diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index b15cf30106c76640016b1957fe9906ac01c74858..37bf814d85569a5aa8dd827e2fce17894da5a397 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -16,6 +16,7 @@ static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0);
  * LLVM bindings for Rust, and we are *not* writing any C++.
  */
 pub fn cpu_codegen<W: Write>(
+    module_name: &str,
     function: &Function,
     types: &Vec<Type>,
     constants: &Vec<Constant>,
@@ -27,6 +28,7 @@ pub fn cpu_codegen<W: Write>(
     w: &mut W,
 ) -> Result<(), Error> {
     let ctx = CPUContext {
+        module_name,
         function,
         types,
         constants,
@@ -40,6 +42,7 @@ pub fn cpu_codegen<W: Write>(
 }
 
 struct CPUContext<'a> {
+    module_name: &'a str,
     function: &'a Function,
     types: &'a Vec<Type>,
     constants: &'a Vec<Constant>,
@@ -65,16 +68,18 @@ impl<'a> CPUContext<'a> {
             if self.types[return_type.idx()].is_primitive() {
                 write!(
                     w,
-                    "define dso_local {} @{}(",
+                    "define dso_local {} @{}_{}(",
                     self.get_type(return_type),
-                    self.function.name
+                    self.module_name,
+                    self.function.name,
                 )?;
             } else {
                 write!(
                     w,
-                    "define dso_local nonnull noundef {} @{}(",
+                    "define dso_local nonnull noundef {} @{}_{}(",
                     self.get_type(return_type),
-                    self.function.name
+                    self.module_name,
+                    self.function.name,
                 )?;
             }
         } else {
@@ -89,7 +94,11 @@ impl<'a> CPUContext<'a> {
                     .collect::<Vec<_>>()
                     .join(", "),
             )?;
-            write!(w, "define dso_local void @{}(", self.function.name,)?;
+            write!(
+                w,
+                "define dso_local void @{}_{}(",
+                self.module_name, self.function.name,
+            )?;
         }
         let mut first_param = true;
         // The first parameter is a pointer to CPU backing memory, if it's
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index a3eea2745ea6dde2929b9cd6fcc17f5c6483643f..c9720273c03243d4874b27fc6c74f20fd21a6c33 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -14,6 +14,7 @@ use crate::*;
  * of similarities with the CPU LLVM generation plus custom GPU parallelization.
  */
 pub fn gpu_codegen<W: Write>(
+    module_name: &str,
     function: &Function,
     types: &Vec<Type>,
     constants: &Vec<Constant>,
@@ -170,6 +171,7 @@ pub fn gpu_codegen<W: Write>(
     };
 
     let ctx = GPUContext {
+        module_name,
         function,
         types,
         constants,
@@ -199,6 +201,7 @@ struct GPUKernelParams {
 }
 
 struct GPUContext<'a> {
+    module_name: &'a str,
     function: &'a Function,
     types: &'a Vec<Type>,
     constants: &'a Vec<Constant>,
@@ -395,8 +398,8 @@ namespace cg = cooperative_groups;
     fn codegen_kernel_begin<W: Write>(&self, w: &mut W) -> Result<(), Error> {
         write!(
             w,
-            "__global__ void __launch_bounds__({}) {}_gpu(",
-            self.kernel_params.max_num_threads, self.function.name
+            "__global__ void __launch_bounds__({}) {}_{}_gpu(",
+            self.kernel_params.max_num_threads, self.module_name, self.function.name
         )?;
         let mut first_param = true;
         // The first parameter is a pointer to GPU backing memory, if it's
@@ -645,7 +648,7 @@ namespace cg = cooperative_groups;
         } else {
             write!(w, "{}", self.get_type(self.function.return_types[0], false))?;
         }
-        write!(w, " {}(", self.function.name)?;
+        write!(w, " {}_{}(", self.module_name, self.function.name)?;
 
         let mut first_param = true;
         // The first parameter is a pointer to GPU backing memory, if it's
@@ -721,8 +724,13 @@ namespace cg = cooperative_groups;
         write!(w, "\tcudaError_t err;\n")?;
         write!(
             w,
-            "\t{}_gpu<<<{}, {}, {}>>>({});\n",
-            self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args
+            "\t{}_{}_gpu<<<{}, {}, {}>>>({});\n",
+            self.module_name,
+            self.function.name,
+            num_blocks,
+            num_threads,
+            dynamic_shared_offset,
+            pass_args
         )?;
         write!(w, "\terr = cudaGetLastError();\n")?;
         write!(
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 8fa0c09ee512e3f2e43c5280bc3cb6947bc31dc5..6981a3da7e59176f73d6fecdde07fe636cc6aecf 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -74,6 +74,7 @@ use crate::*;
  * set some CUDA memory - the user can then take a CUDA reference to that box.
  */
 pub fn rt_codegen<W: Write>(
+    module_name: &str,
     func_id: FunctionID,
     module: &Module,
     def_use: &ImmutableDefUseMap,
@@ -96,6 +97,7 @@ pub fn rt_codegen<W: Write>(
         .map(|(fork, join)| (*join, *fork))
         .collect();
     let ctx = RTContext {
+        module_name,
         func_id,
         module,
         def_use,
@@ -117,6 +119,7 @@ pub fn rt_codegen<W: Write>(
 }
 
 struct RTContext<'a> {
+    module_name: &'a str,
     func_id: FunctionID,
     module: &'a Module,
     def_use: &'a ImmutableDefUseMap,
@@ -157,7 +160,8 @@ impl<'a> RTContext<'a> {
         // Dump the function signature.
         write!(
             w,
-            "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}(",
+            "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}_{}(",
+            self.module_name,
             func.name
         )?;
         let mut first_param = true;
@@ -236,7 +240,7 @@ impl<'a> RTContext<'a> {
                 // Create the return struct
                 write!(w, "let mut ret_struct: ::std::mem::MaybeUninit<ReturnStruct> = ::std::mem::MaybeUninit::uninit();")?;
                 // Call the device function
-                write!(w, "{}(", callee.name)?;
+                write!(w, "{}_{}(", self.module_name, callee.name)?;
                 if self.backing_allocations[&callee_id].contains_key(&self.devices[callee_id.idx()])
                 {
                     write!(w, "backing, ")?;
@@ -672,8 +676,9 @@ impl<'a> RTContext<'a> {
                 };
                 write!(
                     block,
-                    "{}{}(",
+                    "{}{}_{}(",
                     prefix,
+                    self.module_name,
                     self.module.functions[callee_id.idx()].name
                 )?;
                 for (device, (offset, size)) in self.backing_allocations[&self.func_id]
@@ -1463,7 +1468,7 @@ impl<'a> RTContext<'a> {
         }
 
         // Call the wrapped function.
-        write!(w, "let ret = {}(", func.name)?;
+        write!(w, "let ret = {}_{}(", self.module_name, func.name)?;
         for (device, _) in self.backing_allocations[&self.func_id].iter() {
             write!(
                 w,
@@ -1630,8 +1635,9 @@ impl<'a> RTContext<'a> {
         let func = &self.module.functions[func_id.idx()];
         write!(
             w,
-            "{}fn {}(",
+            "{}fn {}_{}(",
             if is_unsafe { "unsafe " } else { "" },
+            self.module_name,
             func.name
         )?;
         let mut first_param = true;
@@ -1667,7 +1673,7 @@ impl<'a> RTContext<'a> {
         func_id: FunctionID,
     ) -> Result<(), Error> {
         let func = &self.module.functions[func_id.idx()];
-        write!(w, "fn {}(", func.name)?;
+        write!(w, "fn {}_{}(", self.module_name, func.name)?;
         let mut first_param = true;
         if self.backing_allocations[&func_id].contains_key(&self.devices[func_id.idx()]) {
             first_param = false;
diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs
index 17cea32500ab86779c01e37e2b03db842f9f3712..0e332a0033c50585160242f04bfdcceb37f87ad5 100644
--- a/hercules_opt/src/editor.rs
+++ b/hercules_opt/src/editor.rs
@@ -880,6 +880,18 @@ impl<'a, 'b> FunctionEdit<'a, 'b> {
         }
     }
 
+    pub fn get_param_types(&self) -> &Vec<TypeID> {
+        self.updated_param_types
+            .as_ref()
+            .unwrap_or(&self.editor.function.param_types)
+    }
+
+    pub fn get_return_types(&self) -> &Vec<TypeID> {
+        self.updated_return_types
+            .as_ref()
+            .unwrap_or(&self.editor.function.return_types)
+    }
+
     pub fn set_param_types(&mut self, tys: Vec<TypeID>) {
         self.updated_param_types = Some(tys);
     }
diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs
index 99187dd2bbfb2a9bbc2bf5937cb40f5246ba5b29..38ed1b22d2d81be971aa867857fd01e3752ecc0a 100644
--- a/hercules_opt/src/inline.rs
+++ b/hercules_opt/src/inline.rs
@@ -1,8 +1,7 @@
+use std::cell::Ref;
 use std::collections::HashMap;
 
-use hercules_ir::callgraph::*;
-use hercules_ir::def_use::*;
-use hercules_ir::ir::*;
+use hercules_ir::*;
 
 use crate::*;
 
@@ -235,3 +234,205 @@ fn inline_func(
         });
     }
 }
+
+#[derive(Clone, Debug, Copy, PartialEq, Eq)]
+enum ParameterLattice {
+    Top,
+    Constant(ConstantID),
+    // Dynamic constant
+    DynamicConstant(DynamicConstantID, FunctionID),
+    Bottom,
+}
+
+impl ParameterLattice {
+    fn from_node(node: &Node, func_id: FunctionID) -> Self {
+        use ParameterLattice::*;
+        match node {
+            Node::Undef { ty: _ } => Top,
+            Node::Constant { id } => Constant(*id),
+            Node::DynamicConstant { id } => DynamicConstant(*id, func_id),
+            _ => Bottom,
+        }
+    }
+
+    fn meet(&mut self, b: Self, cons: Ref<'_, Vec<Constant>>, dcs: Ref<'_, Vec<DynamicConstant>>) {
+        use ParameterLattice::*;
+        *self = match (*self, b) {
+            (Top, b) => b,
+            (a, Top) => a,
+            (Bottom, _) | (_, Bottom) => Bottom,
+            (Constant(id_a), Constant(id_b)) => {
+                if id_a == id_b {
+                    Constant(id_a)
+                } else {
+                    Bottom
+                }
+            }
+            (DynamicConstant(dc_a, f_a), DynamicConstant(dc_b, f_b)) => {
+                if dc_a == dc_b && f_a == f_b {
+                    DynamicConstant(dc_a, f_a)
+                } else if let (
+                    ir::DynamicConstant::Constant(dcv_a),
+                    ir::DynamicConstant::Constant(dcv_b),
+                ) = (&dcs[dc_a.idx()], &dcs[dc_b.idx()])
+                    && *dcv_a == *dcv_b
+                {
+                    DynamicConstant(dc_a, f_a)
+                } else {
+                    Bottom
+                }
+            }
+            (DynamicConstant(dc, _), Constant(con)) | (Constant(con), DynamicConstant(dc, _)) => {
+                match (&cons[con.idx()], &dcs[dc.idx()]) {
+                    (ir::Constant::UnsignedInteger64(conv), ir::DynamicConstant::Constant(dcv))
+                        if *conv as usize == *dcv =>
+                    {
+                        Constant(con)
+                    }
+                    _ => Bottom,
+                }
+            }
+        }
+    }
+}
+
+/*
+ * Top level function to inline constant parameters and constant dynamic
+ * constant parameters. Identifies functions that are:
+ *
+ * 1. Not marked as entry.
+ * 2. At every call site, a particular parameter is always a specific constant
+ *    or dynamic constant.
+ *
+ * These functions can have that constant "inlined" - the parameter is removed
+ * and all uses of the parameter becomes uses of the constant directly.
+ */
+pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) {
+    // Run const inlining on each function, starting at the most shallow
+    // function first, since we want to propagate constants down the call graph.
+    for func_id in callgraph.topo().into_iter().rev() {
+        let func = editors[func_id.idx()].func();
+        if func.entry || callgraph.num_callers(func_id) == 0 {
+            continue;
+        }
+
+        // Figure out what we know about the parameters to this function.
+        let mut param_lattice = vec![ParameterLattice::Top; func.param_types.len()];
+        let mut callers = vec![];
+        for caller in callgraph.get_callers(func_id) {
+            let editor = &editors[caller.idx()];
+            let nodes = &editor.func().nodes;
+            for id in editor.node_ids() {
+                if let Some((_, callee, _, args)) = nodes[id.idx()].try_call()
+                    && callee == func_id
+                {
+                    if editor.is_mutable(id) {
+                        for (idx, id) in args.into_iter().enumerate() {
+                            let lattice = ParameterLattice::from_node(&nodes[id.idx()], callee);
+                            param_lattice[idx].meet(
+                                lattice,
+                                editor.get_constants(),
+                                editor.get_dynamic_constants(),
+                            );
+                        }
+                    } else {
+                        // If we can't modify the call node in the caller, then
+                        // we can't perform the inlining.
+                        param_lattice = vec![ParameterLattice::Bottom; func.param_types.len()];
+                    }
+                    callers.push((caller, id));
+                }
+            }
+        }
+        if param_lattice.iter().all(|v| *v == ParameterLattice::Bottom) {
+            continue;
+        }
+
+        // Replace the arguments.
+        let editor = &mut editors[func_id.idx()];
+        let mut param_idx_to_ids: HashMap<usize, Vec<NodeID>> = HashMap::new();
+        for id in editor.node_ids() {
+            if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() {
+                param_idx_to_ids.entry(idx).or_default().push(id);
+            }
+        }
+        let mut params_to_remove = vec![];
+        let success = editor.edit(|mut edit| {
+            let mut param_tys = edit.get_param_types().clone();
+            let mut decrement_index_by = 0;
+            for idx in 0..param_tys.len() {
+                if let Some(node) = match param_lattice[idx] {
+                    ParameterLattice::Top => Some(Node::Undef { ty: param_tys[idx] }),
+                    ParameterLattice::Constant(id) => Some(Node::Constant { id }),
+                    ParameterLattice::DynamicConstant(id, _) => {
+                        // Rust moment.
+                        let maybe_cons = edit.get_dynamic_constant(id).try_constant();
+                        if let Some(val) = maybe_cons {
+                            Some(Node::DynamicConstant {
+                                id: edit.add_dynamic_constant(DynamicConstant::Constant(val)),
+                            })
+                        } else {
+                            None
+                        }
+                    }
+                    _ => None,
+                } && let Some(ids) = param_idx_to_ids.get(&idx)
+                {
+                    let node = edit.add_node(node);
+                    for id in ids {
+                        edit = edit.replace_all_uses(*id, node)?;
+                        edit = edit.delete_node(*id)?;
+                    }
+                    param_tys.remove(idx - decrement_index_by);
+                    params_to_remove.push(idx);
+                    decrement_index_by += 1;
+                } else if decrement_index_by != 0
+                    && let Some(ids) = param_idx_to_ids.get(&idx)
+                {
+                    let node = edit.add_node(Node::Parameter {
+                        index: idx - decrement_index_by,
+                    });
+                    for id in ids {
+                        edit = edit.replace_all_uses(*id, node)?;
+                        edit = edit.delete_node(*id)?;
+                    }
+                }
+            }
+            edit.set_param_types(param_tys);
+            Ok(edit)
+        });
+        params_to_remove.reverse();
+
+        // Update callers.
+        if success {
+            for (caller, call) in callers {
+                let editor = &mut editors[caller.idx()];
+                let success = editor.edit(|mut edit| {
+                    let Node::Call {
+                        control,
+                        function,
+                        dynamic_constants,
+                        args,
+                    } = edit.get_node(call).clone()
+                    else {
+                        panic!();
+                    };
+                    let mut args = args.into_vec();
+                    for idx in params_to_remove.iter() {
+                        args.remove(*idx);
+                    }
+                    let node = edit.add_node(Node::Call {
+                        control,
+                        function,
+                        dynamic_constants,
+                        args: args.into_boxed_slice(),
+                    });
+                    edit = edit.replace_all_uses(call, node)?;
+                    edit = edit.delete_node(call)?;
+                    Ok(edit)
+                });
+                assert!(success);
+            }
+        }
+    }
+}
diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn
index c7f4345bc5dc89d2fb55ee96231e6b5f6604ef4f..356bb3d91836ba0994cad56315b9a5588b0df8b7 100644
--- a/juno_samples/rodinia/backprop/src/backprop.jn
+++ b/juno_samples/rodinia/backprop/src/backprop.jn
@@ -4,7 +4,7 @@ fn squash(x: f32) -> f32 {
 }
 
 fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] {
-  let result : f32[m + 1];
+  @res let result : f32[m + 1];
   result[0] = 1.0;
 
   for j in 1..=m {
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index 56fc2c9ae401985116fa7fbfdf69ed0e4e0ab926..fa3dccf123fb51fbe8d989b97a8ed35d1c763251 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -1,24 +1,24 @@
-gvn(*);
-dce(*);
-phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  infer-schedules(X);
+}
 
-let auto = auto-outline(backprop);
-cpu(auto.backprop);
-
-inline(auto.backprop);
-inline(auto.backprop);
+simpl!(*);
+inline(layer_forward);
 delete-uncalled(*);
 
-sroa[true](*);
-dce(*);
-float-collections(*);
-reuse-products(*);
-dce(*);
+no-memset(layer_forward@res);
+lift-dc-math(*);
+loop-bound-canon(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+  fork-coalesce(*);
+}
 
 gcm(*);
-
diff --git a/juno_samples/rodinia/bfs/build.rs b/juno_samples/rodinia/bfs/build.rs
index c19bae5d54b185ac2ec97ffc645fc86840c7ad15..bb8f9ff507e818b6010cf9e12bbf3e9cdf8c342d 100644
--- a/juno_samples/rodinia/bfs/build.rs
+++ b/juno_samples/rodinia/bfs/build.rs
@@ -13,6 +13,8 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("bfs.jn")
         .unwrap()
+        .schedule_in_src("cpu.sch")
+        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index cf2ea086619dd431e8edd30c51d86d35972296fc..51dcd945429dfde02cb2313afa404e81f8722c84 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -13,8 +13,8 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
   let visited: bool[n];
   visited[source as u64] = true;
   
-  let cost: i32[n];
-  for i in 0..n {
+  @cost @cost_init let cost: i32[n];
+  @cost_init for i in 0..n {
     cost[i] = -1;
   }
   cost[source as u64] = 0;
@@ -25,7 +25,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
   while !stop {
     stop = true;
 
-    for i in 0..n {
+    @loop1 for i in 0..n {
       if mask[i] {
         mask[i] = false;
 
@@ -42,7 +42,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
       }
     }
 
-    for i in 0..n {
+    @loop2 for i in 0..n {
       if updated[i] {
         mask[i] = true;
         visited[i] = true;
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..44cfa8ad0161fac0afbccc2d383637ec8a2f1aa0
--- /dev/null
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -0,0 +1,29 @@
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  infer-schedules(X);
+}
+
+phi-elim(bfs);
+no-memset(bfs@cost);
+outline(bfs@cost_init);
+let loop1 = outline(bfs@loop1);
+let loop2 = outline(bfs@loop2);
+
+simpl!(*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
+
+unforkify(*);
+
+gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch
index 9cbdb942bb484342afe42a2fc711852878474508..5fe48a8395cfb6fada1d668b4f73fa6eb3487f5e 100644
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -1,23 +1,30 @@
-gvn(*);
-dce(*);
-phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  crc(X);
+  slf(X);
+  dce(X);
+  infer-schedules(X);
+}
 
-let auto = auto-outline(euler);
-cpu(auto.euler);
-
-inline(auto.euler);
-inline(auto.euler);
+simpl!(*);
+inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
 delete-uncalled(*);
+simpl!(*);
+ip-sroa[false](*);
+sroa[false](*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
 
-sroa[false](auto.euler);
-dce(*);
-float-collections(*);
-dce(*);
+unforkify(*);
 
 gcm(*);
-
diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
index 252015c368d594e843af34367450257d3459034f..6329c5046e15ca0bce646f4a778dcf8c8d781656 100644
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
@@ -1,23 +1,30 @@
-gvn(*);
-dce(*);
-phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  crc(X);
+  slf(X);
+  dce(X);
+  infer-schedules(X);
+}
 
-let auto = auto-outline(pre_euler);
-cpu(auto.pre_euler);
-
-inline(auto.pre_euler);
-inline(auto.pre_euler);
+simpl!(*);
+inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
 delete-uncalled(*);
+simpl!(*);
+ip-sroa[false](*);
+sroa[false](*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
 
-sroa[false](auto.pre_euler);
-dce(*);
-float-collections(*);
-dce(*);
+unforkify(*);
 
 gcm(*);
-
diff --git a/juno_samples/rodinia/srad/build.rs b/juno_samples/rodinia/srad/build.rs
index 36ba61207bb08766e95f7437859d6d6d2146339c..5e1f78f762a39dcc10e131b4d359cfb1097575c8 100644
--- a/juno_samples/rodinia/srad/build.rs
+++ b/juno_samples/rodinia/srad/build.rs
@@ -13,6 +13,8 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("srad.jn")
         .unwrap()
+        .schedule_in_src("cpu.sch")
+        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..1a81ddad3b55bcf9ffb76660ebdc1069338affd4
--- /dev/null
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -0,0 +1,35 @@
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  infer-schedules(X);
+}
+
+phi-elim(*);
+let loop1 = outline(srad@loop1);
+let loop2 = outline(srad@loop2);
+let loop3 = outline(srad@loop3);
+simpl!(*);
+const-inline(*);
+crc(*);
+slf(*);
+write-predication(*);
+simpl!(*);
+predication(*);
+simpl!(*);
+predication(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+  fork-coalesce(*);
+}
+simpl!(*);
+
+fork-split(*);
+unforkify(*);
+
+gcm(*);
diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn
index 5eea647c58949ebd951149f57e1961cebe6fc443..3e016a99b574c1dcde982e7277a5cbcdc1743c19 100644
--- a/juno_samples/rodinia/srad/src/srad.jn
+++ b/juno_samples/rodinia/srad/src/srad.jn
@@ -38,7 +38,7 @@ fn srad<nrows, ncols: usize>(
 
     // These loops should really be interchanged, but they aren't in the
     // Rodinia source (though they are in the HPVM source)
-    for i in 0..nrows {
+    @loop1 for i in 0..nrows {
       for j in 0..ncols {
         let tmp = image[j, i];
         sum += tmp;
@@ -57,7 +57,7 @@ fn srad<nrows, ncols: usize>(
 
     let c : f32[ncols, nrows];
 
-    for j in 0..ncols {
+    @loop2 for j in 0..ncols {
       for i in 0..nrows {
         let Jc = image[j, i];
         dN[j, i] = image[j, iN[i] as u64] - Jc;
@@ -75,14 +75,15 @@ fn srad<nrows, ncols: usize>(
         let qsqr = num / (den * den);
 
         let den = (qsqr - q0sqr) / (q0sqr * (1 + q0sqr));
-        c[j, i] = 1.0 / (1.0 + den);
+	let val = 1.0 / (1.0 + den);
 
-        if c[j, i] < 0      { c[j, i] = 0; }
-        else if c[j, i] > 1 { c[j, i] = 1; }
+        if val < 0      { c[j, i] = 0; }
+        else if val > 1 { c[j, i] = 1; }
+	else            { c[j, i] = val; }
       }
     }
 
-    for j in 0..ncols {
+    @loop3 for j in 0..ncols {
       for i in 0..nrows {
         let cN = c[j, i];
         let cS = c[j, iS[i] as u64];
diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs
index 3c288ca7a61c855b3982ba6ed66c215c9e2942fe..bd27350a26d58f4e729b24d0026f12cd13ca7195 100644
--- a/juno_scheduler/src/compile.rs
+++ b/juno_scheduler/src/compile.rs
@@ -112,6 +112,7 @@ impl FromStr for Appliable {
             "ccp" => Ok(Appliable::Pass(ir::Pass::CCP)),
             "crc" | "collapse-read-chains" => Ok(Appliable::Pass(ir::Pass::CRC)),
             "clean-monoid-reduces" => Ok(Appliable::Pass(ir::Pass::CleanMonoidReduces)),
+            "const-inline" => Ok(Appliable::Pass(ir::Pass::ConstInline)),
             "dce" => Ok(Appliable::Pass(ir::Pass::DCE)),
             "delete-uncalled" => Ok(Appliable::DeleteUncalled),
             "float-collections" | "collections" => Ok(Appliable::Pass(ir::Pass::FloatCollections)),
diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs
index 3a087c0d40093c6363e67332c1bd489f22727a42..a0db884492120a43d0bb8fff89e689746ef1579e 100644
--- a/juno_scheduler/src/ir.rs
+++ b/juno_scheduler/src/ir.rs
@@ -8,8 +8,9 @@ pub enum Pass {
     ArrayToProduct,
     AutoOutline,
     CCP,
-    CleanMonoidReduces,
     CRC,
+    CleanMonoidReduces,
+    ConstInline,
     DCE,
     FloatCollections,
     ForkChunk,
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 5f2fa4cce02e04ca4499cbff8806356f81cc86bc..e049f985e0db36ae78368b8d33c01d22744fdcc6 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -951,6 +951,7 @@ impl PassManager {
         for idx in 0..module.functions.len() {
             match devices[idx] {
                 Device::LLVM => cpu_codegen(
+                    &module_name,
                     &module.functions[idx],
                     &module.types,
                     &module.constants,
@@ -966,6 +967,7 @@ impl PassManager {
                     error: format!("{}", e),
                 })?,
                 Device::CUDA => gpu_codegen(
+                    &module_name,
                     &module.functions[idx],
                     &module.types,
                     &module.constants,
@@ -986,6 +988,7 @@ impl PassManager {
                     error: format!("{}", e),
                 })?,
                 Device::AsyncRust => rt_codegen(
+                    &module_name,
                     FunctionID::new(idx),
                     &module,
                     &def_uses[idx],
@@ -1833,6 +1836,24 @@ fn run_pass(
             pm.delete_gravestones();
             pm.clear_analyses();
         }
+        Pass::ConstInline => {
+            assert!(args.is_empty());
+            pm.make_callgraph();
+            let callgraph = pm.callgraph.take().unwrap();
+
+            let mut editors: Vec<_> = build_selection(pm, selection, true)
+                .into_iter()
+                .map(|editor| editor.unwrap())
+                .collect();
+            const_inline(&mut editors, &callgraph);
+
+            for func in editors {
+                changed |= func.modified();
+            }
+
+            pm.delete_gravestones();
+            pm.clear_analyses();
+        }
         Pass::CRC => {
             assert!(args.is_empty());
             for func in build_selection(pm, selection, false) {