diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index b15cf30106c76640016b1957fe9906ac01c74858..37bf814d85569a5aa8dd827e2fce17894da5a397 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -16,6 +16,7 @@ static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0);
  * LLVM bindings for Rust, and we are *not* writing any C++.
  */
 pub fn cpu_codegen<W: Write>(
+    module_name: &str,
     function: &Function,
     types: &Vec<Type>,
     constants: &Vec<Constant>,
@@ -27,6 +28,7 @@ pub fn cpu_codegen<W: Write>(
     w: &mut W,
 ) -> Result<(), Error> {
     let ctx = CPUContext {
+        module_name,
         function,
         types,
         constants,
@@ -40,6 +42,7 @@ pub fn cpu_codegen<W: Write>(
 }
 
 struct CPUContext<'a> {
+    module_name: &'a str,
     function: &'a Function,
     types: &'a Vec<Type>,
     constants: &'a Vec<Constant>,
@@ -65,16 +68,18 @@ impl<'a> CPUContext<'a> {
             if self.types[return_type.idx()].is_primitive() {
                 write!(
                     w,
-                    "define dso_local {} @{}(",
+                    "define dso_local {} @{}_{}(",
                     self.get_type(return_type),
-                    self.function.name
+                    self.module_name,
+                    self.function.name,
                 )?;
             } else {
                 write!(
                     w,
-                    "define dso_local nonnull noundef {} @{}(",
+                    "define dso_local nonnull noundef {} @{}_{}(",
                     self.get_type(return_type),
-                    self.function.name
+                    self.module_name,
+                    self.function.name,
                 )?;
             }
         } else {
@@ -89,7 +94,11 @@ impl<'a> CPUContext<'a> {
                     .collect::<Vec<_>>()
                     .join(", "),
             )?;
-            write!(w, "define dso_local void @{}(", self.function.name,)?;
+            write!(
+                w,
+                "define dso_local void @{}_{}(",
+                self.module_name, self.function.name,
+            )?;
         }
         let mut first_param = true;
         // The first parameter is a pointer to CPU backing memory, if it's
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index a3eea2745ea6dde2929b9cd6fcc17f5c6483643f..1ba8302ab84c96d818e8fdbfecea10576ed946a5 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -14,6 +14,7 @@ use crate::*;
  * of similarities with the CPU LLVM generation plus custom GPU parallelization.
  */
 pub fn gpu_codegen<W: Write>(
+    module_name: &str,
     function: &Function,
     types: &Vec<Type>,
     constants: &Vec<Constant>,
@@ -170,6 +171,7 @@ pub fn gpu_codegen<W: Write>(
     };
 
     let ctx = GPUContext {
+        module_name,
         function,
         types,
         constants,
@@ -199,6 +201,7 @@ struct GPUKernelParams {
 }
 
 struct GPUContext<'a> {
+    module_name: &'a str,
     function: &'a Function,
     types: &'a Vec<Type>,
     constants: &'a Vec<Constant>,
@@ -395,8 +398,8 @@ namespace cg = cooperative_groups;
     fn codegen_kernel_begin<W: Write>(&self, w: &mut W) -> Result<(), Error> {
         write!(
             w,
-            "__global__ void __launch_bounds__({}) {}_gpu(",
-            self.kernel_params.max_num_threads, self.function.name
+            "__global__ void __launch_bounds__({}) {}_{}_gpu(",
+            self.kernel_params.max_num_threads, self.module_name, self.function.name
         )?;
         let mut first_param = true;
         // The first parameter is a pointer to GPU backing memory, if it's
@@ -645,7 +648,7 @@ namespace cg = cooperative_groups;
         } else {
             write!(w, "{}", self.get_type(self.function.return_types[0], false))?;
         }
-        write!(w, " {}(", self.function.name)?;
+        write!(w, " {}_{}(", self.module_name, self.function.name)?;
 
         let mut first_param = true;
         // The first parameter is a pointer to GPU backing memory, if it's
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 8fa0c09ee512e3f2e43c5280bc3cb6947bc31dc5..6981a3da7e59176f73d6fecdde07fe636cc6aecf 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -74,6 +74,7 @@ use crate::*;
  * set some CUDA memory - the user can then take a CUDA reference to that box.
  */
 pub fn rt_codegen<W: Write>(
+    module_name: &str,
     func_id: FunctionID,
     module: &Module,
     def_use: &ImmutableDefUseMap,
@@ -96,6 +97,7 @@ pub fn rt_codegen<W: Write>(
         .map(|(fork, join)| (*join, *fork))
         .collect();
     let ctx = RTContext {
+        module_name,
         func_id,
         module,
         def_use,
@@ -117,6 +119,7 @@ pub fn rt_codegen<W: Write>(
 }
 
 struct RTContext<'a> {
+    module_name: &'a str,
     func_id: FunctionID,
     module: &'a Module,
     def_use: &'a ImmutableDefUseMap,
@@ -157,7 +160,8 @@ impl<'a> RTContext<'a> {
         // Dump the function signature.
         write!(
             w,
-            "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}(",
+            "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}_{}(",
+            self.module_name,
             func.name
         )?;
         let mut first_param = true;
@@ -236,7 +240,7 @@ impl<'a> RTContext<'a> {
                 // Create the return struct
                 write!(w, "let mut ret_struct: ::std::mem::MaybeUninit<ReturnStruct> = ::std::mem::MaybeUninit::uninit();")?;
                 // Call the device function
-                write!(w, "{}(", callee.name)?;
+                write!(w, "{}_{}(", self.module_name, callee.name)?;
                 if self.backing_allocations[&callee_id].contains_key(&self.devices[callee_id.idx()])
                 {
                     write!(w, "backing, ")?;
@@ -672,8 +676,9 @@ impl<'a> RTContext<'a> {
                 };
                 write!(
                     block,
-                    "{}{}(",
+                    "{}{}_{}(",
                     prefix,
+                    self.module_name,
                     self.module.functions[callee_id.idx()].name
                 )?;
                 for (device, (offset, size)) in self.backing_allocations[&self.func_id]
@@ -1463,7 +1468,7 @@ impl<'a> RTContext<'a> {
         }
 
         // Call the wrapped function.
-        write!(w, "let ret = {}(", func.name)?;
+        write!(w, "let ret = {}_{}(", self.module_name, func.name)?;
         for (device, _) in self.backing_allocations[&self.func_id].iter() {
             write!(
                 w,
@@ -1630,8 +1635,9 @@ impl<'a> RTContext<'a> {
         let func = &self.module.functions[func_id.idx()];
         write!(
             w,
-            "{}fn {}(",
+            "{}fn {}_{}(",
             if is_unsafe { "unsafe " } else { "" },
+            self.module_name,
             func.name
         )?;
         let mut first_param = true;
@@ -1667,7 +1673,7 @@ impl<'a> RTContext<'a> {
         func_id: FunctionID,
     ) -> Result<(), Error> {
         let func = &self.module.functions[func_id.idx()];
-        write!(w, "fn {}(", func.name)?;
+        write!(w, "fn {}_{}(", self.module_name, func.name)?;
         let mut first_param = true;
         if self.backing_allocations[&func_id].contains_key(&self.devices[func_id.idx()]) {
             first_param = false;
diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch
index 9cbdb942bb484342afe42a2fc711852878474508..5fe48a8395cfb6fada1d668b4f73fa6eb3487f5e 100644
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -1,23 +1,30 @@
-gvn(*);
-dce(*);
-phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  crc(X);
+  slf(X);
+  dce(X);
+  infer-schedules(X);
+}
 
-let auto = auto-outline(euler);
-cpu(auto.euler);
-
-inline(auto.euler);
-inline(auto.euler);
+simpl!(*);
+inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
 delete-uncalled(*);
+simpl!(*);
+ip-sroa[false](*);
+sroa[false](*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
 
-sroa[false](auto.euler);
-dce(*);
-float-collections(*);
-dce(*);
+unforkify(*);
 
 gcm(*);
-
diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
index 252015c368d594e843af34367450257d3459034f..6329c5046e15ca0bce646f4a778dcf8c8d781656 100644
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
@@ -1,23 +1,30 @@
-gvn(*);
-dce(*);
-phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  crc(X);
+  slf(X);
+  dce(X);
+  infer-schedules(X);
+}
 
-let auto = auto-outline(pre_euler);
-cpu(auto.pre_euler);
-
-inline(auto.pre_euler);
-inline(auto.pre_euler);
+simpl!(*);
+inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
 delete-uncalled(*);
+simpl!(*);
+ip-sroa[false](*);
+sroa[false](*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
 
-sroa[false](auto.pre_euler);
-dce(*);
-float-collections(*);
-dce(*);
+unforkify(*);
 
 gcm(*);
-
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 3c14270057cb14559942499ff5f92df45cd7deab..e049f985e0db36ae78368b8d33c01d22744fdcc6 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -951,6 +951,7 @@ impl PassManager {
         for idx in 0..module.functions.len() {
             match devices[idx] {
                 Device::LLVM => cpu_codegen(
+                    &module_name,
                     &module.functions[idx],
                     &module.types,
                     &module.constants,
@@ -966,6 +967,7 @@ impl PassManager {
                     error: format!("{}", e),
                 })?,
                 Device::CUDA => gpu_codegen(
+                    &module_name,
                     &module.functions[idx],
                     &module.types,
                     &module.constants,
@@ -986,6 +988,7 @@ impl PassManager {
                     error: format!("{}", e),
                 })?,
                 Device::AsyncRust => rt_codegen(
+                    &module_name,
                     FunctionID::new(idx),
                     &module,
                     &def_uses[idx],