diff --git a/.gitignore b/.gitignore
index 516108ddfe2c09b0e5d556e6b4ca96c8a9b3a72d..6298d7bbc1b72492a0d1da0892a78ff2c3f13b85 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-/target
+**/target
 *.dot
 !paper_resources/*.dot
 *.bc
@@ -14,3 +14,4 @@
 .vscode
 *_env
 *.txt
+*ncu-rep
\ No newline at end of file
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index c9720273c03243d4874b27fc6c74f20fd21a6c33..07dd3ebfc551b84b20cea797421bf5e6846f30c8 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
                             // because Fork basic block's init section already does gating
                             write!(
                                 w,
-                                "{}{} = (threadIdx.x % {}) / {};\n",
+                                "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
                                 tabs,
                                 define_variable,
                                 use_thread_quota.unwrap(),
-                                use_thread_quota.unwrap() / parallel_factor.unwrap()
+                                use_thread_quota.unwrap() / parallel_factor.unwrap(),
+                                divide,
+                                modulo,
                             )?;
                         }
                     }
diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index e6db0345def31324243cdee2bdcb6b5cca5d9a7b..8bd3f735fa60cf4e0404edde28eb858e0ed7e581 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge(
                     op: BinaryOperator::Rem,
                 });
                 edit.sub_edit(tid, rem);
+                edit.sub_edit(tid, outer_tid);
                 edit = edit.replace_all_uses(tid, rem)?;
             } else if tid_dim == inner_idx {
                 let outer_tid = Node::ThreadID {
@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge(
                     op: BinaryOperator::Div,
                 });
                 edit.sub_edit(tid, div);
+                edit.sub_edit(tid, outer_tid);
                 edit = edit.replace_all_uses(tid, div)?;
             }
         }
@@ -1479,7 +1481,12 @@ fn fork_fusion(
     }
 
     // Perform the fusion.
+    let bottom_tids: Vec<_> = editor
+        .get_users(bottom_fork)
+        .filter(|id| nodes[id.idx()].is_thread_id())
+        .collect();
     editor.edit(|mut edit| {
+        edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?;
         if bottom_join_pred != bottom_fork {
             // If there is control flow in the bottom fork-join, stitch it into
             // the top fork-join.
diff --git a/hercules_opt/src/interprocedural_sroa.rs b/hercules_opt/src/interprocedural_sroa.rs
index c7ea58360bf09b2a8e221a19f66a9bfe81126516..a01b0f5553ef59aaf45fe6ccc2c3ee7bdc881a0b 100644
--- a/hercules_opt/src/interprocedural_sroa.rs
+++ b/hercules_opt/src/interprocedural_sroa.rs
@@ -85,7 +85,6 @@ pub fn interprocedural_sroa(
                 param_nodes[idx].push(id);
             }
         }
-        println!("{}", editor.func().name);
         let success = editor.edit(|mut edit| {
             for (idx, ids) in param_nodes.into_iter().enumerate() {
                 let new_indices = &old_param_type_map[idx];
diff --git a/hercules_opt/src/sroa.rs b/hercules_opt/src/sroa.rs
index e658ff8818246b31b4dbf3c9dd7ad0372c59f3c6..2718f99d82321d6ddce59320ffc5ea2b9bbdc2cf 100644
--- a/hercules_opt/src/sroa.rs
+++ b/hercules_opt/src/sroa.rs
@@ -447,7 +447,7 @@ pub fn sroa(
                 field_map.insert(node, generate_reads(editor, types[&node], node));
             }
             Node::Constant { id } => {
-                field_map.insert(node, generate_constant_fields(editor, id));
+                field_map.insert(node, generate_constant_fields(editor, id, node));
                 to_delete.push(node);
             }
             _ => {
@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID
 
 // Given a constant cnst adds node to the function which are the constant values of each field and
 // returns a list of pairs of indices and the node that holds that index
-fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> {
+fn generate_constant_fields(
+    editor: &mut FunctionEditor,
+    cnst: ConstantID,
+    old_node: NodeID,
+) -> IndexTree<NodeID> {
     let cs: Option<Vec<ConstantID>> =
         if let Some(cs) = editor.get_constant(cnst).try_product_fields() {
             Some(cs.into())
@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In
     if let Some(cs) = cs {
         let mut fields = vec![];
         for c in cs {
-            fields.push(generate_constant_fields(editor, c));
+            fields.push(generate_constant_fields(editor, c, old_node));
         }
         IndexTree::Node(fields)
     } else {
         let mut node = None;
         editor.edit(|mut edit| {
             node = Some(edit.add_node(Node::Constant { id: cnst }));
+            edit.sub_edit(old_node, node.unwrap());
             Ok(edit)
         });
         IndexTree::Leaf(node.expect("Add node cannot fail"))
diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn
index 4d02b2cdb5f843774ebae9b873388e03c82071b0..931e78f8196eba77ff3f9015ab7f4561b1153c57 100644
--- a/juno_samples/cava/src/cava.jn
+++ b/juno_samples/cava/src/cava.jn
@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
   @image_loop for r = 0 to row {
     for c = 0 to col {
       @l2 let l2_dist : f32[num_ctrl_pts];
-      for cp = 0 to num_ctrl_pts {
+      @cp_loop for cp = 0 to num_ctrl_pts {
         let v1 = input[0, r, c] - ctrl_pts[cp, 0];
         let v2 = input[1, r, c] - ctrl_pts[cp, 1];
         let v3 = input[2, r, c] - ctrl_pts[cp, 2];
@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
      
       @channel_loop for chan = 0 to CHAN {
         let chan_val : f32 = 0.0;
-        for cp = 0 to num_ctrl_pts {
+        @cp_loop for cp = 0 to num_ctrl_pts {
           chan_val += l2_dist[cp] * weights[cp, chan];
         }
 
diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index 8f22b37d4c3a77a31e8d2467c8e6e130c281513f..6fc8adbb1fff30856978ce201af09fd2e02d6298 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -115,7 +115,7 @@ array-slf(fuse4);
 simpl!(fuse4);
 let par = fuse4@image_loop \ fuse4@channel_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let fuse4_body = outline(split.cava_3.fj2);
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index bacfd3abca363a6dd93496c1adb23fa54c860b9f..0ef466c00d08f3edffe02bad9612ce23ebf14768 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -117,9 +117,9 @@ fixpoint {
 simpl!(fuse4);
 array-slf(fuse4);
 simpl!(fuse4);
-//fork-tile[2, 0, false, true](fuse4@channel_loop);
-//fork-split(fuse4@channel_loop);
-//clean-monoid-reduces(fuse4);
+fork-tile[2, 0, false, true](fuse4@channel_loop);
+let out = fork-split(fuse4@channel_loop);
+fork-unroll(out.cava_3.fj1);
 unforkify(fuse4@channel_loop);
 
 no-memset(fuse5@res1);
@@ -133,6 +133,13 @@ simpl!(fuse5);
 array-slf(fuse5);
 simpl!(fuse5);
 
+fork-tile[4, 1, false, true](fuse4);
+fork-tile[8, 0, false, true](fuse4);
+fork-interchange[1, 2](fuse4);
+let split = fork-split(fuse4);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
+
 delete-uncalled(*);
 simpl!(*);
 
diff --git a/juno_samples/cava/src/lib.rs b/juno_samples/cava/src/lib.rs
index 1810a24670d8fa7325b2ff353ba70756824a0269..47e3b1b3542217b9a17905d8a52ef431fd97541f 100644
--- a/juno_samples/cava/src/lib.rs
+++ b/juno_samples/cava/src/lib.rs
@@ -124,9 +124,9 @@ pub struct CavaInputs {
     #[clap(long = "output-verify", value_name = "PATH")]
     pub output_verify: Option<String>,
     pub cam_model: String,
-    #[clap(short, long)]
+    #[clap(long)]
     pub crop_rows: Option<usize>,
-    #[clap(short, long)]
+    #[clap(long)]
     pub crop_cols: Option<usize>,
 }
 
diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index ec9e423dc4c160d08b61eaf45d2b75329886f94e..3e1321c517b55f72e529e8a6df46d9d55bf59efb 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -26,14 +26,14 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
 fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
 simpl!(gaussian_smoothing, gaussian_smoothing_body);
 
-no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+no-memset(laplacian_estimate@res);
 fixpoint {
   forkify(laplacian_estimate);
   fork-guard-elim(laplacian_estimate);
@@ -42,15 +42,15 @@ fixpoint {
 simpl!(laplacian_estimate);
 let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2;
+let body = split._1_laplacian_estimate.fj2;
 let laplacian_estimate_body = outline(body);
 fork-coalesce(laplacian_estimate, laplacian_estimate_body);
 simpl!(laplacian_estimate, laplacian_estimate_body);
 
-no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+no-memset(zero_crossings@res);
 fixpoint {
   forkify(zero_crossings);
   fork-guard-elim(zero_crossings);
@@ -59,10 +59,10 @@ fixpoint {
 simpl!(zero_crossings);
 let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2;
+let body = split._2_zero_crossings.fj2;
 let zero_crossings_body = outline(body);
 fork-coalesce(zero_crossings, zero_crossings_body);
 simpl!(zero_crossings, zero_crossings_body);
@@ -86,7 +86,7 @@ fixpoint {
 simpl!(max_gradient);
 fork-dim-merge(max_gradient);
 simpl!(max_gradient);
-fork-tile[16, 0, false, false](max_gradient);
+fork-tile[32, 0, false, false](max_gradient);
 let split = fork-split(max_gradient);
 clean-monoid-reduces(max_gradient);
 let out = outline(split._4_max_gradient.fj1);
@@ -105,7 +105,7 @@ fixpoint {
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
 fork-tile[4, 1, false, false](reject_zero_crossings);
-fork-tile[4, 0, false, false](reject_zero_crossings);
+fork-tile[8, 0, false, false](reject_zero_crossings);
 fork-interchange[1, 2](reject_zero_crossings);
 let split = fork-split(reject_zero_crossings);
 let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
index 58f364dc1cea77b27c83ba2340bfbd11fde07f31..3e49cb365b186037cf6f380c3aa6a4d3b483fb5b 100644
--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>(
 
   @image_loop for row = 0 to n {
     for col = 0 to m {
-      // Copy data for dilation filter
-      @shr1 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-	                       || row + i - r > n - 1
-			       || col + j < r
-			       || col + j - r > m - 1 then MIN_BR
-			    else input[row + i - r, col + j - r];
-        }
-      }
-
       // Compute pixel of dilated image
       let dilated = MIN_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
-        }
-      }
-
-      // Data copy for erotion filter
-      @shr2 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-	                       || row + i - r > n - 1
-			       || col + j < r
-			       || col + j - r > m - 1 then MAX_BR
-			    else input[row + i - r, col + j - r];
+	  let filter = if row + i < r
+                         || row + i - r > n - 1
+                         || col + j < r
+                         || col + j - r > m - 1 then MIN_BR
+                      else input[row + i - r, col + j - r];
+          dilated = max!(dilated, filter * structure[i, j]);
         }
       }
 
@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>(
       let eroded = MAX_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+	  let filter = if row + i < r
+                         || row + i - r > n - 1
+                         || col + j < r
+                         || col + j - r > m - 1 then MAX_BR
+                      else input[row + i - r, col + j - r];
+          eroded = min!(eroded, filter * structure[i, j]);
         }
       }
 
@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>(
 
   @image_loop for row = 0 to n {
     for col = 0 to m {
-      // Data copy for dilation filter
-      @shr1 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-                               || row + i - r > n - 1
-                               || col + j < r
-                               || col + j - r > m - 1 then MIN_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
-                            else MIN_BR;
-        }
-      }
-
       // Compute the pixel of dilated image
       let dilated = MIN_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
-        }
-      }
-
-      // Data copy for erotion filter
-      @shr2 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-                               || row + i - r > n - 1
-                               || col + j < r
-                               || col + j - r > m - 1 then MAX_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
-                            else MIN_BR;
+	  let filter = if row + i < r
+                          || row + i - r > n - 1
+                          || col + j < r
+                          || col + j - r > m - 1 then MIN_BR
+                       else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
+                       else MIN_BR;
+          dilated = max!(dilated, filter * structure[i, j]);
         }
       }
 
@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>(
       let eroded = MAX_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+	  let filter = if row + i < r
+                          || row + i - r > n - 1
+                          || col + j < r
+                          || col + j - r > m - 1 then MAX_BR
+                       else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
+                       else MIN_BR;
+          eroded = min!(eroded, filter * structure[i, j]);
         }
       }
 
@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>(
       let gx = 0;
       let gy = 0;
 
-      for i = 0 to sb {
+      @filter_loop for i = 0 to sb {
         for j = 0 to sb {
           let val = input[if row + i < sbr              then 0
                           else if row + i - sbr > n - 1 then n - 1
diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch
index 7ee2904f7d1bb59780390360dbd9abc6b3934aba..666f6cef30f0c0ecedef57f02e50bc25d2b26b8f 100644
--- a/juno_samples/edge_detection/src/gpu.sch
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -26,22 +26,49 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
+fork-dim-merge(gaussian_smoothing@filter_loop);
+unforkify(gaussian_smoothing@filter_loop);
+simpl!(gaussian_smoothing);
 
-no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+fork-dim-merge(gaussian_smoothing);
+fork-tile[32, 0, false, true](gaussian_smoothing);
+simpl!(gaussian_smoothing);
+fork-split(gaussian_smoothing);
+simpl!(gaussian_smoothing);
+
+no-memset(laplacian_estimate@res);
 fixpoint {
   forkify(laplacian_estimate);
   fork-guard-elim(laplacian_estimate);
   fork-coalesce(laplacian_estimate);
 }
 simpl!(laplacian_estimate);
+fork-dim-merge(laplacian_estimate@filter_loop);
+unforkify(laplacian_estimate@filter_loop);
+simpl!(laplacian_estimate);
 
-no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+fork-dim-merge(laplacian_estimate);
+fork-tile[32, 0, false, true](laplacian_estimate);
+simpl!(laplacian_estimate);
+fork-split(laplacian_estimate);
+simpl!(laplacian_estimate);
+
+no-memset(zero_crossings@res);
 fixpoint {
   forkify(zero_crossings);
   fork-guard-elim(zero_crossings);
   fork-coalesce(zero_crossings);
 }
 simpl!(zero_crossings);
+fork-dim-merge(zero_crossings@filter_loop);
+unforkify(zero_crossings@filter_loop);
+simpl!(zero_crossings);
+
+fork-dim-merge(zero_crossings);
+fork-tile[32, 0, false, true](zero_crossings);
+simpl!(zero_crossings);
+fork-split(zero_crossings);
+simpl!(zero_crossings);
 
 no-memset(gradient@res);
 fixpoint {
@@ -53,6 +80,15 @@ predication(gradient);
 simpl!(gradient);
 predication(gradient);
 simpl!(gradient);
+fork-dim-merge(gradient@filter_loop);
+unforkify(gradient@filter_loop);
+simpl!(gradient);
+
+fork-dim-merge(gradient);
+fork-tile[32, 0, false, true](gradient);
+simpl!(gradient);
+fork-split(gradient);
+simpl!(gradient);
 
 fixpoint {
   forkify(max_gradient);
@@ -88,6 +124,12 @@ fixpoint {
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
 
+fork-dim-merge(reject_zero_crossings);
+fork-tile[32, 0, false, true](reject_zero_crossings);
+simpl!(reject_zero_crossings);
+fork-split(reject_zero_crossings);
+simpl!(reject_zero_crossings);
+
 async-call(edge_detection@le, edge_detection@zc);
 
 simpl!(*);
diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch
index 4cf320a6d8f8d1245272c0855f567a1834939963..7a284a9a3bed0c1800366c64ceeca20265d845a6 100644
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -12,6 +12,7 @@ macro simpl!(X) {
 
 simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
 simpl!(*);
 ip-sroa[true](*);
@@ -24,8 +25,31 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
-no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
-parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
+unforkify(compute_flux@inner_loop);
+
+fork-tile[32, 0, false, false](compute_step_factor);
+let split = fork-split(compute_step_factor);
+let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+fork-coalesce(compute_step_factor, compute_step_factor_body);
+simpl!(compute_step_factor, compute_step_factor_body);
+
+fork-tile[32, 0, false, false](compute_flux);
+let split = fork-split(compute_flux);
+let compute_flux_body = outline(split._6_compute_flux.fj1);
+fork-coalesce(compute_flux, compute_flux_body);
+simpl!(compute_flux, compute_flux_body);
+
+fork-tile[32, 0, false, false](time_step);
+let split = fork-split(time_step);
+let time_step_body = outline(split._7_time_step.fj1);
+fork-coalesce(time_step, time_step_body);
+simpl!(time_step, time_step_body);
+
+fork-tile[32, 0, false, false](copy_vars);
+let split = fork-split(copy_vars);
+let copy_vars_body = outline(split._8_copy_vars.fj1);
+fork-coalesce(copy_vars, copy_vars_body);
+simpl!(copy_vars, copy_vars_body);
 
-unforkify(*);
+unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
index 14eb690641107e7e807cd24b741ecf9098553891..518c656d99652dd9813e6aa5c8e8e84388836d47 100644
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
@@ -24,7 +24,38 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
+no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
+unforkify(compute_flux@inner_loop);
 
-unforkify(*);
+fork-tile[32, 0, false, false](compute_step_factor);
+let split = fork-split(compute_step_factor);
+let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+fork-coalesce(compute_step_factor, compute_step_factor_body);
+simpl!(compute_step_factor, compute_step_factor_body);
 
+fork-tile[32, 0, false, false](compute_flux_contributions);
+let split = fork-split(compute_flux_contributions);
+let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
+fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
+simpl!(compute_flux_contributions, compute_flux_contributions_body);
+
+fork-tile[32, 0, false, false](compute_flux);
+let split = fork-split(compute_flux);
+let compute_flux_body = outline(split._7_compute_flux.fj1);
+fork-coalesce(compute_flux, compute_flux_body);
+simpl!(compute_flux, compute_flux_body);
+
+fork-tile[32, 0, false, false](time_step);
+let split = fork-split(time_step);
+let time_step_body = outline(split._8_time_step.fj1);
+fork-coalesce(time_step, time_step_body);
+simpl!(time_step, time_step_body);
+
+fork-tile[32, 0, false, false](copy_vars);
+let split = fork-split(copy_vars);
+let copy_vars_body = outline(split._9_copy_vars.fj1);
+fork-coalesce(copy_vars, copy_vars_body);
+simpl!(copy_vars, copy_vars_body);
+
+unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/gpu_euler.sch b/juno_samples/rodinia/cfd/src/gpu_euler.sch
index aed6115e7cf790a9274862b1ab2e4d099a194a32..3700f79d24aa22657020ace6a37931feb49048a4 100644
--- a/juno_samples/rodinia/cfd/src/gpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_euler.sch
@@ -12,6 +12,7 @@ macro simpl!(X) {
 
 simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
 gpu(copy_vars, compute_step_factor, compute_flux, time_step);
 
@@ -26,9 +27,18 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
-no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
-parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
+unforkify(compute_flux@inner_loop);
+
+fork-tile[32, 0, false, true](compute_step_factor);
+fork-split(compute_step_factor);
+
+fork-tile[32, 0, false, true](compute_flux);
+fork-split(compute_flux);
+
+fork-tile[32, 0, false, true](time_step);
+fork-split(time_step);
+
+fork-tile[32, 0, false, true](copy_vars);
+fork-split(copy_vars);
 
-unforkify(*);
-float-collections(*);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
index d91f1b001ca1fe4b7ff544c3f8cab561490acd00..d6db675b0682490a84ba30408c3dd96993dce0f9 100644
--- a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
@@ -12,6 +12,7 @@ macro simpl!(X) {
 
 simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
 gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step);
 
@@ -26,7 +27,21 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
+unforkify(compute_flux@inner_loop);
+
+fork-tile[32, 0, false, true](compute_step_factor);
+fork-split(compute_step_factor);
+
+fork-tile[32, 0, false, true](compute_flux_contributions);
+fork-split(compute_flux_contributions);
+
+fork-tile[32, 0, false, true](compute_flux);
+fork-split(compute_flux);
+
+fork-tile[32, 0, false, true](time_step);
+fork-split(time_step);
+
+fork-tile[32, 0, false, true](copy_vars);
+fork-split(copy_vars);
 
-unforkify(*);
-float-collections(*);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs
index 62ee59f4f285b3a829ad95a9a323811b3b3895b7..d61df4c5b7bb0c7915d8605debd72e375a5482b0 100644
--- a/juno_samples/rodinia/cfd/src/lib.rs
+++ b/juno_samples/rodinia/cfd/src/lib.rs
@@ -48,8 +48,7 @@ fn run_euler(
     let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
     let mut runner = runner!(euler);
-    let (density, momentum_x, momentum_y, momentum_z, energy) = 
-    async_std::task::block_on(async {
+    let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
         runner
             .run(
                 nelr as u64,
@@ -123,8 +122,7 @@ fn run_pre_euler(
     let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
     let mut runner = runner!(pre_euler);
-    let (density, momentum_x, momentum_y, momentum_z, energy) = 
-    async_std::task::block_on(async {
+    let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
         runner
             .run(
                 nelr as u64,
@@ -189,15 +187,30 @@ fn compare_floats(xs: &Variables, ys: &Variables) -> bool {
     let ys_energy = ys.energy.as_slice();
 
     xs_density.len() == ys_density.len()
-        && xs_density.iter().zip(ys_density.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_density
+            .iter()
+            .zip(ys_density.iter())
+            .all(|(x, y)| compare_float(*x, *y))
         && xs_momentum_x.len() == ys_momentum_x.len()
-        && xs_momentum_x.iter().zip(ys_momentum_x.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_x
+            .iter()
+            .zip(ys_momentum_x.iter())
+            .all(|(x, y)| compare_float(*x, *y))
         && xs_momentum_y.len() == ys_momentum_y.len()
-        && xs_momentum_y.iter().zip(ys_momentum_y.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_y
+            .iter()
+            .zip(ys_momentum_y.iter())
+            .all(|(x, y)| compare_float(*x, *y))
         && xs_momentum_z.len() == ys_momentum_z.len()
-        && xs_momentum_z.iter().zip(ys_momentum_z.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_z
+            .iter()
+            .zip(ys_momentum_z.iter())
+            .all(|(x, y)| compare_float(*x, *y))
         && xs_energy.len() == ys_energy.len()
-        && xs_energy.iter().zip(ys_energy.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_energy
+            .iter()
+            .zip(ys_energy.iter())
+            .all(|(x, y)| compare_float(*x, *y))
 }
 
 pub fn cfd_harness(args: CFDInputs) {
@@ -224,6 +237,7 @@ pub fn cfd_harness(args: CFDInputs) {
     } = read_domain_geometry(data_file, block_size);
 
     let variables = initialize_variables(nelr, &ff_variable);
+    println!("Running CFD with nelr = {}.", nelr);
 
     let res_juno = if pre_euler {
         run_pre_euler(
diff --git a/juno_samples/rodinia/cfd/src/pre_euler.jn b/juno_samples/rodinia/cfd/src/pre_euler.jn
index c200f2db9a2be4708c4cf1c90c60eef0b7d7a11e..979c2e9a16d3ff2d0b5c690af7645dbf10462139 100644
--- a/juno_samples/rodinia/cfd/src/pre_euler.jn
+++ b/juno_samples/rodinia/cfd/src/pre_euler.jn
@@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 {
 }
 
 fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] {
-  let step_factors : f32[nelr];
+  @res let step_factors : f32[nelr];
 
   for i in 0..nelr {
     let density = variables.density[i];
@@ -109,10 +109,10 @@ fn compute_flux_contribution(
 fn compute_flux_contributions<nelr: usize>(
   variables: Variables::<nelr>,
 ) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) {
-  let fc_momentum_x: Momentum::<nelr>;
-  let fc_momentum_y: Momentum::<nelr>;
-  let fc_momentum_z: Momentum::<nelr>;
-  let fc_density_energy: Momentum::<nelr>;
+  @res let fc_momentum_x: Momentum::<nelr>;
+  @res let fc_momentum_y: Momentum::<nelr>;
+  @res let fc_momentum_z: Momentum::<nelr>;
+  @res let fc_density_energy: Momentum::<nelr>;
 
   for i in 0..nelr {
     let density_i = variables.density[i];
@@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>(
   ff_fc_momentum_z: float3,
 ) -> Variables::<nelr> {
   const smoothing_coefficient : f32 = 0.2;
-  let fluxes: Variables::<nelr>;
+  @res let fluxes: Variables::<nelr>;
 
-  for i in 0..nelr {
+  @outer_loop for i in 0..nelr {
     let density_i = variables.density[i];
 
     let momentum_i = float3 { x: variables.momentum.x[i],
@@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>(
     let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 };
     let flux_i_density_energy : f32 = 0.0;
 
-    for j in 0..NNB {
+    @inner_loop for j in 0..NNB {
       let nb = elements_surrounding_elements[j, i];
       let normal = float3 {
         x: normals.x[j, i],
@@ -328,7 +328,7 @@ fn time_step<nelr: usize>(
 }
 
 fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> {
-  let result : Variables::<nelr>;
+  @res let result : Variables::<nelr>;
 
   for i in 0..nelr {
     result.density[i] = variables.density[i];
diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch
index 289548f9e01cdf402a3e1b1057fa52d4029f6173..f736c0b776759596cfc93a5025fb4f280c3fc932 100644
--- a/juno_samples/rodinia/srad/src/gpu.sch
+++ b/juno_samples/rodinia/srad/src/gpu.sch
@@ -54,4 +54,10 @@ ip-sroa(*);
 sroa(*);
 simpl!(*);
 
+fork-dim-merge(main_loops);
+fork-tile[32, 0, false, true](main_loops);
+dce(main_loops);
+fork-split(main_loops);
+simpl!(main_loops);
+
 gcm(*);
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 456df2eda49b93a6c80327a090b6f6606ae711bb..62bdaf739b69b52a0b540c6a602e1e5beb632786 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -1090,7 +1090,9 @@ impl PassManager {
 
             let mut nvcc_process = Command::new("nvcc")
                 .arg("-c")
+                .arg("-Xptxas")
                 .arg("-O3")
+                .arg("-use_fast_math")
                 .arg("-diag-suppress")
                 .arg("177")
                 .arg("-o")