diff --git a/.gitignore b/.gitignore
index 516108ddfe2c09b0e5d556e6b4ca96c8a9b3a72d..6298d7bbc1b72492a0d1da0892a78ff2c3f13b85 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-/target
+**/target
 *.dot
 !paper_resources/*.dot
 *.bc
@@ -14,3 +14,4 @@
 .vscode
 *_env
 *.txt
+*ncu-rep
\ No newline at end of file
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index c9720273c03243d4874b27fc6c74f20fd21a6c33..07dd3ebfc551b84b20cea797421bf5e6846f30c8 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
                             // because Fork basic block's init section already does gating
                             write!(
                                 w,
-                                "{}{} = (threadIdx.x % {}) / {};\n",
+                                "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
                                 tabs,
                                 define_variable,
                                 use_thread_quota.unwrap(),
-                                use_thread_quota.unwrap() / parallel_factor.unwrap()
+                                use_thread_quota.unwrap() / parallel_factor.unwrap(),
+                                divide,
+                                modulo,
                             )?;
                         }
                     }
diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs
index 2a3f9fb1aa86dd092d048bf79b51aa118d2489b8..b2567b8f1313e97aeb70268b11847b6d70b1a2f5 100644
--- a/hercules_ir/src/typecheck.rs
+++ b/hercules_ir/src/typecheck.rs
@@ -716,8 +716,10 @@ fn typeflow(
             // Check number of run-time arguments.
             if inputs.len() - 1 != callee.param_types.len() {
                 return Error(format!(
-                    "Call node has {} inputs, but calls a function with {} parameters.",
+                    "Call node in {} has {} inputs, but calls a function ({}) with {} parameters.",
+                    function.name,
                     inputs.len() - 1,
+                    callee.name,
                     callee.param_types.len(),
                 ));
             }
@@ -725,8 +727,10 @@ fn typeflow(
             // Check number of dynamic constant arguments.
             if dc_args.len() != callee.num_dynamic_constants as usize {
                 return Error(format!(
-                    "Call node references {} dynamic constants, but calls a function expecting {} dynamic constants.",
+                    "Call node in {} references {} dynamic constants, but calls a function ({}) expecting {} dynamic constants.",
+                    function.name,
                     dc_args.len(),
+                    callee.name,
                     callee.num_dynamic_constants
                 ));
             }
diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index e6db0345def31324243cdee2bdcb6b5cca5d9a7b..8bd3f735fa60cf4e0404edde28eb858e0ed7e581 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge(
                     op: BinaryOperator::Rem,
                 });
                 edit.sub_edit(tid, rem);
+                edit.sub_edit(tid, outer_tid);
                 edit = edit.replace_all_uses(tid, rem)?;
             } else if tid_dim == inner_idx {
                 let outer_tid = Node::ThreadID {
@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge(
                     op: BinaryOperator::Div,
                 });
                 edit.sub_edit(tid, div);
+                edit.sub_edit(tid, outer_tid);
                 edit = edit.replace_all_uses(tid, div)?;
             }
         }
@@ -1479,7 +1481,12 @@ fn fork_fusion(
     }
 
     // Perform the fusion.
+    let bottom_tids: Vec<_> = editor
+        .get_users(bottom_fork)
+        .filter(|id| nodes[id.idx()].is_thread_id())
+        .collect();
     editor.edit(|mut edit| {
+        edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?;
         if bottom_join_pred != bottom_fork {
             // If there is control flow in the bottom fork-join, stitch it into
             // the top fork-join.
diff --git a/hercules_opt/src/interprocedural_sroa.rs b/hercules_opt/src/interprocedural_sroa.rs
index ad4ce19ede72c3fd373e67f1a51f46de00c47241..a01b0f5553ef59aaf45fe6ccc2c3ee7bdc881a0b 100644
--- a/hercules_opt/src/interprocedural_sroa.rs
+++ b/hercules_opt/src/interprocedural_sroa.rs
@@ -1,6 +1,3 @@
-use std::collections::HashMap;
-use std::iter::zip;
-
 use hercules_ir::ir::*;
 
 use crate::*;
@@ -39,15 +36,30 @@ pub fn interprocedural_sroa(
         }
 
         let editor: &mut FunctionEditor = &mut editors[func_id.idx()];
+        let param_types = &editor.func().param_types.to_vec();
         let return_types = &editor.func().return_types.to_vec();
 
-        // We determine the new return types of the function and track a map
-        // that tells us how the old return values are constructed from the
-        // new ones
+        // We determine the new param/return types of the function and track a
+        // map that tells us how the old param/return values are constructed
+        // from the new ones.
+        let mut new_param_types = vec![];
+        let mut old_param_type_map = vec![];
         let mut new_return_types = vec![];
         let mut old_return_type_map = vec![];
         let mut changed = false;
 
+        for par_typ in param_types.iter() {
+            if !can_sroa_type(editor, *par_typ) {
+                old_param_type_map.push(IndexTree::Leaf(new_param_types.len()));
+                new_param_types.push(*par_typ);
+            } else {
+                let (types, index) = sroa_type(editor, *par_typ, new_param_types.len());
+                old_param_type_map.push(index);
+                new_param_types.extend(types);
+                changed = true;
+            }
+        }
+
         for ret_typ in return_types.iter() {
             if !can_sroa_type(editor, *ret_typ) {
                 old_return_type_map.push(IndexTree::Leaf(new_return_types.len()));
@@ -60,25 +72,54 @@ pub fn interprocedural_sroa(
             }
         }
 
-        // If the return type is not changed by IP SROA, skip to the next function
+        // If the param/return types aren't changed by IP SROA, skip to the next
+        // function.
         if !changed {
             continue;
         }
 
-        // Now, modify each return in the current function and the return type
-        let return_nodes = editor
-            .func()
-            .nodes
-            .iter()
-            .enumerate()
-            .filter_map(|(idx, node)| {
-                if node.try_return().is_some() {
-                    Some(NodeID::new(idx))
+        // Modify each parameter in the current function and the param types.
+        let mut param_nodes: Vec<_> = vec![vec![]; param_types.len()];
+        for id in editor.node_ids() {
+            if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() {
+                param_nodes[idx].push(id);
+            }
+        }
+        let success = editor.edit(|mut edit| {
+            for (idx, ids) in param_nodes.into_iter().enumerate() {
+                let new_indices = &old_param_type_map[idx];
+                let built = if let IndexTree::Leaf(new_idx) = new_indices {
+                    edit.add_node(Node::Parameter { index: *new_idx })
                 } else {
-                    None
+                    let prod_ty = param_types[idx];
+                    let cons = edit.add_zero_constant(prod_ty);
+                    let mut cons = edit.add_node(Node::Constant { id: cons });
+                    new_indices.for_each(|idx: &Vec<Index>, param_idx: &usize| {
+                        let param = edit.add_node(Node::Parameter { index: *param_idx });
+                        cons = edit.add_node(Node::Write {
+                            collect: cons,
+                            data: param,
+                            indices: idx.clone().into_boxed_slice(),
+                        });
+                    });
+                    cons
+                };
+                for id in ids {
+                    edit = edit.replace_all_uses(id, built)?;
+                    edit = edit.delete_node(id)?;
                 }
-            })
-            .collect::<Vec<_>>();
+            }
+
+            edit.set_param_types(new_param_types);
+            Ok(edit)
+        });
+        assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument");
+
+        // Modify each return in the current function and the return types.
+        let return_nodes: Vec<_> = editor
+            .node_ids()
+            .filter(|id| editor.func().nodes[id.idx()].is_return())
+            .collect();
         let success = editor.edit(|mut edit| {
             for node in return_nodes {
                 let Node::Return { control, data } = edit.get_node(node) else {
@@ -114,17 +155,15 @@ pub fn interprocedural_sroa(
             }
 
             edit.set_return_types(new_return_types);
-
             Ok(edit)
         });
         assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument");
 
-        // Finally, update calls of this function
-        // In particular, we actually don't have to update the call node at all but have to update
-        // its DataProjection users
+        // Finally, update calls of this function.
         for (caller, callsite) in callsites {
             let editor = &mut editors[caller.idx()];
             assert!(editor.func_id() == caller);
+
             let projs = editor.get_users(callsite).collect::<Vec<_>>();
             for proj_id in projs {
                 let Node::DataProjection { data: _, selection } = editor.node(proj_id) else {
@@ -134,6 +173,40 @@ pub fn interprocedural_sroa(
                 let typ = types[caller.idx()][proj_id.idx()];
                 replace_returned_value(editor, proj_id, typ, new_return_info, callsite);
             }
+
+            let (control, callee, dc_args, args) =
+                editor.func().nodes[callsite.idx()].try_call().unwrap();
+            let dc_args = dc_args.clone();
+            let args = args.clone();
+            let success = editor.edit(|mut edit| {
+                let mut new_args = vec![];
+                for (idx, (data_id, update_info)) in
+                    args.iter().zip(old_param_type_map.iter()).enumerate()
+                {
+                    if let IndexTree::Leaf(new_idx) = update_info {
+                        // Unchanged parameter value
+                        assert!(new_args.len() == *new_idx);
+                        new_args.push(*data_id);
+                    } else {
+                        // SROA'd parameter value
+                        let reads = generate_reads_edit(&mut edit, param_types[idx], *data_id);
+                        reads.zip(update_info).for_each(|_, (read_id, ret_idx)| {
+                            assert!(new_args.len() == **ret_idx);
+                            new_args.push(*read_id);
+                        });
+                    }
+                }
+                let new_call = edit.add_node(Node::Call {
+                    control,
+                    function: callee,
+                    dynamic_constants: dc_args,
+                    args: new_args.into_boxed_slice(),
+                });
+                edit = edit.replace_all_uses(callsite, new_call)?;
+                edit = edit.delete_node(callsite)?;
+                Ok(edit)
+            });
+            assert!(success);
         }
     }
 }
diff --git a/hercules_opt/src/sroa.rs b/hercules_opt/src/sroa.rs
index e658ff8818246b31b4dbf3c9dd7ad0372c59f3c6..2718f99d82321d6ddce59320ffc5ea2b9bbdc2cf 100644
--- a/hercules_opt/src/sroa.rs
+++ b/hercules_opt/src/sroa.rs
@@ -447,7 +447,7 @@ pub fn sroa(
                 field_map.insert(node, generate_reads(editor, types[&node], node));
             }
             Node::Constant { id } => {
-                field_map.insert(node, generate_constant_fields(editor, id));
+                field_map.insert(node, generate_constant_fields(editor, id, node));
                 to_delete.push(node);
             }
             _ => {
@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID
 
 // Given a constant cnst adds node to the function which are the constant values of each field and
 // returns a list of pairs of indices and the node that holds that index
-fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> {
+fn generate_constant_fields(
+    editor: &mut FunctionEditor,
+    cnst: ConstantID,
+    old_node: NodeID,
+) -> IndexTree<NodeID> {
     let cs: Option<Vec<ConstantID>> =
         if let Some(cs) = editor.get_constant(cnst).try_product_fields() {
             Some(cs.into())
@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In
     if let Some(cs) = cs {
         let mut fields = vec![];
         for c in cs {
-            fields.push(generate_constant_fields(editor, c));
+            fields.push(generate_constant_fields(editor, c, old_node));
         }
         IndexTree::Node(fields)
     } else {
         let mut node = None;
         editor.edit(|mut edit| {
             node = Some(edit.add_node(Node::Constant { id: cnst }));
+            edit.sub_edit(old_node, node.unwrap());
             Ok(edit)
         });
         IndexTree::Leaf(node.expect("Add node cannot fail"))
diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn
index 4d02b2cdb5f843774ebae9b873388e03c82071b0..931e78f8196eba77ff3f9015ab7f4561b1153c57 100644
--- a/juno_samples/cava/src/cava.jn
+++ b/juno_samples/cava/src/cava.jn
@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
   @image_loop for r = 0 to row {
     for c = 0 to col {
       @l2 let l2_dist : f32[num_ctrl_pts];
-      for cp = 0 to num_ctrl_pts {
+      @cp_loop for cp = 0 to num_ctrl_pts {
         let v1 = input[0, r, c] - ctrl_pts[cp, 0];
         let v2 = input[1, r, c] - ctrl_pts[cp, 1];
         let v3 = input[2, r, c] - ctrl_pts[cp, 2];
@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
      
       @channel_loop for chan = 0 to CHAN {
         let chan_val : f32 = 0.0;
-        for cp = 0 to num_ctrl_pts {
+        @cp_loop for cp = 0 to num_ctrl_pts {
           chan_val += l2_dist[cp] * weights[cp, chan];
         }
 
diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index 8f22b37d4c3a77a31e8d2467c8e6e130c281513f..6fc8adbb1fff30856978ce201af09fd2e02d6298 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -115,7 +115,7 @@ array-slf(fuse4);
 simpl!(fuse4);
 let par = fuse4@image_loop \ fuse4@channel_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let fuse4_body = outline(split.cava_3.fj2);
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index bacfd3abca363a6dd93496c1adb23fa54c860b9f..0ef466c00d08f3edffe02bad9612ce23ebf14768 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -117,9 +117,9 @@ fixpoint {
 simpl!(fuse4);
 array-slf(fuse4);
 simpl!(fuse4);
-//fork-tile[2, 0, false, true](fuse4@channel_loop);
-//fork-split(fuse4@channel_loop);
-//clean-monoid-reduces(fuse4);
+fork-tile[2, 0, false, true](fuse4@channel_loop);
+let out = fork-split(fuse4@channel_loop);
+fork-unroll(out.cava_3.fj1);
 unforkify(fuse4@channel_loop);
 
 no-memset(fuse5@res1);
@@ -133,6 +133,13 @@ simpl!(fuse5);
 array-slf(fuse5);
 simpl!(fuse5);
 
+fork-tile[4, 1, false, true](fuse4);
+fork-tile[8, 0, false, true](fuse4);
+fork-interchange[1, 2](fuse4);
+let split = fork-split(fuse4);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
+
 delete-uncalled(*);
 simpl!(*);
 
diff --git a/juno_samples/cava/src/lib.rs b/juno_samples/cava/src/lib.rs
index 1810a24670d8fa7325b2ff353ba70756824a0269..47e3b1b3542217b9a17905d8a52ef431fd97541f 100644
--- a/juno_samples/cava/src/lib.rs
+++ b/juno_samples/cava/src/lib.rs
@@ -124,9 +124,9 @@ pub struct CavaInputs {
     #[clap(long = "output-verify", value_name = "PATH")]
     pub output_verify: Option<String>,
     pub cam_model: String,
-    #[clap(short, long)]
+    #[clap(long)]
     pub crop_rows: Option<usize>,
-    #[clap(short, long)]
+    #[clap(long)]
     pub crop_cols: Option<usize>,
 }
 
diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index ec9e423dc4c160d08b61eaf45d2b75329886f94e..3e1321c517b55f72e529e8a6df46d9d55bf59efb 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -26,14 +26,14 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
 fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
 simpl!(gaussian_smoothing, gaussian_smoothing_body);
 
-no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+no-memset(laplacian_estimate@res);
 fixpoint {
   forkify(laplacian_estimate);
   fork-guard-elim(laplacian_estimate);
@@ -42,15 +42,15 @@ fixpoint {
 simpl!(laplacian_estimate);
 let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2;
+let body = split._1_laplacian_estimate.fj2;
 let laplacian_estimate_body = outline(body);
 fork-coalesce(laplacian_estimate, laplacian_estimate_body);
 simpl!(laplacian_estimate, laplacian_estimate_body);
 
-no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+no-memset(zero_crossings@res);
 fixpoint {
   forkify(zero_crossings);
   fork-guard-elim(zero_crossings);
@@ -59,10 +59,10 @@ fixpoint {
 simpl!(zero_crossings);
 let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2;
+let body = split._2_zero_crossings.fj2;
 let zero_crossings_body = outline(body);
 fork-coalesce(zero_crossings, zero_crossings_body);
 simpl!(zero_crossings, zero_crossings_body);
@@ -86,7 +86,7 @@ fixpoint {
 simpl!(max_gradient);
 fork-dim-merge(max_gradient);
 simpl!(max_gradient);
-fork-tile[16, 0, false, false](max_gradient);
+fork-tile[32, 0, false, false](max_gradient);
 let split = fork-split(max_gradient);
 clean-monoid-reduces(max_gradient);
 let out = outline(split._4_max_gradient.fj1);
@@ -105,7 +105,7 @@ fixpoint {
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
 fork-tile[4, 1, false, false](reject_zero_crossings);
-fork-tile[4, 0, false, false](reject_zero_crossings);
+fork-tile[8, 0, false, false](reject_zero_crossings);
 fork-interchange[1, 2](reject_zero_crossings);
 let split = fork-split(reject_zero_crossings);
 let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
index 58f364dc1cea77b27c83ba2340bfbd11fde07f31..3e49cb365b186037cf6f380c3aa6a4d3b483fb5b 100644
--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>(
 
   @image_loop for row = 0 to n {
     for col = 0 to m {
-      // Copy data for dilation filter
-      @shr1 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-	                       || row + i - r > n - 1
-			       || col + j < r
-			       || col + j - r > m - 1 then MIN_BR
-			    else input[row + i - r, col + j - r];
-        }
-      }
-
       // Compute pixel of dilated image
       let dilated = MIN_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
-        }
-      }
-
-      // Data copy for erotion filter
-      @shr2 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-	                       || row + i - r > n - 1
-			       || col + j < r
-			       || col + j - r > m - 1 then MAX_BR
-			    else input[row + i - r, col + j - r];
+	  let filter = if row + i < r
+                         || row + i - r > n - 1
+                         || col + j < r
+                         || col + j - r > m - 1 then MIN_BR
+                      else input[row + i - r, col + j - r];
+          dilated = max!(dilated, filter * structure[i, j]);
         }
       }
 
@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>(
       let eroded = MAX_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+	  let filter = if row + i < r
+                         || row + i - r > n - 1
+                         || col + j < r
+                         || col + j - r > m - 1 then MAX_BR
+                      else input[row + i - r, col + j - r];
+          eroded = min!(eroded, filter * structure[i, j]);
         }
       }
 
@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>(
 
   @image_loop for row = 0 to n {
     for col = 0 to m {
-      // Data copy for dilation filter
-      @shr1 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-                               || row + i - r > n - 1
-                               || col + j < r
-                               || col + j - r > m - 1 then MIN_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
-                            else MIN_BR;
-        }
-      }
-
       // Compute the pixel of dilated image
       let dilated = MIN_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
-        }
-      }
-
-      // Data copy for erotion filter
-      @shr2 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-                               || row + i - r > n - 1
-                               || col + j < r
-                               || col + j - r > m - 1 then MAX_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
-                            else MIN_BR;
+	  let filter = if row + i < r
+                          || row + i - r > n - 1
+                          || col + j < r
+                          || col + j - r > m - 1 then MIN_BR
+                       else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
+                       else MIN_BR;
+          dilated = max!(dilated, filter * structure[i, j]);
         }
       }
 
@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>(
       let eroded = MAX_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+	  let filter = if row + i < r
+                          || row + i - r > n - 1
+                          || col + j < r
+                          || col + j - r > m - 1 then MAX_BR
+                       else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
+                       else MIN_BR;
+          eroded = min!(eroded, filter * structure[i, j]);
         }
       }
 
@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>(
       let gx = 0;
       let gy = 0;
 
-      for i = 0 to sb {
+      @filter_loop for i = 0 to sb {
         for j = 0 to sb {
           let val = input[if row + i < sbr              then 0
                           else if row + i - sbr > n - 1 then n - 1
diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch
index 7ee2904f7d1bb59780390360dbd9abc6b3934aba..666f6cef30f0c0ecedef57f02e50bc25d2b26b8f 100644
--- a/juno_samples/edge_detection/src/gpu.sch
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -26,22 +26,49 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
+fork-dim-merge(gaussian_smoothing@filter_loop);
+unforkify(gaussian_smoothing@filter_loop);
+simpl!(gaussian_smoothing);
 
-no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+fork-dim-merge(gaussian_smoothing);
+fork-tile[32, 0, false, true](gaussian_smoothing);
+simpl!(gaussian_smoothing);
+fork-split(gaussian_smoothing);
+simpl!(gaussian_smoothing);
+
+no-memset(laplacian_estimate@res);
 fixpoint {
   forkify(laplacian_estimate);
   fork-guard-elim(laplacian_estimate);
   fork-coalesce(laplacian_estimate);
 }
 simpl!(laplacian_estimate);
+fork-dim-merge(laplacian_estimate@filter_loop);
+unforkify(laplacian_estimate@filter_loop);
+simpl!(laplacian_estimate);
 
-no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+fork-dim-merge(laplacian_estimate);
+fork-tile[32, 0, false, true](laplacian_estimate);
+simpl!(laplacian_estimate);
+fork-split(laplacian_estimate);
+simpl!(laplacian_estimate);
+
+no-memset(zero_crossings@res);
 fixpoint {
   forkify(zero_crossings);
   fork-guard-elim(zero_crossings);
   fork-coalesce(zero_crossings);
 }
 simpl!(zero_crossings);
+fork-dim-merge(zero_crossings@filter_loop);
+unforkify(zero_crossings@filter_loop);
+simpl!(zero_crossings);
+
+fork-dim-merge(zero_crossings);
+fork-tile[32, 0, false, true](zero_crossings);
+simpl!(zero_crossings);
+fork-split(zero_crossings);
+simpl!(zero_crossings);
 
 no-memset(gradient@res);
 fixpoint {
@@ -53,6 +80,15 @@ predication(gradient);
 simpl!(gradient);
 predication(gradient);
 simpl!(gradient);
+fork-dim-merge(gradient@filter_loop);
+unforkify(gradient@filter_loop);
+simpl!(gradient);
+
+fork-dim-merge(gradient);
+fork-tile[32, 0, false, true](gradient);
+simpl!(gradient);
+fork-split(gradient);
+simpl!(gradient);
 
 fixpoint {
   forkify(max_gradient);
@@ -88,6 +124,12 @@ fixpoint {
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
 
+fork-dim-merge(reject_zero_crossings);
+fork-tile[32, 0, false, true](reject_zero_crossings);
+simpl!(reject_zero_crossings);
+fork-split(reject_zero_crossings);
+simpl!(reject_zero_crossings);
+
 async-call(edge_detection@le, edge_detection@zc);
 
 simpl!(*);
diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs
index d2813388e0e7a1d7bd1696ffbb641e629096e2c2..7bc2083cf07c063eccda5855fa4fed3bfca91f87 100644
--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
@@ -1,24 +1,11 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(not(feature = "cuda"))]
-    {
-        JunoCompiler::new()
-            .file_in_src("matmul.jn")
-            .unwrap()
-            .schedule_in_src("cpu.sch")
-            .unwrap()
-            .build()
-            .unwrap();
-    }
-    #[cfg(feature = "cuda")]
-    {
-        JunoCompiler::new()
-            .file_in_src("matmul.jn")
-            .unwrap()
-            .schedule_in_src("gpu.sch")
-            .unwrap()
-            .build()
-            .unwrap();
-    }
+    JunoCompiler::new()
+        .file_in_src("matmul.jn")
+        .unwrap()
+        .schedule_in_src("matmul.sch")
+        .unwrap()
+        .build()
+        .unwrap();
 }
diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch
deleted file mode 100644
index 69f1811d08a5691de6dc10fa86f0720e416fb85a..0000000000000000000000000000000000000000
--- a/juno_samples/matmul/src/cpu.sch
+++ /dev/null
@@ -1,61 +0,0 @@
-macro optimize!(X) {
-  gvn(X);
-  phi-elim(X);
-  dce(X);
-  ip-sroa(X);
-  sroa(X);
-  dce(X);
-  gvn(X);
-  phi-elim(X);
-  dce(X);
-}
-
-macro codegen-prep!(X) {
-  optimize!(X);
-  gcm(X);
-  float-collections(X);
-  dce(X);
-  gcm(X);
-}
-
-macro forkify!(X) {
-  fixpoint {
-    forkify(X);
-    fork-guard-elim(X);
-  }
-}
-
-macro fork-tile![n](X) {
-  fork-tile[n, 0, false, true](X);
-}
-
-macro parallelize!(X) {
-  parallel-fork(X);
-  parallel-reduce(X);
-}
-
-macro unforkify!(X) {
-  fork-split(X);
-  unforkify(X);
-}
-
-optimize!(*);
-forkify!(*);
-associative(matmul@outer);
-
-// Parallelize by computing output array as 16 chunks
-let par = matmul@outer \ matmul@inner;
-fork-tile![4](par);
-let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
-parallelize!(outer \ inner);
-
-let body = outline(inner);
-cpu(body);
-
-// Tile for cache, assuming 64B cache lines
-fork-tile![16](body);
-let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
-
-reduce-slf(inner);
-unforkify!(body);
-codegen-prep!(*);
diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
deleted file mode 100644
index 76808149e7b99dce97e43f0e936536d3d13b7417..0000000000000000000000000000000000000000
--- a/juno_samples/matmul/src/gpu.sch
+++ /dev/null
@@ -1,26 +0,0 @@
-phi-elim(*);
-
-forkify(*);
-fork-guard-elim(*);
-dce(*);
-
-fixpoint {
-  reduce-slf(*);
-  slf(*);
-  infer-schedules(*);
-}
-fork-coalesce(*);
-infer-schedules(*);
-dce(*);
-rewrite(*);
-fixpoint {
-  simplify-cfg(*);
-  dce(*);
-}
-
-ip-sroa(*);
-sroa(*);
-dce(*);
-
-float-collections(*);
-gcm(*);
diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch
new file mode 100644
index 0000000000000000000000000000000000000000..306997f58eb217f9ce301dc18e418c412e6df621
--- /dev/null
+++ b/juno_samples/matmul/src/matmul.sch
@@ -0,0 +1,81 @@
+macro optimize!(X) {
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  ip-sroa(X);
+  sroa(X);
+  dce(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+}
+
+macro codegen-prep!(X) {
+  optimize!(X);
+  gcm(X);
+  float-collections(X);
+  dce(X);
+  gcm(X);
+}
+
+macro forkify!(X) {
+  fixpoint {
+    forkify(X);
+    fork-guard-elim(X);
+  }
+}
+
+macro fork-tile![n](X) {
+  fork-tile[n, 0, false, true](X);
+}
+
+macro parallelize!(X) {
+  parallel-fork(X);
+  parallel-reduce(X);
+}
+
+macro unforkify!(X) {
+  fork-split(X);
+  unforkify(X);
+}
+
+optimize!(*);
+forkify!(*);
+
+if feature("cuda") {
+  fixpoint {
+    reduce-slf(*);
+    slf(*);
+    infer-schedules(*);
+  }
+  fork-coalesce(*);
+  infer-schedules(*);
+  dce(*);
+  rewrite(*);
+  fixpoint {
+    simplify-cfg(*);
+    dce(*);
+  }
+
+  optimize!(*);
+  codegen-prep!(*);
+} else {
+  associative(matmul@outer);
+
+  // Parallelize by computing output array as 16 chunks
+  let par = matmul@outer \ matmul@inner;
+  fork-tile![4](par);
+  let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
+  parallelize!(outer \ inner);
+
+  let body = outline(inner);
+  cpu(body);
+
+  // Tile for cache, assuming 64B cache lines
+  fork-tile![16](body);
+  let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
+
+  reduce-slf(inner);
+  unforkify!(body);
+  codegen-prep!(*);
+}
diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
index fd614b42a55488bfcda64a853105fd40e53ff7bc..5fc73db9d369be5ef9695f37ad1f39122facf991 100644
--- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs
+++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
@@ -28,40 +28,55 @@ fn cfd_bench(c: &mut Criterion) {
         elements_surrounding_elements,
         normals,
     } = read_domain_geometry(data_file, block_size);
-    let mut variables = initialize_variables(nelr, ff_variable.as_slice());
-    let mut variables = HerculesMutBox::from(variables.as_mut_slice());
+    let mut variables = initialize_variables(nelr, &ff_variable);
+
+    let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+    let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+    let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+    let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+    let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+
     let areas = HerculesImmBox::from(areas.as_slice());
     let elements_surrounding_elements =
         HerculesImmBox::from(elements_surrounding_elements.as_slice());
-    let normals = HerculesImmBox::from(normals.as_slice());
-    let ff_variable = HerculesImmBox::from(ff_variable.as_slice());
-    let ff_fc_density_energy = vec![
-        ff_fc_density_energy.x,
-        ff_fc_density_energy.y,
-        ff_fc_density_energy.z,
-    ];
-    let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
-    let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
-    let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
-    let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
-    let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
-    let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
-    let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
+
+    let normals_x = HerculesImmBox::from(normals.x.as_slice());
+    let normals_y = HerculesImmBox::from(normals.y.as_slice());
+    let normals_z = HerculesImmBox::from(normals.z.as_slice());
+
     group.bench_function("cfd bench euler", |b| {
         b.iter(|| {
             async_std::task::block_on(async {
                 r.run(
                     nelr as u64,
                     iterations as u64,
-                    variables.to(),
+                    v_density.to(),
+                    v_momentum_x.to(),
+                    v_momentum_y.to(),
+                    v_momentum_z.to(),
+                    v_energy.to(),
                     areas.to(),
                     elements_surrounding_elements.to(),
-                    normals.to(),
-                    ff_variable.to(),
-                    ff_fc_density_energy.to(),
-                    ff_fc_momentum_x.to(),
-                    ff_fc_momentum_y.to(),
-                    ff_fc_momentum_z.to(),
+                    normals_x.to(),
+                    normals_y.to(),
+                    normals_z.to(),
+                    ff_variable.density,
+                    ff_variable.momentum.x,
+                    ff_variable.momentum.y,
+                    ff_variable.momentum.z,
+                    ff_variable.energy,
+                    ff_fc_density_energy.x,
+                    ff_fc_density_energy.y,
+                    ff_fc_density_energy.z,
+                    ff_fc_momentum_x.x,
+                    ff_fc_momentum_x.y,
+                    ff_fc_momentum_x.z,
+                    ff_fc_momentum_y.x,
+                    ff_fc_momentum_y.y,
+                    ff_fc_momentum_y.z,
+                    ff_fc_momentum_z.x,
+                    ff_fc_momentum_z.y,
+                    ff_fc_momentum_z.z,
                 )
                 .await
             });
@@ -85,40 +100,55 @@ fn cfd_bench(c: &mut Criterion) {
         elements_surrounding_elements,
         normals,
     } = read_domain_geometry(data_file, block_size);
-    let mut variables = initialize_variables(nelr, ff_variable.as_slice());
-    let mut variables = HerculesMutBox::from(variables.as_mut_slice());
+    let mut variables = initialize_variables(nelr, &ff_variable);
+
+    let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+    let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+    let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+    let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+    let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+
     let areas = HerculesImmBox::from(areas.as_slice());
     let elements_surrounding_elements =
         HerculesImmBox::from(elements_surrounding_elements.as_slice());
-    let normals = HerculesImmBox::from(normals.as_slice());
-    let ff_variable = HerculesImmBox::from(ff_variable.as_slice());
-    let ff_fc_density_energy = vec![
-        ff_fc_density_energy.x,
-        ff_fc_density_energy.y,
-        ff_fc_density_energy.z,
-    ];
-    let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
-    let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
-    let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
-    let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
-    let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
-    let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
-    let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
+
+    let normals_x = HerculesImmBox::from(normals.x.as_slice());
+    let normals_y = HerculesImmBox::from(normals.y.as_slice());
+    let normals_z = HerculesImmBox::from(normals.z.as_slice());
+
     group.bench_function("cfd bench pre-euler", |b| {
         b.iter(|| {
             async_std::task::block_on(async {
                 r.run(
                     nelr as u64,
                     iterations as u64,
-                    variables.to(),
+                    v_density.to(),
+                    v_momentum_x.to(),
+                    v_momentum_y.to(),
+                    v_momentum_z.to(),
+                    v_energy.to(),
                     areas.to(),
                     elements_surrounding_elements.to(),
-                    normals.to(),
-                    ff_variable.to(),
-                    ff_fc_density_energy.to(),
-                    ff_fc_momentum_x.to(),
-                    ff_fc_momentum_y.to(),
-                    ff_fc_momentum_z.to(),
+                    normals_x.to(),
+                    normals_y.to(),
+                    normals_z.to(),
+                    ff_variable.density,
+                    ff_variable.momentum.x,
+                    ff_variable.momentum.y,
+                    ff_variable.momentum.z,
+                    ff_variable.energy,
+                    ff_fc_density_energy.x,
+                    ff_fc_density_energy.y,
+                    ff_fc_density_energy.z,
+                    ff_fc_momentum_x.x,
+                    ff_fc_momentum_x.y,
+                    ff_fc_momentum_x.z,
+                    ff_fc_momentum_y.x,
+                    ff_fc_momentum_y.y,
+                    ff_fc_momentum_y.z,
+                    ff_fc_momentum_z.x,
+                    ff_fc_momentum_z.y,
+                    ff_fc_momentum_z.z,
                 )
                 .await
             });
diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch
index 1244f80e54fdad43f58e5c5a5af44646b7a83e89..7a284a9a3bed0c1800366c64ceeca20265d845a6 100644
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -12,10 +12,11 @@ macro simpl!(X) {
 
 simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
 simpl!(*);
-ip-sroa[false](*);
-sroa[false](*);
+ip-sroa[true](*);
+sroa[true](*);
 predication(*);
 const-inline(*);
 simpl!(*);
@@ -24,8 +25,31 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
-no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
-parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
+unforkify(compute_flux@inner_loop);
+
+fork-tile[32, 0, false, false](compute_step_factor);
+let split = fork-split(compute_step_factor);
+let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+fork-coalesce(compute_step_factor, compute_step_factor_body);
+simpl!(compute_step_factor, compute_step_factor_body);
+
+fork-tile[32, 0, false, false](compute_flux);
+let split = fork-split(compute_flux);
+let compute_flux_body = outline(split._6_compute_flux.fj1);
+fork-coalesce(compute_flux, compute_flux_body);
+simpl!(compute_flux, compute_flux_body);
+
+fork-tile[32, 0, false, false](time_step);
+let split = fork-split(time_step);
+let time_step_body = outline(split._7_time_step.fj1);
+fork-coalesce(time_step, time_step_body);
+simpl!(time_step, time_step_body);
+
+fork-tile[32, 0, false, false](copy_vars);
+let split = fork-split(copy_vars);
+let copy_vars_body = outline(split._8_copy_vars.fj1);
+fork-coalesce(copy_vars, copy_vars_body);
+simpl!(copy_vars, copy_vars_body);
 
-unforkify(*);
+unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
index 6329c5046e15ca0bce646f4a778dcf8c8d781656..518c656d99652dd9813e6aa5c8e8e84388836d47 100644
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
@@ -14,8 +14,8 @@ simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
 delete-uncalled(*);
 simpl!(*);
-ip-sroa[false](*);
-sroa[false](*);
+ip-sroa[true](*);
+sroa[true](*);
 predication(*);
 const-inline(*);
 simpl!(*);
@@ -24,7 +24,38 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
+no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
+unforkify(compute_flux@inner_loop);
 
-unforkify(*);
+fork-tile[32, 0, false, false](compute_step_factor);
+let split = fork-split(compute_step_factor);
+let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+fork-coalesce(compute_step_factor, compute_step_factor_body);
+simpl!(compute_step_factor, compute_step_factor_body);
 
+fork-tile[32, 0, false, false](compute_flux_contributions);
+let split = fork-split(compute_flux_contributions);
+let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
+fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
+simpl!(compute_flux_contributions, compute_flux_contributions_body);
+
+fork-tile[32, 0, false, false](compute_flux);
+let split = fork-split(compute_flux);
+let compute_flux_body = outline(split._7_compute_flux.fj1);
+fork-coalesce(compute_flux, compute_flux_body);
+simpl!(compute_flux, compute_flux_body);
+
+fork-tile[32, 0, false, false](time_step);
+let split = fork-split(time_step);
+let time_step_body = outline(split._8_time_step.fj1);
+fork-coalesce(time_step, time_step_body);
+simpl!(time_step, time_step_body);
+
+fork-tile[32, 0, false, false](copy_vars);
+let split = fork-split(copy_vars);
+let copy_vars_body = outline(split._9_copy_vars.fj1);
+fork-coalesce(copy_vars, copy_vars_body);
+simpl!(copy_vars, copy_vars_body);
+
+unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/gpu_euler.sch b/juno_samples/rodinia/cfd/src/gpu_euler.sch
index 7f7ee42ca9af7e6026b304c7b27d00bbb9a3f035..3700f79d24aa22657020ace6a37931feb49048a4 100644
--- a/juno_samples/rodinia/cfd/src/gpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_euler.sch
@@ -1,23 +1,44 @@
-gvn(*);
-dce(*);
-phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  crc(X);
+  slf(X);
+  dce(X);
+  infer-schedules(X);
+}
 
-let auto = auto-outline(euler);
-gpu(auto.euler);
-
-inline(auto.euler);
-inline(auto.euler);
+simpl!(*);
+inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
+gpu(copy_vars, compute_step_factor, compute_flux, time_step);
 
-sroa[false](auto.euler);
-dce(*);
-float-collections(*);
-dce(*);
+simpl!(*);
+ip-sroa[true](*);
+sroa[true](*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
+unforkify(compute_flux@inner_loop);
 
-gcm(*);
+fork-tile[32, 0, false, true](compute_step_factor);
+fork-split(compute_step_factor);
+
+fork-tile[32, 0, false, true](compute_flux);
+fork-split(compute_flux);
 
+fork-tile[32, 0, false, true](time_step);
+fork-split(time_step);
+
+fork-tile[32, 0, false, true](copy_vars);
+fork-split(copy_vars);
+
+gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
index 33c46dabe92854850f55fe0f0101cfe022947cf0..d6db675b0682490a84ba30408c3dd96993dce0f9 100644
--- a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
@@ -1,23 +1,47 @@
-gvn(*);
-dce(*);
-phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
-
-let auto = auto-outline(pre_euler);
-gpu(auto.pre_euler);
-
-inline(auto.pre_euler);
-inline(auto.pre_euler);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  crc(X);
+  slf(X);
+  dce(X);
+  infer-schedules(X);
+}
+
+simpl!(*);
+inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
+gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step);
 
-sroa[false](auto.pre_euler);
-dce(*);
-float-collections(*);
-dce(*);
+simpl!(*);
+ip-sroa[true](*);
+sroa[true](*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
+unforkify(compute_flux@inner_loop);
 
-gcm(*);
+fork-tile[32, 0, false, true](compute_step_factor);
+fork-split(compute_step_factor);
+
+fork-tile[32, 0, false, true](compute_flux_contributions);
+fork-split(compute_flux_contributions);
+
+fork-tile[32, 0, false, true](compute_flux);
+fork-split(compute_flux);
 
+fork-tile[32, 0, false, true](time_step);
+fork-split(time_step);
+
+fork-tile[32, 0, false, true](copy_vars);
+fork-split(copy_vars);
+
+gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs
index 39384c0d6a322f22a7c7b7ef7e51348d80a36107..d61df4c5b7bb0c7915d8605debd72e375a5482b0 100644
--- a/juno_samples/rodinia/cfd/src/lib.rs
+++ b/juno_samples/rodinia/cfd/src/lib.rs
@@ -24,123 +24,193 @@ pub struct CFDInputs {
 fn run_euler(
     nelr: usize,
     iterations: usize,
-    mut variables: AlignedSlice<f32>,
+    mut variables: Variables,
     areas: &[f32],
     elements_surrounding_elements: &[i32],
-    normals: &[f32],
-    ff_variable: &[f32],
+    normals: &Normals,
+    ff_variable: &Variable,
     ff_fc_density_energy: &Float3,
     ff_fc_momentum_x: &Float3,
     ff_fc_momentum_y: &Float3,
     ff_fc_momentum_z: &Float3,
-) -> Vec<f32> {
-    let mut variables = HerculesMutBox::from(variables.as_mut_slice());
+) -> Variables {
+    let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+    let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+    let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+    let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+    let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+
     let areas = HerculesImmBox::from(areas);
     let elements_surrounding_elements = HerculesImmBox::from(elements_surrounding_elements);
-    let normals = HerculesImmBox::from(normals);
-    let ff_variable = HerculesImmBox::from(ff_variable);
-
-    // TODO: Make hercules box handle structs, for now we'll copy into a vec
-    let ff_fc_density_energy = vec![
-        ff_fc_density_energy.x,
-        ff_fc_density_energy.y,
-        ff_fc_density_energy.z,
-    ];
-    let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
-    let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
-    let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
-    let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
-    let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
-    let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
-    let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
 
-    let mut runner = runner!(euler);
+    let normals_x = HerculesImmBox::from(normals.x.as_slice());
+    let normals_y = HerculesImmBox::from(normals.y.as_slice());
+    let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
-    HerculesMutBox::from(async_std::task::block_on(async {
+    let mut runner = runner!(euler);
+    let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
         runner
             .run(
                 nelr as u64,
                 iterations as u64,
-                variables.to(),
+                v_density.to(),
+                v_momentum_x.to(),
+                v_momentum_y.to(),
+                v_momentum_z.to(),
+                v_energy.to(),
                 areas.to(),
                 elements_surrounding_elements.to(),
-                normals.to(),
-                ff_variable.to(),
-                ff_fc_density_energy.to(),
-                ff_fc_momentum_x.to(),
-                ff_fc_momentum_y.to(),
-                ff_fc_momentum_z.to(),
+                normals_x.to(),
+                normals_y.to(),
+                normals_z.to(),
+                ff_variable.density,
+                ff_variable.momentum.x,
+                ff_variable.momentum.y,
+                ff_variable.momentum.z,
+                ff_variable.energy,
+                ff_fc_density_energy.x,
+                ff_fc_density_energy.y,
+                ff_fc_density_energy.z,
+                ff_fc_momentum_x.x,
+                ff_fc_momentum_x.y,
+                ff_fc_momentum_x.z,
+                ff_fc_momentum_y.x,
+                ff_fc_momentum_y.y,
+                ff_fc_momentum_y.z,
+                ff_fc_momentum_z.x,
+                ff_fc_momentum_z.y,
+                ff_fc_momentum_z.z,
             )
             .await
-    }))
-    .as_slice()
-    .to_vec()
+    });
+
+    Variables {
+        density: AlignedSlice::from_slice(HerculesMutBox::from(density).as_slice()),
+        momentum: Momentum {
+            x: AlignedSlice::from_slice(HerculesMutBox::from(momentum_x).as_slice()),
+            y: AlignedSlice::from_slice(HerculesMutBox::from(momentum_y).as_slice()),
+            z: AlignedSlice::from_slice(HerculesMutBox::from(momentum_z).as_slice()),
+        },
+        energy: AlignedSlice::from_slice(HerculesMutBox::from(energy).as_slice()),
+    }
 }
 
 fn run_pre_euler(
     nelr: usize,
     iterations: usize,
-    mut variables: AlignedSlice<f32>,
+    mut variables: Variables,
     areas: &[f32],
     elements_surrounding_elements: &[i32],
-    normals: &[f32],
-    ff_variable: &[f32],
+    normals: &Normals,
+    ff_variable: &Variable,
     ff_fc_density_energy: &Float3,
     ff_fc_momentum_x: &Float3,
     ff_fc_momentum_y: &Float3,
     ff_fc_momentum_z: &Float3,
-) -> Vec<f32> {
-    let mut variables = HerculesMutBox::from(variables.as_mut_slice());
+) -> Variables {
+    let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+    let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+    let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+    let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+    let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+
     let areas = HerculesImmBox::from(areas);
     let elements_surrounding_elements = HerculesImmBox::from(elements_surrounding_elements);
-    let normals = HerculesImmBox::from(normals);
-    let ff_variable = HerculesImmBox::from(ff_variable);
-
-    // TODO: Make hercules box handle structs, for now we'll copy into a vec
-    let ff_fc_density_energy = vec![
-        ff_fc_density_energy.x,
-        ff_fc_density_energy.y,
-        ff_fc_density_energy.z,
-    ];
-    let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
-    let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
-    let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
-    let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
-    let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
-    let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
-    let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
-
-    let mut runner = runner!(pre_euler);
 
-    let variables = variables.to();
+    let normals_x = HerculesImmBox::from(normals.x.as_slice());
+    let normals_y = HerculesImmBox::from(normals.y.as_slice());
+    let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
-    HerculesMutBox::from(async_std::task::block_on(async {
+    let mut runner = runner!(pre_euler);
+    let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
         runner
             .run(
                 nelr as u64,
                 iterations as u64,
-                variables,
+                v_density.to(),
+                v_momentum_x.to(),
+                v_momentum_y.to(),
+                v_momentum_z.to(),
+                v_energy.to(),
                 areas.to(),
                 elements_surrounding_elements.to(),
-                normals.to(),
-                ff_variable.to(),
-                ff_fc_density_energy.to(),
-                ff_fc_momentum_x.to(),
-                ff_fc_momentum_y.to(),
-                ff_fc_momentum_z.to(),
+                normals_x.to(),
+                normals_y.to(),
+                normals_z.to(),
+                ff_variable.density,
+                ff_variable.momentum.x,
+                ff_variable.momentum.y,
+                ff_variable.momentum.z,
+                ff_variable.energy,
+                ff_fc_density_energy.x,
+                ff_fc_density_energy.y,
+                ff_fc_density_energy.z,
+                ff_fc_momentum_x.x,
+                ff_fc_momentum_x.y,
+                ff_fc_momentum_x.z,
+                ff_fc_momentum_y.x,
+                ff_fc_momentum_y.y,
+                ff_fc_momentum_y.z,
+                ff_fc_momentum_z.x,
+                ff_fc_momentum_z.y,
+                ff_fc_momentum_z.z,
             )
             .await
-    }))
-    .as_slice()
-    .to_vec()
+    });
+
+    Variables {
+        density: AlignedSlice::from_slice(HerculesMutBox::from(density).as_slice()),
+        momentum: Momentum {
+            x: AlignedSlice::from_slice(HerculesMutBox::from(momentum_x).as_slice()),
+            y: AlignedSlice::from_slice(HerculesMutBox::from(momentum_y).as_slice()),
+            z: AlignedSlice::from_slice(HerculesMutBox::from(momentum_z).as_slice()),
+        },
+        energy: AlignedSlice::from_slice(HerculesMutBox::from(energy).as_slice()),
+    }
 }
 
 fn compare_float(x: f32, y: f32) -> bool {
     (x - y).abs() < 1e-5
 }
 
-fn compare_floats(xs: &[f32], ys: &[f32]) -> bool {
-    xs.len() == ys.len() && xs.iter().zip(ys.iter()).all(|(x, y)| compare_float(*x, *y))
+fn compare_floats(xs: &Variables, ys: &Variables) -> bool {
+    let xs_density = xs.density.as_slice();
+    let xs_momentum_x = xs.momentum.x.as_slice();
+    let xs_momentum_y = xs.momentum.y.as_slice();
+    let xs_momentum_z = xs.momentum.z.as_slice();
+    let xs_energy = xs.energy.as_slice();
+
+    let ys_density = ys.density.as_slice();
+    let ys_momentum_x = ys.momentum.x.as_slice();
+    let ys_momentum_y = ys.momentum.y.as_slice();
+    let ys_momentum_z = ys.momentum.z.as_slice();
+    let ys_energy = ys.energy.as_slice();
+
+    xs_density.len() == ys_density.len()
+        && xs_density
+            .iter()
+            .zip(ys_density.iter())
+            .all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_x.len() == ys_momentum_x.len()
+        && xs_momentum_x
+            .iter()
+            .zip(ys_momentum_x.iter())
+            .all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_y.len() == ys_momentum_y.len()
+        && xs_momentum_y
+            .iter()
+            .zip(ys_momentum_y.iter())
+            .all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_z.len() == ys_momentum_z.len()
+        && xs_momentum_z
+            .iter()
+            .zip(ys_momentum_z.iter())
+            .all(|(x, y)| compare_float(*x, *y))
+        && xs_energy.len() == ys_energy.len()
+        && xs_energy
+            .iter()
+            .zip(ys_energy.iter())
+            .all(|(x, y)| compare_float(*x, *y))
 }
 
 pub fn cfd_harness(args: CFDInputs) {
@@ -151,8 +221,6 @@ pub fn cfd_harness(args: CFDInputs) {
         pre_euler,
     } = args;
 
-    assert!(block_size % 16 == 0, "Hercules expects all arrays to be 64-byte aligned, cfd uses structs of arrays that are annoying to deal with if the block_size is not a multiple of 16");
-
     let FarFieldConditions {
         ff_variable,
         ff_fc_momentum_x,
@@ -168,7 +236,8 @@ pub fn cfd_harness(args: CFDInputs) {
         normals,
     } = read_domain_geometry(data_file, block_size);
 
-    let variables = initialize_variables(nelr, ff_variable.as_slice());
+    let variables = initialize_variables(nelr, &ff_variable);
+    println!("Running CFD with nelr = {}.", nelr);
 
     let res_juno = if pre_euler {
         run_pre_euler(
@@ -177,8 +246,8 @@ pub fn cfd_harness(args: CFDInputs) {
             variables.clone(),
             areas.as_slice(),
             elements_surrounding_elements.as_slice(),
-            normals.as_slice(),
-            ff_variable.as_slice(),
+            &normals,
+            &ff_variable,
             &ff_fc_density_energy,
             &ff_fc_momentum_x,
             &ff_fc_momentum_y,
@@ -191,8 +260,8 @@ pub fn cfd_harness(args: CFDInputs) {
             variables.clone(),
             areas.as_slice(),
             elements_surrounding_elements.as_slice(),
-            normals.as_slice(),
-            ff_variable.as_slice(),
+            &normals,
+            &ff_variable,
             &ff_fc_density_energy,
             &ff_fc_momentum_x,
             &ff_fc_momentum_y,
@@ -206,8 +275,8 @@ pub fn cfd_harness(args: CFDInputs) {
             variables,
             areas.as_slice(),
             elements_surrounding_elements.as_slice(),
-            normals.as_slice(),
-            ff_variable.as_slice(),
+            &normals,
+            &ff_variable,
             &ff_fc_density_energy,
             &ff_fc_momentum_x,
             &ff_fc_momentum_y,
@@ -220,8 +289,8 @@ pub fn cfd_harness(args: CFDInputs) {
             variables,
             areas.as_slice(),
             elements_surrounding_elements.as_slice(),
-            normals.as_slice(),
-            ff_variable.as_slice(),
+            &normals,
+            &ff_variable,
             &ff_fc_density_energy,
             &ff_fc_momentum_x,
             &ff_fc_momentum_y,
@@ -229,8 +298,7 @@ pub fn cfd_harness(args: CFDInputs) {
         )
     };
 
-    if !compare_floats(&res_juno, res_rust.as_slice()) {
-        assert_eq!(res_juno.len(), res_rust.as_slice().len());
+    if !compare_floats(&res_juno, &res_rust) {
         panic!("Mismatch in results");
     }
 }
diff --git a/juno_samples/rodinia/cfd/src/pre_euler.jn b/juno_samples/rodinia/cfd/src/pre_euler.jn
index c200f2db9a2be4708c4cf1c90c60eef0b7d7a11e..979c2e9a16d3ff2d0b5c690af7645dbf10462139 100644
--- a/juno_samples/rodinia/cfd/src/pre_euler.jn
+++ b/juno_samples/rodinia/cfd/src/pre_euler.jn
@@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 {
 }
 
 fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] {
-  let step_factors : f32[nelr];
+  @res let step_factors : f32[nelr];
 
   for i in 0..nelr {
     let density = variables.density[i];
@@ -109,10 +109,10 @@ fn compute_flux_contribution(
 fn compute_flux_contributions<nelr: usize>(
   variables: Variables::<nelr>,
 ) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) {
-  let fc_momentum_x: Momentum::<nelr>;
-  let fc_momentum_y: Momentum::<nelr>;
-  let fc_momentum_z: Momentum::<nelr>;
-  let fc_density_energy: Momentum::<nelr>;
+  @res let fc_momentum_x: Momentum::<nelr>;
+  @res let fc_momentum_y: Momentum::<nelr>;
+  @res let fc_momentum_z: Momentum::<nelr>;
+  @res let fc_density_energy: Momentum::<nelr>;
 
   for i in 0..nelr {
     let density_i = variables.density[i];
@@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>(
   ff_fc_momentum_z: float3,
 ) -> Variables::<nelr> {
   const smoothing_coefficient : f32 = 0.2;
-  let fluxes: Variables::<nelr>;
+  @res let fluxes: Variables::<nelr>;
 
-  for i in 0..nelr {
+  @outer_loop for i in 0..nelr {
     let density_i = variables.density[i];
 
     let momentum_i = float3 { x: variables.momentum.x[i],
@@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>(
     let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 };
     let flux_i_density_energy : f32 = 0.0;
 
-    for j in 0..NNB {
+    @inner_loop for j in 0..NNB {
       let nb = elements_surrounding_elements[j, i];
       let normal = float3 {
         x: normals.x[j, i],
@@ -328,7 +328,7 @@ fn time_step<nelr: usize>(
 }
 
 fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> {
-  let result : Variables::<nelr>;
+  @res let result : Variables::<nelr>;
 
   for i in 0..nelr {
     result.density[i] = variables.density[i];
diff --git a/juno_samples/rodinia/cfd/src/rust_cfd.rs b/juno_samples/rodinia/cfd/src/rust_cfd.rs
index 42479df8eb2cd5cf86e70cafd212050a07d729b4..6d92c2b5501198513ab3e8bc7314df2eec3aff0f 100644
--- a/juno_samples/rodinia/cfd/src/rust_cfd.rs
+++ b/juno_samples/rodinia/cfd/src/rust_cfd.rs
@@ -58,19 +58,19 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 {
     (GAMMA * pressure / density).sqrt()
 }
 
-fn compute_step_factor(nelr: usize, variables: &[f32], areas: &[f32]) -> Vec<f32> {
+fn compute_step_factor(nelr: usize, variables: &Variables, areas: &[f32]) -> Vec<f32> {
     let mut step_factors = vec![0.0; nelr];
 
     for i in 0..nelr {
-        let density = variables[i + VAR_DENSITY * nelr];
+        let density = variables.density[i];
 
         let momentum = Float3 {
-            x: variables[i + (VAR_MOMENTUM + 0) * nelr],
-            y: variables[i + (VAR_MOMENTUM + 1) * nelr],
-            z: variables[i + (VAR_MOMENTUM + 2) * nelr],
+            x: variables.momentum.x[i],
+            y: variables.momentum.y[i],
+            z: variables.momentum.z[i],
         };
 
-        let density_energy = variables[i + VAR_DENSITY_ENERGY * nelr];
+        let density_energy = variables.energy[i];
 
         let velocity = compute_velocity(density, &momentum);
         let speed_sqd = compute_speed_sqd(&velocity);
@@ -85,28 +85,36 @@ fn compute_step_factor(nelr: usize, variables: &[f32], areas: &[f32]) -> Vec<f32
 
 fn compute_flux_euler(
     nelr: usize,
-    variables: &[f32],
+    variables: &Variables,
     elements_surrounding_elements: &[i32],
-    normals: &[f32],
-    ff_variable: &[f32],
+    normals: &Normals,
+    ff_variable: &Variable,
     ff_fc_momentum_x: &Float3,
     ff_fc_momentum_y: &Float3,
     ff_fc_momentum_z: &Float3,
     ff_fc_density_energy: &Float3,
-) -> Vec<f32> {
+) -> Variables {
     let smoothing_coefficient: f32 = 0.2;
 
-    let mut fluxes = vec![0.0; nelr * NVAR];
+    let mut fluxes = Variables {
+        density: AlignedSlice::of_len(nelr),
+        momentum: Momentum {
+            x: AlignedSlice::of_len(nelr),
+            y: AlignedSlice::of_len(nelr),
+            z: AlignedSlice::of_len(nelr),
+        },
+        energy: AlignedSlice::of_len(nelr),
+    };
 
     for i in 0..nelr {
-        let density_i = variables[i + VAR_DENSITY * nelr];
+        let density_i = variables.density[i];
         let momentum_i = Float3 {
-            x: variables[i + (VAR_MOMENTUM + 0) * nelr],
-            y: variables[i + (VAR_MOMENTUM + 1) * nelr],
-            z: variables[i + (VAR_MOMENTUM + 2) * nelr],
+            x: variables.momentum.x[i],
+            y: variables.momentum.y[i],
+            z: variables.momentum.z[i],
         };
 
-        let density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
+        let density_energy_i = variables.energy[i];
 
         let velocity_i = compute_velocity(density_i, &momentum_i);
         let speed_sqd_i = compute_speed_sqd(&velocity_i);
@@ -134,9 +142,9 @@ fn compute_flux_euler(
         for j in 0..NNB {
             let nb = elements_surrounding_elements[i + j * nelr];
             let normal = Float3 {
-                x: normals[i + (j + 0 * NNB) * nelr],
-                y: normals[i + (j + 1 * NNB) * nelr],
-                z: normals[i + (j + 2 * NNB) * nelr],
+                x: normals.x[i + j * nelr],
+                y: normals.y[i + j * nelr],
+                z: normals.z[i + j * nelr],
             };
             let normal_len =
                 (normal.x * normal.x + normal.y * normal.y + normal.z * normal.z).sqrt();
@@ -144,13 +152,13 @@ fn compute_flux_euler(
             if nb >= 0 {
                 // legitimate neighbor
                 let nb = nb as usize;
-                let density_nb = variables[nb + VAR_DENSITY * nelr];
+                let density_nb = variables.density[nb];
                 let momentum_nb = Float3 {
-                    x: variables[nb + (VAR_MOMENTUM + 0) * nelr],
-                    y: variables[nb + (VAR_MOMENTUM + 1) * nelr],
-                    z: variables[nb + (VAR_MOMENTUM + 2) * nelr],
+                    x: variables.momentum.x[nb],
+                    y: variables.momentum.y[nb],
+                    z: variables.momentum.z[nb],
                 };
-                let density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
+                let density_energy_nb = variables.energy[nb];
                 let velocity_nb = compute_velocity(density_nb, &momentum_nb);
                 let speed_sqd_nb = compute_speed_sqd(&velocity_nb);
                 let pressure_nb = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
@@ -205,21 +213,21 @@ fn compute_flux_euler(
             } else if nb == -2 {
                 // a far field boundary
                 let factor = 0.5 * normal.x;
-                flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
+                flux_i_density += factor * (ff_variable.momentum.x + momentum_i.x);
                 flux_i_density_energy += factor * (ff_fc_density_energy.x + fc_i_density_energy.x);
                 flux_i_momentum.x += factor * (ff_fc_momentum_x.x + fc_i_momentum_x.x);
                 flux_i_momentum.y += factor * (ff_fc_momentum_y.x + fc_i_momentum_y.x);
                 flux_i_momentum.z += factor * (ff_fc_momentum_z.x + fc_i_momentum_z.x);
 
                 let factor = 0.5 * normal.y;
-                flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
+                flux_i_density += factor * (ff_variable.momentum.y + momentum_i.y);
                 flux_i_density_energy += factor * (ff_fc_density_energy.y + fc_i_density_energy.y);
                 flux_i_momentum.x += factor * (ff_fc_momentum_x.y + fc_i_momentum_x.y);
                 flux_i_momentum.y += factor * (ff_fc_momentum_y.y + fc_i_momentum_y.y);
                 flux_i_momentum.z += factor * (ff_fc_momentum_z.y + fc_i_momentum_z.y);
 
                 let factor = 0.5 * normal.z;
-                flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
+                flux_i_density += factor * (ff_variable.momentum.z + momentum_i.z);
                 flux_i_density_energy += factor * (ff_fc_density_energy.z + fc_i_density_energy.z);
                 flux_i_momentum.x += factor * (ff_fc_momentum_x.z + fc_i_momentum_x.z);
                 flux_i_momentum.y += factor * (ff_fc_momentum_y.z + fc_i_momentum_y.z);
@@ -227,11 +235,11 @@ fn compute_flux_euler(
             }
         }
 
-        fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
-        fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
-        fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
-        fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
-        fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
+        fluxes.density[i] = flux_i_density;
+        fluxes.momentum.x[i] = flux_i_momentum.x;
+        fluxes.momentum.y[i] = flux_i_momentum.y;
+        fluxes.momentum.z[i] = flux_i_momentum.z;
+        fluxes.energy[i] = flux_i_density_energy;
     }
 
     fluxes
@@ -239,7 +247,7 @@ fn compute_flux_euler(
 
 fn compute_flux_contributions(
     nelr: usize,
-    variables: &[f32],
+    variables: &Variables,
 ) -> (Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
     let mut fc_momentum_x = vec![0.0; nelr * NDIM];
     let mut fc_momentum_y = vec![0.0; nelr * NDIM];
@@ -247,14 +255,14 @@ fn compute_flux_contributions(
     let mut fc_density_energy = vec![0.0; nelr * NDIM];
 
     for i in 0..nelr {
-        let density_i = variables[i + VAR_DENSITY * nelr];
+        let density_i = variables.density[i];
         let momentum_i = Float3 {
-            x: variables[i + (VAR_MOMENTUM + 0) * nelr],
-            y: variables[i + (VAR_MOMENTUM + 1) * nelr],
-            z: variables[i + (VAR_MOMENTUM + 2) * nelr],
+            x: variables.momentum.x[i],
+            y: variables.momentum.y[i],
+            z: variables.momentum.z[i],
         };
 
-        let density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
+        let density_energy_i = variables.energy[i];
 
         let velocity_i = compute_velocity(density_i, &momentum_i);
         let speed_sqd_i = compute_speed_sqd(&velocity_i);
@@ -298,32 +306,40 @@ fn compute_flux_contributions(
 
 fn compute_flux_pre_euler(
     nelr: usize,
-    variables: &[f32],
+    variables: &Variables,
     elements_surrounding_elements: &[i32],
-    normals: &[f32],
+    normals: &Normals,
     fc_momentum_x: &[f32],
     fc_momentum_y: &[f32],
     fc_momentum_z: &[f32],
     fc_density_energy: &[f32],
-    ff_variable: &[f32],
+    ff_variable: &Variable,
     ff_fc_momentum_x: &Float3,
     ff_fc_momentum_y: &Float3,
     ff_fc_momentum_z: &Float3,
     ff_fc_density_energy: &Float3,
-) -> Vec<f32> {
+) -> Variables {
     let smoothing_coefficient: f32 = 0.2;
 
-    let mut fluxes = vec![0.0; nelr * NVAR];
+    let mut fluxes = Variables {
+        density: AlignedSlice::of_len(nelr),
+        momentum: Momentum {
+            x: AlignedSlice::of_len(nelr),
+            y: AlignedSlice::of_len(nelr),
+            z: AlignedSlice::of_len(nelr),
+        },
+        energy: AlignedSlice::of_len(nelr),
+    };
 
     for i in 0..nelr {
-        let density_i = variables[i + VAR_DENSITY * nelr];
+        let density_i = variables.density[i];
         let momentum_i = Float3 {
-            x: variables[i + (VAR_MOMENTUM + 0) * nelr],
-            y: variables[i + (VAR_MOMENTUM + 1) * nelr],
-            z: variables[i + (VAR_MOMENTUM + 2) * nelr],
+            x: variables.momentum.x[i],
+            y: variables.momentum.y[i],
+            z: variables.momentum.z[i],
         };
 
-        let density_energy_i = variables[i + VAR_DENSITY_ENERGY * nelr];
+        let density_energy_i = variables.energy[i];
 
         let velocity_i = compute_velocity(density_i, &momentum_i);
         let speed_sqd_i = compute_speed_sqd(&velocity_i);
@@ -363,22 +379,22 @@ fn compute_flux_pre_euler(
         for j in 0..NNB {
             let nb = elements_surrounding_elements[i + j * nelr];
             let normal = Float3 {
-                x: normals[i + (j + 0 * NNB) * nelr],
-                y: normals[i + (j + 1 * NNB) * nelr],
-                z: normals[i + (j + 2 * NNB) * nelr],
+                x: normals.x[i + j * nelr],
+                y: normals.y[i + j * nelr],
+                z: normals.z[i + j * nelr],
             };
             let normal_len =
                 (normal.x * normal.x + normal.y * normal.y + normal.z * normal.z).sqrt();
 
             if nb >= 0 {
                 let nb = nb as usize;
-                let density_nb = variables[nb + VAR_DENSITY * nelr];
+                let density_nb = variables.density[nb];
                 let momentum_nb = Float3 {
-                    x: variables[nb + (VAR_MOMENTUM + 0) * nelr],
-                    y: variables[nb + (VAR_MOMENTUM + 1) * nelr],
-                    z: variables[nb + (VAR_MOMENTUM + 2) * nelr],
+                    x: variables.momentum.x[nb],
+                    y: variables.momentum.y[nb],
+                    z: variables.momentum.z[nb],
                 };
-                let density_energy_nb = variables[nb + VAR_DENSITY_ENERGY * nelr];
+                let density_energy_nb = variables.energy[nb];
                 let velocity_nb = compute_velocity(density_nb, &momentum_nb);
                 let speed_sqd_nb = compute_speed_sqd(&velocity_nb);
                 let pressure_nb = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
@@ -443,21 +459,21 @@ fn compute_flux_pre_euler(
                 flux_i_momentum.z += normal.z * pressure_i;
             } else if nb == -2 {
                 let factor = 0.5 * normal.x;
-                flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 0] + momentum_i.x);
+                flux_i_density += factor * (ff_variable.momentum.x + momentum_i.x);
                 flux_i_density_energy += factor * (ff_fc_density_energy.x + fc_i_density_energy.x);
                 flux_i_momentum.x += factor * (ff_fc_momentum_x.x + fc_i_momentum_x.x);
                 flux_i_momentum.y += factor * (ff_fc_momentum_y.x + fc_i_momentum_y.x);
                 flux_i_momentum.z += factor * (ff_fc_momentum_z.x + fc_i_momentum_z.x);
 
                 let factor = 0.5 * normal.y;
-                flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 1] + momentum_i.y);
+                flux_i_density += factor * (ff_variable.momentum.y + momentum_i.y);
                 flux_i_density_energy += factor * (ff_fc_density_energy.y + fc_i_density_energy.y);
                 flux_i_momentum.x += factor * (ff_fc_momentum_x.y + fc_i_momentum_x.y);
                 flux_i_momentum.y += factor * (ff_fc_momentum_y.y + fc_i_momentum_y.y);
                 flux_i_momentum.z += factor * (ff_fc_momentum_z.y + fc_i_momentum_z.y);
 
                 let factor = 0.5 * normal.z;
-                flux_i_density += factor * (ff_variable[VAR_MOMENTUM + 2] + momentum_i.z);
+                flux_i_density += factor * (ff_variable.momentum.z + momentum_i.z);
                 flux_i_density_energy += factor * (ff_fc_density_energy.z + fc_i_density_energy.z);
                 flux_i_momentum.x += factor * (ff_fc_momentum_x.z + fc_i_momentum_x.z);
                 flux_i_momentum.y += factor * (ff_fc_momentum_y.z + fc_i_momentum_y.z);
@@ -465,11 +481,11 @@ fn compute_flux_pre_euler(
             }
         }
 
-        fluxes[i + VAR_DENSITY * nelr] = flux_i_density;
-        fluxes[i + (VAR_MOMENTUM + 0) * nelr] = flux_i_momentum.x;
-        fluxes[i + (VAR_MOMENTUM + 1) * nelr] = flux_i_momentum.y;
-        fluxes[i + (VAR_MOMENTUM + 2) * nelr] = flux_i_momentum.z;
-        fluxes[i + VAR_DENSITY_ENERGY * nelr] = flux_i_density_energy;
+        fluxes.density[i] = flux_i_density;
+        fluxes.momentum.x[i] = flux_i_momentum.x;
+        fluxes.momentum.y[i] = flux_i_momentum.y;
+        fluxes.momentum.z[i] = flux_i_momentum.z;
+        fluxes.energy[i] = flux_i_density_energy;
     }
 
     fluxes
@@ -478,47 +494,42 @@ fn compute_flux_pre_euler(
 fn time_step(
     j: usize,
     nelr: usize,
-    old_variables: &[f32],
-    variables: &mut AlignedSlice<f32>,
+    old_variables: &Variables,
+    variables: &mut Variables,
     step_factors: &[f32],
-    fluxes: &[f32],
+    fluxes: &Variables,
 ) {
     for i in 0..nelr {
         let factor = step_factors[i] / (RK + 1 - j) as f32;
-        variables[i + VAR_DENSITY * nelr] =
-            old_variables[i + VAR_DENSITY * nelr] + factor * fluxes[i + VAR_DENSITY * nelr];
-        variables[i + (VAR_MOMENTUM + 0) * nelr] = old_variables[i + (VAR_MOMENTUM + 0) * nelr]
-            + factor * fluxes[i + (VAR_MOMENTUM + 0) * nelr];
-        variables[i + (VAR_MOMENTUM + 1) * nelr] = old_variables[i + (VAR_MOMENTUM + 1) * nelr]
-            + factor * fluxes[i + (VAR_MOMENTUM + 1) * nelr];
-        variables[i + (VAR_MOMENTUM + 2) * nelr] = old_variables[i + (VAR_MOMENTUM + 2) * nelr]
-            + factor * fluxes[i + (VAR_MOMENTUM + 2) * nelr];
-        variables[i + VAR_DENSITY_ENERGY * nelr] = old_variables[i + VAR_DENSITY_ENERGY * nelr]
-            + factor * fluxes[i + VAR_DENSITY_ENERGY * nelr];
+        variables.density[i] = old_variables.density[i] + factor * fluxes.density[i];
+        variables.momentum.x[i] = old_variables.momentum.x[i] + factor * fluxes.momentum.x[i];
+        variables.momentum.y[i] = old_variables.momentum.y[i] + factor * fluxes.momentum.y[i];
+        variables.momentum.z[i] = old_variables.momentum.z[i] + factor * fluxes.momentum.z[i];
+        variables.energy[i] = old_variables.energy[i] + factor * fluxes.energy[i];
     }
 }
 
 pub fn euler(
     nelr: usize,
     iterations: usize,
-    mut variables: AlignedSlice<f32>,
+    mut variables: Variables,
     areas: &[f32],
     elements_surrounding_elements: &[i32],
-    normals: &[f32],
-    ff_variable: &[f32],
+    normals: &Normals,
+    ff_variable: &Variable,
     ff_fc_density_energy: &Float3,
     ff_fc_momentum_x: &Float3,
     ff_fc_momentum_y: &Float3,
     ff_fc_momentum_z: &Float3,
-) -> AlignedSlice<f32> {
+) -> Variables {
     for i in 0..iterations {
         let old_variables = variables.clone();
-        let step_factors = compute_step_factor(nelr, variables.as_slice(), areas);
+        let step_factors = compute_step_factor(nelr, &variables, areas);
 
         for j in 0..RK {
             let fluxes = compute_flux_euler(
                 nelr,
-                variables.as_slice(),
+                &variables,
                 elements_surrounding_elements,
                 normals,
                 ff_variable,
@@ -530,7 +541,7 @@ pub fn euler(
             time_step(
                 j,
                 nelr,
-                old_variables.as_slice(),
+                &old_variables,
                 &mut variables,
                 &step_factors,
                 &fluxes,
@@ -544,26 +555,26 @@ pub fn euler(
 pub fn pre_euler(
     nelr: usize,
     iterations: usize,
-    mut variables: AlignedSlice<f32>,
+    mut variables: Variables,
     areas: &[f32],
     elements_surrounding_elements: &[i32],
-    normals: &[f32],
-    ff_variable: &[f32],
+    normals: &Normals,
+    ff_variable: &Variable,
     ff_fc_density_energy: &Float3,
     ff_fc_momentum_x: &Float3,
     ff_fc_momentum_y: &Float3,
     ff_fc_momentum_z: &Float3,
-) -> AlignedSlice<f32> {
+) -> Variables {
     for i in 0..iterations {
         let old_variables = variables.clone();
-        let step_factors = compute_step_factor(nelr, variables.as_slice(), areas);
+        let step_factors = compute_step_factor(nelr, &variables, areas);
 
         for j in 0..RK {
             let (fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy) =
-                compute_flux_contributions(nelr, variables.as_slice());
+                compute_flux_contributions(nelr, &variables);
             let fluxes = compute_flux_pre_euler(
                 nelr,
-                variables.as_slice(),
+                &variables,
                 elements_surrounding_elements,
                 normals,
                 &fc_momentum_x,
@@ -579,7 +590,7 @@ pub fn pre_euler(
             time_step(
                 j,
                 nelr,
-                old_variables.as_slice(),
+                &old_variables,
                 &mut variables,
                 &step_factors,
                 &fluxes,
diff --git a/juno_samples/rodinia/cfd/src/setup.rs b/juno_samples/rodinia/cfd/src/setup.rs
index b996f057c50c6eb404ea8a9db2d0d92dc738ac2a..0da0025162c919d2e420d6bbb0e69f98d83e52ff 100644
--- a/juno_samples/rodinia/cfd/src/setup.rs
+++ b/juno_samples/rodinia/cfd/src/setup.rs
@@ -1,10 +1,10 @@
+use std::fmt::{Debug, Error, Formatter};
 use std::fs::File;
 use std::io::Read;
+use std::ops::{Index, IndexMut};
 use std::path::Path;
 use std::str::FromStr;
 
-use std::ops::{Index, IndexMut};
-
 use nom::Parser;
 
 use crate::rust_cfd;
@@ -14,10 +14,6 @@ pub const ff_mach: f32 = 1.2;
 pub const NDIM: usize = 3;
 pub const NNB: usize = 4;
 pub const RK: usize = 3;
-pub const VAR_DENSITY: usize = 0;
-pub const VAR_MOMENTUM: usize = 1;
-pub const VAR_DENSITY_ENERGY: usize = VAR_MOMENTUM + NDIM;
-pub const NVAR: usize = VAR_DENSITY_ENERGY + 1;
 pub const deg_angle_of_attack: f32 = 0.0;
 
 #[repr(align(64))]
@@ -31,6 +27,15 @@ pub struct AlignedSlice<T> {
     length: usize,
 }
 
+impl<T> Debug for AlignedSlice<T>
+where
+    T: Debug
+{
+    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
+        write!(f, "{:?}", self.as_slice())
+    }
+}
+
 impl<T> AlignedSlice<T>
 where
     T: Default,
@@ -53,6 +58,17 @@ where
     }
 }
 
+impl<T> AlignedSlice<T>
+where
+    T: Clone + Default,
+{
+    pub fn from_slice(slice: &[T]) -> Self {
+        let mut res = Self::of_len(slice.len());
+        res.as_mut_slice().clone_from_slice(slice);
+        res
+    }
+}
+
 impl<T> AlignedSlice<T> {
     pub fn as_slice(&self) -> &[T] {
         &self.slice[self.offset..self.offset + self.length]
@@ -90,15 +106,20 @@ where
     }
 }
 
-#[repr(C)]
 pub struct Float3 {
     pub x: f32,
     pub y: f32,
     pub z: f32,
 }
 
+pub struct Variable {
+    pub density: f32,
+    pub momentum: Float3,
+    pub energy: f32,
+}
+
 pub struct FarFieldConditions {
-    pub ff_variable: AlignedSlice<f32>,
+    pub ff_variable: Variable,
     pub ff_fc_momentum_x: Float3,
     pub ff_fc_momentum_y: Float3,
     pub ff_fc_momentum_z: Float3,
@@ -106,14 +127,14 @@ pub struct FarFieldConditions {
 }
 
 pub fn set_far_field_conditions() -> FarFieldConditions {
-    let mut ff_variable: AlignedSlice<f32> = AlignedSlice::of_len(NVAR);
+    let mut ff_variable = Variable { density: 0.0, momentum: Float3 { x: 0.0, y: 0.0, z: 0.0 }, energy: 0.0 };
 
     let angle_of_attack = std::f32::consts::PI / 180.0 * deg_angle_of_attack;
 
-    ff_variable[VAR_DENSITY] = 1.4;
+    ff_variable.density = 1.4;
 
     let ff_pressure = 1.0;
-    let ff_speed_of_sound = (GAMMA * ff_pressure / ff_variable[VAR_DENSITY]).sqrt();
+    let ff_speed_of_sound = (GAMMA * ff_pressure / ff_variable.density).sqrt();
     let ff_speed = ff_mach * ff_speed_of_sound;
 
     let ff_velocity = Float3 {
@@ -122,23 +143,23 @@ pub fn set_far_field_conditions() -> FarFieldConditions {
         z: 0.0,
     };
 
-    ff_variable[VAR_MOMENTUM + 0] = ff_variable[VAR_DENSITY] * ff_velocity.x;
-    ff_variable[VAR_MOMENTUM + 1] = ff_variable[VAR_DENSITY] * ff_velocity.y;
-    ff_variable[VAR_MOMENTUM + 2] = ff_variable[VAR_DENSITY] * ff_velocity.z;
+    ff_variable.momentum.x = ff_variable.density * ff_velocity.x;
+    ff_variable.momentum.y = ff_variable.density * ff_velocity.y;
+    ff_variable.momentum.z = ff_variable.density * ff_velocity.z;
 
-    ff_variable[VAR_DENSITY_ENERGY] =
-        ff_variable[VAR_DENSITY] * (0.5 * (ff_speed * ff_speed)) + (ff_pressure / (GAMMA - 1.0));
+    ff_variable.energy =
+        ff_variable.density * (0.5 * (ff_speed * ff_speed)) + (ff_pressure / (GAMMA - 1.0));
 
     let ff_momentum = Float3 {
-        x: ff_variable[VAR_MOMENTUM + 0],
-        y: ff_variable[VAR_MOMENTUM + 1],
-        z: ff_variable[VAR_MOMENTUM + 2],
+        x: ff_variable.momentum.x,
+        y: ff_variable.momentum.y,
+        z: ff_variable.momentum.z,
     };
     let (ff_fc_momentum_x, ff_fc_momentum_y, ff_fc_momentum_z, ff_fc_density_energy) =
         rust_cfd::compute_flux_contribution(
-            ff_variable[VAR_DENSITY],
+            ff_variable.density,
             &ff_momentum,
-            ff_variable[VAR_DENSITY_ENERGY],
+            ff_variable.energy,
             ff_pressure,
             &ff_velocity,
         );
@@ -152,11 +173,17 @@ pub fn set_far_field_conditions() -> FarFieldConditions {
     }
 }
 
+pub struct Normals {
+    pub x: AlignedSlice<f32>,
+    pub y: AlignedSlice<f32>,
+    pub z: AlignedSlice<f32>,
+}
+
 pub struct GeometryData {
     pub nelr: usize,
     pub areas: AlignedSlice<f32>,
     pub elements_surrounding_elements: AlignedSlice<i32>,
-    pub normals: AlignedSlice<f32>,
+    pub normals: Normals,
 }
 
 pub fn read_domain_geometry<P>(path: P, block_size: usize) -> GeometryData
@@ -183,7 +210,11 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo
 
     let mut areas: AlignedSlice<f32> = AlignedSlice::of_len(nelr);
     let mut elements_surrounding_elements: AlignedSlice<i32> = AlignedSlice::of_len(nelr * NNB);
-    let mut normals: AlignedSlice<f32> = AlignedSlice::of_len(NDIM * NNB * nelr);
+    let mut normals = Normals {
+        x: AlignedSlice::of_len(NNB * nelr),
+        y: AlignedSlice::of_len(NNB * nelr),
+        z: AlignedSlice::of_len(NNB * nelr),
+    };
 
     // read in data
     let mut text = text;
@@ -209,14 +240,21 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo
             let val = i32::from_str(val).unwrap();
             let val = if neg { -val } else { val };
             elements_surrounding_elements[i + j * nelr] = if val < 0 { -1 } else { val } - 1; // it's coming in with Fortran numbering
-
-            for k in 0..NDIM {
-                let t = nom::character::complete::multispace0(text)?.0;
-                let (t, val) = nom::number::complete::float(t)?;
-                text = t;
-
-                normals[i + (j + k * NNB) * nelr] = -val;
-            }
+                
+            let t = nom::character::complete::multispace0(text)?.0;
+            let (t, val) = nom::number::complete::float(t)?;
+            text = t;
+            normals.x[i + j * nelr] = -val;
+                
+            let t = nom::character::complete::multispace0(text)?.0;
+            let (t, val) = nom::number::complete::float(t)?;
+            text = t;
+            normals.y[i + j * nelr] = -val;
+                
+            let t = nom::character::complete::multispace0(text)?.0;
+            let (t, val) = nom::number::complete::float(t)?;
+            text = t;
+            normals.z[i + j * nelr] = -val;
         }
     }
 
@@ -227,9 +265,9 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo
         for j in 0..NNB {
             elements_surrounding_elements[i + j * nelr] =
                 elements_surrounding_elements[last + j * nelr];
-            for k in 0..NDIM {
-                normals[i + (j + k * NNB) * nelr] = normals[last + (j + k * NNB) * nelr];
-            }
+            normals.x[i + j * nelr] = normals.x[last + j * nelr];
+            normals.y[i + j * nelr] = normals.y[last + j * nelr];
+            normals.z[i + j * nelr] = normals.z[last + j * nelr];
         }
     }
 
@@ -245,13 +283,37 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo
     ))
 }
 
-pub fn initialize_variables(nelr: usize, ff_variable: &[f32]) -> AlignedSlice<f32> {
-    let mut variables = AlignedSlice::of_len(nelr * NVAR);
+#[derive(Clone)]
+pub struct Momentum {
+    pub x: AlignedSlice<f32>,
+    pub y: AlignedSlice<f32>,
+    pub z: AlignedSlice<f32>,
+}
+
+#[derive(Clone)]
+pub struct Variables {
+    pub density: AlignedSlice<f32>,
+    pub momentum: Momentum,
+    pub energy: AlignedSlice<f32>,
+}
+
+pub fn initialize_variables(nelr: usize, ff_variable: &Variable) -> Variables {
+    let mut variables = Variables {
+        density: AlignedSlice::of_len(nelr),
+        momentum: Momentum {
+            x: AlignedSlice::of_len(nelr),
+            y: AlignedSlice::of_len(nelr),
+            z: AlignedSlice::of_len(nelr),
+        },
+        energy: AlignedSlice::of_len(nelr),
+    };
 
     for i in 0..nelr {
-        for j in 0..NVAR {
-            variables[i + j * nelr] = ff_variable[j];
-        }
+        variables.density[i] = ff_variable.density;
+        variables.momentum.x[i] = ff_variable.momentum.x;
+        variables.momentum.y[i] = ff_variable.momentum.y;
+        variables.momentum.z[i] = ff_variable.momentum.z;
+        variables.energy[i] = ff_variable.energy;
     }
 
     variables
diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch
index 289548f9e01cdf402a3e1b1057fa52d4029f6173..f736c0b776759596cfc93a5025fb4f280c3fc932 100644
--- a/juno_samples/rodinia/srad/src/gpu.sch
+++ b/juno_samples/rodinia/srad/src/gpu.sch
@@ -54,4 +54,10 @@ ip-sroa(*);
 sroa(*);
 simpl!(*);
 
+fork-dim-merge(main_loops);
+fork-tile[32, 0, false, true](main_loops);
+dce(main_loops);
+fork-split(main_loops);
+simpl!(main_loops);
+
 gcm(*);
diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs
index 9d020c64ccef3b9c0a79694876c5b0ace606f938..8b68ed7111de9009458dbdc26b511e66ee13db58 100644
--- a/juno_scheduler/src/compile.rs
+++ b/juno_scheduler/src/compile.rs
@@ -22,6 +22,7 @@ pub enum ScheduleCompilerError {
         actual: usize,
         loc: Location,
     },
+    SemanticError(String, Location),
 }
 
 impl fmt::Display for ScheduleCompilerError {
@@ -46,6 +47,11 @@ impl fmt::Display for ScheduleCompilerError {
                 "({}, {}) -- ({}, {}): Expected {} arguments, found {}",
                 loc.0 .0, loc.0 .1, loc.1 .0, loc.1 .1, expected, actual
             ),
+            ScheduleCompilerError::SemanticError(msg, loc) => write!(
+                f,
+                "({}, {}) -- ({}, {}): {}",
+                loc.0 .0, loc.0 .1, loc.1 .0, loc.1 .1, msg,
+            ),
         }
     }
 }
@@ -76,6 +82,8 @@ enum Appliable {
     // DeleteUncalled requires special handling because it changes FunctionIDs, so it is not
     // treated like a pass
     DeleteUncalled,
+    // Test whether a feature is enabled
+    Feature,
     Schedule(Schedule),
     Device(Device),
 }
@@ -85,6 +93,8 @@ impl Appliable {
     fn is_valid_num_args(&self, num: usize) -> bool {
         match self {
             Appliable::Pass(pass) => pass.is_valid_num_args(num),
+            // Testing whether a feature is enabled takes the feature instead of a selection, so it
+            // has 0 arguments
             // Delete uncalled, Schedules, and devices do not take arguments
             _ => num == 0,
         }
@@ -158,6 +168,8 @@ impl FromStr for Appliable {
             "serialize" => Ok(Appliable::Pass(ir::Pass::Serialize)),
             "write-predication" => Ok(Appliable::Pass(ir::Pass::WritePredication)),
 
+            "feature" => Ok(Appliable::Feature),
+
             "print" => Ok(Appliable::Pass(ir::Pass::Print)),
 
             "cpu" | "llvm" => Ok(Appliable::Device(Device::LLVM)),
@@ -275,6 +287,35 @@ fn compile_stmt(
                 limit,
             }])
         }
+        parser::Stmt::IfThenElse {
+            span: _,
+            cond,
+            thn,
+            els,
+        } => {
+            let cond = compile_exp_as_expr(cond, lexer, macrostab, macros)?;
+
+            macros.open_scope();
+            let thn = ir::ScheduleStmt::Block {
+                body: compile_ops_as_block(*thn, lexer, macrostab, macros)?,
+            };
+            macros.close_scope();
+
+            macros.open_scope();
+            let els = match els {
+                Some(els) => ir::ScheduleStmt::Block {
+                    body: compile_ops_as_block(*els, lexer, macrostab, macros)?,
+                },
+                None => ir::ScheduleStmt::Block { body: vec![] },
+            };
+            macros.close_scope();
+
+            Ok(vec![ir::ScheduleStmt::IfThenElse {
+                cond,
+                thn: Box::new(thn),
+                els: Box::new(els),
+            }])
+        }
         parser::Stmt::MacroDecl { span: _, def } => {
             let parser::MacroDecl {
                 name,
@@ -380,6 +421,17 @@ fn compile_expr(
                         on: selection,
                     }))
                 }
+                Appliable::Feature => match selection {
+                    ir::Selector::Selection(mut args) if args.len() == 1 => {
+                        Ok(ExprResult::Expr(ir::ScheduleExp::Feature {
+                            feature: Box::new(args.pop().unwrap()),
+                        }))
+                    }
+                    _ => Err(ScheduleCompilerError::SemanticError(
+                        "feature requires exactly one argument as its selection".to_string(),
+                        lexer.line_col(span),
+                    )),
+                },
                 Appliable::Schedule(sched) => Ok(ExprResult::Stmt(ir::ScheduleStmt::AddSchedule {
                     sched,
                     on: selection,
diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs
index ab1495b816c99452560d03c0addf77a5aec18974..bacb4142c62df5b0bf974bcb1703400dd3caa02c 100644
--- a/juno_scheduler/src/ir.rs
+++ b/juno_scheduler/src/ir.rs
@@ -121,6 +121,9 @@ pub enum ScheduleExp {
     DeleteUncalled {
         on: Selector,
     },
+    Feature {
+        feature: Box<ScheduleExp>,
+    },
     Record {
         fields: Vec<(String, ScheduleExp)>,
     },
@@ -180,4 +183,9 @@ pub enum ScheduleStmt {
         device: Device,
         on: Selector,
     },
+    IfThenElse {
+        cond: ScheduleExp,
+        thn: Box<ScheduleStmt>,
+        els: Box<ScheduleStmt>,
+    },
 }
diff --git a/juno_scheduler/src/lang.l b/juno_scheduler/src/lang.l
index af154fce3d489b7b078607e52888d76d08f8e5fe..1f4f8723de4d950920d503a11c9145ad1d15d24a 100644
--- a/juno_scheduler/src/lang.l
+++ b/juno_scheduler/src/lang.l
@@ -20,12 +20,15 @@
 \.              "."
 
 apply           "apply"
+else            "else"
 fixpoint        "fixpoint"
+if              "if"
 let             "let"
 macro           "macro_keyword"
 on              "on"
 set             "set"
 target          "target"
+then            "then"
 
 true            "true"
 false           "false"
diff --git a/juno_scheduler/src/lang.y b/juno_scheduler/src/lang.y
index 451f035b8122ac1a694567327a44776c9326fff2..55c82b9dc368e7cade9500fb852c9d48320be7d2 100644
--- a/juno_scheduler/src/lang.y
+++ b/juno_scheduler/src/lang.y
@@ -27,6 +27,10 @@ Stmt -> Stmt
       { Stmt::ExprStmt { span: $span, exp: $1 } }
   | 'fixpoint' FixpointLimit '{' Schedule '}'
       { Stmt::Fixpoint { span: $span, limit: $2, body: Box::new($4) } }
+  | 'if' Expr '{' Schedule '}'
+      { Stmt::IfThenElse { span: $span, cond: $2, thn: Box::new($4), els: None } }
+  | 'if' Expr '{' Schedule '}' 'else' '{' Schedule '}'
+      { Stmt::IfThenElse { span: $span, cond: $2, thn: Box::new($4), els: Some(Box::new($8)) } }
   | MacroDecl
       { Stmt::MacroDecl { span: $span, def: $1 } }
   ;
@@ -163,6 +167,7 @@ pub enum Stmt {
   AssignStmt { span: Span, var: Span, rhs: Expr },
   ExprStmt   { span: Span, exp: Expr },
   Fixpoint   { span: Span, limit: FixpointLimit, body: Box<OperationList> },
+  IfThenElse { span: Span, cond: Expr, thn: Box<OperationList>, els: Option<Box<OperationList>> },
   MacroDecl  { span: Span, def: MacroDecl },
 }
 
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 456df2eda49b93a6c80327a090b6f6606ae711bb..4c981bd98ebd8eb605352cd3a78128c72e0e4ae9 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -16,6 +16,7 @@ use juno_utils::stringtab::StringTable;
 
 use std::cell::RefCell;
 use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
+use std::env;
 use std::fmt;
 use std::fs::File;
 use std::io::Write;
@@ -1090,7 +1091,9 @@ impl PassManager {
 
             let mut nvcc_process = Command::new("nvcc")
                 .arg("-c")
+                .arg("-Xptxas")
                 .arg("-O3")
+                .arg("-use_fast_math")
                 .arg("-diag-suppress")
                 .arg("177")
                 .arg("-o")
@@ -1197,6 +1200,23 @@ fn schedule_interpret(
             // were made
             Ok(i > 1)
         }
+        ScheduleStmt::IfThenElse { cond, thn, els } => {
+            let (cond, modified) = interp_expr(pm, cond, stringtab, env, functions)?;
+            let Value::Boolean { val: cond } = cond else {
+                return Err(SchedulerError::SemanticError(
+                    "Condition must be a boolean value".to_string(),
+                ));
+            };
+            let changed = schedule_interpret(
+                pm,
+                if cond { &*thn } else { &*els },
+                stringtab,
+                env,
+                functions,
+            )?;
+
+            Ok(modified || changed)
+        }
         ScheduleStmt::Block { body } => {
             let mut modified = false;
             env.open_scope();
@@ -1441,6 +1461,23 @@ fn interp_expr(
                 changed,
             ))
         }
+        ScheduleExp::Feature { feature } => {
+            let (feature, modified) = interp_expr(pm, &*feature, stringtab, env, functions)?;
+            let Value::String { val } = feature else {
+                return Err(SchedulerError::SemanticError(
+                    "Feature expects a single string argument (instead of a selection)".to_string(),
+                ));
+            };
+            // To test for features, the scheduler needs to be invoked from a build script so that
+            // Cargo provides the enabled features via environment variables
+            let key = "CARGO_FEATURE_".to_string() + &val.to_uppercase().replace("-", "_");
+            Ok((
+                Value::Boolean {
+                    val: env::var(key).is_ok(),
+                },
+                modified,
+            ))
+        }
         ScheduleExp::Record { fields } => {
             let mut result = HashMap::new();
             let mut changed = false;