Compare revisions

rarbore2 · rarbore2 · rarbore2 · rarbore2 · Aaron Councilman · Aaron Councilman
--- a/.gitignore
+++ b/.gitignore
-/target
+**/target
 *.dot
 !paper_resources/*.dot
 *.bc
@@ -14,3 +14,4 @@
 .vscode
 *_env
 *.txt
+*ncu-rep
\ No newline at end of file
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
                            // because Fork basic block's init section already does gating
                            write!(
                                w,
-                                "{}{} = (threadIdx.x % {}) / {};\n",
+                                "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
                                tabs,
                                define_variable,
                                use_thread_quota.unwrap(),
-                                use_thread_quota.unwrap() / parallel_factor.unwrap()
+                                use_thread_quota.unwrap() / parallel_factor.unwrap(),
+                                divide,
+                                modulo,
                            )?;
                        }
                    }

--- a/hercules_ir/src/typecheck.rs
+++ b/hercules_ir/src/typecheck.rs
@@ -716,8 +716,10 @@ fn typeflow(
            // Check number of run-time arguments.
            if inputs.len() - 1 != callee.param_types.len() {
                return Error(format!(
-                    "Call node has {} inputs, but calls a function with {} parameters.",
+                    "Call node in {} has {} inputs, but calls a function ({}) with {} parameters.",
+                    function.name,
                    inputs.len() - 1,
+                    callee.name,
                    callee.param_types.len(),
                ));
            }
@@ -725,8 +727,10 @@ fn typeflow(
            // Check number of dynamic constant arguments.
            if dc_args.len() != callee.num_dynamic_constants as usize {
                return Error(format!(
-                    "Call node references {} dynamic constants, but calls a function expecting {} dynamic constants.",
+                    "Call node in {} references {} dynamic constants, but calls a function ({}) expecting {} dynamic constants.",
+                    function.name,
                    dc_args.len(),
+                    callee.name,
                    callee.num_dynamic_constants
                ));
            }

--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge(
                    op: BinaryOperator::Rem,
                });
                edit.sub_edit(tid, rem);
+                edit.sub_edit(tid, outer_tid);
                edit = edit.replace_all_uses(tid, rem)?;
            } else if tid_dim == inner_idx {
                let outer_tid = Node::ThreadID {
@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge(
                    op: BinaryOperator::Div,
                });
                edit.sub_edit(tid, div);
+                edit.sub_edit(tid, outer_tid);
                edit = edit.replace_all_uses(tid, div)?;
            }
        }
@@ -1479,7 +1481,12 @@ fn fork_fusion(
    }

    // Perform the fusion.
+    let bottom_tids: Vec<_> = editor
+        .get_users(bottom_fork)
+        .filter(|id| nodes[id.idx()].is_thread_id())
+        .collect();
    editor.edit(|mut edit| {
+        edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?;
        if bottom_join_pred != bottom_fork {
            // If there is control flow in the bottom fork-join, stitch it into
            // the top fork-join.

--- a/hercules_opt/src/interprocedural_sroa.rs
+++ b/hercules_opt/src/interprocedural_sroa.rs
-use std::collections::HashMap;
-use std::iter::zip;
-
 use hercules_ir::ir::*;

 use crate::*;
@@ -39,15 +36,30 @@ pub fn interprocedural_sroa(
        }

        let editor: &mut FunctionEditor = &mut editors[func_id.idx()];
+        let param_types = &editor.func().param_types.to_vec();
        let return_types = &editor.func().return_types.to_vec();

-        // We determine the new return types of the function and track a map
-        // that tells us how the old return values are constructed from the
-        // new ones
+        // We determine the new param/return types of the function and track a
+        // map that tells us how the old param/return values are constructed
+        // from the new ones.
+        let mut new_param_types = vec![];
+        let mut old_param_type_map = vec![];
        let mut new_return_types = vec![];
        let mut old_return_type_map = vec![];
        let mut changed = false;

+        for par_typ in param_types.iter() {
+            if !can_sroa_type(editor, *par_typ) {
+                old_param_type_map.push(IndexTree::Leaf(new_param_types.len()));
+                new_param_types.push(*par_typ);
+            } else {
+                let (types, index) = sroa_type(editor, *par_typ, new_param_types.len());
+                old_param_type_map.push(index);
+                new_param_types.extend(types);
+                changed = true;
+            }
+        }
+
        for ret_typ in return_types.iter() {
            if !can_sroa_type(editor, *ret_typ) {
                old_return_type_map.push(IndexTree::Leaf(new_return_types.len()));
@@ -60,25 +72,54 @@ pub fn interprocedural_sroa(
            }
        }

-        // If the return type is not changed by IP SROA, skip to the next function
+        // If the param/return types aren't changed by IP SROA, skip to the next
+        // function.
        if !changed {
            continue;
        }

-        // Now, modify each return in the current function and the return type
-        let return_nodes = editor
-            .func()
-            .nodes
-            .iter()
-            .enumerate()
-            .filter_map(|(idx, node)| {
-                if node.try_return().is_some() {
-                    Some(NodeID::new(idx))
+        // Modify each parameter in the current function and the param types.
+        let mut param_nodes: Vec<_> = vec![vec![]; param_types.len()];
+        for id in editor.node_ids() {
+            if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() {
+                param_nodes[idx].push(id);
+            }
+        }
+        let success = editor.edit(|mut edit| {
+            for (idx, ids) in param_nodes.into_iter().enumerate() {
+                let new_indices = &old_param_type_map[idx];
+                let built = if let IndexTree::Leaf(new_idx) = new_indices {
+                    edit.add_node(Node::Parameter { index: *new_idx })
                } else {
-                    None
+                    let prod_ty = param_types[idx];
+                    let cons = edit.add_zero_constant(prod_ty);
+                    let mut cons = edit.add_node(Node::Constant { id: cons });
+                    new_indices.for_each(|idx: &Vec<Index>, param_idx: &usize| {
+                        let param = edit.add_node(Node::Parameter { index: *param_idx });
+                        cons = edit.add_node(Node::Write {
+                            collect: cons,
+                            data: param,
+                            indices: idx.clone().into_boxed_slice(),
+                        });
+                    });
+                    cons
+                };
+                for id in ids {
+                    edit = edit.replace_all_uses(id, built)?;
+                    edit = edit.delete_node(id)?;
                }
-            })
-            .collect::<Vec<_>>();
+            }
+
+            edit.set_param_types(new_param_types);
+            Ok(edit)
+        });
+        assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument");
+
+        // Modify each return in the current function and the return types.
+        let return_nodes: Vec<_> = editor
+            .node_ids()
+            .filter(|id| editor.func().nodes[id.idx()].is_return())
+            .collect();
        let success = editor.edit(|mut edit| {
            for node in return_nodes {
                let Node::Return { control, data } = edit.get_node(node) else {
@@ -114,17 +155,15 @@ pub fn interprocedural_sroa(
            }

            edit.set_return_types(new_return_types);
-
            Ok(edit)
        });
        assert!(success, "IP SROA expects to be able to edit everything, specify what functions to IP SROA via the func_selection argument");

-        // Finally, update calls of this function
-        // In particular, we actually don't have to update the call node at all but have to update
-        // its DataProjection users
+        // Finally, update calls of this function.
        for (caller, callsite) in callsites {
            let editor = &mut editors[caller.idx()];
            assert!(editor.func_id() == caller);
+
            let projs = editor.get_users(callsite).collect::<Vec<_>>();
            for proj_id in projs {
                let Node::DataProjection { data: _, selection } = editor.node(proj_id) else {
@@ -134,6 +173,40 @@ pub fn interprocedural_sroa(
                let typ = types[caller.idx()][proj_id.idx()];
                replace_returned_value(editor, proj_id, typ, new_return_info, callsite);
            }
+
+            let (control, callee, dc_args, args) =
+                editor.func().nodes[callsite.idx()].try_call().unwrap();
+            let dc_args = dc_args.clone();
+            let args = args.clone();
+            let success = editor.edit(|mut edit| {
+                let mut new_args = vec![];
+                for (idx, (data_id, update_info)) in
+                    args.iter().zip(old_param_type_map.iter()).enumerate()
+                {
+                    if let IndexTree::Leaf(new_idx) = update_info {
+                        // Unchanged parameter value
+                        assert!(new_args.len() == *new_idx);
+                        new_args.push(*data_id);
+                    } else {
+                        // SROA'd parameter value
+                        let reads = generate_reads_edit(&mut edit, param_types[idx], *data_id);
+                        reads.zip(update_info).for_each(|_, (read_id, ret_idx)| {
+                            assert!(new_args.len() == **ret_idx);
+                            new_args.push(*read_id);
+                        });
+                    }
+                }
+                let new_call = edit.add_node(Node::Call {
+                    control,
+                    function: callee,
+                    dynamic_constants: dc_args,
+                    args: new_args.into_boxed_slice(),
+                });
+                edit = edit.replace_all_uses(callsite, new_call)?;
+                edit = edit.delete_node(callsite)?;
+                Ok(edit)
+            });
+            assert!(success);
        }
    }
 }

--- a/hercules_opt/src/sroa.rs
+++ b/hercules_opt/src/sroa.rs
@@ -447,7 +447,7 @@ pub fn sroa(
                field_map.insert(node, generate_reads(editor, types[&node], node));
            }
            Node::Constant { id } => {
-                field_map.insert(node, generate_constant_fields(editor, id));
+                field_map.insert(node, generate_constant_fields(editor, id, node));
                to_delete.push(node);
            }
            _ => {
@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID

 // Given a constant cnst adds node to the function which are the constant values of each field and
 // returns a list of pairs of indices and the node that holds that index
-fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> {
+fn generate_constant_fields(
+    editor: &mut FunctionEditor,
+    cnst: ConstantID,
+    old_node: NodeID,
+) -> IndexTree<NodeID> {
    let cs: Option<Vec<ConstantID>> =
        if let Some(cs) = editor.get_constant(cnst).try_product_fields() {
            Some(cs.into())
@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In
    if let Some(cs) = cs {
        let mut fields = vec![];
        for c in cs {
-            fields.push(generate_constant_fields(editor, c));
+            fields.push(generate_constant_fields(editor, c, old_node));
        }
        IndexTree::Node(fields)
    } else {
        let mut node = None;
        editor.edit(|mut edit| {
            node = Some(edit.add_node(Node::Constant { id: cnst }));
+            edit.sub_edit(old_node, node.unwrap());
            Ok(edit)
        });
        IndexTree::Leaf(node.expect("Add node cannot fail"))

--- a/juno_samples/cava/src/cava.jn
+++ b/juno_samples/cava/src/cava.jn
@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
  @image_loop for r = 0 to row {
    for c = 0 to col {
      @l2 let l2_dist : f32[num_ctrl_pts];
-      for cp = 0 to num_ctrl_pts {
+      @cp_loop for cp = 0 to num_ctrl_pts {
        let v1 = input[0, r, c] - ctrl_pts[cp, 0];
        let v2 = input[1, r, c] - ctrl_pts[cp, 1];
        let v3 = input[2, r, c] - ctrl_pts[cp, 2];
@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
     
      @channel_loop for chan = 0 to CHAN {
        let chan_val : f32 = 0.0;
-        for cp = 0 to num_ctrl_pts {
+        @cp_loop for cp = 0 to num_ctrl_pts {
          chan_val += l2_dist[cp] * weights[cp, chan];
        }


--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -115,7 +115,7 @@ array-slf(fuse4);
 simpl!(fuse4);
 let par = fuse4@image_loop \ fuse4@channel_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let fuse4_body = outline(split.cava_3.fj2);

--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -117,9 +117,9 @@ fixpoint {
 simpl!(fuse4);
 array-slf(fuse4);
 simpl!(fuse4);
-//fork-tile[2, 0, false, true](fuse4@channel_loop);
-//fork-split(fuse4@channel_loop);
-//clean-monoid-reduces(fuse4);
+fork-tile[2, 0, false, true](fuse4@channel_loop);
+let out = fork-split(fuse4@channel_loop);
+fork-unroll(out.cava_3.fj1);
 unforkify(fuse4@channel_loop);

 no-memset(fuse5@res1);
@@ -133,6 +133,13 @@ simpl!(fuse5);
 array-slf(fuse5);
 simpl!(fuse5);

+fork-tile[4, 1, false, true](fuse4);
+fork-tile[8, 0, false, true](fuse4);
+fork-interchange[1, 2](fuse4);
+let split = fork-split(fuse4);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
+
 delete-uncalled(*);
 simpl!(*);


--- a/juno_samples/cava/src/lib.rs
+++ b/juno_samples/cava/src/lib.rs
@@ -124,9 +124,9 @@ pub struct CavaInputs {
    #[clap(long = "output-verify", value_name = "PATH")]
    pub output_verify: Option<String>,
    pub cam_model: String,
-    #[clap(short, long)]
+    #[clap(long)]
    pub crop_rows: Option<usize>,
-    #[clap(short, long)]
+    #[clap(long)]
    pub crop_cols: Option<usize>,
 }


--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -26,14 +26,14 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
 fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
 simpl!(gaussian_smoothing, gaussian_smoothing_body);

-no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+no-memset(laplacian_estimate@res);
 fixpoint {
  forkify(laplacian_estimate);
  fork-guard-elim(laplacian_estimate);
@@ -42,15 +42,15 @@ fixpoint {
 simpl!(laplacian_estimate);
 let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2;
+let body = split._1_laplacian_estimate.fj2;
 let laplacian_estimate_body = outline(body);
 fork-coalesce(laplacian_estimate, laplacian_estimate_body);
 simpl!(laplacian_estimate, laplacian_estimate_body);

-no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+no-memset(zero_crossings@res);
 fixpoint {
  forkify(zero_crossings);
  fork-guard-elim(zero_crossings);
@@ -59,10 +59,10 @@ fixpoint {
 simpl!(zero_crossings);
 let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2;
+let body = split._2_zero_crossings.fj2;
 let zero_crossings_body = outline(body);
 fork-coalesce(zero_crossings, zero_crossings_body);
 simpl!(zero_crossings, zero_crossings_body);
@@ -86,7 +86,7 @@ fixpoint {
 simpl!(max_gradient);
 fork-dim-merge(max_gradient);
 simpl!(max_gradient);
-fork-tile[16, 0, false, false](max_gradient);
+fork-tile[32, 0, false, false](max_gradient);
 let split = fork-split(max_gradient);
 clean-monoid-reduces(max_gradient);
 let out = outline(split._4_max_gradient.fj1);
@@ -105,7 +105,7 @@ fixpoint {
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
 fork-tile[4, 1, false, false](reject_zero_crossings);
-fork-tile[4, 0, false, false](reject_zero_crossings);
+fork-tile[8, 0, false, false](reject_zero_crossings);
 fork-interchange[1, 2](reject_zero_crossings);
 let split = fork-split(reject_zero_crossings);
 let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);

--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>(

  @image_loop for row = 0 to n {
    for col = 0 to m {
-      // Copy data for dilation filter
-      @shr1 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-	                       || row + i - r > n - 1
-			       || col + j < r
-			       || col + j - r > m - 1 then MIN_BR
-			    else input[row + i - r, col + j - r];
-        }
-      }
-
      // Compute pixel of dilated image
      let dilated = MIN_BR;
      @filter_loop for i = 0 to sz {
        for j = 0 to sz {
-          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
-        }
-      }
-
-      // Data copy for erotion filter
-      @shr2 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-	                       || row + i - r > n - 1
-			       || col + j < r
-			       || col + j - r > m - 1 then MAX_BR
-			    else input[row + i - r, col + j - r];
+	  let filter = if row + i < r
+                         || row + i - r > n - 1
+                         || col + j < r
+                         || col + j - r > m - 1 then MIN_BR
+                      else input[row + i - r, col + j - r];
+          dilated = max!(dilated, filter * structure[i, j]);
        }
      }

@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>(
      let eroded = MAX_BR;
      @filter_loop for i = 0 to sz {
        for j = 0 to sz {
-          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+	  let filter = if row + i < r
+                         || row + i - r > n - 1
+                         || col + j < r
+                         || col + j - r > m - 1 then MAX_BR
+                      else input[row + i - r, col + j - r];
+          eroded = min!(eroded, filter * structure[i, j]);
        }
      }

@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>(

  @image_loop for row = 0 to n {
    for col = 0 to m {
-      // Data copy for dilation filter
-      @shr1 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-                               || row + i - r > n - 1
-                               || col + j < r
-                               || col + j - r > m - 1 then MIN_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
-                            else MIN_BR;
-        }
-      }
-
      // Compute the pixel of dilated image
      let dilated = MIN_BR;
      @filter_loop for i = 0 to sz {
        for j = 0 to sz {
-          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
-        }
-      }
-
-      // Data copy for erotion filter
-      @shr2 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-                               || row + i - r > n - 1
-                               || col + j < r
-                               || col + j - r > m - 1 then MAX_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
-                            else MIN_BR;
+	  let filter = if row + i < r
+                          || row + i - r > n - 1
+                          || col + j < r
+                          || col + j - r > m - 1 then MIN_BR
+                       else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
+                       else MIN_BR;
+          dilated = max!(dilated, filter * structure[i, j]);
        }
      }

@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>(
      let eroded = MAX_BR;
      @filter_loop for i = 0 to sz {
        for j = 0 to sz {
-          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+	  let filter = if row + i < r
+                          || row + i - r > n - 1
+                          || col + j < r
+                          || col + j - r > m - 1 then MAX_BR
+                       else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
+                       else MIN_BR;
+          eroded = min!(eroded, filter * structure[i, j]);
        }
      }

@@ -166,7 +138,7 @@ fn gradient<n, m, sb: usize>(
      let gx = 0;
      let gy = 0;

-      for i = 0 to sb {
+      @filter_loop for i = 0 to sb {
        for j = 0 to sb {
          let val = input[if row + i < sbr              then 0
                          else if row + i - sbr > n - 1 then n - 1

--- a/juno_samples/edge_detection/src/gpu.sch
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -26,22 +26,49 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
+fork-dim-merge(gaussian_smoothing@filter_loop);
+unforkify(gaussian_smoothing@filter_loop);
+simpl!(gaussian_smoothing);

-no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+fork-dim-merge(gaussian_smoothing);
+fork-tile[32, 0, false, true](gaussian_smoothing);
+simpl!(gaussian_smoothing);
+fork-split(gaussian_smoothing);
+simpl!(gaussian_smoothing);
+
+no-memset(laplacian_estimate@res);
 fixpoint {
  forkify(laplacian_estimate);
  fork-guard-elim(laplacian_estimate);
  fork-coalesce(laplacian_estimate);
 }
 simpl!(laplacian_estimate);
+fork-dim-merge(laplacian_estimate@filter_loop);
+unforkify(laplacian_estimate@filter_loop);
+simpl!(laplacian_estimate);

-no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+fork-dim-merge(laplacian_estimate);
+fork-tile[32, 0, false, true](laplacian_estimate);
+simpl!(laplacian_estimate);
+fork-split(laplacian_estimate);
+simpl!(laplacian_estimate);
+
+no-memset(zero_crossings@res);
 fixpoint {
  forkify(zero_crossings);
  fork-guard-elim(zero_crossings);
  fork-coalesce(zero_crossings);
 }
 simpl!(zero_crossings);
+fork-dim-merge(zero_crossings@filter_loop);
+unforkify(zero_crossings@filter_loop);
+simpl!(zero_crossings);
+
+fork-dim-merge(zero_crossings);
+fork-tile[32, 0, false, true](zero_crossings);
+simpl!(zero_crossings);
+fork-split(zero_crossings);
+simpl!(zero_crossings);

 no-memset(gradient@res);
 fixpoint {
@@ -53,6 +80,15 @@ predication(gradient);
 simpl!(gradient);
 predication(gradient);
 simpl!(gradient);
+fork-dim-merge(gradient@filter_loop);
+unforkify(gradient@filter_loop);
+simpl!(gradient);
+
+fork-dim-merge(gradient);
+fork-tile[32, 0, false, true](gradient);
+simpl!(gradient);
+fork-split(gradient);
+simpl!(gradient);

 fixpoint {
  forkify(max_gradient);
@@ -88,6 +124,12 @@ fixpoint {
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);

+fork-dim-merge(reject_zero_crossings);
+fork-tile[32, 0, false, true](reject_zero_crossings);
+simpl!(reject_zero_crossings);
+fork-split(reject_zero_crossings);
+simpl!(reject_zero_crossings);
+
 async-call(edge_detection@le, edge_detection@zc);

 simpl!(*);

--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
 use juno_build::JunoCompiler;

 fn main() {
-    #[cfg(not(feature = "cuda"))]
-    {
-        JunoCompiler::new()
-            .file_in_src("matmul.jn")
-            .unwrap()
-            .schedule_in_src("cpu.sch")
-            .unwrap()
-            .build()
-            .unwrap();
-    }
-    #[cfg(feature = "cuda")]
-    {
-        JunoCompiler::new()
-            .file_in_src("matmul.jn")
-            .unwrap()
-            .schedule_in_src("gpu.sch")
-            .unwrap()
-            .build()
-            .unwrap();
-    }
+    JunoCompiler::new()
+        .file_in_src("matmul.jn")
+        .unwrap()
+        .schedule_in_src("matmul.sch")
+        .unwrap()
+        .build()
+        .unwrap();
 }
--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
-phi-elim(*);
-
-forkify(*);
-fork-guard-elim(*);
-dce(*);
-
-fixpoint {
-  reduce-slf(*);
-  slf(*);
-  infer-schedules(*);
-}
-fork-coalesce(*);
-infer-schedules(*);
-dce(*);
-rewrite(*);
-fixpoint {
-  simplify-cfg(*);
-  dce(*);
-}
-
-ip-sroa(*);
-sroa(*);
-dce(*);
-
-float-collections(*);
-gcm(*);
--- a/juno_samples/matmul/src/cpu.sch
+++ b/juno_samples/matmul/src/cpu.sch
@@ -41,21 +41,41 @@ macro unforkify!(X) {

 optimize!(*);
 forkify!(*);
-associative(matmul@outer);

-// Parallelize by computing output array as 16 chunks
-let par = matmul@outer \ matmul@inner;
-fork-tile![4](par);
-let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
-parallelize!(outer \ inner);
+if feature("cuda") {
+  fixpoint {
+    reduce-slf(*);
+    slf(*);
+    infer-schedules(*);
+  }
+  fork-coalesce(*);
+  infer-schedules(*);
+  dce(*);
+  rewrite(*);
+  fixpoint {
+    simplify-cfg(*);
+    dce(*);
+  }

-let body = outline(inner);
-cpu(body);
+  optimize!(*);
+  codegen-prep!(*);
+} else {
+  associative(matmul@outer);

-// Tile for cache, assuming 64B cache lines
-fork-tile![16](body);
-let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
+  // Parallelize by computing output array as 16 chunks
+  let par = matmul@outer \ matmul@inner;
+  fork-tile![4](par);
+  let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
+  parallelize!(outer \ inner);

-reduce-slf(inner);
-unforkify!(body);
-codegen-prep!(*);
+  let body = outline(inner);
+  cpu(body);
+
+  // Tile for cache, assuming 64B cache lines
+  fork-tile![16](body);
+  let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
+
+  reduce-slf(inner);
+  unforkify!(body);
+  codegen-prep!(*);
+}
--- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs
+++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
@@ -28,40 +28,55 @@ fn cfd_bench(c: &mut Criterion) {
        elements_surrounding_elements,
        normals,
    } = read_domain_geometry(data_file, block_size);
-    let mut variables = initialize_variables(nelr, ff_variable.as_slice());
-    let mut variables = HerculesMutBox::from(variables.as_mut_slice());
+    let mut variables = initialize_variables(nelr, &ff_variable);
+
+    let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+    let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+    let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+    let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+    let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+
    let areas = HerculesImmBox::from(areas.as_slice());
    let elements_surrounding_elements =
        HerculesImmBox::from(elements_surrounding_elements.as_slice());
-    let normals = HerculesImmBox::from(normals.as_slice());
-    let ff_variable = HerculesImmBox::from(ff_variable.as_slice());
-    let ff_fc_density_energy = vec![
-        ff_fc_density_energy.x,
-        ff_fc_density_energy.y,
-        ff_fc_density_energy.z,
-    ];
-    let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
-    let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
-    let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
-    let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
-    let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
-    let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
-    let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
+
+    let normals_x = HerculesImmBox::from(normals.x.as_slice());
+    let normals_y = HerculesImmBox::from(normals.y.as_slice());
+    let normals_z = HerculesImmBox::from(normals.z.as_slice());
+
    group.bench_function("cfd bench euler", |b| {
        b.iter(|| {
            async_std::task::block_on(async {
                r.run(
                    nelr as u64,
                    iterations as u64,
-                    variables.to(),
+                    v_density.to(),
+                    v_momentum_x.to(),
+                    v_momentum_y.to(),
+                    v_momentum_z.to(),
+                    v_energy.to(),
                    areas.to(),
                    elements_surrounding_elements.to(),
-                    normals.to(),
-                    ff_variable.to(),
-                    ff_fc_density_energy.to(),
-                    ff_fc_momentum_x.to(),
-                    ff_fc_momentum_y.to(),
-                    ff_fc_momentum_z.to(),
+                    normals_x.to(),
+                    normals_y.to(),
+                    normals_z.to(),
+                    ff_variable.density,
+                    ff_variable.momentum.x,
+                    ff_variable.momentum.y,
+                    ff_variable.momentum.z,
+                    ff_variable.energy,
+                    ff_fc_density_energy.x,
+                    ff_fc_density_energy.y,
+                    ff_fc_density_energy.z,
+                    ff_fc_momentum_x.x,
+                    ff_fc_momentum_x.y,
+                    ff_fc_momentum_x.z,
+                    ff_fc_momentum_y.x,
+                    ff_fc_momentum_y.y,
+                    ff_fc_momentum_y.z,
+                    ff_fc_momentum_z.x,
+                    ff_fc_momentum_z.y,
+                    ff_fc_momentum_z.z,
                )
                .await
            });
@@ -85,40 +100,55 @@ fn cfd_bench(c: &mut Criterion) {
        elements_surrounding_elements,
        normals,
    } = read_domain_geometry(data_file, block_size);
-    let mut variables = initialize_variables(nelr, ff_variable.as_slice());
-    let mut variables = HerculesMutBox::from(variables.as_mut_slice());
+    let mut variables = initialize_variables(nelr, &ff_variable);
+
+    let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+    let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+    let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+    let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+    let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+
    let areas = HerculesImmBox::from(areas.as_slice());
    let elements_surrounding_elements =
        HerculesImmBox::from(elements_surrounding_elements.as_slice());
-    let normals = HerculesImmBox::from(normals.as_slice());
-    let ff_variable = HerculesImmBox::from(ff_variable.as_slice());
-    let ff_fc_density_energy = vec![
-        ff_fc_density_energy.x,
-        ff_fc_density_energy.y,
-        ff_fc_density_energy.z,
-    ];
-    let ff_fc_density_energy = HerculesImmBox::from(ff_fc_density_energy.as_slice());
-    let ff_fc_momentum_x = vec![ff_fc_momentum_x.x, ff_fc_momentum_x.y, ff_fc_momentum_x.z];
-    let ff_fc_momentum_x = HerculesImmBox::from(ff_fc_momentum_x.as_slice());
-    let ff_fc_momentum_y = vec![ff_fc_momentum_y.x, ff_fc_momentum_y.y, ff_fc_momentum_y.z];
-    let ff_fc_momentum_y = HerculesImmBox::from(ff_fc_momentum_y.as_slice());
-    let ff_fc_momentum_z = vec![ff_fc_momentum_z.x, ff_fc_momentum_z.y, ff_fc_momentum_z.z];
-    let ff_fc_momentum_z = HerculesImmBox::from(ff_fc_momentum_z.as_slice());
+
+    let normals_x = HerculesImmBox::from(normals.x.as_slice());
+    let normals_y = HerculesImmBox::from(normals.y.as_slice());
+    let normals_z = HerculesImmBox::from(normals.z.as_slice());
+
    group.bench_function("cfd bench pre-euler", |b| {
        b.iter(|| {
            async_std::task::block_on(async {
                r.run(
                    nelr as u64,
                    iterations as u64,
-                    variables.to(),
+                    v_density.to(),
+                    v_momentum_x.to(),
+                    v_momentum_y.to(),
+                    v_momentum_z.to(),
+                    v_energy.to(),
                    areas.to(),
                    elements_surrounding_elements.to(),
-                    normals.to(),
-                    ff_variable.to(),
-                    ff_fc_density_energy.to(),
-                    ff_fc_momentum_x.to(),
-                    ff_fc_momentum_y.to(),
-                    ff_fc_momentum_z.to(),
+                    normals_x.to(),
+                    normals_y.to(),
+                    normals_z.to(),
+                    ff_variable.density,
+                    ff_variable.momentum.x,
+                    ff_variable.momentum.y,
+                    ff_variable.momentum.z,
+                    ff_variable.energy,
+                    ff_fc_density_energy.x,
+                    ff_fc_density_energy.y,
+                    ff_fc_density_energy.z,
+                    ff_fc_momentum_x.x,
+                    ff_fc_momentum_x.y,
+                    ff_fc_momentum_x.z,
+                    ff_fc_momentum_y.x,
+                    ff_fc_momentum_y.y,
+                    ff_fc_momentum_y.z,
+                    ff_fc_momentum_z.x,
+                    ff_fc_momentum_z.y,
+                    ff_fc_momentum_z.z,
                )
                .await
            });

--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -12,10 +12,11 @@ macro simpl!(X) {

 simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
 simpl!(*);
-ip-sroa[false](*);
-sroa[false](*);
+ip-sroa[true](*);
+sroa[true](*);
 predication(*);
 const-inline(*);
 simpl!(*);
@@ -24,8 +25,31 @@ fixpoint {
  fork-guard-elim(*);
 }
 simpl!(*);
-no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
-parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
+unforkify(compute_flux@inner_loop);
+
+fork-tile[32, 0, false, false](compute_step_factor);
+let split = fork-split(compute_step_factor);
+let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+fork-coalesce(compute_step_factor, compute_step_factor_body);
+simpl!(compute_step_factor, compute_step_factor_body);
+
+fork-tile[32, 0, false, false](compute_flux);
+let split = fork-split(compute_flux);
+let compute_flux_body = outline(split._6_compute_flux.fj1);
+fork-coalesce(compute_flux, compute_flux_body);
+simpl!(compute_flux, compute_flux_body);
+
+fork-tile[32, 0, false, false](time_step);
+let split = fork-split(time_step);
+let time_step_body = outline(split._7_time_step.fj1);
+fork-coalesce(time_step, time_step_body);
+simpl!(time_step, time_step_body);
+
+fork-tile[32, 0, false, false](copy_vars);
+let split = fork-split(copy_vars);
+let copy_vars_body = outline(split._8_copy_vars.fj1);
+fork-coalesce(copy_vars, copy_vars_body);
+simpl!(copy_vars, copy_vars_body);

-unforkify(*);
+unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
 gcm(*);
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
@@ -14,8 +14,8 @@ simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
 delete-uncalled(*);
 simpl!(*);
-ip-sroa[false](*);
-sroa[false](*);
+ip-sroa[true](*);
+sroa[true](*);
 predication(*);
 const-inline(*);
 simpl!(*);
@@ -24,7 +24,38 @@ fixpoint {
  fork-guard-elim(*);
 }
 simpl!(*);
+no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
+unforkify(compute_flux@inner_loop);

-unforkify(*);
+fork-tile[32, 0, false, false](compute_step_factor);
+let split = fork-split(compute_step_factor);
+let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+fork-coalesce(compute_step_factor, compute_step_factor_body);
+simpl!(compute_step_factor, compute_step_factor_body);

+fork-tile[32, 0, false, false](compute_flux_contributions);
+let split = fork-split(compute_flux_contributions);
+let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
+fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
+simpl!(compute_flux_contributions, compute_flux_contributions_body);
+
+fork-tile[32, 0, false, false](compute_flux);
+let split = fork-split(compute_flux);
+let compute_flux_body = outline(split._7_compute_flux.fj1);
+fork-coalesce(compute_flux, compute_flux_body);
+simpl!(compute_flux, compute_flux_body);
+
+fork-tile[32, 0, false, false](time_step);
+let split = fork-split(time_step);
+let time_step_body = outline(split._8_time_step.fj1);
+fork-coalesce(time_step, time_step_body);
+simpl!(time_step, time_step_body);
+
+fork-tile[32, 0, false, false](copy_vars);
+let split = fork-split(copy_vars);
+let copy_vars_body = outline(split._9_copy_vars.fj1);
+fork-coalesce(copy_vars, copy_vars_body);
+simpl!(copy_vars, copy_vars_body);
+
+unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
 gcm(*);
--- a/juno_samples/rodinia/cfd/src/gpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_euler.sch
-gvn(*);
-dce(*);
-phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  crc(X);
+  slf(X);
+  dce(X);
+  infer-schedules(X);
+}

-let auto = auto-outline(euler);
-gpu(auto.euler);
-
-inline(auto.euler);
-inline(auto.euler);
+simpl!(*);
+inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
+gpu(copy_vars, compute_step_factor, compute_flux, time_step);

-sroa[false](auto.euler);
-dce(*);
-float-collections(*);
-dce(*);
+simpl!(*);
+ip-sroa[true](*);
+sroa[true](*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
+unforkify(compute_flux@inner_loop);

-gcm(*);
+fork-tile[32, 0, false, true](compute_step_factor);
+fork-split(compute_step_factor);
+
+fork-tile[32, 0, false, true](compute_flux);
+fork-split(compute_flux);

+fork-tile[32, 0, false, true](time_step);
+fork-split(time_step);
+
+fork-tile[32, 0, false, true](copy_vars);
+fork-split(copy_vars);
+
+gcm(*);
No results found