From b1970233ee1f807a133e7badf7b7b3a4eda7f803 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 17:53:32 -0600
Subject: [PATCH 01/25] verify flag for cfd

---
 juno_samples/rodinia/cfd/src/lib.rs  | 67 +++++++++++++++-------------
 juno_samples/rodinia/cfd/src/main.rs |  2 +
 2 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs
index d61df4c5..a9800ed0 100644
--- a/juno_samples/rodinia/cfd/src/lib.rs
+++ b/juno_samples/rodinia/cfd/src/lib.rs
@@ -19,6 +19,8 @@ pub struct CFDInputs {
     pub block_size: usize,
     #[clap(short = None, long = Some("pre-euler"))]
     pub pre_euler: bool,
+    #[clap(short, long)]
+    pub verify: bool,
 }
 
 fn run_euler(
@@ -219,6 +221,7 @@ pub fn cfd_harness(args: CFDInputs) {
         iterations,
         block_size,
         pre_euler,
+        verify,
     } = args;
 
     let FarFieldConditions {
@@ -268,37 +271,39 @@ pub fn cfd_harness(args: CFDInputs) {
             &ff_fc_momentum_z,
         )
     };
-    let res_rust = if pre_euler {
-        rust_cfd::pre_euler(
-            nelr,
-            iterations,
-            variables,
-            areas.as_slice(),
-            elements_surrounding_elements.as_slice(),
-            &normals,
-            &ff_variable,
-            &ff_fc_density_energy,
-            &ff_fc_momentum_x,
-            &ff_fc_momentum_y,
-            &ff_fc_momentum_z,
-        )
-    } else {
-        rust_cfd::euler(
-            nelr,
-            iterations,
-            variables,
-            areas.as_slice(),
-            elements_surrounding_elements.as_slice(),
-            &normals,
-            &ff_variable,
-            &ff_fc_density_energy,
-            &ff_fc_momentum_x,
-            &ff_fc_momentum_y,
-            &ff_fc_momentum_z,
-        )
-    };
+    if verify {
+        let res_rust = if pre_euler {
+            rust_cfd::pre_euler(
+                nelr,
+                iterations,
+                variables,
+                areas.as_slice(),
+                elements_surrounding_elements.as_slice(),
+                &normals,
+                &ff_variable,
+                &ff_fc_density_energy,
+                &ff_fc_momentum_x,
+                &ff_fc_momentum_y,
+                &ff_fc_momentum_z,
+            )
+        } else {
+            rust_cfd::euler(
+                nelr,
+                iterations,
+                variables,
+                areas.as_slice(),
+                elements_surrounding_elements.as_slice(),
+                &normals,
+                &ff_variable,
+                &ff_fc_density_energy,
+                &ff_fc_momentum_x,
+                &ff_fc_momentum_y,
+                &ff_fc_momentum_z,
+            )
+        };
 
-    if !compare_floats(&res_juno, &res_rust) {
-        panic!("Mismatch in results");
+        if !compare_floats(&res_juno, &res_rust) {
+            panic!("Mismatch in results");
+        }
     }
 }
diff --git a/juno_samples/rodinia/cfd/src/main.rs b/juno_samples/rodinia/cfd/src/main.rs
index 277a3edb..2dd9a364 100644
--- a/juno_samples/rodinia/cfd/src/main.rs
+++ b/juno_samples/rodinia/cfd/src/main.rs
@@ -14,6 +14,7 @@ fn test_euler() {
         iterations: 1,
         block_size: 16,
         pre_euler: false,
+        verify: true,
     });
 }
 
@@ -24,5 +25,6 @@ fn test_pre_euler() {
         iterations: 1,
         block_size: 16,
         pre_euler: true,
+        verify: true,
     });
 }
-- 
GitLab


From 54438766872b92e3ae3e279ae0ab142d3efedafd Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 18:41:31 -0600
Subject: [PATCH 02/25] some backprop opt

---
 juno_samples/rodinia/backprop/src/backprop.jn | 31 +++++----
 juno_samples/rodinia/backprop/src/cpu.sch     |  4 +-
 juno_samples/rodinia/backprop/src/gpu.sch     | 63 +++++++++++++------
 3 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn
index 94c4334c..2ca57c9f 100644
--- a/juno_samples/rodinia/backprop/src/backprop.jn
+++ b/juno_samples/rodinia/backprop/src/backprop.jn
@@ -6,10 +6,9 @@ fn squash(x: f32) -> f32 {
 fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] {
   @res let result : f32[m + 1];
   result[0] = 1.0;
-
   @outer_loop for j in 1..=m {
-    let sum = 0.0;
-    @inner_loop for k in 0..=n {
+    let sum = weights[0, j] * vals[0];
+    @inner_loop for k in 1..=n {
       sum += weights[k, j] * vals[k];
     }
     result[j] = squash(sum);
@@ -19,13 +18,16 @@ fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f
 }
 
 fn output_error<n: usize>(target: f32[n + 1], actual: f32[n + 1]) -> f32, f32[n + 1] {
-  let errsum = 0.0;
-  let delta : f32[n + 1];
-
-  for j in 1..=n {
+  @loop1 @res let delta : f32[n + 1];
+  @loop1 delta[0] = 0.0;
+  @loop1 for j in 1..=n {
     let a = actual[j];
     let t = target[j];
     delta[j] = a * (1.0 - a) * (t - a);
+  }
+
+  let errsum = 0.0;
+  @loop2 for j in 1..=n {
     errsum += abs!(delta[j]);
   }
 
@@ -37,10 +39,9 @@ fn hidden_error<hidden_n, output_n: usize>(
   hidden_weights: f32[hidden_n + 1, output_n + 1],
   hidden_vals: f32[hidden_n + 1],
 ) -> f32, f32[hidden_n + 1] {
-  let errsum = 0.0;
-  let delta : f32[hidden_n + 1];
-
-  for j in 1..=hidden_n {
+  @loop1 @res let delta : f32[hidden_n + 1];
+  @loop1 delta[0] = 0.0;
+  @loop1 for j in 1..=hidden_n {
     let h = hidden_vals[j];
 
     let sum = 0.0;
@@ -49,6 +50,10 @@ fn hidden_error<hidden_n, output_n: usize>(
     }
 
     delta[j] = h * (1.0 - h) * sum;
+  }
+
+  let errsum = 0.0;
+  @loop2 for j in 1..=hidden_n {
     errsum += abs!(delta[j]);
   }
 
@@ -89,8 +94,8 @@ fn backprop<input_n, hidden_n, output_n: usize>(
   let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights);
   let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights);
 
-  let out_err, out_delta = output_error::<output_n>(target, output_vals);
-  let hid_err, hid_delta = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals);
+  @output_error let out_err, out_delta = output_error::<output_n>(target, output_vals);
+  @hidden_error let hid_err, hid_delta = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals);
 
   let hidden_weights, hidden_prev_weights
     = adjust_weights::<hidden_n, output_n>(out_delta, hidden_vals, hidden_weights, hidden_prev_weights);
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index de34d660..661ec531 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -12,7 +12,7 @@ simpl!(*);
 inline(layer_forward);
 delete-uncalled(*);
 
-no-memset(layer_forward@res);
+no-memset(layer_forward@res, output_error@res, hidden_error@res);
 lift-dc-math(*);
 loop-bound-canon(*);
 simpl!(*);
@@ -25,6 +25,8 @@ fixpoint {
 }
 reduce-slf(*);
 simpl!(*);
+fork-interchange[0, 1](adjust_weights);
+simpl!(*);
 
 fork-split(*);
 unforkify(*);
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index 2011860d..d0be79db 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -1,24 +1,51 @@
-gvn(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  infer-schedules(X);
+}
+
+no-memset(layer_forward@res, output_error@res, hidden_error@res);
 phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+let output_loop1 = outline(output_error@loop1);
+let output_loop2 = outline(output_error@loop2);
+let hidden_loop1 = outline(hidden_error@loop1);
+let hidden_loop2 = outline(hidden_error@loop2);
+simpl!(*);
+inline(layer_forward, backprop@output_error, backprop@hidden_error);
+delete-uncalled(*);
+gpu(layer_forward, output_loop1, output_loop2, hidden_loop1, hidden_loop2, adjust_weights);
+const-inline(*);
 
-let auto = auto-outline(backprop);
-gpu(auto.backprop);
+lift-dc-math(*);
+loop-bound-canon(*);
+simpl!(*);
+lift-dc-math(*);
+slf(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+  fork-coalesce(*);
+}
+reduce-slf(*);
+simpl!(*);
 
-inline(auto.backprop);
-inline(auto.backprop);
-delete-uncalled(*);
+fork-tile[16, 0, false, true](layer_forward@inner_loop);
+let out = fork-split(layer_forward@inner_loop);
+clean-monoid-reduces(layer_forward);
+simpl!(layer_forward);
+let fission = fork-fission[out._1_layer_forward.fj0](layer_forward);
+simpl!(layer_forward);
 
-sroa[true](*);
-dce(*);
-float-collections(*);
-reuse-products(*);
-dce(*);
+fork-dim-merge(adjust_weights);
+simpl!(adjust_weights);
+fork-extend[32](adjust_weights);
+fork-tile[32, 0, false, true](adjust_weights);
+fork-split(adjust_weights);
+simpl!(adjust_weights);
 
+xdot[true](*);
 gcm(*);
-
-- 
GitLab


From f092ae383b280a6df9778e62198c4341d2f1e8ad Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 19:15:45 -0600
Subject: [PATCH 03/25] more backprop opt

---
 hercules_opt/src/fork_transforms.rs       | 102 ++++++++++++++++++----
 hercules_opt/src/simplify_cfg.rs          |  15 +++-
 juno_samples/rodinia/backprop/src/gpu.sch |   7 +-
 3 files changed, 106 insertions(+), 18 deletions(-)

diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index 1c220b99..e1598463 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -319,12 +319,12 @@ pub fn fork_fission<'a>(
         .collect();
 
     let mut created_forks = Vec::new();
-    
-    // This does the reduction fission 
+
+    // This does the reduction fission
     for fork in forks {
         let join = fork_join_map[&fork.0];
 
-        // FIXME: Don't make multiple forks for reduces that are in cycles with each other. 
+        // FIXME: Don't make multiple forks for reduces that are in cycles with each other.
         let reduce_partition = default_reduce_partition(editor, fork.0, join);
 
         if !editor.func().labels[fork.0.idx()].contains(&fork_label) {
@@ -332,14 +332,19 @@ pub fn fork_fission<'a>(
         }
 
         if editor.is_mutable(fork.0) {
-            created_forks = fork_reduce_fission_helper(editor, fork_join_map, reduce_partition, nodes_in_fork_joins, fork.0);
+            created_forks = fork_reduce_fission_helper(
+                editor,
+                fork_join_map,
+                reduce_partition,
+                nodes_in_fork_joins,
+                fork.0,
+            );
             if created_forks.is_empty() {
                 continue;
             } else {
                 return created_forks;
             }
         }
-            
     }
 
     created_forks
@@ -503,13 +508,17 @@ pub fn fork_reduce_fission_helper<'a>(
 
     let mut new_forks = Vec::new();
 
-    let mut new_control_pred: NodeID = editor.get_uses(fork).filter(|n| editor.node(n).is_control()).next().unwrap();
+    let mut new_control_pred: NodeID = editor
+        .get_uses(fork)
+        .filter(|n| editor.node(n).is_control())
+        .next()
+        .unwrap();
 
     let mut new_fork = NodeID::new(0);
     let mut new_join = NodeID::new(0);
 
-    let subgraph = &nodes_in_fork_joins[&fork]; 
-    
+    let subgraph = &nodes_in_fork_joins[&fork];
+
     // Gets everything between fork & join that this reduce needs. (ALL CONTROL)
     editor.edit(|mut edit| {
         for reduce in reduce_partition {
@@ -522,7 +531,7 @@ pub fn fork_reduce_fission_helper<'a>(
             new_fork = mapping[&fork];
             new_forks.push(new_fork);
             new_join = mapping[&join];
-            
+
             // Atttach new_fork after control_pred
             let (old_control_pred, _) = edit.get_node(new_fork).try_fork().unwrap().clone();
             edit = edit.replace_all_uses_where(old_control_pred, new_control_pred, |usee| {
@@ -532,7 +541,7 @@ pub fn fork_reduce_fission_helper<'a>(
             // Replace uses of reduce
             edit = edit.replace_all_uses(reduce, mapping[&reduce])?;
             new_control_pred = new_join;
-        };
+        }
 
         // Replace original join w/ new final join
         edit = edit.replace_all_uses_where(join, new_join, |_| true)?;
@@ -1502,6 +1511,10 @@ fn fork_fusion(
  * element. This aides in parallelizing outer loops. Looks only at reduces with
  * the monoid reduce schedule, since that indicates a particular structure which
  * is annoying to check for again.
+ *
+ * Looks for would-be monoid reduces, if not for a gate on the reduction.
+ * Partially predicate the gated reduction to allow for a proper monoid
+ * reduction.
  */
 pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
     for id in editor.node_ids() {
@@ -1512,7 +1525,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
         let Some((_, init, reduct)) = nodes[id.idx()].try_reduce() else {
             continue;
         };
-        let out_uses: Vec<_> = editor.get_users(id).filter(|id| *id != reduct).collect();
+        let out_users: Vec<_> = editor.get_users(id).filter(|id| *id != reduct).collect();
 
         match nodes[reduct.idx()] {
             Node::Binary {
@@ -1532,7 +1545,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                         left: init,
                         right: id,
                     });
-                    for u in out_uses {
+                    for u in out_users {
                         edit.sub_edit(u, final_op);
                     }
                     edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op)
@@ -1555,7 +1568,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                         left: init,
                         right: id,
                     });
-                    for u in out_uses {
+                    for u in out_users {
                         edit.sub_edit(u, final_op);
                     }
                     edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op)
@@ -1574,7 +1587,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                         intrinsic: Intrinsic::Max,
                         args: Box::new([init, id]),
                     });
-                    for u in out_uses {
+                    for u in out_users {
                         edit.sub_edit(u, final_op);
                     }
                     edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op)
@@ -1593,7 +1606,7 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                         intrinsic: Intrinsic::Min,
                         args: Box::new([init, id]),
                     });
-                    for u in out_uses {
+                    for u in out_users {
                         edit.sub_edit(u, final_op);
                     }
                     edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op)
@@ -1602,6 +1615,65 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
             _ => {}
         }
     }
+
+    for id in editor.node_ids() {
+        if !editor.func().schedules[id.idx()].contains(&Schedule::MonoidReduce) {
+            continue;
+        }
+        let nodes = &editor.func().nodes;
+        let Some((control, init, reduct)) = nodes[id.idx()].try_reduce() else {
+            continue;
+        };
+        if let Node::Phi {
+            control: phi_control,
+            ref data,
+        } = nodes[reduct.idx()]
+            && data.len() == 2
+            && data.contains(&id)
+            && let other = *data
+                .into_iter()
+                .filter(|other| **other != id)
+                .next()
+                .unwrap()
+            && let Node::Binary {
+                op: BinaryOperator::Add,
+                left,
+                right,
+            } = nodes[other.idx()]
+            && ((left == id) ^ (right == id))
+        {
+            let gated_input = if left == id { right } else { left };
+            let data = data.clone();
+            editor.edit(|mut edit| {
+                let zero = edit.add_zero_constant(typing[id.idx()]);
+                let zero = edit.add_node(Node::Constant { id: zero });
+                let phi = edit.add_node(Node::Phi {
+                    control: phi_control,
+                    data: data
+                        .iter()
+                        .map(|phi_use| if *phi_use == id { zero } else { gated_input })
+                        .collect(),
+                });
+                let new_reduce_id = NodeID::new(edit.num_node_ids());
+                let new_reduct_id = NodeID::new(edit.num_node_ids() + 1);
+                let new_reduce = Node::Reduce {
+                    control,
+                    init,
+                    reduct: new_reduct_id,
+                };
+                let new_add = Node::Binary {
+                    op: BinaryOperator::Add,
+                    left: new_reduce_id,
+                    right: phi,
+                };
+                let new_reduce = edit.add_node(new_reduce);
+                edit.add_node(new_add);
+                edit = edit.replace_all_uses(id, new_reduce)?;
+                edit = edit.delete_node(id)?;
+                Ok(edit)
+            });
+        }
+    }
 }
 
 /*
diff --git a/hercules_opt/src/simplify_cfg.rs b/hercules_opt/src/simplify_cfg.rs
index cf39db2b..b13cf0c3 100644
--- a/hercules_opt/src/simplify_cfg.rs
+++ b/hercules_opt/src/simplify_cfg.rs
@@ -126,11 +126,24 @@ fn remove_useless_fork_joins(
 
     // Third, get rid of fork-joins.
     for (fork, join) in fork_join_map {
-        if editor.get_users(*fork).len() == 1 && editor.get_users(*join).len() == 1 {
+        if editor.get_users(*join).len() == 1 {
             let fork_use = get_uses(&editor.func().nodes[fork.idx()]).as_ref()[0];
             let join_use = get_uses(&editor.func().nodes[join.idx()]).as_ref()[0];
+            let tids: Vec<_> = editor
+                .get_users(*fork)
+                .filter(|id| editor.func().nodes[id.idx()].is_thread_id())
+                .collect();
 
             editor.edit(|mut edit| {
+                if !tids.is_empty() {
+                    let u64_ty = edit.add_type(Type::UnsignedInteger64);
+                    let zero = edit.add_zero_constant(u64_ty);
+                    let zero = edit.add_node(Node::Constant { id: zero });
+                    for tid in tids {
+                        edit = edit.replace_all_uses(tid, zero)?;
+                        edit = edit.delete_node(tid)?;
+                    }
+                }
                 edit = edit.replace_all_uses(*join, join_use)?;
                 edit = edit.replace_all_uses(*fork, fork_use)?;
                 edit = edit.delete_node(*fork)?;
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index d0be79db..f8cc84a3 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -33,7 +33,11 @@ fixpoint {
 reduce-slf(*);
 simpl!(*);
 
-fork-tile[16, 0, false, true](layer_forward@inner_loop);
+fork-extend[32](layer_forward@inner_loop);
+clean-monoid-reduces(layer_forward);
+simpl!(layer_forward);
+fork-tile[32, 0, false, true](layer_forward@inner_loop);
+clean-monoid-reduces(layer_forward);
 let out = fork-split(layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 simpl!(layer_forward);
@@ -47,5 +51,4 @@ fork-tile[32, 0, false, true](adjust_weights);
 fork-split(adjust_weights);
 simpl!(adjust_weights);
 
-xdot[true](*);
 gcm(*);
-- 
GitLab


From 3b1aa5d426c2c43032abef5a0505fc935da2eadc Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 21:19:31 -0600
Subject: [PATCH 04/25] parallelize main loop in bfs

---
 juno_samples/rodinia/bfs/src/cpu.sch | 13 +++++++++++--
 juno_samples/rodinia/bfs/src/lib.rs  |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index ae67fdd9..f94c473f 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -10,13 +10,15 @@ macro simpl!(X) {
 
 phi-elim(bfs);
 no-memset(bfs@cost);
-outline(bfs@cost_init);
+let init = outline(bfs@cost_init);
 let loop1 = outline(bfs@loop1);
 let loop2 = outline(bfs@loop2);
+parallel-reduce(loop1);
 
 simpl!(*);
 predication(*);
 const-inline(*);
+loop-bound-canon(*);
 simpl!(*);
 fixpoint {
   forkify(*);
@@ -26,5 +28,12 @@ simpl!(*);
 predication(*);
 simpl!(*);
 
-unforkify(*);
+fork-tile[32, 0, false, false](loop1);
+let split = fork-split(loop1);
+let out = outline(split.bfs_1.fj1);
+unforkify(out);
+inline(bfs@loop1);
+delete-uncalled(*);
+
+unforkify(init);
 gcm(*);
diff --git a/juno_samples/rodinia/bfs/src/lib.rs b/juno_samples/rodinia/bfs/src/lib.rs
index 218e9bb0..f6403540 100644
--- a/juno_samples/rodinia/bfs/src/lib.rs
+++ b/juno_samples/rodinia/bfs/src/lib.rs
@@ -19,6 +19,7 @@ pub struct BFSInputs {
 fn run_bfs(nodes: &[Node], source: u32, edges: &[u32]) -> Vec<i32> {
     let n = nodes.len() as u64;
     let m = edges.len() as u64;
+    println!("Running with {} nodes and {} edges.", n, m);
 
     let nodes = HerculesImmBox::from(nodes);
     let edges = HerculesImmBox::from(edges);
-- 
GitLab


From 2598806c43fe6e518657b2c6bffb866695fdcab5 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 21:50:22 -0600
Subject: [PATCH 05/25] work on bfs

---
 juno_samples/rodinia/bfs/src/bfs.jn  | 11 ++++++-----
 juno_samples/rodinia/bfs/src/cpu.sch | 22 ++++++++++++----------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index 2534a89c..3d0280f1 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -44,11 +44,12 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
 
     @loop2 for i in 0..n {
       stop = stop && !updated[i];
-      if updated[i] {
-        mask[i] = true;
-        visited[i] = true;
-        updated[i] = false;
-      }
+    }
+
+    @loop3 for i in 0..n {
+      mask[i] = mask[i] || updated[i];
+      visited[i] = visited[i] || updated[i];
+      updated[i] = false;
     }
   }
 
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index f94c473f..a33e361d 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -11,9 +11,9 @@ macro simpl!(X) {
 phi-elim(bfs);
 no-memset(bfs@cost);
 let init = outline(bfs@cost_init);
-let loop1 = outline(bfs@loop1);
-let loop2 = outline(bfs@loop2);
-parallel-reduce(loop1);
+let traverse = outline(bfs@loop1);
+let collect = outline(bfs@loop2 | bfs@loop3);
+parallel-reduce(traverse);
 
 simpl!(*);
 predication(*);
@@ -27,13 +27,15 @@ fixpoint {
 simpl!(*);
 predication(*);
 simpl!(*);
+reduce-slf(*);
+simpl!(*);
 
-fork-tile[32, 0, false, false](loop1);
-let split = fork-split(loop1);
-let out = outline(split.bfs_1.fj1);
-unforkify(out);
-inline(bfs@loop1);
-delete-uncalled(*);
+fixpoint {
+  forkify(collect);
+  fork-guard-elim(collect);
+}
+fork-fusion(collect);
+simpl!(collect);
 
-unforkify(init);
+unforkify(init, traverse, collect);
 gcm(*);
-- 
GitLab


From 84d2978f414a3ba57f70bd359753a3184eae4f49 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 22:37:06 -0600
Subject: [PATCH 06/25] bfs w/ and reduction

---
 juno_samples/rodinia/bfs/src/gpu.sch | 50 +++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 6c4d027b..56489a23 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -10,14 +10,16 @@ macro simpl!(X) {
 
 phi-elim(bfs);
 no-memset(bfs@cost);
-let cost_init = outline(bfs@cost_init);
+let init = outline(bfs@cost_init);
 let loop1 = outline(bfs@loop1);
 let loop2 = outline(bfs@loop2);
-gpu(loop1, loop2);
+let loop3 = outline(bfs@loop3);
+parallel-reduce(loop1);
 
 simpl!(*);
 predication(*);
 const-inline(*);
+loop-bound-canon(*);
 simpl!(*);
 fixpoint {
   forkify(*);
@@ -26,14 +28,44 @@ fixpoint {
 simpl!(*);
 predication(*);
 simpl!(*);
-
-unforkify(cost_init);
-parallel-reduce(loop1);
-forkify(*);
-fork-guard-elim(*);
-simpl!(*);
-predication(*);
 reduce-slf(*);
 simpl!(*);
 
+fork-tile[32, 0, false, true](loop1);
+fork-split(loop1);
+gpu(loop1);
+
+fixpoint {
+  forkify(loop2, loop3);
+  fork-guard-elim(loop2, loop3);
+}
+
+simpl!(loop2, loop3);
+fork-tile[32, 0, false, true](loop2, loop3);
+let out = fork-split(loop2, loop3);
+clean-monoid-reduces(loop2, loop3);
+simpl!(loop2, loop3);
+gpu(loop3);
+
+let fission1 = fork-fission[out.bfs_2.fj0](loop2);
+simpl!(loop2);
+fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom);
+let out = fork-split(fission1.bfs_2.fj_bottom);
+clean-monoid-reduces(loop2);
+simpl!(loop2);
+let fission2 = fork-fission[out.bfs_2.fj0](loop2);
+simpl!(loop2);
+fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom);
+let out = fork-split(fission2.bfs_2.fj_bottom);
+clean-monoid-reduces(loop2);
+simpl!(loop2);
+let top = outline(fission1.bfs_2.fj_top);
+let middle = outline(fission2.bfs_2.fj_top);
+let bottom = outline(out.bfs_2.fj0);
+const-inline(loop2, top, middle, bottom);
+no-memset(top, middle);
+gpu(top, middle, bottom);
+simpl!(loop2, top, middle, bottom);
+
+unforkify(init);
 gcm(*);
-- 
GitLab


From 76a82eaf47b1dba9bbd74c2aa540b0b0bfa2b6dc Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 22:38:53 -0600
Subject: [PATCH 07/25] ignore too small test bfs for now...

---
 juno_samples/rodinia/bfs/src/main.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/juno_samples/rodinia/bfs/src/main.rs b/juno_samples/rodinia/bfs/src/main.rs
index 0ad23b00..b0a74bbd 100644
--- a/juno_samples/rodinia/bfs/src/main.rs
+++ b/juno_samples/rodinia/bfs/src/main.rs
@@ -8,6 +8,7 @@ fn main() {
 }
 
 #[test]
+#[ignore]
 fn bfs_test_4096() {
     bfs_harness(BFSInputs {
         input: "data/graph4096.txt".to_string(),
-- 
GitLab


From 375199f5369a1237f52edd82af8c0167219f2cce Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 3 Mar 2025 11:39:24 -0600
Subject: [PATCH 08/25] some tweaks

---
 hercules_opt/src/fork_transforms.rs |  6 ++++--
 hercules_opt/src/gcm.rs             | 14 +++++++++++---
 hercules_opt/src/utils.rs           | 18 ++++++++++++++++++
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index e1598463..6998f879 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1533,7 +1533,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                 left: _,
                 right: _,
             } if (op == BinaryOperator::Add || op == BinaryOperator::Or)
-                && !is_zero(editor, init) =>
+                && !is_zero(editor, init)
+                && !is_false(editor, init) =>
             {
                 editor.edit(|mut edit| {
                     let zero = edit.add_zero_constant(typing[init.idx()]);
@@ -1556,7 +1557,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                 left: _,
                 right: _,
             } if (op == BinaryOperator::Mul || op == BinaryOperator::And)
-                && !is_one(editor, init) =>
+                && !is_one(editor, init)
+                && !is_true(editor, init) =>
             {
                 editor.edit(|mut edit| {
                     let one = edit.add_one_constant(typing[init.idx()]);
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index d950941a..4a6365c8 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -212,7 +212,8 @@ fn preliminary_fixups(
             let (_, init, _) = nodes[reduce.idx()].try_reduce().unwrap();
 
             // Replace uses of the reduce in its cycle with the init.
-            let success = editor.edit(|edit| {
+            let success = editor.edit(|mut edit| {
+                edit = edit.add_schedule(init, Schedule::ParallelReduce)?;
                 edit.replace_all_uses_where(reduce, init, |id| reduce_cycles[&reduce].contains(id))
             });
             assert!(success);
@@ -870,7 +871,7 @@ fn spill_clones(
     // Step 2: filter edges (A, B) to just see edges where A uses B and A
     // mutates B. These are the edges that may require a spill.
     let mut spill_edges = edges.into_iter().filter(|(a, b)| {
-        mutating_writes(editor.func(), *a, objects).any(|id| id == *b)
+        (mutating_writes(editor.func(), *a, objects).any(|id| id == *b)
             || (get_uses(&editor.func().nodes[a.idx()])
                 .as_ref()
                 .into_iter()
@@ -890,7 +891,14 @@ fn spill_clones(
                         data.contains(b)
                             && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce)
                     })
-                    .unwrap_or(false))
+                    .unwrap_or(false)))
+            && !editor.func().nodes[a.idx()]
+                .try_write()
+                .map(|(collect, _, _)| {
+                    collect == *b
+                        && editor.func().schedules[b.idx()].contains(&Schedule::ParallelReduce)
+                })
+                .unwrap_or(false)
     });
 
     // Step 3: if there is a spill edge, spill it and return true. Otherwise,
diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs
index b910a128..351abc2b 100644
--- a/hercules_opt/src/utils.rs
+++ b/hercules_opt/src/utils.rs
@@ -598,6 +598,24 @@ pub fn is_one(editor: &FunctionEditor, id: NodeID) -> bool {
         || nodes[id.idx()].is_undef()
 }
 
+pub fn is_false(editor: &FunctionEditor, id: NodeID) -> bool {
+    let nodes = &editor.func().nodes;
+    nodes[id.idx()]
+        .try_constant()
+        .map(|id| editor.get_constant(id).is_false())
+        .unwrap_or(false)
+        || nodes[id.idx()].is_undef()
+}
+
+pub fn is_true(editor: &FunctionEditor, id: NodeID) -> bool {
+    let nodes = &editor.func().nodes;
+    nodes[id.idx()]
+        .try_constant()
+        .map(|id| editor.get_constant(id).is_true())
+        .unwrap_or(false)
+        || nodes[id.idx()].is_undef()
+}
+
 pub fn is_largest(editor: &FunctionEditor, id: NodeID) -> bool {
     let nodes = &editor.func().nodes;
     nodes[id.idx()]
-- 
GitLab


From 4cd1d6108d7c37701d0136818cc643304837aa5f Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 3 Mar 2025 12:28:33 -0600
Subject: [PATCH 09/25] opt bfs

---
 juno_samples/rodinia/bfs/src/bfs.jn  | 24 +++++++++------
 juno_samples/rodinia/bfs/src/cpu.sch |  6 ++--
 juno_samples/rodinia/bfs/src/gpu.sch | 46 ++++++----------------------
 3 files changed, 28 insertions(+), 48 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index 3d0280f1..f82d9d80 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -1,4 +1,11 @@
 type Node = struct { edge_start: u32; num_edges: u32; };
+type StopProd = struct { stop: bool; };
+
+fn make_stop_prod() -> StopProd {
+  let ret : StopProd;
+  ret.stop = true;
+  return ret;
+}
 
 #[entry]
 fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] {
@@ -23,8 +30,6 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
   let updated: bool[n];
 
   while !stop {
-    stop = true;
-
     @loop1 for i in 0..n {
       if mask[i] {
         mask[i] = false;
@@ -42,15 +47,16 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
       }
     }
 
+    @make let stop_prod = make_stop_prod();
     @loop2 for i in 0..n {
-      stop = stop && !updated[i];
-    }
-
-    @loop3 for i in 0..n {
-      mask[i] = mask[i] || updated[i];
-      visited[i] = visited[i] || updated[i];
-      updated[i] = false;
+      if updated[i] {
+        mask[i] = true;
+        visited[i] = true;
+        updated[i] = false;
+	stop_prod.stop = updated[i];
+      }
     }
+    stop = stop_prod.stop;
   }
 
   return cost;
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index a33e361d..589b93b1 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -12,8 +12,7 @@ phi-elim(bfs);
 no-memset(bfs@cost);
 let init = outline(bfs@cost_init);
 let traverse = outline(bfs@loop1);
-let collect = outline(bfs@loop2 | bfs@loop3);
-parallel-reduce(traverse);
+let collect = outline(bfs@loop2);
 
 simpl!(*);
 predication(*);
@@ -29,12 +28,13 @@ predication(*);
 simpl!(*);
 reduce-slf(*);
 simpl!(*);
+slf(*);
+simpl!(*);
 
 fixpoint {
   forkify(collect);
   fork-guard-elim(collect);
 }
-fork-fusion(collect);
 simpl!(collect);
 
 unforkify(init, traverse, collect);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 56489a23..d5c8dee6 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -11,10 +11,11 @@ macro simpl!(X) {
 phi-elim(bfs);
 no-memset(bfs@cost);
 let init = outline(bfs@cost_init);
-let loop1 = outline(bfs@loop1);
-let loop2 = outline(bfs@loop2);
-let loop3 = outline(bfs@loop3);
-parallel-reduce(loop1);
+let traverse = outline(bfs@loop1);
+let collect = outline(bfs@loop2);
+parallel-reduce(traverse, collect);
+no-memset(make_stop_prod);
+gpu(traverse, make_stop_prod, collect);
 
 simpl!(*);
 predication(*);
@@ -31,41 +32,14 @@ simpl!(*);
 reduce-slf(*);
 simpl!(*);
 
-fork-tile[32, 0, false, true](loop1);
-fork-split(loop1);
-gpu(loop1);
-
 fixpoint {
-  forkify(loop2, loop3);
-  fork-guard-elim(loop2, loop3);
+  forkify(collect);
+  fork-guard-elim(collect);
 }
+simpl!(collect);
 
-simpl!(loop2, loop3);
-fork-tile[32, 0, false, true](loop2, loop3);
-let out = fork-split(loop2, loop3);
-clean-monoid-reduces(loop2, loop3);
-simpl!(loop2, loop3);
-gpu(loop3);
-
-let fission1 = fork-fission[out.bfs_2.fj0](loop2);
-simpl!(loop2);
-fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom);
-let out = fork-split(fission1.bfs_2.fj_bottom);
-clean-monoid-reduces(loop2);
-simpl!(loop2);
-let fission2 = fork-fission[out.bfs_2.fj0](loop2);
-simpl!(loop2);
-fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom);
-let out = fork-split(fission2.bfs_2.fj_bottom);
-clean-monoid-reduces(loop2);
-simpl!(loop2);
-let top = outline(fission1.bfs_2.fj_top);
-let middle = outline(fission2.bfs_2.fj_top);
-let bottom = outline(out.bfs_2.fj0);
-const-inline(loop2, top, middle, bottom);
-no-memset(top, middle);
-gpu(top, middle, bottom);
-simpl!(loop2, top, middle, bottom);
+fork-tile[32, 0, false, true](traverse, collect);
+fork-split(traverse, collect);
 
 unforkify(init);
 gcm(*);
-- 
GitLab


From 7b6ba729fdeaf8e76a2bb2aa2631f931a4525070 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 3 Mar 2025 13:25:13 -0600
Subject: [PATCH 10/25] more bfs opt

---
 hercules_cg/src/gpu.rs               | 18 +++++++++++-------
 juno_samples/rodinia/bfs/src/gpu.sch |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 07dd3ebf..3a00e547 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -354,6 +354,7 @@ impl GPUContext<'_> {
         write!(
             w,
             "
+#define _CG_ABI_EXPERIMENTAL
 #include <assert.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -1294,7 +1295,7 @@ namespace cg = cooperative_groups;
                 }
                 if !is_primitive && state != KernelState::OutBlock {
                     write!(w, "{}}}\n", tabs)?;
-                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    //write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
                     *num_tabs -= 1;
                 }
                 if !is_primitive && state == KernelState::OutBlock {
@@ -1311,6 +1312,7 @@ namespace cg = cooperative_groups;
                 }
                 if !is_primitive
                     && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false))
+                    && !self.function.schedules[id.idx()].contains(&Schedule::NoResetConstant)
                 {
                     let data_size = self.get_size(self.typing[id.idx()], None);
                     write!(
@@ -1320,7 +1322,8 @@ namespace cg = cooperative_groups;
                     )?;
                     write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?;
                     write!(w, "{}}}\n", tabs)?;
-                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    //write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    write!(w, "__syncthreads\n")?;
                 }
             }
             // Dynamic constants emitted at top
@@ -1595,7 +1598,7 @@ namespace cg = cooperative_groups;
                     write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?;
                     write!(w, "{}}}\n", tabs)?;
                 }
-                write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                //write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
                 let collect_variable = self.get_value(*collect, false, false);
                 write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?;
             }
@@ -1705,20 +1708,20 @@ namespace cg = cooperative_groups;
                     };
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                         use_thread_per_id, cg_tile, use_thread_per_id
                     )?;
                     let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                         use_thread_quota, cg_tile_use, use_thread_quota
                     )?;
                     let available_thread_quota = available_thread_quota.unwrap();
                     let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {
@@ -1780,7 +1783,8 @@ namespace cg = cooperative_groups;
                     }
                     let fork = self.join_fork_map.get(&id).unwrap();
                     let cg_tile_available = self.get_cg_tile(*fork, CGType::Available);
-                    write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
+                    //write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
+                    write!(w_term, "\t__syncthreads;\n")?;
                 }
                 // If the Fork was parallelized, each thread or UsedPerId tile of
                 // threads only runs one ThreadID, so we can jump straight to the
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index d5c8dee6..4e5c1f74 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -38,7 +38,7 @@ fixpoint {
 }
 simpl!(collect);
 
-fork-tile[32, 0, false, true](traverse, collect);
+fork-tile[1024, 0, false, true](traverse, collect);
 fork-split(traverse, collect);
 
 unforkify(init);
-- 
GitLab


From d303b1850dfd142871ee9cb819f0edff33334d6a Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Mon, 3 Mar 2025 13:35:02 -0600
Subject: [PATCH 11/25] fix syncthreads hack

---
 hercules_cg/src/gpu.rs | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 3a00e547..1e6067a3 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -562,8 +562,9 @@ namespace cg = cooperative_groups;
      * and writes.
      */
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
+        write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
-        write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?;
+        write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?;
         Ok(())
     }
 
@@ -1322,8 +1323,8 @@ namespace cg = cooperative_groups;
                     )?;
                     write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?;
                     write!(w, "{}}}\n", tabs)?;
-                    //write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
-                    write!(w, "__syncthreads\n")?;
+                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    //write!(w, "__syncthreads\n")?;
                 }
             }
             // Dynamic constants emitted at top
@@ -1783,8 +1784,8 @@ namespace cg = cooperative_groups;
                     }
                     let fork = self.join_fork_map.get(&id).unwrap();
                     let cg_tile_available = self.get_cg_tile(*fork, CGType::Available);
-                    //write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
-                    write!(w_term, "\t__syncthreads;\n")?;
+                    write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
+                    //write!(w_term, "\t__syncthreads;\n")?;
                 }
                 // If the Fork was parallelized, each thread or UsedPerId tile of
                 // threads only runs one ThreadID, so we can jump straight to the
-- 
GitLab


From 416ed7b1e6595eb1dd2313e4d46b7d38b4b6ff68 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 15:55:56 -0600
Subject: [PATCH 12/25] Parallelize both main loops in srad

---
 juno_samples/rodinia/srad/src/cpu.sch | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index 7b7a6c9e..8917f03d 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -40,10 +40,15 @@ let split = fork-split(loop2);
 let loop2_body = outline(split.srad_1.fj1);
 simpl!(loop2, loop2_body);
 
-inline(srad@loop2);
+fork-tile[32, 0, false, false](loop3);
+let split = fork-split(loop3);
+let loop3_body = outline(split.srad_2.fj1);
+simpl!(loop3, loop3_body);
+
+inline(srad@loop2, srad@loop3);
 delete-uncalled(*);
 
-fork-split(extract, compress, loop1, loop2_body, loop3);
-unforkify(extract, compress, loop1, loop2_body, loop3);
+fork-split(extract, compress, loop1, loop2_body, loop3_body);
+unforkify(extract, compress, loop1, loop2_body, loop3_body);
 
 gcm(*);
-- 
GitLab


From b156fad632e6d0a7746b81c8bcf9e9dda0ef9977 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 15:56:23 -0600
Subject: [PATCH 13/25] Parallelize backprop

---
 juno_samples/rodinia/backprop/src/backprop.jn |  4 +--
 juno_samples/rodinia/backprop/src/cpu.sch     | 27 +++++++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn
index 2ca57c9f..70894c17 100644
--- a/juno_samples/rodinia/backprop/src/backprop.jn
+++ b/juno_samples/rodinia/backprop/src/backprop.jn
@@ -69,8 +69,8 @@ fn adjust_weights<n, m: usize>(
   weights: f32[n + 1, m + 1],
   prev_weights: f32[n + 1, m + 1]
 ) -> f32[n + 1, m + 1], f32[n + 1, m + 1] {
-  for j in 1..=m {
-    for k in 0..=n {
+  @outer_loop for j in 1..=m {
+    @inner_loop for k in 0..=n {
       let new_dw = ETA * delta[j] * vals[k] + MOMENTUM * prev_weights[k, j];
       weights[k, j] += new_dw;
       prev_weights[k, j] = new_dw;
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index 661ec531..865cc1a2 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -28,6 +28,29 @@ simpl!(*);
 fork-interchange[0, 1](adjust_weights);
 simpl!(*);
 
-fork-split(*);
-unforkify(*);
+infer-schedules(*);
+
+fork-tile[32, 0, false, true](layer_forward@outer_loop \ layer_forward@inner_loop);
+let (forward_outer, forward_inner) = fork-reshape[[1], [0]](layer_forward@outer_loop \ layer_forward@inner_loop);
+
+fork-tile[32, 0, false, true](adjust_weights);
+let (adjust_outer, adjust_inner) = fork-reshape[[1], [0, 2]](adjust_weights);
+
+let forward_body = outline(forward_inner);
+let adjust_body = outline(adjust_inner);
+
+rename["output_error"](output_error);
+rename["hidden_error"](hidden_error);
+
+let output_error_body = auto-outline(output_error).output_error;
+let hidden_error_body = auto-outline(hidden_error).hidden_error;
+
+inline(backprop);
+delete-uncalled(*);
+const-inline(*);
+
+simpl!(*);
+fork-split(forward_body, adjust_body, output_error_body, hidden_error_body);
+unforkify(forward_body, adjust_body, output_error_body, hidden_error_body);
+
 gcm(*);
-- 
GitLab


From 17d338421b68e66c3c6e784a6a2e3ea901373383 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 16:11:19 -0600
Subject: [PATCH 14/25] Unparallelize backprop

---
 juno_samples/rodinia/backprop/src/cpu.sch | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index 865cc1a2..6899523e 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -30,27 +30,11 @@ simpl!(*);
 
 infer-schedules(*);
 
-fork-tile[32, 0, false, true](layer_forward@outer_loop \ layer_forward@inner_loop);
-let (forward_outer, forward_inner) = fork-reshape[[1], [0]](layer_forward@outer_loop \ layer_forward@inner_loop);
-
-fork-tile[32, 0, false, true](adjust_weights);
-let (adjust_outer, adjust_inner) = fork-reshape[[1], [0, 2]](adjust_weights);
-
-let forward_body = outline(forward_inner);
-let adjust_body = outline(adjust_inner);
-
-rename["output_error"](output_error);
-rename["hidden_error"](hidden_error);
-
-let output_error_body = auto-outline(output_error).output_error;
-let hidden_error_body = auto-outline(hidden_error).hidden_error;
-
-inline(backprop);
 delete-uncalled(*);
 const-inline(*);
 
 simpl!(*);
-fork-split(forward_body, adjust_body, output_error_body, hidden_error_body);
-unforkify(forward_body, adjust_body, output_error_body, hidden_error_body);
+fork-split(*);
+unforkify(*);
 
 gcm(*);
-- 
GitLab


From be6c3a90a2d3f35f251915e216c81bb9a85f2449 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 16:36:31 -0600
Subject: [PATCH 15/25] New backprop schedule

---
 juno_samples/rodinia/backprop/src/backprop.jn |  8 ++++----
 juno_samples/rodinia/backprop/src/cpu.sch     | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn
index 70894c17..7851cf47 100644
--- a/juno_samples/rodinia/backprop/src/backprop.jn
+++ b/juno_samples/rodinia/backprop/src/backprop.jn
@@ -91,15 +91,15 @@ fn backprop<input_n, hidden_n, output_n: usize>(
 ) -> f32, f32,
      f32[input_n + 1, hidden_n + 1], f32[input_n + 1, hidden_n + 1],
      f32[hidden_n + 1, output_n + 1], f32[hidden_n + 1, output_n + 1] {
-  let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights);
-  let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights);
+  @forward_input let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights);
+  @forward_hidden let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights);
 
   @output_error let out_err, out_delta = output_error::<output_n>(target, output_vals);
   @hidden_error let hid_err, hid_delta = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals);
 
-  let hidden_weights, hidden_prev_weights
+  @adjust_hidden let hidden_weights, hidden_prev_weights
     = adjust_weights::<hidden_n, output_n>(out_delta, hidden_vals, hidden_weights, hidden_prev_weights);
-  let input_weights, input_prev_weights
+  @adjust_input let input_weights, input_prev_weights
     = adjust_weights::<input_n, hidden_n>(hid_delta, input_vals, input_weights, input_prev_weights);
 
   return out_err, hid_err, input_weights, input_prev_weights, hidden_weights, hidden_prev_weights;
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index 6899523e..9a5f4d75 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -30,11 +30,24 @@ simpl!(*);
 
 infer-schedules(*);
 
+// The first call to layer_forward can be parallelized by 16 (the size of the
+// hidden layer) and the second can't be parallelized at all (the size of the
+// output layer is 1)
+inline(backprop@forward_input, backprop@forward_hidden);
+let forward_input = outline(backprop@forward_input);
+let forward_hidden = outline(backprop@forward_hidden);
+
+fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop);
+let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop);
+let forward_input = outline(inner);
+inline(backprop@forward_input);
+
 delete-uncalled(*);
 const-inline(*);
 
 simpl!(*);
 fork-split(*);
-unforkify(*);
+unforkify(output_error, hidden_error, adjust_weights, forward_hidden, forward_input);
+simpl!(*);
 
 gcm(*);
-- 
GitLab


From 634e17e9bf1aa169b384c06d0f7fb3591a0afa10 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 16:39:40 -0600
Subject: [PATCH 16/25] More parallelism for backprop

---
 juno_samples/rodinia/backprop/src/cpu.sch | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index 9a5f4d75..3c7f7d5f 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -42,12 +42,25 @@ let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_i
 let forward_input = outline(inner);
 inline(backprop@forward_input);
 
+// The first call to adjust_weights has total loop dimensions of 1 * 17, so not
+// worth parallelizing (given that the body is trivial)
+// The second call to adjust_weights has a total dimension of 16 * (input + 1)
+// which is worth parallelizing, we'll do it by 16
+inline(backprop@adjust_hidden, backprop@adjust_input);
+let adjust_hidden = outline(backprop@adjust_hidden);
+let adjust_input = outline(backprop@adjust_input);
+
+fork-tile[16, 0, false, true](adjust_input);
+let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input);
+let adjust_input = outline(inner);
+inline(backprop@adjust_input);
+
 delete-uncalled(*);
 const-inline(*);
 
 simpl!(*);
 fork-split(*);
-unforkify(output_error, hidden_error, adjust_weights, forward_hidden, forward_input);
+unforkify(output_error, hidden_error, adjust_hidden, adjust_input, forward_hidden, forward_input);
 simpl!(*);
 
 gcm(*);
-- 
GitLab


From 53b31022651e397dbae56491c5b9447399b9af42 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 16:41:00 -0600
Subject: [PATCH 17/25] Broken bfs schedule

---
 juno_samples/rodinia/bfs/src/bfs.jn  |  2 +-
 juno_samples/rodinia/bfs/src/cpu.sch | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index f82d9d80..d6ec25f2 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -53,7 +53,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
         mask[i] = true;
         visited[i] = true;
         updated[i] = false;
-	stop_prod.stop = updated[i];
+        stop_prod.stop = updated[i];
       }
     }
     stop = stop_prod.stop;
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index 589b93b1..55d25095 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -37,5 +37,21 @@ fixpoint {
 }
 simpl!(collect);
 
-unforkify(init, traverse, collect);
+parallel-fork(traverse, collect);
+parallel-reduce(traverse, collect);
+
+fork-tile[32, 0, false, true](init, traverse, collect);
+let (outer, inner) = fork-reshape[[1], [0]](init);
+let init_body = outline(inner);
+let (outer, inner) = fork-reshape[[1], [0]](traverse);
+let traverse_body = outline(inner);
+let (outer, inner) = fork-reshape[[1], [0]](collect);
+let collect_body = outline(inner);
+
+inline(bfs);
+delete-uncalled(*);
+const-inline(*);
+simpl!(*);
+
+unforkify(init_body, traverse_body, collect_body);
 gcm(*);
-- 
GitLab


From bcd279ad741b348012a3fa7461957411d8b52e43 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 3 Mar 2025 17:19:36 -0600
Subject: [PATCH 18/25] Fix bfs

---
 juno_samples/rodinia/bfs/src/cpu.sch | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index 55d25095..e7d88a29 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -48,10 +48,8 @@ let traverse_body = outline(inner);
 let (outer, inner) = fork-reshape[[1], [0]](collect);
 let collect_body = outline(inner);
 
-inline(bfs);
-delete-uncalled(*);
 const-inline(*);
 simpl!(*);
 
 unforkify(init_body, traverse_body, collect_body);
-gcm(*);
+gcm(*);
\ No newline at end of file
-- 
GitLab


From 085d0a3a11b1ea232af70b03c58f17c0e28cb42e Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 17:48:16 -0600
Subject: [PATCH 19/25] Parallel BFS schedule

---
 juno_samples/rodinia/bfs/src/cpu.sch | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index e7d88a29..648e2be9 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -48,8 +48,11 @@ let traverse_body = outline(inner);
 let (outer, inner) = fork-reshape[[1], [0]](collect);
 let collect_body = outline(inner);
 
+inline(bfs@cost_init, bfs@loop1, bfs@loop2);
+delete-uncalled(*);
 const-inline(*);
-simpl!(*);
 
 unforkify(init_body, traverse_body, collect_body);
-gcm(*);
\ No newline at end of file
+simpl!(*);
+gcm(*);
+xdot[true](bfs);
-- 
GitLab


From 31523c2ea072b90a56b521ac33455a0ec0f34ed0 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 17:58:19 -0600
Subject: [PATCH 20/25] Parallel bfs

---
 juno_samples/rodinia/bfs/src/cpu.sch | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index 648e2be9..e5100561 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -40,19 +40,23 @@ simpl!(collect);
 parallel-fork(traverse, collect);
 parallel-reduce(traverse, collect);
 
-fork-tile[32, 0, false, true](init, traverse, collect);
-let (outer, inner) = fork-reshape[[1], [0]](init);
-let init_body = outline(inner);
+fork-tile[32, 0, false, true](traverse, collect);
 let (outer, inner) = fork-reshape[[1], [0]](traverse);
 let traverse_body = outline(inner);
 let (outer, inner) = fork-reshape[[1], [0]](collect);
 let collect_body = outline(inner);
 
-inline(bfs@cost_init, bfs@loop1, bfs@loop2);
+let init_body = init;
+// Following code seems to generate breaking RT code
+//fork-tile[32, 0, false, true](init);
+//let (outer, inner) = fork-reshape[[1], [0]](init);
+//let init_body = outline(inner);
+//inline(bfs@cost_init);
+
+inline(bfs@loop1, bfs@loop2);
 delete-uncalled(*);
 const-inline(*);
 
 unforkify(init_body, traverse_body, collect_body);
 simpl!(*);
 gcm(*);
-xdot[true](bfs);
-- 
GitLab


From f5fe6b01eec57eeb770cfa273aebcf49d7cd0166 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 3 Mar 2025 22:51:55 -0600
Subject: [PATCH 21/25] Larger rodinia benches

---
 .../rodinia/backprop/benches/backprop_bench.rs      |  4 ++--
 juno_samples/rodinia/bfs/benches/bfs_bench.rs       | 13 +++++++++++++
 juno_samples/rodinia/cfd/benches/cfd_bench.rs       | 13 +++++++------
 juno_samples/rodinia/srad/benches/srad_bench.rs     |  9 +++++----
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
index 17bdf6a7..98d582b5 100644
--- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs
+++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
@@ -18,7 +18,7 @@ fn backprop_bench(c: &mut Criterion) {
 
     let mut rng = StdRng::seed_from_u64(7);
 
-    let input_n = 65536;
+    let input_n = 33554432;
     let hidden_n = 16;
     let output_n = 1;
 
@@ -46,7 +46,7 @@ fn backprop_bench(c: &mut Criterion) {
     let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec());
     let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec());
 
-    group.bench_function("backprop bench", |b| {
+    group.bench_function("backprop bench large", |b| {
         b.iter(|| {
             async_std::task::block_on(async {
                 r.run(
diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
index bf39a0fc..dbe6db38 100644
--- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs
+++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
@@ -9,6 +9,7 @@ use juno_bfs::graph_parser::*;
 
 fn bfs_bench(c: &mut Criterion) {
     let mut group = c.benchmark_group("bfs bench");
+    group.sample_size(10);
 
     let mut r = runner!(bfs);
 
@@ -35,6 +36,18 @@ fn bfs_bench(c: &mut Criterion) {
             async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
         })
     });
+
+    let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt";
+    let (nodes, source, edges) = parse_graph(input.into());
+    let n = nodes.len() as u64;
+    let m = edges.len() as u64;
+    let nodes = HerculesImmBox::from(&nodes as &[Node]);
+    let edges = HerculesImmBox::from(&edges as &[u32]);
+    group.bench_function("bfs bench 64M", |b| {
+        b.iter(|| {
+            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
+        })
+    });
 }
 
 criterion_group!(benches, bfs_bench);
diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
index 5fc73db9..962c5249 100644
--- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs
+++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
@@ -10,10 +10,11 @@ use juno_cfd::*;
 
 fn cfd_bench(c: &mut Criterion) {
     let mut group = c.benchmark_group("cfd bench");
+    group.sample_size(10);
 
     let mut r = runner!(euler);
-    let data_file = "data/fvcorr.domn.097K".to_string();
-    let iterations = 1;
+    let data_file = "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M".to_string();
+    let iterations = 150;
     let block_size = 16;
     let FarFieldConditions {
         ff_variable,
@@ -44,7 +45,7 @@ fn cfd_bench(c: &mut Criterion) {
     let normals_y = HerculesImmBox::from(normals.y.as_slice());
     let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
-    group.bench_function("cfd bench euler", |b| {
+    group.bench_function("cfd bench euler large", |b| {
         b.iter(|| {
             async_std::task::block_on(async {
                 r.run(
@@ -84,8 +85,8 @@ fn cfd_bench(c: &mut Criterion) {
     });
 
     let mut r = runner!(pre_euler);
-    let data_file = "data/fvcorr.domn.097K".to_string();
-    let iterations = 1;
+    let data_file = "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M".to_string();
+    let iterations = 150;
     let block_size = 16;
     let FarFieldConditions {
         ff_variable,
@@ -116,7 +117,7 @@ fn cfd_bench(c: &mut Criterion) {
     let normals_y = HerculesImmBox::from(normals.y.as_slice());
     let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
-    group.bench_function("cfd bench pre-euler", |b| {
+    group.bench_function("cfd bench pre-euler large", |b| {
         b.iter(|| {
             async_std::task::block_on(async {
                 r.run(
diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs
index 6af13aae..23dc0643 100644
--- a/juno_samples/rodinia/srad/benches/srad_bench.rs
+++ b/juno_samples/rodinia/srad/benches/srad_bench.rs
@@ -9,12 +9,13 @@ use juno_srad::*;
 
 fn srad_bench(c: &mut Criterion) {
     let mut group = c.benchmark_group("srad bench");
+    group.sample_size(10);
 
     let mut r = runner!(srad);
-    let niter = 100;
+    let niter = 30;
     let lambda = 0.5;
-    let nrows = 512;
-    let ncols = 512;
+    let nrows = 2048;
+    let ncols = 2048;
     let image = "data/image.pgm".to_string();
     let Image {
         image: image_ori,
@@ -24,7 +25,7 @@ fn srad_bench(c: &mut Criterion) {
     } = read_graphics(image);
     let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
     let mut image_h = HerculesMutBox::from(image.clone());
-    group.bench_function("srad bench", |b| {
+    group.bench_function("srad bench large", |b| {
         b.iter(|| {
             async_std::task::block_on(async {
                 r.run(
-- 
GitLab


From cf91c1033bd3c6ed71e46ad93998daba178c307d Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 09:35:43 -0600
Subject: [PATCH 22/25] Create two benches in backprop, one old size, one new
 size

---
 .../backprop/benches/backprop_bench.rs        | 87 ++++++++++---------
 1 file changed, 46 insertions(+), 41 deletions(-)

diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
index 98d582b5..492bce3b 100644
--- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs
+++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
@@ -9,7 +9,8 @@ juno_build::juno!("backprop");
 
 // We need this even though we don't use anything from the library because of
 // Rust build scripts only linking static libraries into the library, and not
-// into the benchmark binary. Ugh!
+// into the benchmark binary. Yuck!
+#[allow(unused_imports)]
 use juno_backprop::*;
 
 fn backprop_bench(c: &mut Criterion) {
@@ -18,52 +19,56 @@ fn backprop_bench(c: &mut Criterion) {
 
     let mut rng = StdRng::seed_from_u64(7);
 
-    let input_n = 33554432;
-    let hidden_n = 16;
-    let output_n = 1;
+    let mut bench = |name, input_n: usize| {
+        let hidden_n = 16;
+        let output_n = 1;
 
-    let mut input_vals = vec![0.0f32; input_n + 1];
-    input_vals[0] = 1.0;
+        let mut input_vals = vec![0.0f32; input_n + 1];
+        input_vals[0] = 1.0;
 
-    // For some reason the bpnn_randomize_row function used on target just sets it to 0.1
-    let target = vec![0.1f32; output_n + 1];
+        // For some reason the bpnn_randomize_row function used on target just sets it to 0.1
+        let target = vec![0.1f32; output_n + 1];
 
-    let input_weights = (0..(input_n + 1) * (hidden_n + 1))
-        .map(|_| rng.random::<f32>())
-        .collect::<Vec<_>>();
-    let hidden_weights = (0..(hidden_n + 1) * (output_n + 1))
-        .map(|_| rng.random::<f32>())
-        .collect::<Vec<_>>();
+        let input_weights = (0..(input_n + 1) * (hidden_n + 1))
+            .map(|_| rng.random::<f32>())
+            .collect::<Vec<_>>();
+        let hidden_weights = (0..(hidden_n + 1) * (output_n + 1))
+            .map(|_| rng.random::<f32>())
+            .collect::<Vec<_>>();
 
-    let input_prev_weights = vec![0.0; (input_n + 1) * (hidden_n + 1)];
-    let hidden_prev_weights = vec![0.0; (hidden_n + 1) * (output_n + 1)];
+        let input_prev_weights = vec![0.0; (input_n + 1) * (hidden_n + 1)];
+        let hidden_prev_weights = vec![0.0; (hidden_n + 1) * (output_n + 1)];
 
-    let mut r = runner!(backprop);
-    let input_vals = HerculesImmBox::from(&input_vals as &[f32]);
-    let target = HerculesImmBox::from(&target as &[f32]);
-    let mut input_weights = HerculesMutBox::from(input_weights.to_vec());
-    let mut hidden_weights = HerculesMutBox::from(hidden_weights.to_vec());
-    let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec());
-    let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec());
+        let mut r = runner!(backprop);
+        let input_vals = HerculesImmBox::from(&input_vals as &[f32]);
+        let target = HerculesImmBox::from(&target as &[f32]);
+        let mut input_weights = HerculesMutBox::from(input_weights.to_vec());
+        let mut hidden_weights = HerculesMutBox::from(hidden_weights.to_vec());
+        let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec());
+        let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec());
 
-    group.bench_function("backprop bench large", |b| {
-        b.iter(|| {
-            async_std::task::block_on(async {
-                r.run(
-                    input_n as u64,
-                    hidden_n as u64,
-                    output_n as u64,
-                    input_vals.to(),
-                    input_weights.to(),
-                    hidden_weights.to(),
-                    target.to(),
-                    input_prev_weights.to(),
-                    hidden_prev_weights.to(),
-                )
-                .await
-            });
-        })
-    });
+        group.bench_function(name, |b| {
+            b.iter(|| {
+                async_std::task::block_on(async {
+                    r.run(
+                        input_n as u64,
+                        hidden_n as u64,
+                        output_n as u64,
+                        input_vals.to(),
+                        input_weights.to(),
+                        hidden_weights.to(),
+                        target.to(),
+                        input_prev_weights.to(),
+                        hidden_prev_weights.to(),
+                    )
+                    .await
+                });
+            })
+        });
+    };
+
+    bench("backprop bench small", 65536);
+    bench("backprop bench large", 33554432);
 }
 
 criterion_group!(benches, backprop_bench);
-- 
GitLab


From d5af9dea72cfa307940f6e1904032245c71d15f2 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 09:41:38 -0600
Subject: [PATCH 23/25] BFS bench add useful error messages

---
 juno_samples/rodinia/bfs/benches/bfs_bench.rs |  6 +++---
 juno_samples/rodinia/bfs/src/graph_parser.rs  | 12 +++++++-----
 juno_samples/rodinia/bfs/src/lib.rs           |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
index dbe6db38..9c09c180 100644
--- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs
+++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
@@ -14,7 +14,7 @@ fn bfs_bench(c: &mut Criterion) {
     let mut r = runner!(bfs);
 
     let input = "data/graph4096.txt";
-    let (nodes, source, edges) = parse_graph(input.into());
+    let (nodes, source, edges) = parse_graph(input.into()).unwrap();
     let n = nodes.len() as u64;
     let m = edges.len() as u64;
     let nodes = HerculesImmBox::from(&nodes as &[Node]);
@@ -26,7 +26,7 @@ fn bfs_bench(c: &mut Criterion) {
     });
 
     let input = "data/graph65536.txt";
-    let (nodes, source, edges) = parse_graph(input.into());
+    let (nodes, source, edges) = parse_graph(input.into()).unwrap();
     let n = nodes.len() as u64;
     let m = edges.len() as u64;
     let nodes = HerculesImmBox::from(&nodes as &[Node]);
@@ -38,7 +38,7 @@ fn bfs_bench(c: &mut Criterion) {
     });
 
     let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt";
-    let (nodes, source, edges) = parse_graph(input.into());
+    let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines.");
     let n = nodes.len() as u64;
     let m = edges.len() as u64;
     let nodes = HerculesImmBox::from(&nodes as &[Node]);
diff --git a/juno_samples/rodinia/bfs/src/graph_parser.rs b/juno_samples/rodinia/bfs/src/graph_parser.rs
index fecd2a3e..f761d8ea 100644
--- a/juno_samples/rodinia/bfs/src/graph_parser.rs
+++ b/juno_samples/rodinia/bfs/src/graph_parser.rs
@@ -11,16 +11,18 @@ pub struct Node {
     pub num_edges: u32,
 }
 
-pub fn parse_graph(file: String) -> (Vec<Node>, u32, Vec<u32>) {
-    let mut file = File::open(file).expect("Error opening input file");
+pub fn parse_graph(file: String) -> Result<(Vec<Node>, u32, Vec<u32>), String> {
+    let mut file = File::open(file).map_err(|err| format!("Error opening input file: {}", err))?;
     let mut contents = String::new();
     file.read_to_string(&mut contents)
-        .expect("Error reading input file");
+        .map_err(|err| format!("Error reading input file: {}", err))?;
 
     let mut parser = nom::combinator::all_consuming(graph_parser);
-    let (_, result) = parser.parse(&contents).expect("Parser error");
+    let (_, result) = parser
+        .parse(&contents)
+        .map_err(|err| format!("Parsing error: {}", err))?;
 
-    result
+    Ok(result)
 }
 
 fn graph_parser<'a>(text: &'a str) -> nom::IResult<&'a str, (Vec<Node>, u32, Vec<u32>)> {
diff --git a/juno_samples/rodinia/bfs/src/lib.rs b/juno_samples/rodinia/bfs/src/lib.rs
index f6403540..e48dfe22 100644
--- a/juno_samples/rodinia/bfs/src/lib.rs
+++ b/juno_samples/rodinia/bfs/src/lib.rs
@@ -36,7 +36,7 @@ fn run_bfs(nodes: &[Node], source: u32, edges: &[u32]) -> Vec<i32> {
 pub fn bfs_harness(args: BFSInputs) {
     let BFSInputs { input } = args;
 
-    let (nodes, source, edges) = parse_graph(input);
+    let (nodes, source, edges) = parse_graph(input).unwrap();
 
     let costs_juno = run_bfs(&nodes, source, &edges);
     let costs_ref = rust_bfs::bfs(&nodes, source, &edges);
-- 
GitLab


From 65a5416a9e3989502542e03d4e7a4ff867008772 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 09:56:13 -0600
Subject: [PATCH 24/25] cleanup cfd bench

---
 juno_samples/rodinia/bfs/benches/bfs_bench.rs |  36 +--
 juno_samples/rodinia/cfd/benches/cfd_bench.rs | 280 +++++++++---------
 juno_samples/rodinia/cfd/src/lib.rs           |   2 +-
 juno_samples/rodinia/cfd/src/setup.rs         |  30 +-
 4 files changed, 185 insertions(+), 163 deletions(-)

diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
index 9c09c180..f87b4e09 100644
--- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs
+++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
@@ -13,37 +13,37 @@ fn bfs_bench(c: &mut Criterion) {
 
     let mut r = runner!(bfs);
 
-    let input = "data/graph4096.txt";
-    let (nodes, source, edges) = parse_graph(input.into()).unwrap();
-    let n = nodes.len() as u64;
-    let m = edges.len() as u64;
-    let nodes = HerculesImmBox::from(&nodes as &[Node]);
-    let edges = HerculesImmBox::from(&edges as &[u32]);
     group.bench_function("bfs bench 4096", |b| {
+        let input = "data/graph4096.txt";
+        let (nodes, source, edges) = parse_graph(input.into()).unwrap();
+        let n = nodes.len() as u64;
+        let m = edges.len() as u64;
+        let nodes = HerculesImmBox::from(&nodes as &[Node]);
+        let edges = HerculesImmBox::from(&edges as &[u32]);
         b.iter(|| {
             async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
         })
     });
 
-    let input = "data/graph65536.txt";
-    let (nodes, source, edges) = parse_graph(input.into()).unwrap();
-    let n = nodes.len() as u64;
-    let m = edges.len() as u64;
-    let nodes = HerculesImmBox::from(&nodes as &[Node]);
-    let edges = HerculesImmBox::from(&edges as &[u32]);
     group.bench_function("bfs bench 65536", |b| {
+        let input = "data/graph65536.txt";
+        let (nodes, source, edges) = parse_graph(input.into()).unwrap();
+        let n = nodes.len() as u64;
+        let m = edges.len() as u64;
+        let nodes = HerculesImmBox::from(&nodes as &[Node]);
+        let edges = HerculesImmBox::from(&edges as &[u32]);
         b.iter(|| {
             async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
         })
     });
 
-    let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt";
-    let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines.");
-    let n = nodes.len() as u64;
-    let m = edges.len() as u64;
-    let nodes = HerculesImmBox::from(&nodes as &[Node]);
-    let edges = HerculesImmBox::from(&edges as &[u32]);
     group.bench_function("bfs bench 64M", |b| {
+        let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt";
+        let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines.");
+        let n = nodes.len() as u64;
+        let m = edges.len() as u64;
+        let nodes = HerculesImmBox::from(&nodes as &[Node]);
+        let edges = HerculesImmBox::from(&edges as &[u32]);
         b.iter(|| {
             async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
         })
diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
index 962c5249..aa6d7727 100644
--- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs
+++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
@@ -12,149 +12,161 @@ fn cfd_bench(c: &mut Criterion) {
     let mut group = c.benchmark_group("cfd bench");
     group.sample_size(10);
 
-    let mut r = runner!(euler);
-    let data_file = "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M".to_string();
-    let iterations = 150;
-    let block_size = 16;
-    let FarFieldConditions {
-        ff_variable,
-        ff_fc_momentum_x,
-        ff_fc_momentum_y,
-        ff_fc_momentum_z,
-        ff_fc_density_energy,
-    } = set_far_field_conditions();
-    let GeometryData {
-        nelr,
-        areas,
-        elements_surrounding_elements,
-        normals,
-    } = read_domain_geometry(data_file, block_size);
-    let mut variables = initialize_variables(nelr, &ff_variable);
+    let mut euler_bench = |name, data_file, iterations| {
+        group.bench_function(name, |b| {
+            let mut r = runner!(euler);
+            let block_size = 16;
+            let FarFieldConditions {
+                ff_variable,
+                ff_fc_momentum_x,
+                ff_fc_momentum_y,
+                ff_fc_momentum_z,
+                ff_fc_density_energy,
+            } = set_far_field_conditions();
+            let GeometryData {
+                nelr,
+                areas,
+                elements_surrounding_elements,
+                normals,
+            } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
+            let mut variables = initialize_variables(nelr, &ff_variable);
 
-    let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
-    let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
-    let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
-    let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
-    let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+            let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+            let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+            let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+            let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+            let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
 
-    let areas = HerculesImmBox::from(areas.as_slice());
-    let elements_surrounding_elements =
-        HerculesImmBox::from(elements_surrounding_elements.as_slice());
+            let areas = HerculesImmBox::from(areas.as_slice());
+            let elements_surrounding_elements =
+                HerculesImmBox::from(elements_surrounding_elements.as_slice());
 
-    let normals_x = HerculesImmBox::from(normals.x.as_slice());
-    let normals_y = HerculesImmBox::from(normals.y.as_slice());
-    let normals_z = HerculesImmBox::from(normals.z.as_slice());
+            let normals_x = HerculesImmBox::from(normals.x.as_slice());
+            let normals_y = HerculesImmBox::from(normals.y.as_slice());
+            let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
-    group.bench_function("cfd bench euler large", |b| {
-        b.iter(|| {
-            async_std::task::block_on(async {
-                r.run(
-                    nelr as u64,
-                    iterations as u64,
-                    v_density.to(),
-                    v_momentum_x.to(),
-                    v_momentum_y.to(),
-                    v_momentum_z.to(),
-                    v_energy.to(),
-                    areas.to(),
-                    elements_surrounding_elements.to(),
-                    normals_x.to(),
-                    normals_y.to(),
-                    normals_z.to(),
-                    ff_variable.density,
-                    ff_variable.momentum.x,
-                    ff_variable.momentum.y,
-                    ff_variable.momentum.z,
-                    ff_variable.energy,
-                    ff_fc_density_energy.x,
-                    ff_fc_density_energy.y,
-                    ff_fc_density_energy.z,
-                    ff_fc_momentum_x.x,
-                    ff_fc_momentum_x.y,
-                    ff_fc_momentum_x.z,
-                    ff_fc_momentum_y.x,
-                    ff_fc_momentum_y.y,
-                    ff_fc_momentum_y.z,
-                    ff_fc_momentum_z.x,
-                    ff_fc_momentum_z.y,
-                    ff_fc_momentum_z.z,
-                )
-                .await
-            });
-        })
-    });
+            b.iter(|| {
+                async_std::task::block_on(async {
+                    r.run(
+                        nelr as u64,
+                        iterations as u64,
+                        v_density.to(),
+                        v_momentum_x.to(),
+                        v_momentum_y.to(),
+                        v_momentum_z.to(),
+                        v_energy.to(),
+                        areas.to(),
+                        elements_surrounding_elements.to(),
+                        normals_x.to(),
+                        normals_y.to(),
+                        normals_z.to(),
+                        ff_variable.density,
+                        ff_variable.momentum.x,
+                        ff_variable.momentum.y,
+                        ff_variable.momentum.z,
+                        ff_variable.energy,
+                        ff_fc_density_energy.x,
+                        ff_fc_density_energy.y,
+                        ff_fc_density_energy.z,
+                        ff_fc_momentum_x.x,
+                        ff_fc_momentum_x.y,
+                        ff_fc_momentum_x.z,
+                        ff_fc_momentum_y.x,
+                        ff_fc_momentum_y.y,
+                        ff_fc_momentum_y.z,
+                        ff_fc_momentum_z.x,
+                        ff_fc_momentum_z.y,
+                        ff_fc_momentum_z.z,
+                    )
+                    .await
+                });
+            })
+        });
+    };
+    euler_bench("cfd bench euler small", "data/fvcorr.domn.097K", 1);
+    euler_bench(
+        "cfd bench euler large",
+        "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M",
+        150,
+    );
 
-    let mut r = runner!(pre_euler);
-    let data_file = "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M".to_string();
-    let iterations = 150;
-    let block_size = 16;
-    let FarFieldConditions {
-        ff_variable,
-        ff_fc_momentum_x,
-        ff_fc_momentum_y,
-        ff_fc_momentum_z,
-        ff_fc_density_energy,
-    } = set_far_field_conditions();
-    let GeometryData {
-        nelr,
-        areas,
-        elements_surrounding_elements,
-        normals,
-    } = read_domain_geometry(data_file, block_size);
-    let mut variables = initialize_variables(nelr, &ff_variable);
+    let mut pre_euler_bench = |name, data_file, iterations| {
+        group.bench_function(name, |b| {
+            let mut r = runner!(pre_euler);
+            let block_size = 16;
+            let FarFieldConditions {
+                ff_variable,
+                ff_fc_momentum_x,
+                ff_fc_momentum_y,
+                ff_fc_momentum_z,
+                ff_fc_density_energy,
+            } = set_far_field_conditions();
+            let GeometryData {
+                nelr,
+                areas,
+                elements_surrounding_elements,
+                normals,
+            } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
+            let mut variables = initialize_variables(nelr, &ff_variable);
 
-    let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
-    let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
-    let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
-    let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
-    let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+            let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+            let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+            let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+            let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+            let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
 
-    let areas = HerculesImmBox::from(areas.as_slice());
-    let elements_surrounding_elements =
-        HerculesImmBox::from(elements_surrounding_elements.as_slice());
+            let areas = HerculesImmBox::from(areas.as_slice());
+            let elements_surrounding_elements =
+                HerculesImmBox::from(elements_surrounding_elements.as_slice());
 
-    let normals_x = HerculesImmBox::from(normals.x.as_slice());
-    let normals_y = HerculesImmBox::from(normals.y.as_slice());
-    let normals_z = HerculesImmBox::from(normals.z.as_slice());
+            let normals_x = HerculesImmBox::from(normals.x.as_slice());
+            let normals_y = HerculesImmBox::from(normals.y.as_slice());
+            let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
-    group.bench_function("cfd bench pre-euler large", |b| {
-        b.iter(|| {
-            async_std::task::block_on(async {
-                r.run(
-                    nelr as u64,
-                    iterations as u64,
-                    v_density.to(),
-                    v_momentum_x.to(),
-                    v_momentum_y.to(),
-                    v_momentum_z.to(),
-                    v_energy.to(),
-                    areas.to(),
-                    elements_surrounding_elements.to(),
-                    normals_x.to(),
-                    normals_y.to(),
-                    normals_z.to(),
-                    ff_variable.density,
-                    ff_variable.momentum.x,
-                    ff_variable.momentum.y,
-                    ff_variable.momentum.z,
-                    ff_variable.energy,
-                    ff_fc_density_energy.x,
-                    ff_fc_density_energy.y,
-                    ff_fc_density_energy.z,
-                    ff_fc_momentum_x.x,
-                    ff_fc_momentum_x.y,
-                    ff_fc_momentum_x.z,
-                    ff_fc_momentum_y.x,
-                    ff_fc_momentum_y.y,
-                    ff_fc_momentum_y.z,
-                    ff_fc_momentum_z.x,
-                    ff_fc_momentum_z.y,
-                    ff_fc_momentum_z.z,
-                )
-                .await
-            });
-        })
-    });
+            b.iter(|| {
+                async_std::task::block_on(async {
+                    r.run(
+                        nelr as u64,
+                        iterations as u64,
+                        v_density.to(),
+                        v_momentum_x.to(),
+                        v_momentum_y.to(),
+                        v_momentum_z.to(),
+                        v_energy.to(),
+                        areas.to(),
+                        elements_surrounding_elements.to(),
+                        normals_x.to(),
+                        normals_y.to(),
+                        normals_z.to(),
+                        ff_variable.density,
+                        ff_variable.momentum.x,
+                        ff_variable.momentum.y,
+                        ff_variable.momentum.z,
+                        ff_variable.energy,
+                        ff_fc_density_energy.x,
+                        ff_fc_density_energy.y,
+                        ff_fc_density_energy.z,
+                        ff_fc_momentum_x.x,
+                        ff_fc_momentum_x.y,
+                        ff_fc_momentum_x.z,
+                        ff_fc_momentum_y.x,
+                        ff_fc_momentum_y.y,
+                        ff_fc_momentum_y.z,
+                        ff_fc_momentum_z.x,
+                        ff_fc_momentum_z.y,
+                        ff_fc_momentum_z.z,
+                    )
+                    .await
+                });
+            })
+        });
+    };
+    pre_euler_bench("cfd bench pre-euler small", "data/fvcorr.domn.097K", 1);
+    pre_euler_bench(
+        "cfd bench pre-euler large",
+        "/scratch/aaronjc4/rodinia_3.1/data/cfd/missile.domn.0.2M",
+        150,
+    );
 }
 
 criterion_group!(benches, cfd_bench);
diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs
index a9800ed0..3b1e21e6 100644
--- a/juno_samples/rodinia/cfd/src/lib.rs
+++ b/juno_samples/rodinia/cfd/src/lib.rs
@@ -237,7 +237,7 @@ pub fn cfd_harness(args: CFDInputs) {
         areas,
         elements_surrounding_elements,
         normals,
-    } = read_domain_geometry(data_file, block_size);
+    } = read_domain_geometry(data_file, block_size).unwrap();
 
     let variables = initialize_variables(nelr, &ff_variable);
     println!("Running CFD with nelr = {}.", nelr);
diff --git a/juno_samples/rodinia/cfd/src/setup.rs b/juno_samples/rodinia/cfd/src/setup.rs
index 0da00251..669810d6 100644
--- a/juno_samples/rodinia/cfd/src/setup.rs
+++ b/juno_samples/rodinia/cfd/src/setup.rs
@@ -29,7 +29,7 @@ pub struct AlignedSlice<T> {
 
 impl<T> Debug for AlignedSlice<T>
 where
-    T: Debug
+    T: Debug,
 {
     fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), Error> {
         write!(f, "{:?}", self.as_slice())
@@ -127,7 +127,15 @@ pub struct FarFieldConditions {
 }
 
 pub fn set_far_field_conditions() -> FarFieldConditions {
-    let mut ff_variable = Variable { density: 0.0, momentum: Float3 { x: 0.0, y: 0.0, z: 0.0 }, energy: 0.0 };
+    let mut ff_variable = Variable {
+        density: 0.0,
+        momentum: Float3 {
+            x: 0.0,
+            y: 0.0,
+            z: 0.0,
+        },
+        energy: 0.0,
+    };
 
     let angle_of_attack = std::f32::consts::PI / 180.0 * deg_angle_of_attack;
 
@@ -186,19 +194,21 @@ pub struct GeometryData {
     pub normals: Normals,
 }
 
-pub fn read_domain_geometry<P>(path: P, block_size: usize) -> GeometryData
+pub fn read_domain_geometry<P>(path: P, block_size: usize) -> Result<GeometryData, String>
 where
     P: AsRef<Path>,
 {
-    let mut file = File::open(path).expect("Error opening input file");
+    let mut file = File::open(path).map_err(|err| format!("Error opening input file: {}", err))?;
     let mut contents = String::new();
     file.read_to_string(&mut contents)
-        .expect("Error reading input file");
+        .map_err(|err| format!("Error reading input file: {}", err))?;
 
     let mut parser = nom::combinator::all_consuming(|s| cfd_parser(block_size, s));
-    let (_, result) = parser.parse(&contents).expect("Parser error");
+    let (_, result) = parser
+        .parse(&contents)
+        .map_err(|err| format!("Parser error: {}", err))?;
 
-    result
+    Ok(result)
 }
 
 fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, GeometryData> {
@@ -240,17 +250,17 @@ fn cfd_parser<'a>(block_size: usize, text: &'a str) -> nom::IResult<&'a str, Geo
             let val = i32::from_str(val).unwrap();
             let val = if neg { -val } else { val };
             elements_surrounding_elements[i + j * nelr] = if val < 0 { -1 } else { val } - 1; // it's coming in with Fortran numbering
-                
+
             let t = nom::character::complete::multispace0(text)?.0;
             let (t, val) = nom::number::complete::float(t)?;
             text = t;
             normals.x[i + j * nelr] = -val;
-                
+
             let t = nom::character::complete::multispace0(text)?.0;
             let (t, val) = nom::number::complete::float(t)?;
             text = t;
             normals.y[i + j * nelr] = -val;
-                
+
             let t = nom::character::complete::multispace0(text)?.0;
             let (t, val) = nom::number::complete::float(t)?;
             text = t;
-- 
GitLab


From 4cf9d708a9dddef41b1519cfe0cdec3da10b8d93 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 09:58:06 -0600
Subject: [PATCH 25/25] cleanup srad bench

---
 .../rodinia/srad/benches/srad_bench.rs        | 62 ++++++++++---------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs
index 23dc0643..bf0e4ad4 100644
--- a/juno_samples/rodinia/srad/benches/srad_bench.rs
+++ b/juno_samples/rodinia/srad/benches/srad_bench.rs
@@ -1,7 +1,7 @@
 #![feature(concat_idents)]
 use criterion::{criterion_group, criterion_main, Criterion};
 
-use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox, HerculesMutBoxTo};
+use hercules_rt::{runner, HerculesMutBox, HerculesMutBoxTo};
 
 juno_build::juno!("srad");
 
@@ -11,35 +11,37 @@ fn srad_bench(c: &mut Criterion) {
     let mut group = c.benchmark_group("srad bench");
     group.sample_size(10);
 
-    let mut r = runner!(srad);
-    let niter = 30;
-    let lambda = 0.5;
-    let nrows = 2048;
-    let ncols = 2048;
-    let image = "data/image.pgm".to_string();
-    let Image {
-        image: image_ori,
-        max,
-        rows: image_ori_rows,
-        cols: image_ori_cols,
-    } = read_graphics(image);
-    let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
-    let mut image_h = HerculesMutBox::from(image.clone());
-    group.bench_function("srad bench large", |b| {
-        b.iter(|| {
-            async_std::task::block_on(async {
-                r.run(
-                    nrows as u64,
-                    ncols as u64,
-                    niter as u64,
-                    image_h.to(),
-                    max,
-                    lambda,
-                )
-                .await
-            });
-        })
-    });
+    let mut bench = |name, niter, nrows, ncols| {
+        let mut r = runner!(srad);
+        let lambda = 0.5;
+        let image = "data/image.pgm".to_string();
+        let Image {
+            image: image_ori,
+            max,
+            rows: image_ori_rows,
+            cols: image_ori_cols,
+        } = read_graphics(image);
+        let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
+        let mut image_h = HerculesMutBox::from(image.clone());
+        group.bench_function(name, |b| {
+            b.iter(|| {
+                async_std::task::block_on(async {
+                    r.run(
+                        nrows as u64,
+                        ncols as u64,
+                        niter as u64,
+                        image_h.to(),
+                        max,
+                        lambda,
+                    )
+                    .await
+                });
+            })
+        });
+    };
+
+    bench("srad bench small", 100, 512, 512);
+    bench("srad bench large", 30, 2048, 2048);
 }
 
 criterion_group!(benches, srad_bench);
-- 
GitLab