From c5063c59ec17ef6d1ae9eddb5f27a0a1ca9b09a2 Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Mon, 24 Feb 2025 11:18:30 -0600
Subject: [PATCH 01/17] panic if xdot can't be run

---
 hercules_ir/src/dot.rs | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs
index aff1f9c5..f21137f8 100644
--- a/hercules_ir/src/dot.rs
+++ b/hercules_ir/src/dot.rs
@@ -44,10 +44,17 @@ pub fn xdot_module(
     file.write_all(contents.as_bytes())
         .expect("PANIC: Unable to write output file contents.");
     println!("Graphviz written to: {}", tmp_path.display());
-    Command::new("xdot")
+    let mut xdot_process = Command::new("xdot")
         .args([&tmp_path])
-        .output()
+        .spawn()
         .expect("PANIC: Couldn't execute xdot. Is xdot installed?");
+    assert!(
+        xdot_process
+            .wait()
+            .map(|status| status.success())
+            .unwrap_or(false),
+        "PANIC: Xdot failed to execute."
+    )
 }
 
 /*
-- 
GitLab


From 937e9b7fce419024cf399742c3f18904b0bdd68a Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Mon, 24 Feb 2025 12:14:48 -0600
Subject: [PATCH 02/17] make imm refs copy

---
 hercules_rt/src/lib.rs                  |  4 +--
 juno_samples/cava/benches/cava_bench.rs | 36 ++++++++++++++++---------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index d19a0a5a..df53a0e9 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -150,7 +150,7 @@ extern "C" {
     fn ___cublas_sgemm(i: u64, j: u64, k: u64, c: *mut u8, a: *const u8, b: *const u8);
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Copy)]
 pub struct HerculesCPURef<'a> {
     ptr: NonNull<u8>,
     size: usize,
@@ -165,7 +165,7 @@ pub struct HerculesCPURefMut<'a> {
 }
 
 #[cfg(feature = "cuda")]
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Copy)]
 pub struct HerculesCUDARef<'a> {
     ptr: NonNull<u8>,
     size: usize,
diff --git a/juno_samples/cava/benches/cava_bench.rs b/juno_samples/cava/benches/cava_bench.rs
index b8dd3ce2..41b34230 100644
--- a/juno_samples/cava/benches/cava_bench.rs
+++ b/juno_samples/cava/benches/cava_bench.rs
@@ -25,6 +25,12 @@ fn cava_bench(c: &mut Criterion) {
     let (rows, cols, num_ctrl_pts, image, tstw, ctrl_pts, weights, coefs, tonemap) =
         prepare_hercules_inputs(&raw_image, &cam_model);
     let mut r = runner!(cava);
+    let image = image.to();
+    let tstw = tstw.to();
+    let ctrl_pts = ctrl_pts.to();
+    let weights = weights.to();
+    let coefs = coefs.to();
+    let tonemap = tonemap.to();
 
     group.bench_function("cava bench small", |b| {
         b.iter(|| {
@@ -32,12 +38,12 @@ fn cava_bench(c: &mut Criterion) {
                 rows as u64,
                 cols as u64,
                 num_ctrl_pts as u64,
-                image.to(),
-                tstw.to(),
-                ctrl_pts.to(),
-                weights.to(),
-                coefs.to(),
-                tonemap.to(),
+                image,
+                tstw,
+                ctrl_pts,
+                weights,
+                coefs,
+                tonemap,
             ));
         })
     });
@@ -55,6 +61,12 @@ fn cava_bench(c: &mut Criterion) {
     let (rows, cols, num_ctrl_pts, image, tstw, ctrl_pts, weights, coefs, tonemap) =
         prepare_hercules_inputs(&raw_image, &cam_model);
     let mut r = runner!(cava);
+    let image = image.to();
+    let tstw = tstw.to();
+    let ctrl_pts = ctrl_pts.to();
+    let weights = weights.to();
+    let coefs = coefs.to();
+    let tonemap = tonemap.to();
 
     group.bench_function("cava bench full", |b| {
         b.iter(|| {
@@ -62,12 +74,12 @@ fn cava_bench(c: &mut Criterion) {
                 rows as u64,
                 cols as u64,
                 num_ctrl_pts as u64,
-                image.to(),
-                tstw.to(),
-                ctrl_pts.to(),
-                weights.to(),
-                coefs.to(),
-                tonemap.to(),
+                image,
+                tstw,
+                ctrl_pts,
+                weights,
+                coefs,
+                tonemap,
             ));
         })
     });
-- 
GitLab


From 41be4f79755fa867e19cbf46d16b5adbf4035e4e Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Mon, 24 Feb 2025 13:48:01 -0600
Subject: [PATCH 03/17] fix bench

---
 juno_samples/edge_detection/Cargo.toml               |  1 +
 .../edge_detection/benches/edge_detection_bench.rs   | 12 ++++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/juno_samples/edge_detection/Cargo.toml b/juno_samples/edge_detection/Cargo.toml
index 483724d8..fa4ca1ff 100644
--- a/juno_samples/edge_detection/Cargo.toml
+++ b/juno_samples/edge_detection/Cargo.toml
@@ -33,3 +33,4 @@ criterion = { version = "0.5", features = ["html_reports"] }
 [[bench]]
 name = "edge_detection_bench"
 harness = false
+required-features = ["opencv"]
diff --git a/juno_samples/edge_detection/benches/edge_detection_bench.rs b/juno_samples/edge_detection/benches/edge_detection_bench.rs
index 806a8865..76035275 100644
--- a/juno_samples/edge_detection/benches/edge_detection_bench.rs
+++ b/juno_samples/edge_detection/benches/edge_detection_bench.rs
@@ -63,6 +63,10 @@ fn edge_detection_bench(c: &mut Criterion) {
     let num_frames = 5;
 
     let mut r = runner!(edge_detection);
+    let gaussian_filter_h = gaussian_filter_h.to();
+    let structure_h = structure_h.to();
+    let sx_h = sx_h.to();
+    let sy_h = sy_h.to();
 
     let frames: Vec<_> = (0..num_frames).map(|_| load_frame(&mut video)).collect();
 
@@ -87,10 +91,10 @@ fn edge_detection_bench(c: &mut Criterion) {
                         sz as u64,
                         sb as u64,
                         input_h.to(),
-                        gaussian_filter_h.to(),
-                        structure_h.to(),
-                        sx_h.to(),
-                        sy_h.to(),
+                        gaussian_filter_h,
+                        structure_h,
+                        sx_h,
+                        sy_h,
                         theta,
                     )
                     .await
-- 
GitLab


From d4280e531b346797827cc23653b23b196cdc08f4 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 24 Feb 2025 14:22:44 -0600
Subject: [PATCH 04/17] Simple conv test

---
 juno_samples/fork_join_tests/src/cpu.sch      |  9 ++++---
 .../fork_join_tests/src/fork_join_tests.jn    | 24 +++++++++++++++++++
 juno_samples/fork_join_tests/src/gpu.sch      |  6 +++--
 juno_samples/fork_join_tests/src/main.rs      | 19 ++++++++++++++-
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/juno_samples/fork_join_tests/src/cpu.sch b/juno_samples/fork_join_tests/src/cpu.sch
index 76dcbdf6..f46c91d6 100644
--- a/juno_samples/fork_join_tests/src/cpu.sch
+++ b/juno_samples/fork_join_tests/src/cpu.sch
@@ -3,7 +3,7 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8);
+let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9);
 cpu(auto.test1);
 cpu(auto.test2);
 cpu(auto.test3);
@@ -11,6 +11,7 @@ cpu(auto.test4);
 cpu(auto.test5);
 cpu(auto.test7);
 cpu(auto.test8);
+cpu(auto.test9);
 
 let test1_cpu = auto.test1;
 rename["test1_cpu"](test1_cpu);
@@ -51,11 +52,11 @@ fixpoint panic after 20 {
   unroll(auto.test1);
 }
 
-fork-split(auto.test2, auto.test3, auto.test4, auto.test5);
+fork-split(auto.test2, auto.test3, auto.test4, auto.test5, auto.test9);
 gvn(*);
 phi-elim(*);
 dce(*);
-unforkify(auto.test2, auto.test3, auto.test4, auto.test5);
+unforkify(auto.test2, auto.test3, auto.test4, auto.test5, auto.test9);
 ccp(*);
 gvn(*);
 phi-elim(*);
@@ -93,4 +94,6 @@ dce(auto.test8);
 simplify-cfg(auto.test8);
 dce(auto.test8);
 
+no-memset(test9@const);
+
 gcm(*);
diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn
index bfb5564b..3b7c7833 100644
--- a/juno_samples/fork_join_tests/src/fork_join_tests.jn
+++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn
@@ -122,3 +122,27 @@ fn test8(input : i32) -> i32[8] {
   }
   return out;
 }
+
+#[entry]
+fn test9<r, c, z : usize>(input : i32[r, c]) -> i32[r, c] {
+  const rad = z / 2;
+  @const let out : i32[r, c];
+
+  for ir = 0 to r {
+    for ic = 0 to c {
+      let acc = 0;
+      @filter_loop for sr = 0 to z {
+        for sc = 0 to z {
+	  acc += if ir + sr < rad then 0
+	         else if ir + sr - rad > r - 1 then 0
+	         else if ic + sc < rad then 0
+	         else if ic + sc - rad > c - 1 then 0
+		 else input[ir + sr - rad, ic + sc - rad];
+	}
+      }
+      out[ir, ic] = acc;
+    }
+  }
+
+  return out;
+}
diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch
index 364673cd..c554fd50 100644
--- a/juno_samples/fork_join_tests/src/gpu.sch
+++ b/juno_samples/fork_join_tests/src/gpu.sch
@@ -7,12 +7,13 @@ no-memset(test3@const3);
 no-memset(test6@const);
 no-memset(test8@const1);
 no-memset(test8@const2);
+no-memset(test9@const);
 
 gvn(*);
 phi-elim(*);
 dce(*);
 
-let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8);
+let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9);
 gpu(auto.test1);
 gpu(auto.test2);
 gpu(auto.test3);
@@ -20,6 +21,7 @@ gpu(auto.test4);
 gpu(auto.test5);
 gpu(auto.test7);
 gpu(auto.test8);
+gpu(auto.test9);
 
 ip-sroa(*);
 sroa(*);
@@ -34,7 +36,7 @@ fixpoint panic after 20 {
 }
 
 fixpoint panic after 20 {
-  fork-coalesce(auto.test1, auto.test3, auto.test4, auto.test5, auto.test7, auto.test8);
+  fork-coalesce(auto.test1, auto.test3, auto.test4, auto.test5, auto.test7, auto.test8, auto.test9);
 }
 
 gvn(*);
diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs
index cd715cac..fa99f759 100644
--- a/juno_samples/fork_join_tests/src/main.rs
+++ b/juno_samples/fork_join_tests/src/main.rs
@@ -1,6 +1,6 @@
 #![feature(concat_idents)]
 
-use hercules_rt::runner;
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
 
 juno_build::juno!("fork_join_tests");
 
@@ -57,6 +57,23 @@ fn main() {
         let output = r.run(0).await;
         let correct = vec![10, 17, 24, 31, 38, 45, 52, 59];
         assert(&correct, output);
+
+        let mut r = runner!(test9);
+        let input = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
+        let input = HerculesImmBox::from(&input as &[i32]);
+        let output = r.run(3, 3, 3, input.to()).await;
+        let correct = vec![
+            1 + 2 + 4 + 5,
+            1 + 2 + 3 + 4 + 5 + 6,
+            2 + 3 + 5 + 6,
+            1 + 2 + 4 + 5 + 7 + 8,
+            1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+            2 + 3 + 5 + 6 + 8 + 9,
+            4 + 5 + 7 + 8,
+            4 + 5 + 6 + 7 + 8 + 9,
+            5 + 6 + 8 + 9,
+        ];
+        assert(&correct, output);
     });
 }
 
-- 
GitLab


From 31db6796a6f4dc36b5343be69b38e3e53baf5e25 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 24 Feb 2025 16:14:26 -0600
Subject: [PATCH 05/17] Transform conv test

---
 hercules_opt/src/fork_transforms.rs           |  8 +++++++-
 .../fork_join_tests/src/fork_join_tests.jn    |  3 ++-
 juno_samples/fork_join_tests/src/gpu.sch      | 20 ++++++++++++++++++-
 juno_samples/fork_join_tests/src/main.rs      |  2 +-
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index e635b3c0..ae3dfe22 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -695,7 +695,8 @@ pub fn fork_coalesce_helper(
 
         editor.edit(|mut edit| {
             let new_tid = edit.add_node(new_tid);
-            let edit = edit.replace_all_uses(tid, new_tid)?;
+            let mut edit = edit.replace_all_uses(tid, new_tid)?;
+            edit.sub_edit(tid, new_tid);
             Ok(edit)
         });
     }
@@ -1598,3 +1599,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
         }
     }
 }
+
+/*
+ * Looks for reads in fork-joins that are linear in the thread IDs for the fork-
+ * join.
+ */
diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn
index 3b7c7833..334fc2bf 100644
--- a/juno_samples/fork_join_tests/src/fork_join_tests.jn
+++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn
@@ -124,7 +124,8 @@ fn test8(input : i32) -> i32[8] {
 }
 
 #[entry]
-fn test9<r, c, z : usize>(input : i32[r, c]) -> i32[r, c] {
+fn test9<r, c : usize>(input : i32[r, c]) -> i32[r, c] {
+  const z = 3;
   const rad = z / 2;
   @const let out : i32[r, c];
 
diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch
index c554fd50..ca17f692 100644
--- a/juno_samples/fork_join_tests/src/gpu.sch
+++ b/juno_samples/fork_join_tests/src/gpu.sch
@@ -73,6 +73,24 @@ dce(auto.test8);
 simplify-cfg(auto.test8);
 dce(auto.test8);
 
+fork-split(auto.test9@filter_loop);
+fork-unroll(auto.test9);
+fork-unroll(auto.test9);
+dce(auto.test9);
+ccp(auto.test9);
+gvn(auto.test9);
+phi-elim(auto.test9);
+dce(auto.test9);
+fixpoint {
+  predication(auto.test9);
+  simplify-cfg(auto.test9);
+}
+ccp(auto.test9);
+gvn(auto.test9);
+phi-elim(auto.test9);
+lift-dc-math(auto.test9);
+dce(auto.test9);
+
 ip-sroa(*);
 sroa(*);
 dce(*);
@@ -80,7 +98,7 @@ ccp(*);
 gvn(*);
 phi-elim(*);
 dce(*);
-gcm(*);
 
+gcm(*);
 float-collections(test2, auto.test2, test4, auto.test4, test5, auto.test5);
 gcm(*);
diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs
index fa99f759..e66309b2 100644
--- a/juno_samples/fork_join_tests/src/main.rs
+++ b/juno_samples/fork_join_tests/src/main.rs
@@ -61,7 +61,7 @@ fn main() {
         let mut r = runner!(test9);
         let input = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
         let input = HerculesImmBox::from(&input as &[i32]);
-        let output = r.run(3, 3, 3, input.to()).await;
+        let output = r.run(3, 3, input.to()).await;
         let correct = vec![
             1 + 2 + 4 + 5,
             1 + 2 + 3 + 4 + 5 + 6,
-- 
GitLab


From 87445e17a4e5defc180b560468e5ac5879848cea Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 24 Feb 2025 19:58:03 -0600
Subject: [PATCH 06/17] Fork extend pass

---
 hercules_opt/src/fork_transforms.rs | 126 +++++++++++++++++++++++++++-
 juno_samples/cava/src/cpu.sch       |   1 +
 juno_scheduler/src/compile.rs       |   1 +
 juno_scheduler/src/ir.rs            |   3 +
 juno_scheduler/src/pm.rs            |  24 ++++++
 5 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index ae3dfe22..0e943973 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1601,6 +1601,128 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
 }
 
 /*
- * Looks for reads in fork-joins that are linear in the thread IDs for the fork-
- * join.
+ * Extends the dimensions of a fork-join to be a multiple of a number and gates
+ * the execution of the body.
  */
+pub fn extend_all_forks(
+    editor: &mut FunctionEditor,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    multiple: usize,
+) {
+    for (fork, join) in fork_join_map {
+        if editor.is_mutable(*fork) {
+            extend_fork(editor, *fork, *join, multiple);
+        }
+    }
+}
+
+fn extend_fork(editor: &mut FunctionEditor, fork: NodeID, join: NodeID, multiple: usize) {
+    let nodes = &editor.func().nodes;
+    let (fork_pred, factors) = nodes[fork.idx()].try_fork().unwrap();
+    let factors = factors.to_vec();
+    let fork_succ = editor
+        .get_users(fork)
+        .filter(|id| nodes[id.idx()].is_control())
+        .next()
+        .unwrap();
+    let join_pred = nodes[join.idx()].try_join().unwrap();
+    let ctrl_between = fork != join_pred;
+    let reduces: Vec<_> = editor
+        .get_users(join)
+        .filter_map(|id| nodes[id.idx()].try_reduce().map(|x| (id, x)))
+        .collect();
+
+    editor.edit(|mut edit| {
+        // We can round up a dynamic constant A to a multiple of another dynamic
+        // constant B via the following math:
+        // ((A + B - 1) / B) * B
+        let new_factors: Vec<_> = factors
+            .iter()
+            .map(|factor| {
+                let b = edit.add_dynamic_constant(DynamicConstant::Constant(multiple));
+                let apb = edit.add_dynamic_constant(DynamicConstant::add(*factor, b));
+                let o = edit.add_dynamic_constant(DynamicConstant::Constant(1));
+                let apbmo = edit.add_dynamic_constant(DynamicConstant::sub(apb, o));
+                let apbmodb = edit.add_dynamic_constant(DynamicConstant::div(apbmo, b));
+                edit.add_dynamic_constant(DynamicConstant::mul(apbmodb, b))
+            })
+            .collect();
+
+        // Create the new control structure.
+        let new_fork = edit.add_node(Node::Fork {
+            control: fork_pred,
+            factors: new_factors.into_boxed_slice(),
+        });
+        edit = edit.replace_all_uses_where(fork, new_fork, |id| *id != fork_succ)?;
+        edit.sub_edit(fork, new_fork);
+        let conds: Vec<_> = factors
+            .iter()
+            .enumerate()
+            .map(|(idx, old_factor)| {
+                let tid = edit.add_node(Node::ThreadID {
+                    control: new_fork,
+                    dimension: idx,
+                });
+                let old_bound = edit.add_node(Node::DynamicConstant { id: *old_factor });
+                edit.add_node(Node::Binary {
+                    op: BinaryOperator::LT,
+                    left: tid,
+                    right: old_bound,
+                })
+            })
+            .collect();
+        let cond = conds
+            .into_iter()
+            .reduce(|left, right| {
+                edit.add_node(Node::Binary {
+                    op: BinaryOperator::And,
+                    left,
+                    right,
+                })
+            })
+            .unwrap();
+        let branch = edit.add_node(Node::If {
+            control: new_fork,
+            cond,
+        });
+        let false_proj = edit.add_node(Node::ControlProjection {
+            control: branch,
+            selection: 0,
+        });
+        let true_proj = edit.add_node(Node::ControlProjection {
+            control: branch,
+            selection: 1,
+        });
+        if ctrl_between {
+            edit = edit.replace_all_uses_where(fork, true_proj, |id| *id == fork_succ)?;
+        }
+        let bottom_region = edit.add_node(Node::Region {
+            preds: Box::new([false_proj, if ctrl_between { join_pred } else { true_proj }]),
+        });
+        let new_join = edit.add_node(Node::Join {
+            control: bottom_region,
+        });
+        edit = edit.replace_all_uses(join, new_join)?;
+        edit.sub_edit(join, new_join);
+        edit = edit.delete_node(fork)?;
+        edit = edit.delete_node(join)?;
+
+        // Update the reduces to use phis on the region node to gate their execution.
+        for (reduce, (_, init, reduct)) in reduces {
+            let phi = edit.add_node(Node::Phi {
+                control: bottom_region,
+                data: Box::new([reduce, reduct]),
+            });
+            let new_reduce = edit.add_node(Node::Reduce {
+                control: new_join,
+                init,
+                reduct: phi,
+            });
+            edit = edit.replace_all_uses(reduce, new_reduce)?;
+            edit.sub_edit(reduce, new_reduce);
+            edit = edit.delete_node(reduce)?;
+        }
+
+        Ok(edit)
+    });
+}
diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index 3ac2f326..efa7302e 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -49,6 +49,7 @@ simpl!(fuse1);
 write-predication(fuse1);
 simpl!(fuse1);
 parallel-reduce(fuse1@loop);
+fork-extend[8](fuse1);
 
 inline(fuse2);
 no-memset(fuse2@res);
diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs
index 13990ef9..3c288ca7 100644
--- a/juno_scheduler/src/compile.rs
+++ b/juno_scheduler/src/compile.rs
@@ -131,6 +131,7 @@ impl FromStr for Appliable {
             "fork-dim-merge" => Ok(Appliable::Pass(ir::Pass::ForkDimMerge)),
             "fork-interchange" => Ok(Appliable::Pass(ir::Pass::ForkInterchange)),
             "fork-chunk" | "fork-tile" => Ok(Appliable::Pass(ir::Pass::ForkChunk)),
+            "fork-extend" => Ok(Appliable::Pass(ir::Pass::ForkExtend)),
             "fork-unroll" | "unroll" => Ok(Appliable::Pass(ir::Pass::ForkUnroll)),
             "fork-fusion" | "fusion" => Ok(Appliable::Pass(ir::Pass::ForkFusion)),
             "lift-dc-math" => Ok(Appliable::Pass(ir::Pass::LiftDCMath)),
diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs
index bbecc6ff..3a087c0d 100644
--- a/juno_scheduler/src/ir.rs
+++ b/juno_scheduler/src/ir.rs
@@ -15,6 +15,7 @@ pub enum Pass {
     ForkChunk,
     ForkCoalesce,
     ForkDimMerge,
+    ForkExtend,
     ForkFissionBufferize,
     ForkFusion,
     ForkGuardElim,
@@ -53,6 +54,7 @@ impl Pass {
         match self {
             Pass::ArrayToProduct => num == 0 || num == 1,
             Pass::ForkChunk => num == 4,
+            Pass::ForkExtend => num == 1,
             Pass::ForkFissionBufferize => num == 2 || num == 1,
             Pass::ForkInterchange => num == 2,
             Pass::Print => num == 1,
@@ -68,6 +70,7 @@ impl Pass {
         match self {
             Pass::ArrayToProduct => "0 or 1",
             Pass::ForkChunk => "4",
+            Pass::ForkExtend => "1",
             Pass::ForkFissionBufferize => "1 or 2",
             Pass::ForkInterchange => "2",
             Pass::Print => "1",
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index d5e280b4..4656d841 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -2642,6 +2642,30 @@ fn run_pass(
             pm.delete_gravestones();
             pm.clear_analyses();
         }
+        Pass::ForkExtend => {
+            assert_eq!(args.len(), 1);
+            let Some(Value::Integer { val: multiple }) = args.get(0) else {
+                return Err(SchedulerError::PassError {
+                    pass: "forkExtend".to_string(),
+                    error: "expected integer argument".to_string(),
+                });
+            };
+
+            pm.make_fork_join_maps();
+            let fork_join_maps = pm.fork_join_maps.take().unwrap();
+            for (func, fork_join_map) in build_selection(pm, selection, false)
+                .into_iter()
+                .zip(fork_join_maps.iter())
+            {
+                let Some(mut func) = func else {
+                    continue;
+                };
+                extend_all_forks(&mut func, fork_join_map, *multiple);
+                changed |= func.modified();
+            }
+            pm.delete_gravestones();
+            pm.clear_analyses();
+        }
         Pass::ForkFissionBufferize => {
             assert!(args.len() == 1 || args.len() == 2);
             let Some(Value::Label {
-- 
GitLab


From 1aceb18f344505c2433cc54ab40e0a4538318f5e Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 24 Feb 2025 20:35:12 -0600
Subject: [PATCH 07/17] parallelize gamut in cava on cpu

---
 hercules_opt/src/fork_transforms.rs |  4 +++-
 juno_samples/cava/src/cava.jn       |  2 +-
 juno_samples/cava/src/cpu.sch       | 12 ++++++++++--
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index 0e943973..ff0f0283 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -916,7 +916,9 @@ pub fn chunk_all_forks_unguarded(
     };
 
     for (fork, _) in fork_join_map {
-        chunk_fork_unguarded(editor, *fork, dim_idx, dc_id, order);
+        if editor.is_mutable(*fork) {
+            chunk_fork_unguarded(editor, *fork, dim_idx, dc_id, order);
+        }
     }
 }
 // Splits a dimension of a single fork join into multiple.
diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn
index dbe799f9..4d02b2cd 100644
--- a/juno_samples/cava/src/cava.jn
+++ b/juno_samples/cava/src/cava.jn
@@ -142,7 +142,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
 ) -> f32[CHAN, row, col] {
   @res let result : f32[CHAN, row, col];
 
-  for r = 0 to row {
+  @image_loop for r = 0 to row {
     for c = 0 to col {
       @l2 let l2_dist : f32[num_ctrl_pts];
       for cp = 0 to num_ctrl_pts {
diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index efa7302e..8f22b37d 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -113,6 +113,14 @@ fixpoint {
 simpl!(fuse4);
 array-slf(fuse4);
 simpl!(fuse4);
+let par = fuse4@image_loop \ fuse4@channel_loop;
+fork-tile[4, 1, false, false](par);
+fork-tile[4, 0, false, false](par);
+fork-interchange[1, 2](par);
+let split = fork-split(par);
+let fuse4_body = outline(split.cava_3.fj2);
+fork-coalesce(fuse4, fuse4_body);
+simpl!(fuse4, fuse4_body);
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);
@@ -128,8 +136,8 @@ simpl!(fuse5);
 delete-uncalled(*);
 simpl!(*);
 
-fork-split(fuse1, fuse2, fuse3, fuse4, fuse5);
-unforkify(fuse1, fuse2, fuse3, fuse4, fuse5);
+fork-split(fuse1, fuse2, fuse3, fuse4_body, fuse5);
+unforkify(fuse1, fuse2, fuse3, fuse4_body, fuse5);
 
 simpl!(*);
 
-- 
GitLab


From d83d48cf583e9e0454f048f2b9ebaff4f9c73ab8 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 24 Feb 2025 20:55:23 -0600
Subject: [PATCH 08/17] parallelize parts of edge

---
 juno_samples/edge_detection/src/cpu.sch       | 28 +++++++++++++++++--
 .../edge_detection/src/edge_detection.jn      | 24 ++++++++--------
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index ead722ce..cb65d183 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -24,6 +24,14 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
+let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
+fork-tile[4, 1, false, false](par);
+fork-tile[4, 0, false, false](par);
+fork-interchange[1, 2](par);
+let split = fork-split(par);
+let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
+fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
+simpl!(gaussian_smoothing, gaussian_smoothing_body);
 
 no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
 fixpoint {
@@ -32,6 +40,14 @@ fixpoint {
   fork-coalesce(laplacian_estimate);
 }
 simpl!(laplacian_estimate);
+let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
+fork-tile[4, 1, false, false](par);
+fork-tile[4, 0, false, false](par);
+fork-interchange[1, 2](par);
+let split = fork-split(par);
+let laplacian_estimate_body = outline(split._1_laplacian_estimate.fj2);
+fork-coalesce(laplacian_estimate, laplacian_estimate_body);
+simpl!(laplacian_estimate, laplacian_estimate_body);
 
 no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
 fixpoint {
@@ -40,6 +56,14 @@ fixpoint {
   fork-coalesce(zero_crossings);
 }
 simpl!(zero_crossings);
+let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
+fork-tile[4, 1, false, false](par);
+fork-tile[4, 0, false, false](par);
+fork-interchange[1, 2](par);
+let split = fork-split(par);
+let zero_crossings_body = outline(split._2_zero_crossings.fj2);
+fork-coalesce(zero_crossings, zero_crossings_body);
+simpl!(zero_crossings, zero_crossings_body);
 
 no-memset(gradient@res);
 fixpoint {
@@ -81,8 +105,8 @@ simpl!(reject_zero_crossings);
 
 async-call(edge_detection@le, edge_detection@zc);
 
-fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
-unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
+fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings);
+unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings);
 
 simpl!(*);
 
diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
index e1413488..0b8e71da 100644
--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -7,11 +7,11 @@ fn gaussian_smoothing<n, m, gs : usize>(
   // Define the gaussian radius as half the gaussian size
   const gr = gs / 2;
 
-  for row = 0 to n {
+  @image_loop for row = 0 to n {
     for col = 0 to m {
       let smoothed = 0.0;
 
-      for i = 0 to gs {
+      @filter_loop for i = 0 to gs {
         for j = 0 to gs {
           let val = input[if row + i < gr               then 0
                           else if row + i - gr > n - 1  then n - 1
@@ -41,11 +41,11 @@ fn laplacian_estimate<n, m, sz: usize>(
 
   @res let result : f32[n, m];
 
-  for row = 0 to n {
+  @image_loop for row = 0 to n {
     for col = 0 to m {
       // Copy data for dilation filter
       @shr1 let imageArea : f32[sz, sz];
-      for i = 0 to sz {
+      @filter_loop for i = 0 to sz {
         for j = 0 to sz {
           imageArea[i, j] = if row + i < r              then MIN_BR
                             else if row + i - r > n - 1 then MIN_BR
@@ -57,7 +57,7 @@ fn laplacian_estimate<n, m, sz: usize>(
 
       // Compute pixel of dilated image
       let dilated = MIN_BR;
-      for i = 0 to sz {
+      @filter_loop for i = 0 to sz {
         for j = 0 to sz {
           dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
         }
@@ -65,7 +65,7 @@ fn laplacian_estimate<n, m, sz: usize>(
 
       // Data copy for erotion filter
       @shr2 let imageArea : f32[sz, sz];
-      for i = 0 to sz {
+      @filter_loop for i = 0 to sz {
         for j = 0 to sz {
           imageArea[i, j] = if row + i < r              then MAX_BR
                             else if row + i - r > n - 1 then MAX_BR
@@ -77,7 +77,7 @@ fn laplacian_estimate<n, m, sz: usize>(
 
       // Compute pixel of eroded image
       let eroded = MAX_BR;
-      for i = 0 to sz {
+      @filter_loop for i = 0 to sz {
         for j = 0 to sz {
           eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
         }
@@ -99,11 +99,11 @@ fn zero_crossings<n, m, sz: usize>(
 
   @res let result : f32[n, m];
 
-  for row = 0 to n {
+  @image_loop for row = 0 to n {
     for col = 0 to m {
       // Data copy for dilation filter
       @shr1 let imageArea : f32[sz, sz];
-      for i = 0 to sz {
+      @filter_loop for i = 0 to sz {
         for j = 0 to sz {
           imageArea[i, j] = if row + i < r              then MIN_BR
                             else if row + i - r > n - 1 then MIN_BR
@@ -117,7 +117,7 @@ fn zero_crossings<n, m, sz: usize>(
 
       // Compute the pixel of dilated image
       let dilated = MIN_BR;
-      for i = 0 to sz {
+      @filter_loop for i = 0 to sz {
         for j = 0 to sz {
           dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
         }
@@ -125,7 +125,7 @@ fn zero_crossings<n, m, sz: usize>(
 
       // Data copy for erotion filter
       @shr2 let imageArea : f32[sz, sz];
-      for i = 0 to sz {
+      @filter_loop for i = 0 to sz {
         for j = 0 to sz {
           imageArea[i, j] = if row + i < r              then MAX_BR
                             else if row + i - r > n - 1 then MAX_BR
@@ -139,7 +139,7 @@ fn zero_crossings<n, m, sz: usize>(
 
       // Compute the pixel of eroded image
       let eroded = MAX_BR;
-      for i = 0 to sz {
+      @filter_loop for i = 0 to sz {
         for j = 0 to sz {
           eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
         }
-- 
GitLab


From 789d72b182445316e8789584ee57449372d3e9c0 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 24 Feb 2025 21:12:27 -0600
Subject: [PATCH 09/17] whoops

---
 juno_samples/edge_detection/src/cpu.sch | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index cb65d183..8f715c39 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -40,14 +40,6 @@ fixpoint {
   fork-coalesce(laplacian_estimate);
 }
 simpl!(laplacian_estimate);
-let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
-fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
-fork-interchange[1, 2](par);
-let split = fork-split(par);
-let laplacian_estimate_body = outline(split._1_laplacian_estimate.fj2);
-fork-coalesce(laplacian_estimate, laplacian_estimate_body);
-simpl!(laplacian_estimate, laplacian_estimate_body);
 
 no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
 fixpoint {
@@ -56,14 +48,6 @@ fixpoint {
   fork-coalesce(zero_crossings);
 }
 simpl!(zero_crossings);
-let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
-fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
-fork-interchange[1, 2](par);
-let split = fork-split(par);
-let zero_crossings_body = outline(split._2_zero_crossings.fj2);
-fork-coalesce(zero_crossings, zero_crossings_body);
-simpl!(zero_crossings, zero_crossings_body);
 
 no-memset(gradient@res);
 fixpoint {
@@ -105,8 +89,8 @@ simpl!(reject_zero_crossings);
 
 async-call(edge_detection@le, edge_detection@zc);
 
-fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings);
-unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings);
+fork-split(gaussian_smoothing_body, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
+unforkify(gaussian_smoothing_body, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
 
 simpl!(*);
 
-- 
GitLab


From 3fe286ccd2289d714d0518a19de50b67dfccd13d Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 24 Feb 2025 21:13:39 -0600
Subject: [PATCH 10/17] whoops

---
 juno_samples/fork_join_tests/src/gpu.sch | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch
index ca17f692..81dc8d98 100644
--- a/juno_samples/fork_join_tests/src/gpu.sch
+++ b/juno_samples/fork_join_tests/src/gpu.sch
@@ -73,23 +73,7 @@ dce(auto.test8);
 simplify-cfg(auto.test8);
 dce(auto.test8);
 
-fork-split(auto.test9@filter_loop);
-fork-unroll(auto.test9);
-fork-unroll(auto.test9);
-dce(auto.test9);
-ccp(auto.test9);
-gvn(auto.test9);
-phi-elim(auto.test9);
-dce(auto.test9);
-fixpoint {
-  predication(auto.test9);
-  simplify-cfg(auto.test9);
-}
-ccp(auto.test9);
-gvn(auto.test9);
-phi-elim(auto.test9);
-lift-dc-math(auto.test9);
-dce(auto.test9);
+no-memset(test9@const);
 
 ip-sroa(*);
 sroa(*);
-- 
GitLab


From 90a737cf55d2915bb262a646adca17cd94ec1725 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 24 Feb 2025 21:24:15 -0600
Subject: [PATCH 11/17] oops, parallel launch of functions containing
 allocations causes data races. TODO: fix in gcm

This reverts commit 789d72b182445316e8789584ee57449372d3e9c0.

This reverts commit 789d72b182445316e8789584ee57449372d3e9c0.
---
 hercules_cg/src/rt.rs                         |  2 +-
 juno_samples/edge_detection/src/cpu.sch       | 20 ++++++++-
 .../edge_detection/src/edge_detection.jn      | 42 +++++++++----------
 juno_samples/edge_detection/src/lib.rs        |  9 +++-
 4 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 884129c7..ddfa9503 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -529,7 +529,7 @@ impl<'a> RTContext<'a> {
                 write!(block, "{} = ", self.get_value(id, bb, true))?;
                 let mut size_and_device = None;
                 match self.module.constants[cons_id.idx()] {
-                    Constant::Boolean(val) => write!(block, "{}bool", val)?,
+                    Constant::Boolean(val) => write!(block, "{}", val)?,
                     Constant::Integer8(val) => write!(block, "{}i8", val)?,
                     Constant::Integer16(val) => write!(block, "{}i16", val)?,
                     Constant::Integer32(val) => write!(block, "{}i32", val)?,
diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index 8f715c39..a1974d05 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -40,6 +40,14 @@ fixpoint {
   fork-coalesce(laplacian_estimate);
 }
 simpl!(laplacian_estimate);
+let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
+fork-tile[4, 1, false, false](par);
+fork-tile[4, 0, false, false](par);
+fork-interchange[1, 2](par);
+let split = fork-split(par);
+let laplacian_estimate_body = outline(split._1_laplacian_estimate.fj2);
+fork-coalesce(laplacian_estimate, laplacian_estimate_body);
+simpl!(laplacian_estimate, laplacian_estimate_body);
 
 no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
 fixpoint {
@@ -48,6 +56,14 @@ fixpoint {
   fork-coalesce(zero_crossings);
 }
 simpl!(zero_crossings);
+let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
+fork-tile[4, 1, false, false](par);
+fork-tile[4, 0, false, false](par);
+fork-interchange[1, 2](par);
+let split = fork-split(par);
+let zero_crossings_body = outline(split._2_zero_crossings.fj2);
+fork-coalesce(zero_crossings, zero_crossings_body);
+simpl!(zero_crossings, zero_crossings_body);
 
 no-memset(gradient@res);
 fixpoint {
@@ -89,8 +105,8 @@ simpl!(reject_zero_crossings);
 
 async-call(edge_detection@le, edge_detection@zc);
 
-fork-split(gaussian_smoothing_body, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
-unforkify(gaussian_smoothing_body, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
+fork-split(gaussian_smoothing_body, laplacian_estimate, laplacian_estimate_body, zero_crossings, zero_crossings_body, gradient, reject_zero_crossings);
+unforkify(gaussian_smoothing_body, laplacian_estimate, laplacian_estimate_body, zero_crossings, zero_crossings_body, gradient, reject_zero_crossings);
 
 simpl!(*);
 
diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
index 0b8e71da..58f364dc 100644
--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -47,11 +47,11 @@ fn laplacian_estimate<n, m, sz: usize>(
       @shr1 let imageArea : f32[sz, sz];
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          imageArea[i, j] = if row + i < r              then MIN_BR
-                            else if row + i - r > n - 1 then MIN_BR
-                            else if col + j < r         then MIN_BR
-                            else if col + j - r > m - 1 then MIN_BR
-                                 else input[row + i - r, col + j - r];
+          imageArea[i, j] = if row + i < r
+	                       || row + i - r > n - 1
+			       || col + j < r
+			       || col + j - r > m - 1 then MIN_BR
+			    else input[row + i - r, col + j - r];
         }
       }
 
@@ -67,11 +67,11 @@ fn laplacian_estimate<n, m, sz: usize>(
       @shr2 let imageArea : f32[sz, sz];
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          imageArea[i, j] = if row + i < r              then MAX_BR
-                            else if row + i - r > n - 1 then MAX_BR
-                            else if col + j < r         then MAX_BR
-                            else if col + j - r > m - 1 then MAX_BR
-                                 else input[row + i - r, col + j - r];
+          imageArea[i, j] = if row + i < r
+	                       || row + i - r > n - 1
+			       || col + j < r
+			       || col + j - r > m - 1 then MAX_BR
+			    else input[row + i - r, col + j - r];
         }
       }
 
@@ -105,12 +105,11 @@ fn zero_crossings<n, m, sz: usize>(
       @shr1 let imageArea : f32[sz, sz];
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          imageArea[i, j] = if row + i < r              then MIN_BR
-                            else if row + i - r > n - 1 then MIN_BR
-                            else if col + j < r         then MIN_BR
-                            else if col + j - r > m - 1 then MIN_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR
-                            then MAX_BR
+          imageArea[i, j] = if row + i < r
+                               || row + i - r > n - 1
+                               || col + j < r
+                               || col + j - r > m - 1 then MIN_BR
+                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
                             else MIN_BR;
         }
       }
@@ -127,12 +126,11 @@ fn zero_crossings<n, m, sz: usize>(
       @shr2 let imageArea : f32[sz, sz];
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          imageArea[i, j] = if row + i < r              then MAX_BR
-                            else if row + i - r > n - 1 then MAX_BR
-                            else if col + j < r         then MAX_BR
-                            else if col + j - r > m - 1 then MAX_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR
-                            then MAX_BR
+          imageArea[i, j] = if row + i < r
+                               || row + i - r > n - 1
+                               || col + j < r
+                               || col + j - r > m - 1 then MAX_BR
+                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
                             else MIN_BR;
         }
       }
diff --git a/juno_samples/edge_detection/src/lib.rs b/juno_samples/edge_detection/src/lib.rs
index dab84cf6..aa44e2e7 100644
--- a/juno_samples/edge_detection/src/lib.rs
+++ b/juno_samples/edge_detection/src/lib.rs
@@ -234,7 +234,14 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) {
                 theta,
             );
 
-            assert_eq!(result, rust_result);
+            let mut all = true;
+            for idx in 0..rust_result.len() {
+                if result[idx] != rust_result[idx] {
+                    all = false;
+                    println!("Found mismatch in images at {}.", idx);
+                }
+            }
+            assert!(all);
             println!("Frames {} match", i);
 
             if display_verify {
-- 
GitLab


From ae8f17eb21ca53bbd1d9318518c4c2d62a3e1437 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 10:53:22 -0600
Subject: [PATCH 12/17] Look for arrays that are used in parallel

---
 hercules_cg/src/lib.rs         |  2 +-
 hercules_ir/src/collections.rs |  3 +-
 hercules_opt/src/gcm.rs        | 94 ++++++++++++++++++++++++++++++++++
 hercules_rt/src/lib.rs         |  2 +-
 juno_scheduler/src/pm.rs       |  3 ++
 5 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs
index 446231de..98f91e1f 100644
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
@@ -16,7 +16,7 @@ use std::collections::BTreeMap;
 
 use hercules_ir::*;
 
-pub const LARGEST_ALIGNMENT: usize = 32;
+pub const LARGEST_ALIGNMENT: usize = 64;
 
 /*
  * The alignment of a type does not depend on dynamic constants.
diff --git a/hercules_ir/src/collections.rs b/hercules_ir/src/collections.rs
index 60f4fb1c..a8fc3c67 100644
--- a/hercules_ir/src/collections.rs
+++ b/hercules_ir/src/collections.rs
@@ -202,7 +202,7 @@ pub fn collection_objects(
                     let fco = &collection_objects[&callee];
                     if fco.returned[*selection]
                         .iter()
-                        .any(|returned| fco.origins[returned.idx()].try_parameter().is_some())
+                        .any(|returned| fco.origins[returned.idx()].try_parameter().is_none())
                     {
                         // If the callee may return a new collection object, then
                         // this data projection node originates a single collection object. The
@@ -412,6 +412,7 @@ pub fn collection_objects(
             }
         }
 
+        assert_eq!(objects_per_node.len(), func.nodes.len());
         let fco = FunctionCollectionObjects {
             objects_per_node,
             mutated,
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index d3119705..ae8801f6 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -82,6 +82,7 @@ pub fn gcm(
     control_subgraph: &Subgraph,
     dom: &DomTree,
     fork_join_map: &HashMap<NodeID, NodeID>,
+    fork_join_nest: &HashMap<NodeID, Vec<NodeID>>,
     loops: &LoopTree,
     reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>,
     objects: &CollectionObjects,
@@ -120,6 +121,18 @@ pub fn gcm(
         return None;
     }
 
+    if add_extra_collection_dims(
+        editor,
+        typing,
+        fork_join_map,
+        fork_join_nest,
+        objects,
+        devices,
+        &bbs,
+    ) {
+        return None;
+    }
+
     let Some(node_colors) = color_nodes(editor, typing, &objects, &devices, node_colors) else {
         return None;
     };
@@ -1027,6 +1040,87 @@ fn spill_clones(
     }
 }
 
+/*
+ * Look for mutated collections placed inside fork-joins in AsyncRust functions.
+ * These collections should be duplicated across the size of the fork-join.
+ */
+fn add_extra_collection_dims(
+    editor: &mut FunctionEditor,
+    typing: &Vec<TypeID>,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    fork_join_nest: &HashMap<NodeID, Vec<NodeID>>,
+    objects: &CollectionObjects,
+    devices: &Vec<Device>,
+    bbs: &BasicBlocks,
+) -> bool {
+    if devices[editor.func_id().idx()] == Device::AsyncRust
+        && editor.func().name == "_1_laplacian_estimate"
+    {
+        // Look for collection constant nodes inside fork-joins that are mutated
+        // inside the fork-join, aren't involved in any of the reduces of the
+        // fork-join, and have a user that isn't a direct read based on all of
+        // the thread IDs.
+        let nodes = &editor.func().nodes;
+        let fco = &objects[&editor.func_id()];
+        for id in editor.node_ids().filter(|id| {
+            nodes[id.idx()].is_constant() && !editor.get_type(typing[id.idx()]).is_primitive()
+        }) {
+            // Check all of the above conditions.
+            if editor.get_users(id).len() != 1 {
+                continue;
+            }
+            let forks = &fork_join_nest[&bbs.0[id.idx()]];
+            if forks.is_empty() {
+                continue;
+            }
+            let object = fco.objects(id)[0];
+            let mutated_inside = fco
+                .mutators(object)
+                .into_iter()
+                .any(|id| &fork_join_nest[&bbs.0[id.idx()]] == forks);
+            if !mutated_inside {
+                continue;
+            }
+            let in_reduce = forks.into_iter().any(|id| {
+                let join = fork_join_map[id];
+                let mut reduces = editor
+                    .get_users(join)
+                    .filter(|id| nodes[id.idx()].is_reduce());
+                reduces.any(|id| fco.objects(id).contains(&object))
+            });
+            if in_reduce {
+                continue;
+            }
+            if let Node::Read {
+                collect: _,
+                ref indices,
+            } = nodes[editor.get_users(id).next().unwrap().idx()]
+                && let Index::Position(ref pos) = indices[0]
+                && {
+                    let tid_pos: BTreeSet<(NodeID, usize)> = pos
+                        .into_iter()
+                        .filter_map(|id| nodes[id.idx()].try_thread_id())
+                        .collect();
+                    let reference: BTreeSet<(NodeID, usize)> = forks
+                        .into_iter()
+                        .flat_map(|id| {
+                            (0..nodes[id.idx()].try_fork().unwrap().1.len()).map(|dim| (*id, dim))
+                        })
+                        .collect();
+                    tid_pos == reference
+                }
+            {
+                continue;
+            }
+
+            // We know that this collection needs to be replicated across the
+            // fork-join dimensions, so do that.
+            todo!()
+        }
+    }
+    false
+}
+
 type Liveness = BTreeMap<NodeID, Vec<BTreeSet<NodeID>>>;
 
 /*
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index df53a0e9..a5954ca0 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -13,7 +13,7 @@ use std::sync::OnceLock;
  * src/rt.rs (the RT backend).
  */
 
-pub const LARGEST_ALIGNMENT: usize = 32;
+pub const LARGEST_ALIGNMENT: usize = 64;
 
 pub unsafe fn __cpu_alloc(size: usize) -> *mut u8 {
     let ptr = alloc(Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap());
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 4656d841..77437a61 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -2105,6 +2105,7 @@ fn run_pass(
                 pm.make_control_subgraphs();
                 pm.make_doms();
                 pm.make_fork_join_maps();
+                pm.make_fork_join_nests();
                 pm.make_loops();
                 pm.make_reduce_cycles();
                 pm.make_collection_objects();
@@ -2115,6 +2116,7 @@ fn run_pass(
                 let typing = pm.typing.take().unwrap();
                 let doms = pm.doms.take().unwrap();
                 let fork_join_maps = pm.fork_join_maps.take().unwrap();
+                let fork_join_nests = pm.fork_join_nests.take().unwrap();
                 let loops = pm.loops.take().unwrap();
                 let reduce_cycles = pm.reduce_cycles.take().unwrap();
                 let control_subgraphs = pm.control_subgraphs.take().unwrap();
@@ -2136,6 +2138,7 @@ fn run_pass(
                         &control_subgraphs[id.idx()],
                         &doms[id.idx()],
                         &fork_join_maps[id.idx()],
+                        &fork_join_nests[id.idx()],
                         &loops[id.idx()],
                         &reduce_cycles[id.idx()],
                         &collection_objects,
-- 
GitLab


From 65aae3b11da86b12f108fda7264becf6f2af5816 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 10:59:45 -0600
Subject: [PATCH 13/17] fix cfd to use proper alignment

---
 juno_samples/rodinia/cfd/src/main.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/juno_samples/rodinia/cfd/src/main.rs b/juno_samples/rodinia/cfd/src/main.rs
index 1ce6b89a..fab241fa 100644
--- a/juno_samples/rodinia/cfd/src/main.rs
+++ b/juno_samples/rodinia/cfd/src/main.rs
@@ -151,7 +151,7 @@ fn cfd_harness(args: CFDInputs) {
         pre_euler,
     } = args;
 
-    assert!(block_size % 8 == 0, "Hercules expects all arrays to be 32-byte aligned, cfd uses structs of arrays that are annoying to deal with if the block_size is not a multiple of 8");
+    assert!(block_size % 16 == 0, "Hercules expects all arrays to be 64-byte aligned, cfd uses structs of arrays that are annoying to deal with if the block_size is not a multiple of 16");
 
     let FarFieldConditions {
         ff_variable,
@@ -245,7 +245,7 @@ fn test_euler() {
     cfd_harness(CFDInputs {
         data_file: "data/fvcorr.domn.097K".to_string(),
         iterations: 1,
-        block_size: 8,
+        block_size: 16,
         pre_euler: false,
     });
 }
@@ -255,7 +255,7 @@ fn test_pre_euler() {
     cfd_harness(CFDInputs {
         data_file: "data/fvcorr.domn.097K".to_string(),
         iterations: 1,
-        block_size: 8,
+        block_size: 16,
         pre_euler: true,
     });
 }
-- 
GitLab


From 37567e75c3f7adbb96248a12084bb0ed51914f5b Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 11:30:47 -0600
Subject: [PATCH 14/17] Add extra dimensions to arrays when they'd cause a race
 otherwise

---
 hercules_opt/src/gcm.rs | 54 +++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index ae8801f6..939f1502 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -1053,19 +1053,22 @@ fn add_extra_collection_dims(
     devices: &Vec<Device>,
     bbs: &BasicBlocks,
 ) -> bool {
-    if devices[editor.func_id().idx()] == Device::AsyncRust
-        && editor.func().name == "_1_laplacian_estimate"
-    {
+    if devices[editor.func_id().idx()] == Device::AsyncRust {
         // Look for collection constant nodes inside fork-joins that are mutated
         // inside the fork-join, aren't involved in any of the reduces of the
         // fork-join, and have a user that isn't a direct read based on all of
         // the thread IDs.
-        let nodes = &editor.func().nodes;
         let fco = &objects[&editor.func_id()];
-        for id in editor.node_ids().filter(|id| {
-            nodes[id.idx()].is_constant() && !editor.get_type(typing[id.idx()]).is_primitive()
-        }) {
+        let candidates: Vec<_> = editor
+            .node_ids()
+            .filter(|id| {
+                editor.func().nodes[id.idx()].is_constant()
+                    && !editor.get_type(typing[id.idx()]).is_primitive()
+            })
+            .collect();
+        for id in candidates {
             // Check all of the above conditions.
+            let nodes = &editor.func().nodes;
             if editor.get_users(id).len() != 1 {
                 continue;
             }
@@ -1115,7 +1118,42 @@ fn add_extra_collection_dims(
 
             // We know that this collection needs to be replicated across the
             // fork-join dimensions, so do that.
-            todo!()
+            let ty = typing[id.idx()];
+            let num_dims: Vec<_> = forks
+                .into_iter()
+                .rev()
+                .map(|id| nodes[id.idx()].try_fork().unwrap().1.len())
+                .collect();
+            let factors = forks
+                .into_iter()
+                .rev()
+                .flat_map(|id| nodes[id.idx()].try_fork().unwrap().1.into_iter())
+                .map(|dc| *dc)
+                .collect();
+            let array_ty = Type::Array(ty, factors);
+            let success = editor.edit(|mut edit| {
+                let new_ty = edit.add_type(array_ty);
+                let new_cons = edit.add_zero_constant(new_ty);
+                let new_cons = edit.add_node(Node::Constant { id: new_cons });
+                let mut tids = vec![];
+                for (fork, num_dims) in forks.into_iter().rev().zip(num_dims) {
+                    for dim in 0..num_dims {
+                        tids.push(edit.add_node(Node::ThreadID {
+                            control: *fork,
+                            dimension: dim,
+                        }));
+                    }
+                }
+                let read = edit.add_node(Node::Read {
+                    collect: new_cons,
+                    indices: Box::new([Index::Position(tids.into_boxed_slice())]),
+                });
+                edit = edit.replace_all_uses(id, read)?;
+                edit = edit.delete_node(id)?;
+                Ok(edit)
+            });
+            assert!(success);
+            return true;
         }
     }
     false
-- 
GitLab


From 3d1e5b15ead87939b8f0d941c88d7c412202ee51 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 11:37:51 -0600
Subject: [PATCH 15/17] Outline constants as well

---
 juno_samples/edge_detection/src/cpu.sch | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index a1974d05..6f1ee14b 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -45,7 +45,8 @@ fork-tile[4, 1, false, false](par);
 fork-tile[4, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let laplacian_estimate_body = outline(split._1_laplacian_estimate.fj2);
+let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2;
+let laplacian_estimate_body = outline(body);
 fork-coalesce(laplacian_estimate, laplacian_estimate_body);
 simpl!(laplacian_estimate, laplacian_estimate_body);
 
@@ -61,7 +62,8 @@ fork-tile[4, 1, false, false](par);
 fork-tile[4, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let zero_crossings_body = outline(split._2_zero_crossings.fj2);
+let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2;
+let zero_crossings_body = outline(body);
 fork-coalesce(zero_crossings, zero_crossings_body);
 simpl!(zero_crossings, zero_crossings_body);
 
-- 
GitLab


From 2b60a19fddaf6cb7f3f2c46cff3212e73ca5e2a7 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 11:50:59 -0600
Subject: [PATCH 16/17] Allocate more memory for calls inside fork-joins

---
 hercules_opt/src/gcm.rs | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index 939f1502..b415371f 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -152,6 +152,7 @@ pub fn gcm(
     let backing_allocation = object_allocation(
         editor,
         typing,
+        fork_join_nest,
         &node_colors,
         &alignments,
         &liveness,
@@ -1148,6 +1149,7 @@ fn add_extra_collection_dims(
                     collect: new_cons,
                     indices: Box::new([Index::Position(tids.into_boxed_slice())]),
                 });
+                edit.sub_edit(id, new_cons);
                 edit = edit.replace_all_uses(id, read)?;
                 edit = edit.delete_node(id)?;
                 Ok(edit)
@@ -1639,6 +1641,7 @@ fn type_size(edit: &mut FunctionEdit, ty_id: TypeID, alignments: &Vec<usize>) ->
 fn object_allocation(
     editor: &mut FunctionEditor,
     typing: &Vec<TypeID>,
+    fork_join_nest: &HashMap<NodeID, Vec<NodeID>>,
     node_colors: &FunctionNodeColors,
     alignments: &Vec<usize>,
     _liveness: &Liveness,
@@ -1664,7 +1667,7 @@ fn object_allocation(
                     }
                 }
                 Node::Call {
-                    control: _,
+                    control,
                     function: callee,
                     ref dynamic_constants,
                     args: _,
@@ -1694,9 +1697,25 @@ fn object_allocation(
                                 callee_backing_size,
                                 &mut edit,
                             );
+                            // Multiply the backing allocation size of the
+                            // callee by the number of parallel threads that
+                            // will call the function.
+                            let forks = &fork_join_nest[&control];
+                            let factors: Vec<_> = forks
+                                .into_iter()
+                                .rev()
+                                .flat_map(|id| edit.get_node(*id).try_fork().unwrap().1.into_iter())
+                                .map(|dc| *dc)
+                                .collect();
+                            let mut multiplied_callee_backing_size = callee_backing_size;
+                            for factor in factors {
+                                multiplied_callee_backing_size = edit.add_dynamic_constant(
+                                    DynamicConstant::mul(multiplied_callee_backing_size, factor),
+                                );
+                            }
                             *total = edit.add_dynamic_constant(DynamicConstant::add(
                                 *total,
-                                callee_backing_size,
+                                multiplied_callee_backing_size,
                             ));
                         }
                     }
-- 
GitLab


From af3175773528d77fa06b280e027029943e2539da Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 12:34:37 -0600
Subject: [PATCH 17/17] Use different allocations in parallel calls

---
 hercules_cg/src/cpu.rs                  |  2 +-
 hercules_cg/src/gpu.rs                  |  2 +-
 hercules_cg/src/lib.rs                  | 11 +++++++---
 hercules_cg/src/rt.rs                   | 29 +++++++++++++++++++------
 hercules_opt/src/gcm.rs                 |  6 ++---
 juno_samples/edge_detection/src/cpu.sch |  4 ++--
 juno_scheduler/src/pm.rs                |  4 +++-
 7 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index 6ad38fc0..552dc3a3 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -334,7 +334,7 @@ impl<'a> CPUContext<'a> {
                     }
                 } else {
                     let (_, offsets) = &self.backing_allocation[&Device::LLVM];
-                    let offset = offsets[&id];
+                    let offset = offsets[&id].0;
                     write!(
                         body,
                         "  {} = getelementptr i8, ptr %backing, i64 %dc{}\n",
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 76aba7e0..5f2feedd 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1290,7 +1290,7 @@ namespace cg = cooperative_groups;
                 if !is_primitive && state == KernelState::OutBlock {
                     assert!(self.function.schedules[id.idx()].contains(&Schedule::NoResetConstant), "PANIC: The CUDA backend cannot lower a global memory constant that has to be reset to zero. This is because we cannot efficiently implement a memset to the underlying memory of the constant due to the need for a grid level sync. Consider floating this collection outside the CUDA function and into an AsyncRust function, or attaching the NoResetConstant schedule to indicate that no memset is semantically necessary.");
                     let (_, offsets) = &self.backing_allocation[&Device::CUDA];
-                    let offset = offsets[&id];
+                    let offset = offsets[&id].0;
                     write!(
                         w,
                         "{}{} = backing + dc{};\n",
diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs
index 98f91e1f..9866400c 100644
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
@@ -53,10 +53,15 @@ pub type NodeColors = BTreeMap<FunctionID, FunctionNodeColors>;
 /*
  * The allocation information of each function is a size of the backing memory
  * needed and offsets into that backing memory per constant object and call node
- * in the function.
+ * in the function (as well as their individual sizes).
  */
-pub type FunctionBackingAllocation =
-    BTreeMap<Device, (DynamicConstantID, BTreeMap<NodeID, DynamicConstantID>)>;
+pub type FunctionBackingAllocation = BTreeMap<
+    Device,
+    (
+        DynamicConstantID,
+        BTreeMap<NodeID, (DynamicConstantID, DynamicConstantID)>,
+    ),
+>;
 pub type BackingAllocations = BTreeMap<FunctionID, FunctionBackingAllocation>;
 pub const BACKED_DEVICES: [Device; 2] = [Device::LLVM, Device::CUDA];
 
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index ddfa9503..3db0f16f 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -80,7 +80,7 @@ pub fn rt_codegen<W: Write>(
     typing: &Vec<TypeID>,
     control_subgraph: &Subgraph,
     fork_join_map: &HashMap<NodeID, NodeID>,
-    fork_control_map: &HashMap<NodeID, HashSet<NodeID>>,
+    fork_join_nest: &HashMap<NodeID, Vec<NodeID>>,
     fork_tree: &HashMap<NodeID, HashSet<NodeID>>,
     nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>,
     collection_objects: &CollectionObjects,
@@ -103,7 +103,7 @@ pub fn rt_codegen<W: Write>(
         control_subgraph,
         fork_join_map,
         join_fork_map: &join_fork_map,
-        fork_control_map,
+        fork_join_nest,
         fork_tree,
         nodes_in_fork_joins,
         collection_objects,
@@ -124,7 +124,7 @@ struct RTContext<'a> {
     control_subgraph: &'a Subgraph,
     fork_join_map: &'a HashMap<NodeID, NodeID>,
     join_fork_map: &'a HashMap<NodeID, NodeID>,
-    fork_control_map: &'a HashMap<NodeID, HashSet<NodeID>>,
+    fork_join_nest: &'a HashMap<NodeID, Vec<NodeID>>,
     fork_tree: &'a HashMap<NodeID, HashSet<NodeID>>,
     nodes_in_fork_joins: &'a HashMap<NodeID, HashSet<NodeID>>,
     collection_objects: &'a CollectionObjects,
@@ -559,7 +559,7 @@ impl<'a> RTContext<'a> {
                     Constant::Product(ty, _)
                     | Constant::Summation(ty, _, _)
                     | Constant::Array(ty) => {
-                        let (device, offset) = self.backing_allocations[&self.func_id]
+                        let (device, (offset, _)) = self.backing_allocations[&self.func_id]
                             .iter()
                             .filter_map(|(device, (_, offsets))| {
                                 offsets.get(&id).map(|id| (*device, *id))
@@ -676,13 +676,28 @@ impl<'a> RTContext<'a> {
                     prefix,
                     self.module.functions[callee_id.idx()].name
                 )?;
-                for (device, offset) in self.backing_allocations[&self.func_id]
+                for (device, (offset, size)) in self.backing_allocations[&self.func_id]
                     .iter()
                     .filter_map(|(device, (_, offsets))| offsets.get(&id).map(|id| (*device, *id)))
                 {
-                    write!(block, "backing_{}.byte_add(", device.name())?;
+                    write!(block, "backing_{}.byte_add(((", device.name())?;
                     self.codegen_dynamic_constant(offset, block)?;
-                    write!(block, " as usize), ")?
+                    let forks = &self.fork_join_nest[&bb];
+                    if !forks.is_empty() {
+                        write!(block, ") + ")?;
+                        let mut linear_thread = "0".to_string();
+                        for fork in forks {
+                            let factors = func.nodes[fork.idx()].try_fork().unwrap().1;
+                            for (factor_idx, factor) in factors.into_iter().enumerate() {
+                                linear_thread = format!("({} *", linear_thread);
+                                self.codegen_dynamic_constant(*factor, &mut linear_thread)?;
+                                write!(linear_thread, " + tid_{}_{})", fork.idx(), factor_idx)?;
+                            }
+                        }
+                        write!(block, "{} * (", linear_thread)?;
+                        self.codegen_dynamic_constant(size, block)?;
+                    }
+                    write!(block, ")) as usize), ")?
                 }
                 for dc in dynamic_constants {
                     self.codegen_dynamic_constant(*dc, block)?;
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index b415371f..c612acac 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -1647,7 +1647,7 @@ fn object_allocation(
     _liveness: &Liveness,
     backing_allocations: &BackingAllocations,
 ) -> FunctionBackingAllocation {
-    let mut fba = BTreeMap::new();
+    let mut fba = FunctionBackingAllocation::new();
 
     let node_ids = editor.node_ids();
     editor.edit(|mut edit| {
@@ -1661,8 +1661,8 @@ fn object_allocation(
                         let (total, offsets) =
                             fba.entry(device).or_insert_with(|| (zero, BTreeMap::new()));
                         *total = align(&mut edit, *total, alignments[typing[id.idx()].idx()]);
-                        offsets.insert(id, *total);
                         let type_size = type_size(&mut edit, typing[id.idx()], alignments);
+                        offsets.insert(id, (*total, type_size));
                         *total = edit.add_dynamic_constant(DynamicConstant::add(*total, type_size));
                     }
                 }
@@ -1689,7 +1689,6 @@ fn object_allocation(
                             // We don't know the alignment requirement of the memory
                             // in the callee, so just assume the largest alignment.
                             *total = align(&mut edit, *total, LARGEST_ALIGNMENT);
-                            offsets.insert(id, *total);
                             // Substitute the dynamic constant parameters in the
                             // callee's backing size.
                             callee_backing_size = substitute_dynamic_constants(
@@ -1697,6 +1696,7 @@ fn object_allocation(
                                 callee_backing_size,
                                 &mut edit,
                             );
+                            offsets.insert(id, (*total, callee_backing_size));
                             // Multiply the backing allocation size of the
                             // callee by the number of parallel threads that
                             // will call the function.
diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index 6f1ee14b..4bd3254b 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -107,8 +107,8 @@ simpl!(reject_zero_crossings);
 
 async-call(edge_detection@le, edge_detection@zc);
 
-fork-split(gaussian_smoothing_body, laplacian_estimate, laplacian_estimate_body, zero_crossings, zero_crossings_body, gradient, reject_zero_crossings);
-unforkify(gaussian_smoothing_body, laplacian_estimate, laplacian_estimate_body, zero_crossings, zero_crossings_body, gradient, reject_zero_crossings);
+fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings);
+unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings);
 
 simpl!(*);
 
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 77437a61..5f2fa4cc 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -900,6 +900,7 @@ impl PassManager {
         self.make_typing();
         self.make_control_subgraphs();
         self.make_fork_join_maps();
+        self.make_fork_join_nests();
         self.make_fork_control_maps();
         self.make_fork_trees();
         self.make_nodes_in_fork_joins();
@@ -917,6 +918,7 @@ impl PassManager {
             typing: Some(typing),
             control_subgraphs: Some(control_subgraphs),
             fork_join_maps: Some(fork_join_maps),
+            fork_join_nests: Some(fork_join_nests),
             fork_control_maps: Some(fork_control_maps),
             fork_trees: Some(fork_trees),
             nodes_in_fork_joins: Some(nodes_in_fork_joins),
@@ -990,7 +992,7 @@ impl PassManager {
                     &typing[idx],
                     &control_subgraphs[idx],
                     &fork_join_maps[idx],
-                    &fork_control_maps[idx],
+                    &fork_join_nests[idx],
                     &fork_trees[idx],
                     &nodes_in_fork_joins[idx],
                     &collection_objects,
-- 
GitLab