From 72a61f40f290d9b5ced10b2c2caa406998b7c861 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 13:15:40 -0600
Subject: [PATCH 1/9] Optimize srad

---
 juno_samples/rodinia/srad/src/gpu.sch | 35 ++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch
index f736c0b7..f89b7ab8 100644
--- a/juno_samples/rodinia/srad/src/gpu.sch
+++ b/juno_samples/rodinia/srad/src/gpu.sch
@@ -41,15 +41,26 @@ fork-tile[32, 0, false, true](sum_loop);
 let out = fork-split(sum_loop);
 clean-monoid-reduces(sum_loop);
 simpl!(sum_loop);
-let fission = fork-fission[out.srad_0.fj0](sum_loop);
+
+let fission1 = fork-fission[out.srad_0.fj0](sum_loop);
+simpl!(sum_loop);
+fork-tile[32, 0, false, true](fission1.srad_0.fj_bottom);
+let out = fork-split(fission1.srad_0.fj_bottom);
+clean-monoid-reduces(sum_loop);
+simpl!(sum_loop);
+
+let fission2 = fork-fission[out.srad_0.fj0](sum_loop);
 simpl!(sum_loop);
-fork-tile[32, 0, false, true](fission.srad_0.fj_bottom);
-let out = fork-split(fission.srad_0.fj_bottom);
+fork-tile[32, 0, false, true](fission2.srad_0.fj_bottom);
+let out = fork-split(fission2.srad_0.fj_bottom);
 clean-monoid-reduces(sum_loop);
 simpl!(sum_loop);
-let top = outline(fission.srad_0.fj_top);
-let bottom = outline(out.srad_0.fj0);
-gpu(top, bottom);
+
+let first = outline(fission1.srad_0.fj_top);
+let second = outline(fission2.srad_0.fj_top);
+let third = outline(out.srad_0.fj0);
+gpu(first, second, third);
+const-inline[false](*);
 ip-sroa(*);
 sroa(*);
 simpl!(*);
@@ -60,4 +71,16 @@ dce(main_loops);
 fork-split(main_loops);
 simpl!(main_loops);
 
+fork-dim-merge(extract);
+fork-tile[32, 0, false, true](extract);
+dce(extract);
+fork-split(extract);
+simpl!(extract);
+
+fork-dim-merge(compress);
+fork-tile[32, 0, false, true](compress);
+dce(compress);
+fork-split(compress);
+simpl!(compress);
+
 gcm(*);
-- 
GitLab


From 82c77fe26f79846416cbf29553b9893ac176e936 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 13:50:00 -0600
Subject: [PATCH 2/9] .

---
 juno_scheduler/src/pm.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 62bdaf73..5c1dd477 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -1339,7 +1339,10 @@ fn interp_expr(
                     }
                 }
                 Value::Record { fields } => match fields.get(field) {
-                    None => Err(SchedulerError::UndefinedField(field.clone())),
+                    None => Err(SchedulerError::UndefinedField(format!(
+                        "{} not in {:?}",
+                        field, fields
+                    ))),
                     Some(v) => Ok((v.clone(), changed)),
                 },
             }
-- 
GitLab


From c3e539f9c0743647f2bff0d66c35e1b56d3cb563 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 14:23:45 -0600
Subject: [PATCH 3/9] .

---
 hercules_ir/src/einsum.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hercules_ir/src/einsum.rs b/hercules_ir/src/einsum.rs
index 6c2ca31b..3922506b 100644
--- a/hercules_ir/src/einsum.rs
+++ b/hercules_ir/src/einsum.rs
@@ -435,10 +435,9 @@ pub fn debug_print_math_expr(id: MathID, env: &MathEnv) {
         }
         MathExpr::IntrinsicFunc(intrinsic, ref args) => {
             print!("{}(", intrinsic.lower_case_name());
-            debug_print_math_expr(id, env);
             for arg in args {
-                print!(", ");
                 debug_print_math_expr(*arg, env);
+                print!(", ");
             }
             print!(")");
         }
-- 
GitLab


From 620f17d3dede3e401f4b3d3f0156457725663e30 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 14:34:22 -0600
Subject: [PATCH 4/9] fork join test for array slf

---
 juno_samples/fork_join_tests/src/cpu.sch           | 10 ++++++++--
 .../fork_join_tests/src/fork_join_tests.jn         | 13 +++++++++++++
 juno_samples/fork_join_tests/src/gpu.sch           |  4 +++-
 juno_samples/fork_join_tests/src/main.rs           | 14 ++++++++++++++
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/juno_samples/fork_join_tests/src/cpu.sch b/juno_samples/fork_join_tests/src/cpu.sch
index f46c91d6..5f3ff94e 100644
--- a/juno_samples/fork_join_tests/src/cpu.sch
+++ b/juno_samples/fork_join_tests/src/cpu.sch
@@ -3,7 +3,7 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9);
+let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10);
 cpu(auto.test1);
 cpu(auto.test2);
 cpu(auto.test3);
@@ -12,6 +12,7 @@ cpu(auto.test5);
 cpu(auto.test7);
 cpu(auto.test8);
 cpu(auto.test9);
+cpu(auto.test10);
 
 let test1_cpu = auto.test1;
 rename["test1_cpu"](test1_cpu);
@@ -94,6 +95,11 @@ dce(auto.test8);
 simplify-cfg(auto.test8);
 dce(auto.test8);
 
-no-memset(test9@const);
+array-slf(auto.test10);
+ccp(auto.test10);
+dce(auto.test10);
+simplify-cfg(auto.test10);
+dce(auto.test10);
+unforkify(auto.test10);
 
 gcm(*);
diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn
index 334fc2bf..2eab56b9 100644
--- a/juno_samples/fork_join_tests/src/fork_join_tests.jn
+++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn
@@ -147,3 +147,16 @@ fn test9<r, c : usize>(input : i32[r, c]) -> i32[r, c] {
 
   return out;
 }
+
+#[entry]
+fn test10(k1 : i32[8], k2 : i32[8], v : i32[8]) -> i32 {
+  @const let s : i32[8];
+  for i = 0 to 8 {
+    s[i] = v[k1[i] as u64];
+  }
+  let sum = 0;
+  for i = 0 to 8 {
+    sum += s[k2[i] as u64];
+  }
+  return sum;
+}
\ No newline at end of file
diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch
index 81dc8d98..43b28e34 100644
--- a/juno_samples/fork_join_tests/src/gpu.sch
+++ b/juno_samples/fork_join_tests/src/gpu.sch
@@ -8,12 +8,13 @@ no-memset(test6@const);
 no-memset(test8@const1);
 no-memset(test8@const2);
 no-memset(test9@const);
+no-memset(test10@const);
 
 gvn(*);
 phi-elim(*);
 dce(*);
 
-let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9);
+let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10);
 gpu(auto.test1);
 gpu(auto.test2);
 gpu(auto.test3);
@@ -22,6 +23,7 @@ gpu(auto.test5);
 gpu(auto.test7);
 gpu(auto.test8);
 gpu(auto.test9);
+gpu(auto.test10);
 
 ip-sroa(*);
 sroa(*);
diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs
index e66309b2..0b37a99d 100644
--- a/juno_samples/fork_join_tests/src/main.rs
+++ b/juno_samples/fork_join_tests/src/main.rs
@@ -74,6 +74,20 @@ fn main() {
             5 + 6 + 8 + 9,
         ];
         assert(&correct, output);
+
+        let mut r = runner!(test10);
+        let k1 = vec![0, 4, 3, 7, 3, 4, 2, 1];
+        let k2 = vec![6, 4, 3, 2, 4, 1, 0, 5];
+        let v = vec![3, -499, 4, 32, -2, 55, -74, 10];
+        let mut correct = 0;
+        for i in 0..8 {
+            correct += v[k1[k2[i] as usize] as usize];
+        }
+        let k1 = HerculesImmBox::from(&k1 as &[i32]);
+        let k2 = HerculesImmBox::from(&k2 as &[i32]);
+        let v = HerculesImmBox::from(&v as &[i32]);
+        let output = r.run(k1.to(), k2.to(), v.to()).await;
+        assert_eq!(output, correct);
     });
 }
 
-- 
GitLab


From a8e41c3e7fb395e2f2d4a6b39832b08447f43d6c Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 14:58:35 -0600
Subject: [PATCH 5/9] tweaks

---
 juno_samples/rodinia/srad/src/cpu.sch | 1 +
 juno_samples/rodinia/srad/src/gpu.sch | 1 +
 juno_samples/rodinia/srad/src/srad.jn | 8 ++++----
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index a4cd4956..44007fca 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -8,6 +8,7 @@ macro simpl!(X) {
   infer-schedules(X);
 }
 
+no-memset(srad@scratch);
 phi-elim(*);
 let loop1 = outline(srad@loop1);
 let loop2 = outline(srad@loop2);
diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch
index f89b7ab8..c5a30527 100644
--- a/juno_samples/rodinia/srad/src/gpu.sch
+++ b/juno_samples/rodinia/srad/src/gpu.sch
@@ -8,6 +8,7 @@ macro simpl!(X) {
   infer-schedules(X);
 }
 
+no-memset(srad@scratch);
 phi-elim(*);
 let sum_loop = outline(srad@loop1);
 let main_loops = outline(srad@loop2 | srad@loop3);
diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn
index 6074bf8c..176778be 100644
--- a/juno_samples/rodinia/srad/src/srad.jn
+++ b/juno_samples/rodinia/srad/src/srad.jn
@@ -50,10 +50,10 @@ fn srad<nrows, ncols: usize>(
     let varROI  = (sum2 / nelems as f32) - meanROI * meanROI;
     let q0sqr   = varROI / (meanROI * meanROI);
 
-    @dirs let dN : f32[ncols, nrows];
-    @dirs let dS : f32[ncols, nrows];
-    @dirs let dE : f32[ncols, nrows];
-    @dirs let dW : f32[ncols, nrows];
+    @scratch let dN : f32[ncols, nrows];
+    @scratch let dS : f32[ncols, nrows];
+    @scratch let dE : f32[ncols, nrows];
+    @scratch let dW : f32[ncols, nrows];
 
     let c : f32[ncols, nrows];
 
-- 
GitLab


From b1f50f47a8ddaf6081065312e63cd3f8d9d78965 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 15:01:03 -0600
Subject: [PATCH 6/9] tweak

---
 juno_samples/rodinia/srad/src/cpu.sch | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index 44007fca..b3188b60 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -32,6 +32,8 @@ simpl!(*);
 fork-interchange[0, 1](loop1);
 reduce-slf(*);
 simpl!(*);
+slf!(*);
+simpl!(*);
 
 fork-split(*);
 unforkify(*);
-- 
GitLab


From 5f96fddc185d0b5a39a2063af815984a606cd68b Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 15:09:24 -0600
Subject: [PATCH 7/9] tweak

---
 hercules_ir/src/typecheck.rs                  |  4 +--
 .../rodinia/srad/benches/srad_bench.rs        | 17 ----------
 juno_samples/rodinia/srad/src/cpu.sch         |  2 +-
 juno_samples/rodinia/srad/src/lib.rs          | 33 +------------------
 juno_samples/rodinia/srad/src/rust_srad.rs    | 31 ++++++++---------
 juno_samples/rodinia/srad/src/srad.jn         | 24 ++++++++------
 6 files changed, 30 insertions(+), 81 deletions(-)

diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs
index b2567b8f..1ecebf11 100644
--- a/hercules_ir/src/typecheck.rs
+++ b/hercules_ir/src/typecheck.rs
@@ -822,7 +822,7 @@ fn typeflow(
             // We also return the return type from here
             match intrinsic {
                 // Intrinsics that take any numeric type and return the same
-                Intrinsic::Abs => {
+                Intrinsic::Abs | Intrinsic::Max | Intrinsic::Min => {
                     if let Concrete(id) = inputs[0] {
                         if types[id.idx()].is_arithmetic() {
                             Concrete(*id)
@@ -856,8 +856,6 @@ fn typeflow(
                 | Intrinsic::Ln1P
                 | Intrinsic::Log10
                 | Intrinsic::Log2
-                | Intrinsic::Max
-                | Intrinsic::Min
                 | Intrinsic::Round
                 | Intrinsic::Sin
                 | Intrinsic::Sinh
diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs
index 728702d9..6af13aae 100644
--- a/juno_samples/rodinia/srad/benches/srad_bench.rs
+++ b/juno_samples/rodinia/srad/benches/srad_bench.rs
@@ -24,19 +24,6 @@ fn srad_bench(c: &mut Criterion) {
     } = read_graphics(image);
     let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
     let mut image_h = HerculesMutBox::from(image.clone());
-    let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>();
-    let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>();
-    let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>();
-    let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>();
-    // Fix boundary conditions
-    iN[0] = 0;
-    iS[nrows - 1] = (nrows - 1) as i32;
-    jW[0] = 0;
-    jE[ncols - 1] = (ncols - 1) as i32;
-    let iN_h = HerculesImmBox::from(iN.as_slice());
-    let iS_h = HerculesImmBox::from(iS.as_slice());
-    let jW_h = HerculesImmBox::from(jW.as_slice());
-    let jE_h = HerculesImmBox::from(jE.as_slice());
     group.bench_function("srad bench", |b| {
         b.iter(|| {
             async_std::task::block_on(async {
@@ -45,10 +32,6 @@ fn srad_bench(c: &mut Criterion) {
                     ncols as u64,
                     niter as u64,
                     image_h.to(),
-                    iN_h.to(),
-                    iS_h.to(),
-                    jW_h.to(),
-                    jE_h.to(),
                     max,
                     lambda,
                 )
diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index b3188b60..5a8c180e 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -32,7 +32,7 @@ simpl!(*);
 fork-interchange[0, 1](loop1);
 reduce-slf(*);
 simpl!(*);
-slf!(*);
+slf(*);
 simpl!(*);
 
 fork-split(*);
diff --git a/juno_samples/rodinia/srad/src/lib.rs b/juno_samples/rodinia/srad/src/lib.rs
index a647b94a..cb156d9d 100644
--- a/juno_samples/rodinia/srad/src/lib.rs
+++ b/juno_samples/rodinia/srad/src/lib.rs
@@ -48,22 +48,6 @@ pub fn srad_harness(args: SRADInputs) {
         let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
         let mut image_h = HerculesMutBox::from(image.clone());
 
-        let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>();
-        let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>();
-        let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>();
-        let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>();
-
-        // Fix boundary conditions
-        iN[0] = 0;
-        iS[nrows - 1] = (nrows - 1) as i32;
-        jW[0] = 0;
-        jE[ncols - 1] = (ncols - 1) as i32;
-
-        let iN_h = HerculesImmBox::from(iN.as_slice());
-        let iS_h = HerculesImmBox::from(iS.as_slice());
-        let jW_h = HerculesImmBox::from(jW.as_slice());
-        let jE_h = HerculesImmBox::from(jE.as_slice());
-
         let mut runner = runner!(srad);
         let result: Vec<f32> = HerculesMutBox::from(
             runner
@@ -72,10 +56,6 @@ pub fn srad_harness(args: SRADInputs) {
                     ncols as u64,
                     niter as u64,
                     image_h.to(),
-                    iN_h.to(),
-                    iS_h.to(),
-                    jW_h.to(),
-                    jE_h.to(),
                     max,
                     lambda,
                 )
@@ -90,18 +70,7 @@ pub fn srad_harness(args: SRADInputs) {
 
         if verify {
             let mut rust_result = image;
-            rust_srad::srad(
-                nrows,
-                ncols,
-                niter,
-                &mut rust_result,
-                &iN,
-                &iS,
-                &jW,
-                &jE,
-                max,
-                lambda,
-            );
+            rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda);
 
             if let Some(output) = output_verify {
                 write_graphics(output, &rust_result, nrows, ncols, max);
diff --git a/juno_samples/rodinia/srad/src/rust_srad.rs b/juno_samples/rodinia/srad/src/rust_srad.rs
index 3226e35f..f25d382a 100644
--- a/juno_samples/rodinia/srad/src/rust_srad.rs
+++ b/juno_samples/rodinia/srad/src/rust_srad.rs
@@ -1,15 +1,4 @@
-pub fn srad(
-    nrows: usize,
-    ncols: usize,
-    niter: usize,
-    image: &mut Vec<f32>,
-    iN: &[i32],
-    iS: &[i32],
-    jW: &[i32],
-    jE: &[i32],
-    max: f32,
-    lambda: f32,
-) {
+pub fn srad(nrows: usize, ncols: usize, niter: usize, image: &mut Vec<f32>, max: f32, lambda: f32) {
     let nelems = nrows * ncols;
 
     // EXTRACT
@@ -44,11 +33,15 @@ pub fn srad(
             for i in 0..nrows {
                 let k = i + nrows * j;
                 let Jc = image[k];
+                let iN = std::cmp::max(i, 1) - 1;
+                let iS = std::cmp::min(i, nrows - 2) + 1;
+                let jW = std::cmp::max(j, 1) - 1;
+                let jE = std::cmp::min(j, ncols - 2) + 1;
 
-                dN[k] = image[iN[i] as usize + nrows * j] - Jc;
-                dS[k] = image[iS[i] as usize + nrows * j] - Jc;
-                dW[k] = image[i + nrows * jW[j] as usize] - Jc;
-                dE[k] = image[i + nrows * jE[j] as usize] - Jc;
+                dN[k] = image[iN as usize + nrows * j] - Jc;
+                dS[k] = image[iS as usize + nrows * j] - Jc;
+                dW[k] = image[i + nrows * jW as usize] - Jc;
+                dE[k] = image[i + nrows * jE as usize] - Jc;
 
                 let G2 =
                     (dN[k] * dN[k] + dS[k] * dS[k] + dW[k] * dW[k] + dE[k] * dE[k]) / (Jc * Jc);
@@ -72,11 +65,13 @@ pub fn srad(
         for j in 0..ncols {
             for i in 0..nrows {
                 let k = i + nrows * j;
+                let iS = std::cmp::min(i, nrows - 2) + 1;
+                let jE = std::cmp::min(j, ncols - 2) + 1;
 
                 let cN = c[k];
-                let cS = c[iS[i] as usize + nrows * j];
+                let cS = c[iS as usize + nrows * j];
                 let cW = c[k];
-                let cE = c[i + nrows * jE[j] as usize];
+                let cE = c[i + nrows * jE as usize];
 
                 let D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k];
 
diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn
index 176778be..b055b296 100644
--- a/juno_samples/rodinia/srad/src/srad.jn
+++ b/juno_samples/rodinia/srad/src/srad.jn
@@ -21,10 +21,6 @@ fn compress<nrows, ncols: usize>(inout image: f32[ncols, nrows], max: f32) {
 fn srad<nrows, ncols: usize>(
   niter: usize,
   inout image: f32[ncols, nrows],
-  iN: i32[nrows],
-  iS: i32[nrows],
-  jW: i32[ncols],
-  jE: i32[ncols],
   max: f32,
   lambda: f32,
 ) {
@@ -60,10 +56,15 @@ fn srad<nrows, ncols: usize>(
     @loop2 for j in 0..ncols {
       for i in 0..nrows {
         let Jc = image[j, i];
-        dN[j, i] = image[j, iN[i] as u64] - Jc;
-        dS[j, i] = image[j, iS[i] as u64] - Jc;
-        dW[j, i] = image[jW[j] as u64, i] - Jc;
-        dE[j, i] = image[jE[j] as u64, i] - Jc;
+	let iN = max!(i, 1) - 1;
+	let iS = min!(i, nrows - 2) + 1;
+	let jW = max!(j, 1) - 1;
+	let jE = min!(j, ncols - 2) + 1;
+
+        dN[j, i] = image[j, iN as u64] - Jc;
+        dS[j, i] = image[j, iS as u64] - Jc;
+        dW[j, i] = image[jW as u64, i] - Jc;
+        dE[j, i] = image[jE as u64, i] - Jc;
 
         let G2 = (dN[j, i] * dN[j, i] + dS[j, i] * dS[j, i]
                 + dW[j, i] * dW[j, i] + dE[j, i] * dE[j, i]) / (Jc * Jc);
@@ -85,10 +86,13 @@ fn srad<nrows, ncols: usize>(
 
     @loop3 for j in 0..ncols {
       for i in 0..nrows {
+	let iS = min!(i, nrows - 2) + 1;
+	let jE = min!(j, ncols - 2) + 1;
+
         let cN = c[j, i];
-        let cS = c[j, iS[i] as u64];
+        let cS = c[j, iS as u64];
         let cW = c[j, i];
-        let cE = c[jE[j] as u64, i];
+        let cE = c[jE as u64, i];
 
         let D = cN * dN[j, i] + cS * dS[j, i] + cW * dW[j, i] + cE * dE[j, i];
         image[j, i] = image[j, i] + 0.25 * lambda * D;
-- 
GitLab


From 7c445a64d396f3e924659392deaf0854b5c255e9 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 15:29:54 -0600
Subject: [PATCH 8/9] parallelize loop2 in srad

---
 juno_samples/rodinia/srad/src/cpu.sch | 9 +++++++--
 juno_samples/rodinia/srad/src/srad.jn | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index 5a8c180e..43d8ceac 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -35,7 +35,12 @@ simpl!(*);
 slf(*);
 simpl!(*);
 
-fork-split(*);
-unforkify(*);
+fork-tile[32, 0, false, false](loop2);
+let split = fork-split(loop2);
+let loop2_body = outline(split.srad_1.fj1);
+simpl!(loop2, loop2_body);
+
+fork-split(extract, compress, loop1, loop2_body, loop3);
+unforkify(extract, compress, loop1, loop2_body, loop3);
 
 gcm(*);
diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn
index b055b296..024be598 100644
--- a/juno_samples/rodinia/srad/src/srad.jn
+++ b/juno_samples/rodinia/srad/src/srad.jn
@@ -51,7 +51,7 @@ fn srad<nrows, ncols: usize>(
     @scratch let dE : f32[ncols, nrows];
     @scratch let dW : f32[ncols, nrows];
 
-    let c : f32[ncols, nrows];
+    @scratch let c : f32[ncols, nrows];
 
     @loop2 for j in 0..ncols {
       for i in 0..nrows {
-- 
GitLab


From 98c8cee0ca70828873c13b5a2426c00425c0edcd Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 16:35:26 -0600
Subject: [PATCH 9/9] Inline to not re-allocate vec too much

---
 juno_samples/rodinia/srad/src/cpu.sch | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index 43d8ceac..7b7a6c9e 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -40,6 +40,9 @@ let split = fork-split(loop2);
 let loop2_body = outline(split.srad_1.fj1);
 simpl!(loop2, loop2_body);
 
+inline(srad@loop2);
+delete-uncalled(*);
+
 fork-split(extract, compress, loop1, loop2_body, loop3);
 unforkify(extract, compress, loop1, loop2_body, loop3);
 
-- 
GitLab