From e37ad2b482dfe3fd23723b5b0f8015754e447bea Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Sat, 1 Mar 2025 10:42:33 -0600
Subject: [PATCH 01/14] some tuning

---
 juno_samples/cava/src/cava.jn           |  4 ++--
 juno_samples/cava/src/cpu.sch           |  2 +-
 juno_samples/cava/src/gpu.sch           |  6 +++---
 juno_samples/edge_detection/src/cpu.sch | 10 +++++-----
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn
index 4d02b2cd..931e78f8 100644
--- a/juno_samples/cava/src/cava.jn
+++ b/juno_samples/cava/src/cava.jn
@@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
   @image_loop for r = 0 to row {
     for c = 0 to col {
       @l2 let l2_dist : f32[num_ctrl_pts];
-      for cp = 0 to num_ctrl_pts {
+      @cp_loop for cp = 0 to num_ctrl_pts {
         let v1 = input[0, r, c] - ctrl_pts[cp, 0];
         let v2 = input[1, r, c] - ctrl_pts[cp, 1];
         let v3 = input[2, r, c] - ctrl_pts[cp, 2];
@@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>(
      
       @channel_loop for chan = 0 to CHAN {
         let chan_val : f32 = 0.0;
-        for cp = 0 to num_ctrl_pts {
+        @cp_loop for cp = 0 to num_ctrl_pts {
           chan_val += l2_dist[cp] * weights[cp, chan];
         }
 
diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index 8f22b37d..6fc8adbb 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -115,7 +115,7 @@ array-slf(fuse4);
 simpl!(fuse4);
 let par = fuse4@image_loop \ fuse4@channel_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let fuse4_body = outline(split.cava_3.fj2);
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index bacfd3ab..aa1df390 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -117,9 +117,9 @@ fixpoint {
 simpl!(fuse4);
 array-slf(fuse4);
 simpl!(fuse4);
-//fork-tile[2, 0, false, true](fuse4@channel_loop);
-//fork-split(fuse4@channel_loop);
-//clean-monoid-reduces(fuse4);
+fork-tile[2, 0, false, true](fuse4@channel_loop);
+let out = fork-split(fuse4@channel_loop);
+fork-unroll(out.cava_3.fj1);
 unforkify(fuse4@channel_loop);
 
 no-memset(fuse5@res1);
diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index ec9e423d..b46523f3 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -26,7 +26,7 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
@@ -42,7 +42,7 @@ fixpoint {
 simpl!(laplacian_estimate);
 let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2;
@@ -59,7 +59,7 @@ fixpoint {
 simpl!(zero_crossings);
 let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
 fork-tile[4, 1, false, false](par);
-fork-tile[4, 0, false, false](par);
+fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
 let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2;
@@ -86,7 +86,7 @@ fixpoint {
 simpl!(max_gradient);
 fork-dim-merge(max_gradient);
 simpl!(max_gradient);
-fork-tile[16, 0, false, false](max_gradient);
+fork-tile[32, 0, false, false](max_gradient);
 let split = fork-split(max_gradient);
 clean-monoid-reduces(max_gradient);
 let out = outline(split._4_max_gradient.fj1);
@@ -105,7 +105,7 @@ fixpoint {
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
 fork-tile[4, 1, false, false](reject_zero_crossings);
-fork-tile[4, 0, false, false](reject_zero_crossings);
+fork-tile[8, 0, false, false](reject_zero_crossings);
 fork-interchange[1, 2](reject_zero_crossings);
 let split = fork-split(reject_zero_crossings);
 let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
-- 
GitLab


From d0fd55bba80835ecdd77ccb224a550f4b4b4a8fe Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 10:49:16 -0600
Subject: [PATCH 02/14] optimize ptx

---
 juno_scheduler/src/pm.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 456df2ed..62bdaf73 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -1090,7 +1090,9 @@ impl PassManager {
 
             let mut nvcc_process = Command::new("nvcc")
                 .arg("-c")
+                .arg("-Xptxas")
                 .arg("-O3")
+                .arg("-use_fast_math")
                 .arg("-diag-suppress")
                 .arg("177")
                 .arg("-o")
-- 
GitLab


From 59410bb3d8e491845c9d19533cbccc7326f5329d Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 11:33:40 -0600
Subject: [PATCH 03/14] whoops

---
 juno_samples/cava/src/lib.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/juno_samples/cava/src/lib.rs b/juno_samples/cava/src/lib.rs
index 1810a246..47e3b1b3 100644
--- a/juno_samples/cava/src/lib.rs
+++ b/juno_samples/cava/src/lib.rs
@@ -124,9 +124,9 @@ pub struct CavaInputs {
     #[clap(long = "output-verify", value_name = "PATH")]
     pub output_verify: Option<String>,
     pub cam_model: String,
-    #[clap(short, long)]
+    #[clap(long)]
     pub crop_rows: Option<usize>,
-    #[clap(short, long)]
+    #[clap(long)]
     pub crop_cols: Option<usize>,
 }
 
-- 
GitLab


From a8af6c9bcec6102e327f0d76fc554892458a9aec Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 16:00:59 -0600
Subject: [PATCH 04/14] Fix gpu backend, tile on gpu is good actually :shocked:

---
 hercules_cg/src/gpu.rs        | 6 ++++--
 juno_samples/cava/src/gpu.sch | 6 ++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index c9720273..07dd3ebf 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
                             // because Fork basic block's init section already does gating
                             write!(
                                 w,
-                                "{}{} = (threadIdx.x % {}) / {};\n",
+                                "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
                                 tabs,
                                 define_variable,
                                 use_thread_quota.unwrap(),
-                                use_thread_quota.unwrap() / parallel_factor.unwrap()
+                                use_thread_quota.unwrap() / parallel_factor.unwrap(),
+                                divide,
+                                modulo,
                             )?;
                         }
                     }
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index aa1df390..92a1835d 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -121,6 +121,12 @@ fork-tile[2, 0, false, true](fuse4@channel_loop);
 let out = fork-split(fuse4@channel_loop);
 fork-unroll(out.cava_3.fj1);
 unforkify(fuse4@channel_loop);
+fork-tile[4, 1, false, true](fuse4);
+fork-tile[8, 0, false, true](fuse4);
+fork-interchange[1, 2](fuse4);
+let split = fork-split(fuse4);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);
-- 
GitLab


From 776a69d6007c1e68b0f7aeb884a60c09548cd120 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 16:12:51 -0600
Subject: [PATCH 05/14] reorg

---
 juno_samples/cava/src/gpu.sch | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index 92a1835d..0ef466c0 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -121,12 +121,6 @@ fork-tile[2, 0, false, true](fuse4@channel_loop);
 let out = fork-split(fuse4@channel_loop);
 fork-unroll(out.cava_3.fj1);
 unforkify(fuse4@channel_loop);
-fork-tile[4, 1, false, true](fuse4);
-fork-tile[8, 0, false, true](fuse4);
-fork-interchange[1, 2](fuse4);
-let split = fork-split(fuse4);
-fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
-fork-coalesce(split.cava_3.fj2);
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);
@@ -139,6 +133,13 @@ simpl!(fuse5);
 array-slf(fuse5);
 simpl!(fuse5);
 
+fork-tile[4, 1, false, true](fuse4);
+fork-tile[8, 0, false, true](fuse4);
+fork-interchange[1, 2](fuse4);
+let split = fork-split(fuse4);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
+
 delete-uncalled(*);
 simpl!(*);
 
-- 
GitLab


From f10b9cc746326fb781e0c0e08582796b640c17ed Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 16:33:50 -0600
Subject: [PATCH 06/14] simplify edge code

---
 .../edge_detection/src/edge_detection.jn      | 80 ++++++-------------
 juno_samples/edge_detection/src/gpu.sch       |  4 +-
 2 files changed, 28 insertions(+), 56 deletions(-)

diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
index 58f364dc..ebd58206 100644
--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>(
 
   @image_loop for row = 0 to n {
     for col = 0 to m {
-      // Copy data for dilation filter
-      @shr1 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-	                       || row + i - r > n - 1
-			       || col + j < r
-			       || col + j - r > m - 1 then MIN_BR
-			    else input[row + i - r, col + j - r];
-        }
-      }
-
       // Compute pixel of dilated image
       let dilated = MIN_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
-        }
-      }
-
-      // Data copy for erotion filter
-      @shr2 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-	                       || row + i - r > n - 1
-			       || col + j < r
-			       || col + j - r > m - 1 then MAX_BR
-			    else input[row + i - r, col + j - r];
+	  let filter = if row + i < r
+                         || row + i - r > n - 1
+                         || col + j < r
+                         || col + j - r > m - 1 then MIN_BR
+                      else input[row + i - r, col + j - r];
+          dilated = max!(dilated, filter * structure[i, j]);
         }
       }
 
@@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>(
       let eroded = MAX_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+	  let filter = if row + i < r
+                         || row + i - r > n - 1
+                         || col + j < r
+                         || col + j - r > m - 1 then MAX_BR
+                      else input[row + i - r, col + j - r];
+          eroded = min!(eroded, filter * structure[i, j]);
         }
       }
 
@@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>(
 
   @image_loop for row = 0 to n {
     for col = 0 to m {
-      // Data copy for dilation filter
-      @shr1 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-                               || row + i - r > n - 1
-                               || col + j < r
-                               || col + j - r > m - 1 then MIN_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
-                            else MIN_BR;
-        }
-      }
-
       // Compute the pixel of dilated image
       let dilated = MIN_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
-        }
-      }
-
-      // Data copy for erotion filter
-      @shr2 let imageArea : f32[sz, sz];
-      @filter_loop for i = 0 to sz {
-        for j = 0 to sz {
-          imageArea[i, j] = if row + i < r
-                               || row + i - r > n - 1
-                               || col + j < r
-                               || col + j - r > m - 1 then MAX_BR
-                            else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
-                            else MIN_BR;
+	  let filter = if row + i < r
+                          || row + i - r > n - 1
+                          || col + j < r
+                          || col + j - r > m - 1 then MIN_BR
+                       else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
+                       else MIN_BR;
+          dilated = max!(dilated, filter * structure[i, j]);
         }
       }
 
@@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>(
       let eroded = MAX_BR;
       @filter_loop for i = 0 to sz {
         for j = 0 to sz {
-          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+	  let filter = if row + i < r
+                          || row + i - r > n - 1
+                          || col + j < r
+                          || col + j - r > m - 1 then MAX_BR
+                       else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR
+                       else MIN_BR;
+          eroded = min!(eroded, filter * structure[i, j]);
         }
       }
 
diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch
index 7ee2904f..065a78f2 100644
--- a/juno_samples/edge_detection/src/gpu.sch
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -27,7 +27,7 @@ simpl!(gaussian_smoothing);
 predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 
-no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+no-memset(laplacian_estimate@res);
 fixpoint {
   forkify(laplacian_estimate);
   fork-guard-elim(laplacian_estimate);
@@ -35,7 +35,7 @@ fixpoint {
 }
 simpl!(laplacian_estimate);
 
-no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+no-memset(zero_crossings@res);
 fixpoint {
   forkify(zero_crossings);
   fork-guard-elim(zero_crossings);
-- 
GitLab


From 0c7afae76803f306266fc489e0cb8d9c6edf75fb Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 17:04:11 -0600
Subject: [PATCH 07/14] tile part of edge

---
 hercules_opt/src/fork_transforms.rs     |  7 +++++++
 juno_samples/edge_detection/src/gpu.sch | 27 +++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index e6db0345..8bd3f735 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1169,6 +1169,7 @@ pub fn fork_dim_merge(
                     op: BinaryOperator::Rem,
                 });
                 edit.sub_edit(tid, rem);
+                edit.sub_edit(tid, outer_tid);
                 edit = edit.replace_all_uses(tid, rem)?;
             } else if tid_dim == inner_idx {
                 let outer_tid = Node::ThreadID {
@@ -1185,6 +1186,7 @@ pub fn fork_dim_merge(
                     op: BinaryOperator::Div,
                 });
                 edit.sub_edit(tid, div);
+                edit.sub_edit(tid, outer_tid);
                 edit = edit.replace_all_uses(tid, div)?;
             }
         }
@@ -1479,7 +1481,12 @@ fn fork_fusion(
     }
 
     // Perform the fusion.
+    let bottom_tids: Vec<_> = editor
+        .get_users(bottom_fork)
+        .filter(|id| nodes[id.idx()].is_thread_id())
+        .collect();
     editor.edit(|mut edit| {
+        edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?;
         if bottom_join_pred != bottom_fork {
             // If there is control flow in the bottom fork-join, stitch it into
             // the top fork-join.
diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch
index 065a78f2..f8da90d0 100644
--- a/juno_samples/edge_detection/src/gpu.sch
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -26,6 +26,15 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
+fork-dim-merge(gaussian_smoothing@filter_loop);
+unforkify(gaussian_smoothing@filter_loop);
+simpl!(gaussian_smoothing);
+
+fork-dim-merge(gaussian_smoothing);
+fork-tile[32, 0, false, true](gaussian_smoothing);
+simpl!(gaussian_smoothing);
+fork-split(gaussian_smoothing);
+simpl!(gaussian_smoothing);
 
 no-memset(laplacian_estimate@res);
 fixpoint {
@@ -34,6 +43,15 @@ fixpoint {
   fork-coalesce(laplacian_estimate);
 }
 simpl!(laplacian_estimate);
+fork-dim-merge(laplacian_estimate@filter_loop);
+unforkify(laplacian_estimate@filter_loop);
+simpl!(laplacian_estimate);
+
+fork-dim-merge(laplacian_estimate);
+fork-tile[32, 0, false, true](laplacian_estimate);
+simpl!(laplacian_estimate);
+fork-split(laplacian_estimate);
+simpl!(laplacian_estimate);
 
 no-memset(zero_crossings@res);
 fixpoint {
@@ -42,6 +60,15 @@ fixpoint {
   fork-coalesce(zero_crossings);
 }
 simpl!(zero_crossings);
+fork-dim-merge(zero_crossings@filter_loop);
+unforkify(zero_crossings@filter_loop);
+simpl!(zero_crossings);
+
+fork-dim-merge(zero_crossings);
+fork-tile[32, 0, false, true](zero_crossings);
+simpl!(zero_crossings);
+fork-split(zero_crossings);
+simpl!(zero_crossings);
 
 no-memset(gradient@res);
 fixpoint {
-- 
GitLab


From 79f6f3ae99a7e4c3baddc5618ece21830ee9bd2c Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 17:08:23 -0600
Subject: [PATCH 08/14] fix cpu edge

---
 juno_samples/edge_detection/src/cpu.sch | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index b46523f3..3e1321c5 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -33,7 +33,7 @@ let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
 fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
 simpl!(gaussian_smoothing, gaussian_smoothing_body);
 
-no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+no-memset(laplacian_estimate@res);
 fixpoint {
   forkify(laplacian_estimate);
   fork-guard-elim(laplacian_estimate);
@@ -45,12 +45,12 @@ fork-tile[4, 1, false, false](par);
 fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2;
+let body = split._1_laplacian_estimate.fj2;
 let laplacian_estimate_body = outline(body);
 fork-coalesce(laplacian_estimate, laplacian_estimate_body);
 simpl!(laplacian_estimate, laplacian_estimate_body);
 
-no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+no-memset(zero_crossings@res);
 fixpoint {
   forkify(zero_crossings);
   fork-guard-elim(zero_crossings);
@@ -62,7 +62,7 @@ fork-tile[4, 1, false, false](par);
 fork-tile[8, 0, false, false](par);
 fork-interchange[1, 2](par);
 let split = fork-split(par);
-let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2;
+let body = split._2_zero_crossings.fj2;
 let zero_crossings_body = outline(body);
 fork-coalesce(zero_crossings, zero_crossings_body);
 simpl!(zero_crossings, zero_crossings_body);
-- 
GitLab


From 30964185a4bd1567d3e57595c4b88f17ad7d1a67 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 17:20:58 -0600
Subject: [PATCH 09/14] update gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 516108dd..6298d7bb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
-/target
+**/target
 *.dot
 !paper_resources/*.dot
 *.bc
@@ -14,3 +14,4 @@
 .vscode
 *_env
 *.txt
+*ncu-rep
\ No newline at end of file
-- 
GitLab


From f3a018e80c5e4cf1a0a4d56720c3bf4f33e049b2 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sat, 1 Mar 2025 22:34:42 -0600
Subject: [PATCH 10/14] .

---
 hercules_opt/src/interprocedural_sroa.rs |  1 -
 juno_samples/rodinia/cfd/src/lib.rs      | 31 +++++++++++++++++-------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/hercules_opt/src/interprocedural_sroa.rs b/hercules_opt/src/interprocedural_sroa.rs
index c7ea5836..a01b0f55 100644
--- a/hercules_opt/src/interprocedural_sroa.rs
+++ b/hercules_opt/src/interprocedural_sroa.rs
@@ -85,7 +85,6 @@ pub fn interprocedural_sroa(
                 param_nodes[idx].push(id);
             }
         }
-        println!("{}", editor.func().name);
         let success = editor.edit(|mut edit| {
             for (idx, ids) in param_nodes.into_iter().enumerate() {
                 let new_indices = &old_param_type_map[idx];
diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs
index 62ee59f4..f9a5dd76 100644
--- a/juno_samples/rodinia/cfd/src/lib.rs
+++ b/juno_samples/rodinia/cfd/src/lib.rs
@@ -48,8 +48,7 @@ fn run_euler(
     let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
     let mut runner = runner!(euler);
-    let (density, momentum_x, momentum_y, momentum_z, energy) = 
-    async_std::task::block_on(async {
+    let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
         runner
             .run(
                 nelr as u64,
@@ -123,8 +122,7 @@ fn run_pre_euler(
     let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
     let mut runner = runner!(pre_euler);
-    let (density, momentum_x, momentum_y, momentum_z, energy) = 
-    async_std::task::block_on(async {
+    let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async {
         runner
             .run(
                 nelr as u64,
@@ -189,15 +187,30 @@ fn compare_floats(xs: &Variables, ys: &Variables) -> bool {
     let ys_energy = ys.energy.as_slice();
 
     xs_density.len() == ys_density.len()
-        && xs_density.iter().zip(ys_density.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_density
+            .iter()
+            .zip(ys_density.iter())
+            .all(|(x, y)| compare_float(*x, *y))
         && xs_momentum_x.len() == ys_momentum_x.len()
-        && xs_momentum_x.iter().zip(ys_momentum_x.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_x
+            .iter()
+            .zip(ys_momentum_x.iter())
+            .all(|(x, y)| compare_float(*x, *y))
         && xs_momentum_y.len() == ys_momentum_y.len()
-        && xs_momentum_y.iter().zip(ys_momentum_y.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_y
+            .iter()
+            .zip(ys_momentum_y.iter())
+            .all(|(x, y)| compare_float(*x, *y))
         && xs_momentum_z.len() == ys_momentum_z.len()
-        && xs_momentum_z.iter().zip(ys_momentum_z.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_momentum_z
+            .iter()
+            .zip(ys_momentum_z.iter())
+            .all(|(x, y)| compare_float(*x, *y))
         && xs_energy.len() == ys_energy.len()
-        && xs_energy.iter().zip(ys_energy.iter()).all(|(x, y)| compare_float(*x, *y))
+        && xs_energy
+            .iter()
+            .zip(ys_energy.iter())
+            .all(|(x, y)| compare_float(*x, *y))
 }
 
 pub fn cfd_harness(args: CFDInputs) {
-- 
GitLab


From a3483234d1a5b7607cea444d8be67180bbc36816 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 09:59:23 -0600
Subject: [PATCH 11/14] Multi-core cfd

---
 juno_samples/rodinia/cfd/src/cpu_euler.sch    | 28 ++++++++++++++--
 .../rodinia/cfd/src/cpu_pre_euler.sch         | 33 ++++++++++++++++++-
 juno_samples/rodinia/cfd/src/lib.rs           |  1 +
 juno_samples/rodinia/cfd/src/pre_euler.jn     | 18 +++++-----
 4 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch
index 4cf320a6..4e7ee3cf 100644
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -25,7 +25,31 @@ fixpoint {
 }
 simpl!(*);
 no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
-parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
+unforkify(compute_flux@inner_loop);
 
-unforkify(*);
+fork-tile[32, 0, false, false](compute_step_factor);
+let split = fork-split(compute_step_factor);
+let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+fork-coalesce(compute_step_factor, compute_step_factor_body);
+simpl!(compute_step_factor, compute_step_factor_body);
+
+fork-tile[32, 0, false, false](compute_flux);
+let split = fork-split(compute_flux);
+let compute_flux_body = outline(split._6_compute_flux.fj1);
+fork-coalesce(compute_flux, compute_flux_body);
+simpl!(compute_flux, compute_flux_body);
+
+fork-tile[32, 0, false, false](time_step);
+let split = fork-split(time_step);
+let time_step_body = outline(split._7_time_step.fj1);
+fork-coalesce(time_step, time_step_body);
+simpl!(time_step, time_step_body);
+
+fork-tile[32, 0, false, false](copy_vars);
+let split = fork-split(copy_vars);
+let copy_vars_body = outline(split._8_copy_vars.fj1);
+fork-coalesce(copy_vars, copy_vars_body);
+simpl!(copy_vars, copy_vars_body);
+
+unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
index 14eb6906..518c656d 100644
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
@@ -24,7 +24,38 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
+no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
+unforkify(compute_flux@inner_loop);
 
-unforkify(*);
+fork-tile[32, 0, false, false](compute_step_factor);
+let split = fork-split(compute_step_factor);
+let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+fork-coalesce(compute_step_factor, compute_step_factor_body);
+simpl!(compute_step_factor, compute_step_factor_body);
 
+fork-tile[32, 0, false, false](compute_flux_contributions);
+let split = fork-split(compute_flux_contributions);
+let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
+fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
+simpl!(compute_flux_contributions, compute_flux_contributions_body);
+
+fork-tile[32, 0, false, false](compute_flux);
+let split = fork-split(compute_flux);
+let compute_flux_body = outline(split._7_compute_flux.fj1);
+fork-coalesce(compute_flux, compute_flux_body);
+simpl!(compute_flux, compute_flux_body);
+
+fork-tile[32, 0, false, false](time_step);
+let split = fork-split(time_step);
+let time_step_body = outline(split._8_time_step.fj1);
+fork-coalesce(time_step, time_step_body);
+simpl!(time_step, time_step_body);
+
+fork-tile[32, 0, false, false](copy_vars);
+let split = fork-split(copy_vars);
+let copy_vars_body = outline(split._9_copy_vars.fj1);
+fork-coalesce(copy_vars, copy_vars_body);
+simpl!(copy_vars, copy_vars_body);
+
+unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs
index f9a5dd76..d61df4c5 100644
--- a/juno_samples/rodinia/cfd/src/lib.rs
+++ b/juno_samples/rodinia/cfd/src/lib.rs
@@ -237,6 +237,7 @@ pub fn cfd_harness(args: CFDInputs) {
     } = read_domain_geometry(data_file, block_size);
 
     let variables = initialize_variables(nelr, &ff_variable);
+    println!("Running CFD with nelr = {}.", nelr);
 
     let res_juno = if pre_euler {
         run_pre_euler(
diff --git a/juno_samples/rodinia/cfd/src/pre_euler.jn b/juno_samples/rodinia/cfd/src/pre_euler.jn
index c200f2db..979c2e9a 100644
--- a/juno_samples/rodinia/cfd/src/pre_euler.jn
+++ b/juno_samples/rodinia/cfd/src/pre_euler.jn
@@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 {
 }
 
 fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] {
-  let step_factors : f32[nelr];
+  @res let step_factors : f32[nelr];
 
   for i in 0..nelr {
     let density = variables.density[i];
@@ -109,10 +109,10 @@ fn compute_flux_contribution(
 fn compute_flux_contributions<nelr: usize>(
   variables: Variables::<nelr>,
 ) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) {
-  let fc_momentum_x: Momentum::<nelr>;
-  let fc_momentum_y: Momentum::<nelr>;
-  let fc_momentum_z: Momentum::<nelr>;
-  let fc_density_energy: Momentum::<nelr>;
+  @res let fc_momentum_x: Momentum::<nelr>;
+  @res let fc_momentum_y: Momentum::<nelr>;
+  @res let fc_momentum_z: Momentum::<nelr>;
+  @res let fc_density_energy: Momentum::<nelr>;
 
   for i in 0..nelr {
     let density_i = variables.density[i];
@@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>(
   ff_fc_momentum_z: float3,
 ) -> Variables::<nelr> {
   const smoothing_coefficient : f32 = 0.2;
-  let fluxes: Variables::<nelr>;
+  @res let fluxes: Variables::<nelr>;
 
-  for i in 0..nelr {
+  @outer_loop for i in 0..nelr {
     let density_i = variables.density[i];
 
     let momentum_i = float3 { x: variables.momentum.x[i],
@@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>(
     let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 };
     let flux_i_density_energy : f32 = 0.0;
 
-    for j in 0..NNB {
+    @inner_loop for j in 0..NNB {
       let nb = elements_surrounding_elements[j, i];
       let normal = float3 {
         x: normals.x[j, i],
@@ -328,7 +328,7 @@ fn time_step<nelr: usize>(
 }
 
 fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> {
-  let result : Variables::<nelr>;
+  @res let result : Variables::<nelr>;
 
   for i in 0..nelr {
     result.density[i] = variables.density[i];
-- 
GitLab


From badf7e6a5cf86ebf02f3e1462a59a121ec78d87f Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 10:21:11 -0600
Subject: [PATCH 12/14] Tile srad gpu

---
 juno_samples/rodinia/srad/src/gpu.sch | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch
index 289548f9..f736c0b7 100644
--- a/juno_samples/rodinia/srad/src/gpu.sch
+++ b/juno_samples/rodinia/srad/src/gpu.sch
@@ -54,4 +54,10 @@ ip-sroa(*);
 sroa(*);
 simpl!(*);
 
+fork-dim-merge(main_loops);
+fork-tile[32, 0, false, true](main_loops);
+dce(main_loops);
+fork-split(main_loops);
+simpl!(main_loops);
+
 gcm(*);
-- 
GitLab


From d5f4aacedf2ee39ce4fe2e254cad5197df092877 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 10:45:57 -0600
Subject: [PATCH 13/14] CFD opt

---
 hercules_opt/src/sroa.rs                      | 11 ++++++++---
 juno_samples/rodinia/cfd/src/cpu_euler.sch    |  2 +-
 juno_samples/rodinia/cfd/src/gpu_euler.sch    | 18 ++++++++++++++----
 .../rodinia/cfd/src/gpu_pre_euler.sch         | 19 +++++++++++++++++--
 4 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/hercules_opt/src/sroa.rs b/hercules_opt/src/sroa.rs
index e658ff88..2718f99d 100644
--- a/hercules_opt/src/sroa.rs
+++ b/hercules_opt/src/sroa.rs
@@ -447,7 +447,7 @@ pub fn sroa(
                 field_map.insert(node, generate_reads(editor, types[&node], node));
             }
             Node::Constant { id } => {
-                field_map.insert(node, generate_constant_fields(editor, id));
+                field_map.insert(node, generate_constant_fields(editor, id, node));
                 to_delete.push(node);
             }
             _ => {
@@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID
 
 // Given a constant cnst adds node to the function which are the constant values of each field and
 // returns a list of pairs of indices and the node that holds that index
-fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> {
+fn generate_constant_fields(
+    editor: &mut FunctionEditor,
+    cnst: ConstantID,
+    old_node: NodeID,
+) -> IndexTree<NodeID> {
     let cs: Option<Vec<ConstantID>> =
         if let Some(cs) = editor.get_constant(cnst).try_product_fields() {
             Some(cs.into())
@@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In
     if let Some(cs) = cs {
         let mut fields = vec![];
         for c in cs {
-            fields.push(generate_constant_fields(editor, c));
+            fields.push(generate_constant_fields(editor, c, old_node));
         }
         IndexTree::Node(fields)
     } else {
         let mut node = None;
         editor.edit(|mut edit| {
             node = Some(edit.add_node(Node::Constant { id: cnst }));
+            edit.sub_edit(old_node, node.unwrap());
             Ok(edit)
         });
         IndexTree::Leaf(node.expect("Add node cannot fail"))
diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch
index 4e7ee3cf..7a284a9a 100644
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -12,6 +12,7 @@ macro simpl!(X) {
 
 simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
 simpl!(*);
 ip-sroa[true](*);
@@ -24,7 +25,6 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
-no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 unforkify(compute_flux@inner_loop);
 
 fork-tile[32, 0, false, false](compute_step_factor);
diff --git a/juno_samples/rodinia/cfd/src/gpu_euler.sch b/juno_samples/rodinia/cfd/src/gpu_euler.sch
index aed6115e..3700f79d 100644
--- a/juno_samples/rodinia/cfd/src/gpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_euler.sch
@@ -12,6 +12,7 @@ macro simpl!(X) {
 
 simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
 gpu(copy_vars, compute_step_factor, compute_flux, time_step);
 
@@ -26,9 +27,18 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
-no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res);
-parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop);
+unforkify(compute_flux@inner_loop);
+
+fork-tile[32, 0, false, true](compute_step_factor);
+fork-split(compute_step_factor);
+
+fork-tile[32, 0, false, true](compute_flux);
+fork-split(compute_flux);
+
+fork-tile[32, 0, false, true](time_step);
+fork-split(time_step);
+
+fork-tile[32, 0, false, true](copy_vars);
+fork-split(copy_vars);
 
-unforkify(*);
-float-collections(*);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
index d91f1b00..d6db675b 100644
--- a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch
@@ -12,6 +12,7 @@ macro simpl!(X) {
 
 simpl!(*);
 inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step);
+no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
 delete-uncalled(*);
 gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step);
 
@@ -26,7 +27,21 @@ fixpoint {
   fork-guard-elim(*);
 }
 simpl!(*);
+unforkify(compute_flux@inner_loop);
+
+fork-tile[32, 0, false, true](compute_step_factor);
+fork-split(compute_step_factor);
+
+fork-tile[32, 0, false, true](compute_flux_contributions);
+fork-split(compute_flux_contributions);
+
+fork-tile[32, 0, false, true](compute_flux);
+fork-split(compute_flux);
+
+fork-tile[32, 0, false, true](time_step);
+fork-split(time_step);
+
+fork-tile[32, 0, false, true](copy_vars);
+fork-split(copy_vars);
 
-unforkify(*);
-float-collections(*);
 gcm(*);
-- 
GitLab


From 78a7c974850eac226b541126265aaebfdebbfe02 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 2 Mar 2025 11:18:17 -0600
Subject: [PATCH 14/14] optimize edge

---
 juno_samples/edge_detection/src/edge_detection.jn |  2 +-
 juno_samples/edge_detection/src/gpu.sch           | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
index ebd58206..3e49cb36 100644
--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -138,7 +138,7 @@ fn gradient<n, m, sb: usize>(
       let gx = 0;
       let gy = 0;
 
-      for i = 0 to sb {
+      @filter_loop for i = 0 to sb {
         for j = 0 to sb {
           let val = input[if row + i < sbr              then 0
                           else if row + i - sbr > n - 1 then n - 1
diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch
index f8da90d0..666f6cef 100644
--- a/juno_samples/edge_detection/src/gpu.sch
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -80,6 +80,15 @@ predication(gradient);
 simpl!(gradient);
 predication(gradient);
 simpl!(gradient);
+fork-dim-merge(gradient@filter_loop);
+unforkify(gradient@filter_loop);
+simpl!(gradient);
+
+fork-dim-merge(gradient);
+fork-tile[32, 0, false, true](gradient);
+simpl!(gradient);
+fork-split(gradient);
+simpl!(gradient);
 
 fixpoint {
   forkify(max_gradient);
@@ -115,6 +124,12 @@ fixpoint {
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
 
+fork-dim-merge(reject_zero_crossings);
+fork-tile[32, 0, false, true](reject_zero_crossings);
+simpl!(reject_zero_crossings);
+fork-split(reject_zero_crossings);
+simpl!(reject_zero_crossings);
+
 async-call(edge_detection@le, edge_detection@zc);
 
 simpl!(*);
-- 
GitLab