From 77bb4cf7b80d66f750383e4d30232b4fa688ee23 Mon Sep 17 00:00:00 2001
From: xavierrouth <xavierrouth@gmail.com>
Date: Wed, 5 Mar 2025 22:05:06 -0600
Subject: [PATCH 1/5] cava gamut fusion debug

---
 hercules_opt/src/unforkify.rs                 |  6 ++
 juno_samples/cava/Cargo.toml                  |  1 +
 juno_samples/cava/src/cpu.sch                 |  8 ++-
 juno_samples/cava/src/gpu.sch                 | 32 ++++++----
 juno_samples/edge_detection/Cargo.toml        |  1 +
 .../benches/edge_detection_bench.rs           |  1 +
 juno_samples/edge_detection/src/gpu.sch       | 64 ++++++++++++++-----
 7 files changed, 83 insertions(+), 30 deletions(-)

diff --git a/hercules_opt/src/unforkify.rs b/hercules_opt/src/unforkify.rs
index 2d6cf7b3..2deb7165 100644
--- a/hercules_opt/src/unforkify.rs
+++ b/hercules_opt/src/unforkify.rs
@@ -313,6 +313,12 @@ pub fn unforkify(
             edit = edit.delete_node(*reduce)?;
         }
 
+        edit.sub_edit(fork, proj_exit_id);
+        edit.sub_edit(fork, proj_back_id);
+        edit.sub_edit(fork, neq_id);
+        edit.sub_edit(fork, add_id);
+
+
         edit = edit.delete_node(fork)?;
         edit = edit.delete_node(join)?;
         for tid in tids {
diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml
index bdf144da..279a81f0 100644
--- a/juno_samples/cava/Cargo.toml
+++ b/juno_samples/cava/Cargo.toml
@@ -14,6 +14,7 @@ path = "src/lib.rs"
 [features]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]
 seq = []
+dont_fuse_gamut = []
 
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index ada2f552..7d788c2f 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -111,11 +111,15 @@ fixpoint {
   fork-fusion(fuse4@channel_loop);
 }
 simpl!(fuse4);
-array-slf(fuse4);
-simpl!(fuse4);
+
+if !feature("dont_fuse_gamut") {
+  array-slf(fuse4);
+  simpl!(fuse4);
+}
 
 if !feature("seq") {
   let par = fuse4@image_loop \ fuse4@channel_loop;
+  let par = par \ fuse4@cp_loop; 
   fork-tile[4, 1, false, false](par);
   fork-tile[8, 0, false, false](par);
   fork-interchange[1, 2](par);
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index 0ef466c0..daf52339 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -115,12 +115,25 @@ fixpoint {
   fork-fusion(fuse4@channel_loop);
 }
 simpl!(fuse4);
-array-slf(fuse4);
-simpl!(fuse4);
-fork-tile[2, 0, false, true](fuse4@channel_loop);
-let out = fork-split(fuse4@channel_loop);
-fork-unroll(out.cava_3.fj1);
-unforkify(fuse4@channel_loop);
+
+if !feature("dont_fuse_gamut") {
+  array-slf(fuse4);
+  simpl!(fuse4);
+  fork-tile[2, 0, false, true](fuse4@channel_loop);
+  let out = fork-split(fuse4@channel_loop);
+  fork-unroll(out.cava_3.fj1);
+  unforkify(fuse4@channel_loop);
+}
+
+let par = fuse4@image_loop \ fuse4@channel_loop;
+let par = par \ fuse4@cp_loop; 
+
+fork-tile[4, 1, false, true](par);
+fork-tile[8, 0, false, true](par);
+fork-interchange[1, 2](par);
+let split = fork-split(par);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);
@@ -133,12 +146,7 @@ simpl!(fuse5);
 array-slf(fuse5);
 simpl!(fuse5);
 
-fork-tile[4, 1, false, true](fuse4);
-fork-tile[8, 0, false, true](fuse4);
-fork-interchange[1, 2](fuse4);
-let split = fork-split(fuse4);
-fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
-fork-coalesce(split.cava_3.fj2);
+
 
 delete-uncalled(*);
 simpl!(*);
diff --git a/juno_samples/edge_detection/Cargo.toml b/juno_samples/edge_detection/Cargo.toml
index 8def7500..6e53500a 100644
--- a/juno_samples/edge_detection/Cargo.toml
+++ b/juno_samples/edge_detection/Cargo.toml
@@ -8,6 +8,7 @@ edition = "2021"
 opencv = ["dep:opencv"]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]
 seq = []
+warp_tile = []
 
 [[bin]]
 name = "juno_edge_detection"
diff --git a/juno_samples/edge_detection/benches/edge_detection_bench.rs b/juno_samples/edge_detection/benches/edge_detection_bench.rs
index 76035275..1fa08506 100644
--- a/juno_samples/edge_detection/benches/edge_detection_bench.rs
+++ b/juno_samples/edge_detection/benches/edge_detection_bench.rs
@@ -17,6 +17,7 @@ juno_build::juno!("edge_detection");
 fn edge_detection_bench(c: &mut Criterion) {
     let mut group = c.benchmark_group("edge detection bench");
     group.sample_size(10);
+    group.measurement_time(std::time::Duration::from_secs(25));
 
     let input = "examples/formula1_scaled.mp4";
 
diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch
index 666f6cef..3a541dab 100644
--- a/juno_samples/edge_detection/src/gpu.sch
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -95,22 +95,54 @@ fixpoint {
   fork-guard-elim(max_gradient);
   fork-coalesce(max_gradient);
 }
-simpl!(max_gradient);
-fork-dim-merge(max_gradient);
-simpl!(max_gradient);
-fork-tile[32, 0, false, true](max_gradient);
-let out = fork-split(max_gradient);
-clean-monoid-reduces(max_gradient);
-simpl!(max_gradient);
-let fission = fork-fission[out._4_max_gradient.fj0](max_gradient);
-simpl!(max_gradient);
-fork-tile[32, 0, false, true](fission._4_max_gradient.fj_bottom);
-let out = fork-split(fission._4_max_gradient.fj_bottom);
-clean-monoid-reduces(max_gradient);
-simpl!(max_gradient);
-let top = outline(fission._4_max_gradient.fj_top);
-let bottom = outline(out._4_max_gradient.fj0);
-gpu(top, bottom);
+
+if !feature("seq") {
+  if !feature("warp_tile") {
+    simpl!(max_gradient);
+    fork-dim-merge(max_gradient);
+    simpl!(max_gradient);
+    fork-tile[32, 0, false, true](max_gradient);
+    let out1 = fork-split(max_gradient);
+    clean-monoid-reduces(max_gradient);
+    simpl!(max_gradient);
+    let fission = fork-fission[out1._4_max_gradient.fj0](max_gradient);
+    simpl!(max_gradient);
+    fork-tile[32, 0, false, true](fission._4_max_gradient.fj_bottom);
+    let out2 = fork-split(fission._4_max_gradient.fj_bottom);
+    clean-monoid-reduces(max_gradient);
+    simpl!(max_gradient);
+    unforkify(out1._4_max_gradient.fj1);
+
+    unforkify(out2._4_max_gradient.fj1);
+    simpl!(max_gradient);
+    let top = outline(fission._4_max_gradient.fj_top);
+    let bottom = outline(out2._4_max_gradient.fj0);
+    gpu(top, bottom);
+  } else {
+    simpl!(max_gradient);
+    fork-dim-merge(max_gradient);
+    simpl!(max_gradient);
+    fork-tile[32, 0, false, true](max_gradient);
+    let out = fork-split(max_gradient);
+    clean-monoid-reduces(max_gradient);
+    simpl!(max_gradient);
+    let fission = fork-fission[out._4_max_gradient.fj0](max_gradient);
+    simpl!(max_gradient);
+    fork-tile[32, 0, false, true](fission._4_max_gradient.fj_bottom);
+    let out = fork-split(fission._4_max_gradient.fj_bottom);
+    clean-monoid-reduces(max_gradient);
+    simpl!(max_gradient);
+    let top = outline(fission._4_max_gradient.fj_top);
+    let bottom = outline(out._4_max_gradient.fj0);
+    gpu(top, bottom);
+  }
+} else {
+  simpl!(max_gradient);
+  fork-split(max_gradient);
+  unforkify(max_gradient);
+  gpu(max_gradient);
+}
+
 ip-sroa(*);
 sroa(*);
 simpl!(*);
-- 
GitLab


From 4578b15e2ec553c6dc4beaefc55f69737bbbb9ed Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Wed, 5 Mar 2025 22:23:10 -0600
Subject: [PATCH 2/5] revert cava schedules

---
 juno_samples/cava/src/cpu.sch | 10 ++++------
 juno_samples/cava/src/gpu.sch | 32 ++++++++++++--------------------
 2 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index 7d788c2f..32e2b63b 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -111,15 +111,11 @@ fixpoint {
   fork-fusion(fuse4@channel_loop);
 }
 simpl!(fuse4);
-
-if !feature("dont_fuse_gamut") {
-  array-slf(fuse4);
-  simpl!(fuse4);
-}
+array-slf(fuse4);
+simpl!(fuse4);
 
 if !feature("seq") {
   let par = fuse4@image_loop \ fuse4@channel_loop;
-  let par = par \ fuse4@cp_loop; 
   fork-tile[4, 1, false, false](par);
   fork-tile[8, 0, false, false](par);
   fork-interchange[1, 2](par);
@@ -128,6 +124,8 @@ if !feature("seq") {
   fork-coalesce(fuse4, fuse4_body);
   simpl!(fuse4, fuse4_body);
   fuse4 = fuse4_body;
+} else {
+  fork-tile[6, 0, false, true](fuse4@channel_loop);
 }
 
 no-memset(fuse5@res1);
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index daf52339..0ef466c0 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -115,25 +115,12 @@ fixpoint {
   fork-fusion(fuse4@channel_loop);
 }
 simpl!(fuse4);
-
-if !feature("dont_fuse_gamut") {
-  array-slf(fuse4);
-  simpl!(fuse4);
-  fork-tile[2, 0, false, true](fuse4@channel_loop);
-  let out = fork-split(fuse4@channel_loop);
-  fork-unroll(out.cava_3.fj1);
-  unforkify(fuse4@channel_loop);
-}
-
-let par = fuse4@image_loop \ fuse4@channel_loop;
-let par = par \ fuse4@cp_loop; 
-
-fork-tile[4, 1, false, true](par);
-fork-tile[8, 0, false, true](par);
-fork-interchange[1, 2](par);
-let split = fork-split(par);
-fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
-fork-coalesce(split.cava_3.fj2);
+array-slf(fuse4);
+simpl!(fuse4);
+fork-tile[2, 0, false, true](fuse4@channel_loop);
+let out = fork-split(fuse4@channel_loop);
+fork-unroll(out.cava_3.fj1);
+unforkify(fuse4@channel_loop);
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);
@@ -146,7 +133,12 @@ simpl!(fuse5);
 array-slf(fuse5);
 simpl!(fuse5);
 
-
+fork-tile[4, 1, false, true](fuse4);
+fork-tile[8, 0, false, true](fuse4);
+fork-interchange[1, 2](fuse4);
+let split = fork-split(fuse4);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
 
 delete-uncalled(*);
 simpl!(*);
-- 
GitLab


From 7031edf9b7c1d3c9d8982fc868b75d18afcc7751 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Wed, 5 Mar 2025 22:30:15 -0600
Subject: [PATCH 3/5] cava cpu dont_fuse_gamut

---
 juno_samples/cava/src/cpu.sch | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index 32e2b63b..16beceeb 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -105,17 +105,20 @@ fixpoint {
   fork-coalesce(fuse4);
 }
 simpl!(fuse4);
-fork-unroll(fuse4@channel_loop);
-simpl!(fuse4);
-fixpoint {
-  fork-fusion(fuse4@channel_loop);
+
+if !feature("dont_fuse_gamut") {
+  fork-unroll(fuse4@channel_loop);
+  simpl!(fuse4);
+  fixpoint {
+    fork-fusion(fuse4@channel_loop);
+  }
+  simpl!(fuse4);
+  array-slf(fuse4);
+  simpl!(fuse4);
 }
-simpl!(fuse4);
-array-slf(fuse4);
-simpl!(fuse4);
 
 if !feature("seq") {
-  let par = fuse4@image_loop \ fuse4@channel_loop;
+  let par = fuse4@image_loop \ fuse4@channel_loop \ fuse4@cp_loop;
   fork-tile[4, 1, false, false](par);
   fork-tile[8, 0, false, false](par);
   fork-interchange[1, 2](par);
-- 
GitLab


From dfcb25fff16ef08a52c77ad01ff60e2c980cc93d Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Wed, 5 Mar 2025 22:37:33 -0600
Subject: [PATCH 4/5] cava gpu dont_fuse_gamut

---
 juno_samples/cava/src/gpu.sch | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index 0ef466c0..0981804f 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -109,18 +109,25 @@ fixpoint {
   fork-coalesce(fuse4);
 }
 simpl!(fuse4);
-fork-unroll(fuse4@channel_loop);
-simpl!(fuse4);
-fixpoint {
-  fork-fusion(fuse4@channel_loop);
+
+if !feature("dont_fuse_gamut") {
+  fork-unroll(fuse4@channel_loop);
+  simpl!(fuse4);
+  fixpoint {
+    fork-fusion(fuse4@channel_loop);
+  }
+  simpl!(fuse4);
 }
-simpl!(fuse4);
 array-slf(fuse4);
 simpl!(fuse4);
-fork-tile[2, 0, false, true](fuse4@channel_loop);
-let out = fork-split(fuse4@channel_loop);
-fork-unroll(out.cava_3.fj1);
-unforkify(fuse4@channel_loop);
+unforkify(fuse4@channel_loop | fuse4@cp_loop);
+
+fork-tile[4, 1, false, true](fuse4);
+fork-tile[8, 0, false, true](fuse4);
+fork-interchange[1, 2](fuse4);
+let split = fork-split(fuse4);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);
@@ -133,13 +140,6 @@ simpl!(fuse5);
 array-slf(fuse5);
 simpl!(fuse5);
 
-fork-tile[4, 1, false, true](fuse4);
-fork-tile[8, 0, false, true](fuse4);
-fork-interchange[1, 2](fuse4);
-let split = fork-split(fuse4);
-fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
-fork-coalesce(split.cava_3.fj2);
-
 delete-uncalled(*);
 simpl!(*);
 
-- 
GitLab


From fcfb66b6dcad26c8772aa95dcc157845f88a9e3a Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Wed, 5 Mar 2025 22:45:12 -0600
Subject: [PATCH 5/5] um what

---
 hercules_opt/src/unforkify.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hercules_opt/src/unforkify.rs b/hercules_opt/src/unforkify.rs
index 2deb7165..cafe5ed1 100644
--- a/hercules_opt/src/unforkify.rs
+++ b/hercules_opt/src/unforkify.rs
@@ -318,7 +318,6 @@ pub fn unforkify(
         edit.sub_edit(fork, neq_id);
         edit.sub_edit(fork, add_id);
 
-
         edit = edit.delete_node(fork)?;
         edit = edit.delete_node(join)?;
         for tid in tids {
-- 
GitLab