From fd62696bdbd42381fdd8a43ca1b20b9f278616b1 Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Tue, 4 Mar 2025 13:12:28 -0600
Subject: [PATCH 1/5] "fix" backprop large bench on gpu

---
 juno_samples/rodinia/backprop/benches/backprop_bench.rs | 2 +-
 juno_samples/rodinia/backprop/src/gpu.sch               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
index 492bce3b..80964c72 100644
--- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs
+++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
@@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) {
     };
 
     bench("backprop bench small", 65536);
-    bench("backprop bench large", 33554432);
+    bench("backprop bench large", 1048576);
 }
 
 criterion_group!(benches, backprop_bench);
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index f8cc84a3..cc9cc2ac 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -33,10 +33,10 @@ fixpoint {
 reduce-slf(*);
 simpl!(*);
 
-fork-extend[32](layer_forward@inner_loop);
+fork-extend[1024](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 simpl!(layer_forward);
-fork-tile[32, 0, false, true](layer_forward@inner_loop);
+fork-tile[1024, 0, false, true](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 let out = fork-split(layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
-- 
GitLab


From 7d59b63ca1517086c4fe5b4bc60ffcc64205fe3f Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Tue, 4 Mar 2025 13:29:41 -0600
Subject: [PATCH 2/5] fix backprop for real

---
 hercules_cg/src/gpu.rs                              | 13 ++++++++++---
 .../rodinia/backprop/benches/backprop_bench.rs      |  2 +-
 juno_samples/rodinia/backprop/src/gpu.sch           |  5 +++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 1e6067a3..dd87acbe 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -562,9 +562,15 @@ namespace cg = cooperative_groups;
      * and writes.
      */
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
-        write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?;
+        write!(
+            w,
+            "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n"
+        )?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
-        write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?;
+        write!(
+            w,
+            "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n"
+        )?;
         Ok(())
     }
 
@@ -1726,7 +1732,8 @@ namespace cg = cooperative_groups;
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {
-                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?;
+                        write!(thread_block_tiles, "\t{};\n", self.get_fork_iter(id, true))?;
+                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, false))?;
                         write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?;
                     }
                 }
diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
index 80964c72..492bce3b 100644
--- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs
+++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
@@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) {
     };
 
     bench("backprop bench small", 65536);
-    bench("backprop bench large", 1048576);
+    bench("backprop bench large", 33554432);
 }
 
 criterion_group!(benches, backprop_bench);
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index cc9cc2ac..1773f713 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -33,10 +33,11 @@ fixpoint {
 reduce-slf(*);
 simpl!(*);
 
-fork-extend[1024](layer_forward@inner_loop);
+fork-extend[32768](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 simpl!(layer_forward);
-fork-tile[1024, 0, false, true](layer_forward@inner_loop);
+fork-tile[32768, 0, false, true](layer_forward@inner_loop);
+fork-tile[1024, 1, false, true](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 let out = fork-split(layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
-- 
GitLab


From 94950efe71d9cbf4ef1840eafd9c8799b7106966 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Tue, 4 Mar 2025 13:53:07 -0600
Subject: [PATCH 3/5] Load inputs once in benches

---
 juno_samples/rodinia/bfs/benches/bfs_bench.rs |  41 ++-----
 juno_samples/rodinia/cfd/benches/cfd_bench.rs | 112 +++++++++---------
 2 files changed, 67 insertions(+), 86 deletions(-)

diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
index f87b4e09..ea8ba0ed 100644
--- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs
+++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
@@ -13,41 +13,22 @@ fn bfs_bench(c: &mut Criterion) {
 
     let mut r = runner!(bfs);
 
-    group.bench_function("bfs bench 4096", |b| {
-        let input = "data/graph4096.txt";
-        let (nodes, source, edges) = parse_graph(input.into()).unwrap();
-        let n = nodes.len() as u64;
-        let m = edges.len() as u64;
-        let nodes = HerculesImmBox::from(&nodes as &[Node]);
-        let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
-
-    group.bench_function("bfs bench 65536", |b| {
-        let input = "data/graph65536.txt";
-        let (nodes, source, edges) = parse_graph(input.into()).unwrap();
-        let n = nodes.len() as u64;
-        let m = edges.len() as u64;
-        let nodes = HerculesImmBox::from(&nodes as &[Node]);
-        let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
-
-    group.bench_function("bfs bench 64M", |b| {
-        let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt";
+    let mut bench = |name, input: &'_ str| {
         let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines.");
         let n = nodes.len() as u64;
         let m = edges.len() as u64;
         let nodes = HerculesImmBox::from(&nodes as &[Node]);
         let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
+        group.bench_function(name, |b| {
+            b.iter(|| {
+                async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
+            })
+        });
+    };
+
+    bench("bfs bench 4096", "data/graph4096.txt");
+    bench("bfs bench 65536", "data/graph65536.txt");
+    bench("bfs bench 64M", "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt");
 }
 
 criterion_group!(benches, bfs_bench);
diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
index aa6d7727..5b4db044 100644
--- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs
+++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
@@ -13,38 +13,38 @@ fn cfd_bench(c: &mut Criterion) {
     group.sample_size(10);
 
     let mut euler_bench = |name, data_file, iterations| {
-        group.bench_function(name, |b| {
-            let mut r = runner!(euler);
-            let block_size = 16;
-            let FarFieldConditions {
-                ff_variable,
-                ff_fc_momentum_x,
-                ff_fc_momentum_y,
-                ff_fc_momentum_z,
-                ff_fc_density_energy,
-            } = set_far_field_conditions();
-            let GeometryData {
-                nelr,
-                areas,
-                elements_surrounding_elements,
-                normals,
-            } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
-            let mut variables = initialize_variables(nelr, &ff_variable);
+        let mut r = runner!(euler);
+        let block_size = 16;
+        let FarFieldConditions {
+            ff_variable,
+            ff_fc_momentum_x,
+            ff_fc_momentum_y,
+            ff_fc_momentum_z,
+            ff_fc_density_energy,
+        } = set_far_field_conditions();
+        let GeometryData {
+            nelr,
+            areas,
+            elements_surrounding_elements,
+            normals,
+        } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
+        let mut variables = initialize_variables(nelr, &ff_variable);
 
-            let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
-            let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
-            let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
-            let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
-            let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+        let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+        let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+        let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+        let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+        let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
 
-            let areas = HerculesImmBox::from(areas.as_slice());
-            let elements_surrounding_elements =
-                HerculesImmBox::from(elements_surrounding_elements.as_slice());
+        let areas = HerculesImmBox::from(areas.as_slice());
+        let elements_surrounding_elements =
+            HerculesImmBox::from(elements_surrounding_elements.as_slice());
 
-            let normals_x = HerculesImmBox::from(normals.x.as_slice());
-            let normals_y = HerculesImmBox::from(normals.y.as_slice());
-            let normals_z = HerculesImmBox::from(normals.z.as_slice());
+        let normals_x = HerculesImmBox::from(normals.x.as_slice());
+        let normals_y = HerculesImmBox::from(normals.y.as_slice());
+        let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
+        group.bench_function(name, |b| {
             b.iter(|| {
                 async_std::task::block_on(async {
                     r.run(
@@ -91,38 +91,38 @@ fn cfd_bench(c: &mut Criterion) {
     );
 
     let mut pre_euler_bench = |name, data_file, iterations| {
-        group.bench_function(name, |b| {
-            let mut r = runner!(pre_euler);
-            let block_size = 16;
-            let FarFieldConditions {
-                ff_variable,
-                ff_fc_momentum_x,
-                ff_fc_momentum_y,
-                ff_fc_momentum_z,
-                ff_fc_density_energy,
-            } = set_far_field_conditions();
-            let GeometryData {
-                nelr,
-                areas,
-                elements_surrounding_elements,
-                normals,
-            } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
-            let mut variables = initialize_variables(nelr, &ff_variable);
+        let mut r = runner!(pre_euler);
+        let block_size = 16;
+        let FarFieldConditions {
+            ff_variable,
+            ff_fc_momentum_x,
+            ff_fc_momentum_y,
+            ff_fc_momentum_z,
+            ff_fc_density_energy,
+        } = set_far_field_conditions();
+        let GeometryData {
+            nelr,
+            areas,
+            elements_surrounding_elements,
+            normals,
+        } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
+        let mut variables = initialize_variables(nelr, &ff_variable);
 
-            let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
-            let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
-            let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
-            let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
-            let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+        let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+        let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+        let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+        let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+        let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
 
-            let areas = HerculesImmBox::from(areas.as_slice());
-            let elements_surrounding_elements =
-                HerculesImmBox::from(elements_surrounding_elements.as_slice());
+        let areas = HerculesImmBox::from(areas.as_slice());
+        let elements_surrounding_elements =
+            HerculesImmBox::from(elements_surrounding_elements.as_slice());
 
-            let normals_x = HerculesImmBox::from(normals.x.as_slice());
-            let normals_y = HerculesImmBox::from(normals.y.as_slice());
-            let normals_z = HerculesImmBox::from(normals.z.as_slice());
+        let normals_x = HerculesImmBox::from(normals.x.as_slice());
+        let normals_y = HerculesImmBox::from(normals.y.as_slice());
+        let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
+        group.bench_function(name, |b| {
             b.iter(|| {
                 async_std::task::block_on(async {
                     r.run(
-- 
GitLab


From 3ce50c43068f7a664d5f4d228376a65a1e995c2b Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Tue, 4 Mar 2025 14:46:30 -0600
Subject: [PATCH 4/5] Add sequential feature to benchmarks

---
 juno_samples/cava/Cargo.toml                  |   3 +-
 juno_samples/cava/src/cpu.sch                 |  24 ++--
 juno_samples/edge_detection/Cargo.toml        |   1 +
 juno_samples/edge_detection/src/cpu.sch       | 114 +++++++++++-------
 juno_samples/rodinia/backprop/Cargo.toml      |   3 +-
 juno_samples/rodinia/backprop/src/cpu.sch     |  20 +--
 juno_samples/rodinia/bfs/Cargo.toml           |   1 +
 juno_samples/rodinia/bfs/src/cpu.sch          |  31 ++---
 juno_samples/rodinia/cfd/Cargo.toml           |   1 +
 juno_samples/rodinia/cfd/src/cpu_euler.sch    |  48 ++++----
 .../rodinia/cfd/src/cpu_pre_euler.sch         |  59 +++++----
 juno_samples/rodinia/srad/Cargo.toml          |   1 +
 juno_samples/rodinia/srad/src/cpu.sch         |  28 +++--
 13 files changed, 194 insertions(+), 140 deletions(-)

diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml
index 17e9a1d3..bdf144da 100644
--- a/juno_samples/cava/Cargo.toml
+++ b/juno_samples/cava/Cargo.toml
@@ -13,6 +13,7 @@ path = "src/lib.rs"
 
 [features]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+seq = []
 
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
@@ -30,4 +31,4 @@ criterion = { version = "0.5", features = ["html_reports"] }
 
 [[bench]]
 name = "cava_bench"
-harness = false
\ No newline at end of file
+harness = false
diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index 6fc8adbb..ada2f552 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -113,14 +113,18 @@ fixpoint {
 simpl!(fuse4);
 array-slf(fuse4);
 simpl!(fuse4);
-let par = fuse4@image_loop \ fuse4@channel_loop;
-fork-tile[4, 1, false, false](par);
-fork-tile[8, 0, false, false](par);
-fork-interchange[1, 2](par);
-let split = fork-split(par);
-let fuse4_body = outline(split.cava_3.fj2);
-fork-coalesce(fuse4, fuse4_body);
-simpl!(fuse4, fuse4_body);
+
+if !feature("seq") {
+  let par = fuse4@image_loop \ fuse4@channel_loop;
+  fork-tile[4, 1, false, false](par);
+  fork-tile[8, 0, false, false](par);
+  fork-interchange[1, 2](par);
+  let split = fork-split(par);
+  let fuse4_body = outline(split.cava_3.fj2);
+  fork-coalesce(fuse4, fuse4_body);
+  simpl!(fuse4, fuse4_body);
+  fuse4 = fuse4_body;
+}
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);
@@ -136,8 +140,8 @@ simpl!(fuse5);
 delete-uncalled(*);
 simpl!(*);
 
-fork-split(fuse1, fuse2, fuse3, fuse4_body, fuse5);
-unforkify(fuse1, fuse2, fuse3, fuse4_body, fuse5);
+fork-split(fuse1, fuse2, fuse3, fuse4, fuse5);
+unforkify(fuse1, fuse2, fuse3, fuse4, fuse5);
 
 simpl!(*);
 
diff --git a/juno_samples/edge_detection/Cargo.toml b/juno_samples/edge_detection/Cargo.toml
index fa4ca1ff..8def7500 100644
--- a/juno_samples/edge_detection/Cargo.toml
+++ b/juno_samples/edge_detection/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2021"
 [features]
 opencv = ["dep:opencv"]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+seq = []
 
 [[bin]]
 name = "juno_edge_detection"
diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index 3e1321c5..64fee6b6 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -24,14 +24,18 @@ predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
 predication(gaussian_smoothing);
 simpl!(gaussian_smoothing);
-let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
-fork-tile[4, 1, false, false](par);
-fork-tile[8, 0, false, false](par);
-fork-interchange[1, 2](par);
-let split = fork-split(par);
-let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
-fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
-simpl!(gaussian_smoothing, gaussian_smoothing_body);
+
+if !feature("seq") {
+  let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop;
+  fork-tile[4, 1, false, false](par);
+  fork-tile[8, 0, false, false](par);
+  fork-interchange[1, 2](par);
+  let split = fork-split(par);
+  let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2);
+  fork-coalesce(gaussian_smoothing, gaussian_smoothing_body);
+  simpl!(gaussian_smoothing, gaussian_smoothing_body);
+  gaussian_smoothing = gaussian_smoothing_body;
+}
 
 no-memset(laplacian_estimate@res);
 fixpoint {
@@ -40,15 +44,19 @@ fixpoint {
   fork-coalesce(laplacian_estimate);
 }
 simpl!(laplacian_estimate);
-let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
-fork-tile[4, 1, false, false](par);
-fork-tile[8, 0, false, false](par);
-fork-interchange[1, 2](par);
-let split = fork-split(par);
-let body = split._1_laplacian_estimate.fj2;
-let laplacian_estimate_body = outline(body);
-fork-coalesce(laplacian_estimate, laplacian_estimate_body);
-simpl!(laplacian_estimate, laplacian_estimate_body);
+
+if !feature("seq") {
+  let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop;
+  fork-tile[4, 1, false, false](par);
+  fork-tile[8, 0, false, false](par);
+  fork-interchange[1, 2](par);
+  let split = fork-split(par);
+  let body = split._1_laplacian_estimate.fj2;
+  let laplacian_estimate_body = outline(body);
+  fork-coalesce(laplacian_estimate, laplacian_estimate_body);
+  simpl!(laplacian_estimate, laplacian_estimate_body);
+  laplacian_estimate = laplacian_estimate_body;
+}
 
 no-memset(zero_crossings@res);
 fixpoint {
@@ -57,15 +65,19 @@ fixpoint {
   fork-coalesce(zero_crossings);
 }
 simpl!(zero_crossings);
-let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
-fork-tile[4, 1, false, false](par);
-fork-tile[8, 0, false, false](par);
-fork-interchange[1, 2](par);
-let split = fork-split(par);
-let body = split._2_zero_crossings.fj2;
-let zero_crossings_body = outline(body);
-fork-coalesce(zero_crossings, zero_crossings_body);
-simpl!(zero_crossings, zero_crossings_body);
+
+if !feature("seq") {
+  let par = zero_crossings@image_loop \ zero_crossings@filter_loop;
+  fork-tile[4, 1, false, false](par);
+  fork-tile[8, 0, false, false](par);
+  fork-interchange[1, 2](par);
+  let split = fork-split(par);
+  let body = split._2_zero_crossings.fj2;
+  let zero_crossings_body = outline(body);
+  fork-coalesce(zero_crossings, zero_crossings_body);
+  simpl!(zero_crossings, zero_crossings_body);
+  zero_crossings = zero_crossings_body;
+}
 
 no-memset(gradient@res);
 fixpoint {
@@ -84,17 +96,23 @@ fixpoint {
   fork-coalesce(max_gradient);
 }
 simpl!(max_gradient);
-fork-dim-merge(max_gradient);
-simpl!(max_gradient);
-fork-tile[32, 0, false, false](max_gradient);
-let split = fork-split(max_gradient);
-clean-monoid-reduces(max_gradient);
-let out = outline(split._4_max_gradient.fj1);
-simpl!(max_gradient, out);
-unforkify(out);
-let out = fork-fission[split._4_max_gradient.fj0](max_gradient);
-simpl!(max_gradient);
-unforkify(out._4_max_gradient.fj_bottom);
+
+if !feature("seq") {
+  fork-dim-merge(max_gradient);
+  simpl!(max_gradient);
+  fork-tile[32, 0, false, false](max_gradient);
+  let split = fork-split(max_gradient);
+  clean-monoid-reduces(max_gradient);
+  let out = outline(split._4_max_gradient.fj1);
+  simpl!(max_gradient, out);
+  unforkify(out);
+  let out = fork-fission[split._4_max_gradient.fj0](max_gradient);
+  simpl!(max_gradient);
+  unforkify(out._4_max_gradient.fj_bottom);
+} else {
+  fork-split(max_gradient);
+  unforkify(max_gradient);
+}
 
 no-memset(reject_zero_crossings@res);
 fixpoint {
@@ -104,18 +122,22 @@ fixpoint {
 }
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
-fork-tile[4, 1, false, false](reject_zero_crossings);
-fork-tile[8, 0, false, false](reject_zero_crossings);
-fork-interchange[1, 2](reject_zero_crossings);
-let split = fork-split(reject_zero_crossings);
-let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
-fork-coalesce(reject_zero_crossings, reject_zero_crossings_body);
-simpl!(reject_zero_crossings, reject_zero_crossings_body);
+
+if !feature("seq") {
+  fork-tile[4, 1, false, false](reject_zero_crossings);
+  fork-tile[8, 0, false, false](reject_zero_crossings);
+  fork-interchange[1, 2](reject_zero_crossings);
+  let split = fork-split(reject_zero_crossings);
+  let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
+  fork-coalesce(reject_zero_crossings, reject_zero_crossings_body);
+  simpl!(reject_zero_crossings, reject_zero_crossings_body);
+  reject_zero_crossings = reject_zero_crossings_body;
+}
 
 async-call(edge_detection@le, edge_detection@zc);
 
-fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body);
-unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body);
+fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
+unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
 
 simpl!(*);
 
diff --git a/juno_samples/rodinia/backprop/Cargo.toml b/juno_samples/rodinia/backprop/Cargo.toml
index 25185e09..68c76e92 100644
--- a/juno_samples/rodinia/backprop/Cargo.toml
+++ b/juno_samples/rodinia/backprop/Cargo.toml
@@ -13,6 +13,7 @@ path = "src/lib.rs"
 
 [features]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+seq = []
 
 [build-dependencies]
 juno_build = { path = "../../../juno_build" }
@@ -31,4 +32,4 @@ criterion = { version = "0.5", features = ["html_reports"] }
 
 [[bench]]
 name = "backprop_bench"
-harness = false
\ No newline at end of file
+harness = false
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index 3c7f7d5f..4796f427 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -37,10 +37,12 @@ inline(backprop@forward_input, backprop@forward_hidden);
 let forward_input = outline(backprop@forward_input);
 let forward_hidden = outline(backprop@forward_hidden);
 
-fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop);
-let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop);
-let forward_input = outline(inner);
-inline(backprop@forward_input);
+if !feature("seq") {
+  fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop);
+  let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop);
+  forward_input = outline(inner);
+  inline(backprop@forward_input);
+}
 
 // The first call to adjust_weights has total loop dimensions of 1 * 17, so not
 // worth parallelizing (given that the body is trivial)
@@ -50,10 +52,12 @@ inline(backprop@adjust_hidden, backprop@adjust_input);
 let adjust_hidden = outline(backprop@adjust_hidden);
 let adjust_input = outline(backprop@adjust_input);
 
-fork-tile[16, 0, false, true](adjust_input);
-let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input);
-let adjust_input = outline(inner);
-inline(backprop@adjust_input);
+if !feature("seq") {
+  fork-tile[16, 0, false, true](adjust_input);
+  let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input);
+  adjust_input = outline(inner);
+  inline(backprop@adjust_input);
+}
 
 delete-uncalled(*);
 const-inline(*);
diff --git a/juno_samples/rodinia/bfs/Cargo.toml b/juno_samples/rodinia/bfs/Cargo.toml
index 34b6f5ce..46f8ade6 100644
--- a/juno_samples/rodinia/bfs/Cargo.toml
+++ b/juno_samples/rodinia/bfs/Cargo.toml
@@ -13,6 +13,7 @@ path = "src/lib.rs"
 
 [features]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+seq = []
 
 [build-dependencies]
 juno_build = { path = "../../../juno_build" }
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index e5100561..550dce31 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -40,23 +40,24 @@ simpl!(collect);
 parallel-fork(traverse, collect);
 parallel-reduce(traverse, collect);
 
-fork-tile[32, 0, false, true](traverse, collect);
-let (outer, inner) = fork-reshape[[1], [0]](traverse);
-let traverse_body = outline(inner);
-let (outer, inner) = fork-reshape[[1], [0]](collect);
-let collect_body = outline(inner);
-
-let init_body = init;
-// Following code seems to generate breaking RT code
-//fork-tile[32, 0, false, true](init);
-//let (outer, inner) = fork-reshape[[1], [0]](init);
-//let init_body = outline(inner);
-//inline(bfs@cost_init);
-
-inline(bfs@loop1, bfs@loop2);
+if !feature("seq") {
+  fork-tile[32, 0, false, true](traverse, collect);
+  let (outer, inner) = fork-reshape[[1], [0]](traverse);
+  traverse = outline(inner);
+  let (outer, inner) = fork-reshape[[1], [0]](collect);
+  collect = outline(inner);
+
+  // Following code seems to generate breaking RT code
+  //fork-tile[32, 0, false, true](init);
+  //let (outer, inner) = fork-reshape[[1], [0]](init);
+  //init = outline(inner);
+  //inline(bfs@cost_init);
+
+  inline(bfs@loop1, bfs@loop2);
+}
 delete-uncalled(*);
 const-inline(*);
 
-unforkify(init_body, traverse_body, collect_body);
+unforkify(init, traverse, collect);
 simpl!(*);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/Cargo.toml b/juno_samples/rodinia/cfd/Cargo.toml
index 6720b527..172573dd 100644
--- a/juno_samples/rodinia/cfd/Cargo.toml
+++ b/juno_samples/rodinia/cfd/Cargo.toml
@@ -13,6 +13,7 @@ path = "src/lib.rs"
 
 [features]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+seq = []
 
 [build-dependencies]
 juno_build = { path = "../../../juno_build" }
diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch
index 7a284a9a..13125961 100644
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -27,29 +27,35 @@ fixpoint {
 simpl!(*);
 unforkify(compute_flux@inner_loop);
 
-fork-tile[32, 0, false, false](compute_step_factor);
-let split = fork-split(compute_step_factor);
-let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
-fork-coalesce(compute_step_factor, compute_step_factor_body);
-simpl!(compute_step_factor, compute_step_factor_body);
+if !feature("seq") {
+  fork-tile[32, 0, false, false](compute_step_factor);
+  let split = fork-split(compute_step_factor);
+  let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+  fork-coalesce(compute_step_factor, compute_step_factor_body);
+  simpl!(compute_step_factor, compute_step_factor_body);
+  compute_step_factor = compute_step_factor_body;
 
-fork-tile[32, 0, false, false](compute_flux);
-let split = fork-split(compute_flux);
-let compute_flux_body = outline(split._6_compute_flux.fj1);
-fork-coalesce(compute_flux, compute_flux_body);
-simpl!(compute_flux, compute_flux_body);
+  fork-tile[32, 0, false, false](compute_flux);
+  let split = fork-split(compute_flux);
+  let compute_flux_body = outline(split._6_compute_flux.fj1);
+  fork-coalesce(compute_flux, compute_flux_body);
+  simpl!(compute_flux, compute_flux_body);
+  compute_flux = compute_flux_body;
 
-fork-tile[32, 0, false, false](time_step);
-let split = fork-split(time_step);
-let time_step_body = outline(split._7_time_step.fj1);
-fork-coalesce(time_step, time_step_body);
-simpl!(time_step, time_step_body);
+  fork-tile[32, 0, false, false](time_step);
+  let split = fork-split(time_step);
+  let time_step_body = outline(split._7_time_step.fj1);
+  fork-coalesce(time_step, time_step_body);
+  simpl!(time_step, time_step_body);
+  time_step = time_step_body;
 
-fork-tile[32, 0, false, false](copy_vars);
-let split = fork-split(copy_vars);
-let copy_vars_body = outline(split._8_copy_vars.fj1);
-fork-coalesce(copy_vars, copy_vars_body);
-simpl!(copy_vars, copy_vars_body);
+  fork-tile[32, 0, false, false](copy_vars);
+  let split = fork-split(copy_vars);
+  let copy_vars_body = outline(split._8_copy_vars.fj1);
+  fork-coalesce(copy_vars, copy_vars_body);
+  simpl!(copy_vars, copy_vars_body);
+  copy_vars = copy_vars_body;
+}
 
-unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body);
+unforkify(compute_step_factor, compute_flux, time_step, copy_vars);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
index 518c656d..858be5ba 100644
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
@@ -27,35 +27,42 @@ simpl!(*);
 no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res);
 unforkify(compute_flux@inner_loop);
 
-fork-tile[32, 0, false, false](compute_step_factor);
-let split = fork-split(compute_step_factor);
-let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
-fork-coalesce(compute_step_factor, compute_step_factor_body);
-simpl!(compute_step_factor, compute_step_factor_body);
+if !feature("seq") {
+  fork-tile[32, 0, false, false](compute_step_factor);
+  let split = fork-split(compute_step_factor);
+  let compute_step_factor_body = outline(split._4_compute_step_factor.fj1);
+  fork-coalesce(compute_step_factor, compute_step_factor_body);
+  simpl!(compute_step_factor, compute_step_factor_body);
+  compute_step_factor = compute_step_factor_body;
 
-fork-tile[32, 0, false, false](compute_flux_contributions);
-let split = fork-split(compute_flux_contributions);
-let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
-fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
-simpl!(compute_flux_contributions, compute_flux_contributions_body);
+  fork-tile[32, 0, false, false](compute_flux_contributions);
+  let split = fork-split(compute_flux_contributions);
+  let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1);
+  fork-coalesce(compute_flux_contributions, compute_flux_contributions_body);
+  simpl!(compute_flux_contributions, compute_flux_contributions_body);
+  compute_flux_contributions = compute_flux_contributions_body;
 
-fork-tile[32, 0, false, false](compute_flux);
-let split = fork-split(compute_flux);
-let compute_flux_body = outline(split._7_compute_flux.fj1);
-fork-coalesce(compute_flux, compute_flux_body);
-simpl!(compute_flux, compute_flux_body);
+  fork-tile[32, 0, false, false](compute_flux);
+  let split = fork-split(compute_flux);
+  let compute_flux_body = outline(split._7_compute_flux.fj1);
+  fork-coalesce(compute_flux, compute_flux_body);
+  simpl!(compute_flux, compute_flux_body);
+  compute_flux = compute_flux_body;
 
-fork-tile[32, 0, false, false](time_step);
-let split = fork-split(time_step);
-let time_step_body = outline(split._8_time_step.fj1);
-fork-coalesce(time_step, time_step_body);
-simpl!(time_step, time_step_body);
+  fork-tile[32, 0, false, false](time_step);
+  let split = fork-split(time_step);
+  let time_step_body = outline(split._8_time_step.fj1);
+  fork-coalesce(time_step, time_step_body);
+  simpl!(time_step, time_step_body);
+  time_step = time_step_body;
 
-fork-tile[32, 0, false, false](copy_vars);
-let split = fork-split(copy_vars);
-let copy_vars_body = outline(split._9_copy_vars.fj1);
-fork-coalesce(copy_vars, copy_vars_body);
-simpl!(copy_vars, copy_vars_body);
+  fork-tile[32, 0, false, false](copy_vars);
+  let split = fork-split(copy_vars);
+  let copy_vars_body = outline(split._9_copy_vars.fj1);
+  fork-coalesce(copy_vars, copy_vars_body);
+  simpl!(copy_vars, copy_vars_body);
+  copy_vars = copy_vars_body;
+}
 
-unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body);
+unforkify(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars);
 gcm(*);
diff --git a/juno_samples/rodinia/srad/Cargo.toml b/juno_samples/rodinia/srad/Cargo.toml
index facf8c3b..783f3327 100644
--- a/juno_samples/rodinia/srad/Cargo.toml
+++ b/juno_samples/rodinia/srad/Cargo.toml
@@ -13,6 +13,7 @@ path = "src/lib.rs"
 
 [features]
 cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+seq = []
 
 [build-dependencies]
 juno_build = { path = "../../../juno_build" }
diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index 8917f03d..8fa22aaa 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -35,20 +35,24 @@ simpl!(*);
 slf(*);
 simpl!(*);
 
-fork-tile[32, 0, false, false](loop2);
-let split = fork-split(loop2);
-let loop2_body = outline(split.srad_1.fj1);
-simpl!(loop2, loop2_body);
+if !feature("seq") {
+  fork-tile[32, 0, false, false](loop2);
+  let split = fork-split(loop2);
+  let loop2_body = outline(split.srad_1.fj1);
+  simpl!(loop2, loop2_body);
+  loop2 = loop2_body;
 
-fork-tile[32, 0, false, false](loop3);
-let split = fork-split(loop3);
-let loop3_body = outline(split.srad_2.fj1);
-simpl!(loop3, loop3_body);
+  fork-tile[32, 0, false, false](loop3);
+  let split = fork-split(loop3);
+  let loop3_body = outline(split.srad_2.fj1);
+  simpl!(loop3, loop3_body);
+  loop3 = loop3_body;
 
-inline(srad@loop2, srad@loop3);
-delete-uncalled(*);
+  inline(srad@loop2, srad@loop3);
+  delete-uncalled(*);
+}
 
-fork-split(extract, compress, loop1, loop2_body, loop3_body);
-unforkify(extract, compress, loop1, loop2_body, loop3_body);
+fork-split(extract, compress, loop1, loop2, loop3);
+unforkify(extract, compress, loop1, loop2, loop3);
 
 gcm(*);
-- 
GitLab


From a0499390ad8b31f3beeaba489c5ea659f78c1285 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 21:14:47 -0600
Subject: [PATCH 5/5] fix

---
 juno_samples/rodinia/bfs/src/cpu.sch | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index 63b2e4fe..ea6f0403 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -52,6 +52,7 @@ if !feature("seq") {
   let init_body = outline(inner);
 
   inline(bfs@cost_init, bfs@loop1, bfs@loop2);
+  init = init_body;
 }
 delete-uncalled(*);
 const-inline(*);
@@ -59,4 +60,4 @@ simpl!(*);
 
 unforkify(init, traverse, collect);
 simpl!(*);
-gcm(*);
+gcm(*);
\ No newline at end of file
-- 
GitLab