From fd62696bdbd42381fdd8a43ca1b20b9f278616b1 Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Tue, 4 Mar 2025 13:12:28 -0600
Subject: [PATCH 01/11] "fix" backprop large bench on gpu

---
 juno_samples/rodinia/backprop/benches/backprop_bench.rs | 2 +-
 juno_samples/rodinia/backprop/src/gpu.sch               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
index 492bce3b..80964c72 100644
--- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs
+++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
@@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) {
     };
 
     bench("backprop bench small", 65536);
-    bench("backprop bench large", 33554432);
+    bench("backprop bench large", 1048576);
 }
 
 criterion_group!(benches, backprop_bench);
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index f8cc84a3..cc9cc2ac 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -33,10 +33,10 @@ fixpoint {
 reduce-slf(*);
 simpl!(*);
 
-fork-extend[32](layer_forward@inner_loop);
+fork-extend[1024](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 simpl!(layer_forward);
-fork-tile[32, 0, false, true](layer_forward@inner_loop);
+fork-tile[1024, 0, false, true](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 let out = fork-split(layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
-- 
GitLab


From 7d59b63ca1517086c4fe5b4bc60ffcc64205fe3f Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Tue, 4 Mar 2025 13:29:41 -0600
Subject: [PATCH 02/11] fix backprop for real

---
 hercules_cg/src/gpu.rs                              | 13 ++++++++++---
 .../rodinia/backprop/benches/backprop_bench.rs      |  2 +-
 juno_samples/rodinia/backprop/src/gpu.sch           |  5 +++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 1e6067a3..dd87acbe 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -562,9 +562,15 @@ namespace cg = cooperative_groups;
      * and writes.
      */
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
-        write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?;
+        write!(
+            w,
+            "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n"
+        )?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
-        write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?;
+        write!(
+            w,
+            "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n"
+        )?;
         Ok(())
     }
 
@@ -1726,7 +1732,8 @@ namespace cg = cooperative_groups;
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {
-                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?;
+                        write!(thread_block_tiles, "\t{};\n", self.get_fork_iter(id, true))?;
+                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, false))?;
                         write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?;
                     }
                 }
diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
index 80964c72..492bce3b 100644
--- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs
+++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
@@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) {
     };
 
     bench("backprop bench small", 65536);
-    bench("backprop bench large", 1048576);
+    bench("backprop bench large", 33554432);
 }
 
 criterion_group!(benches, backprop_bench);
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index cc9cc2ac..1773f713 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -33,10 +33,11 @@ fixpoint {
 reduce-slf(*);
 simpl!(*);
 
-fork-extend[1024](layer_forward@inner_loop);
+fork-extend[32768](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 simpl!(layer_forward);
-fork-tile[1024, 0, false, true](layer_forward@inner_loop);
+fork-tile[32768, 0, false, true](layer_forward@inner_loop);
+fork-tile[1024, 1, false, true](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 let out = fork-split(layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
-- 
GitLab


From 94950efe71d9cbf4ef1840eafd9c8799b7106966 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Tue, 4 Mar 2025 13:53:07 -0600
Subject: [PATCH 03/11] Load inputs once in benches

---
 juno_samples/rodinia/bfs/benches/bfs_bench.rs |  41 ++-----
 juno_samples/rodinia/cfd/benches/cfd_bench.rs | 112 +++++++++---------
 2 files changed, 67 insertions(+), 86 deletions(-)

diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
index f87b4e09..ea8ba0ed 100644
--- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs
+++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs
@@ -13,41 +13,22 @@ fn bfs_bench(c: &mut Criterion) {
 
     let mut r = runner!(bfs);
 
-    group.bench_function("bfs bench 4096", |b| {
-        let input = "data/graph4096.txt";
-        let (nodes, source, edges) = parse_graph(input.into()).unwrap();
-        let n = nodes.len() as u64;
-        let m = edges.len() as u64;
-        let nodes = HerculesImmBox::from(&nodes as &[Node]);
-        let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
-
-    group.bench_function("bfs bench 65536", |b| {
-        let input = "data/graph65536.txt";
-        let (nodes, source, edges) = parse_graph(input.into()).unwrap();
-        let n = nodes.len() as u64;
-        let m = edges.len() as u64;
-        let nodes = HerculesImmBox::from(&nodes as &[Node]);
-        let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
-
-    group.bench_function("bfs bench 64M", |b| {
-        let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt";
+    let mut bench = |name, input: &'_ str| {
         let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines.");
         let n = nodes.len() as u64;
         let m = edges.len() as u64;
         let nodes = HerculesImmBox::from(&nodes as &[Node]);
         let edges = HerculesImmBox::from(&edges as &[u32]);
-        b.iter(|| {
-            async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
-        })
-    });
+        group.bench_function(name, |b| {
+            b.iter(|| {
+                async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await });
+            })
+        });
+    };
+
+    bench("bfs bench 4096", "data/graph4096.txt");
+    bench("bfs bench 65536", "data/graph65536.txt");
+    bench("bfs bench 64M", "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt");
 }
 
 criterion_group!(benches, bfs_bench);
diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
index aa6d7727..5b4db044 100644
--- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs
+++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs
@@ -13,38 +13,38 @@ fn cfd_bench(c: &mut Criterion) {
     group.sample_size(10);
 
     let mut euler_bench = |name, data_file, iterations| {
-        group.bench_function(name, |b| {
-            let mut r = runner!(euler);
-            let block_size = 16;
-            let FarFieldConditions {
-                ff_variable,
-                ff_fc_momentum_x,
-                ff_fc_momentum_y,
-                ff_fc_momentum_z,
-                ff_fc_density_energy,
-            } = set_far_field_conditions();
-            let GeometryData {
-                nelr,
-                areas,
-                elements_surrounding_elements,
-                normals,
-            } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
-            let mut variables = initialize_variables(nelr, &ff_variable);
+        let mut r = runner!(euler);
+        let block_size = 16;
+        let FarFieldConditions {
+            ff_variable,
+            ff_fc_momentum_x,
+            ff_fc_momentum_y,
+            ff_fc_momentum_z,
+            ff_fc_density_energy,
+        } = set_far_field_conditions();
+        let GeometryData {
+            nelr,
+            areas,
+            elements_surrounding_elements,
+            normals,
+        } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
+        let mut variables = initialize_variables(nelr, &ff_variable);
 
-            let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
-            let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
-            let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
-            let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
-            let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+        let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+        let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+        let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+        let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+        let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
 
-            let areas = HerculesImmBox::from(areas.as_slice());
-            let elements_surrounding_elements =
-                HerculesImmBox::from(elements_surrounding_elements.as_slice());
+        let areas = HerculesImmBox::from(areas.as_slice());
+        let elements_surrounding_elements =
+            HerculesImmBox::from(elements_surrounding_elements.as_slice());
 
-            let normals_x = HerculesImmBox::from(normals.x.as_slice());
-            let normals_y = HerculesImmBox::from(normals.y.as_slice());
-            let normals_z = HerculesImmBox::from(normals.z.as_slice());
+        let normals_x = HerculesImmBox::from(normals.x.as_slice());
+        let normals_y = HerculesImmBox::from(normals.y.as_slice());
+        let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
+        group.bench_function(name, |b| {
             b.iter(|| {
                 async_std::task::block_on(async {
                     r.run(
@@ -91,38 +91,38 @@ fn cfd_bench(c: &mut Criterion) {
     );
 
     let mut pre_euler_bench = |name, data_file, iterations| {
-        group.bench_function(name, |b| {
-            let mut r = runner!(pre_euler);
-            let block_size = 16;
-            let FarFieldConditions {
-                ff_variable,
-                ff_fc_momentum_x,
-                ff_fc_momentum_y,
-                ff_fc_momentum_z,
-                ff_fc_density_energy,
-            } = set_far_field_conditions();
-            let GeometryData {
-                nelr,
-                areas,
-                elements_surrounding_elements,
-                normals,
-            } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
-            let mut variables = initialize_variables(nelr, &ff_variable);
+        let mut r = runner!(pre_euler);
+        let block_size = 16;
+        let FarFieldConditions {
+            ff_variable,
+            ff_fc_momentum_x,
+            ff_fc_momentum_y,
+            ff_fc_momentum_z,
+            ff_fc_density_energy,
+        } = set_far_field_conditions();
+        let GeometryData {
+            nelr,
+            areas,
+            elements_surrounding_elements,
+            normals,
+        } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines.");
+        let mut variables = initialize_variables(nelr, &ff_variable);
 
-            let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
-            let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
-            let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
-            let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
-            let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
+        let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice());
+        let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice());
+        let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice());
+        let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice());
+        let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice());
 
-            let areas = HerculesImmBox::from(areas.as_slice());
-            let elements_surrounding_elements =
-                HerculesImmBox::from(elements_surrounding_elements.as_slice());
+        let areas = HerculesImmBox::from(areas.as_slice());
+        let elements_surrounding_elements =
+            HerculesImmBox::from(elements_surrounding_elements.as_slice());
 
-            let normals_x = HerculesImmBox::from(normals.x.as_slice());
-            let normals_y = HerculesImmBox::from(normals.y.as_slice());
-            let normals_z = HerculesImmBox::from(normals.z.as_slice());
+        let normals_x = HerculesImmBox::from(normals.x.as_slice());
+        let normals_y = HerculesImmBox::from(normals.y.as_slice());
+        let normals_z = HerculesImmBox::from(normals.z.as_slice());
 
+        group.bench_function(name, |b| {
             b.iter(|| {
                 async_std::task::block_on(async {
                     r.run(
-- 
GitLab


From 54acf3e288890bbe95d0bf003f7ecc1919348769 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 15:26:41 -0600
Subject: [PATCH 04/11] fix gpu backend to emit namespaces properly across cuda
 versions

---
 hercules_cg/src/gpu.rs | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index dd87acbe..4069cb02 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -354,7 +354,6 @@ impl GPUContext<'_> {
         write!(
             w,
             "
-#define _CG_ABI_EXPERIMENTAL
 #include <assert.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -362,8 +361,23 @@ impl GPUContext<'_> {
 #include <cuda_runtime.h>
 #include <math_constants.h>
 #include <mma.h>
+
+#if (CUDA_VERSION >= 12000)
+#else
+#define _CG_ABI_EXPERIMENTAL
+#endif
+
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+
+#if (CUDA_VERSION >= 12000)
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups;
+#else
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups::experimental;
+#endif
+
 #include <cuda_bf16.h>
 namespace cg = cooperative_groups;
 
@@ -564,12 +578,12 @@ namespace cg = cooperative_groups;
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
         write!(
             w,
-            "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n"
+            "\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n"
         )?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
         write!(
             w,
-            "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n"
+            "\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n"
         )?;
         Ok(())
     }
@@ -1715,20 +1729,20 @@ namespace cg = cooperative_groups;
                     };
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         use_thread_per_id, cg_tile, use_thread_per_id
                     )?;
                     let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         use_thread_quota, cg_tile_use, use_thread_quota
                     )?;
                     let available_thread_quota = available_thread_quota.unwrap();
                     let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {
-- 
GitLab


From 0107cc1c914a44cf925046a4d3fdaf6f160a7394 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 15:52:20 -0600
Subject: [PATCH 05/11] don't build egg for now

---
 Cargo.lock                                   | 123 -------------------
 hercules_opt/Cargo.toml                      |   2 +-
 hercules_opt/src/rewrite_math_expressions.rs |  15 ++-
 juno_samples/rodinia/bfs/src/cpu.sch         |   1 +
 4 files changed, 16 insertions(+), 125 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 61cde7f1..acc97830 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,12 +23,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1"
 
-[[package]]
-name = "allocator-api2"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
-
 [[package]]
 name = "anes"
 version = "0.1.6"
@@ -684,27 +678,6 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
 
-[[package]]
-name = "egg"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb749745461743bb477fba3ef87c663d5965876155c676c9489cfe0963de5ab"
-dependencies = [
- "env_logger",
- "hashbrown",
- "indexmap",
- "log",
- "num-bigint",
- "num-traits",
- "quanta",
- "rustc-hash",
- "saturating",
- "smallvec",
- "symbol_table",
- "symbolic_expressions",
- "thiserror",
-]
-
 [[package]]
 name = "either"
 version = "1.13.0"
@@ -723,15 +696,6 @@ version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d"
 
-[[package]]
-name = "env_logger"
-version = "0.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7"
-dependencies = [
- "log",
-]
-
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -845,12 +809,6 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
-[[package]]
-name = "foldhash"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
-
 [[package]]
 name = "funty"
 version = "2.0.0"
@@ -975,11 +933,6 @@ name = "hashbrown"
 version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
-dependencies = [
- "allocator-api2",
- "equivalent",
- "foldhash",
-]
 
 [[package]]
 name = "heapless"
@@ -1047,7 +1000,6 @@ version = "0.1.0"
 dependencies = [
  "bimap",
  "bitvec",
- "egg",
  "either",
  "hercules_cg",
  "hercules_ir",
@@ -2117,21 +2069,6 @@ dependencies = [
  "bytemuck",
 ]
 
-[[package]]
-name = "quanta"
-version = "0.12.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
-dependencies = [
- "crossbeam-utils",
- "libc",
- "once_cell",
- "raw-cpuid",
- "wasi 0.11.0+wasi-snapshot-preview1",
- "web-sys",
- "winapi",
-]
-
 [[package]]
 name = "quick-error"
 version = "2.0.1"
@@ -2266,15 +2203,6 @@ dependencies = [
  "rgb",
 ]
 
-[[package]]
-name = "raw-cpuid"
-version = "11.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "529468c1335c1c03919960dfefdb1b3648858c20d7ec2d0663e728e4a717efbc"
-dependencies = [
- "bitflags 2.8.0",
-]
-
 [[package]]
 name = "rayon"
 version = "1.10.0"
@@ -2339,12 +2267,6 @@ version = "0.8.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a"
 
-[[package]]
-name = "rustc-hash"
-version = "2.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
-
 [[package]]
 name = "rustc_version"
 version = "0.4.1"
@@ -2388,12 +2310,6 @@ dependencies = [
  "winapi-util",
 ]
 
-[[package]]
-name = "saturating"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71"
-
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -2537,23 +2453,6 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
-[[package]]
-name = "symbol_table"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f19bffd69fb182e684d14e3c71d04c0ef33d1641ac0b9e81c712c734e83703bc"
-dependencies = [
- "crossbeam-utils",
- "foldhash",
- "hashbrown",
-]
-
-[[package]]
-name = "symbolic_expressions"
-version = "5.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c68d531d83ec6c531150584c42a4290911964d5f0d79132b193b67252a23b71"
-
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -2938,22 +2837,6 @@ version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082"
 
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
 [[package]]
 name = "winapi-util"
 version = "0.1.9"
@@ -2963,12 +2846,6 @@ dependencies = [
  "windows-sys",
 ]
 
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
 [[package]]
 name = "windows"
 version = "0.59.0"
diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml
index 92e05339..892e4c15 100644
--- a/hercules_opt/Cargo.toml
+++ b/hercules_opt/Cargo.toml
@@ -22,4 +22,4 @@ hercules_cg = { path = "../hercules_cg" }
 hercules_ir = { path = "../hercules_ir" }
 nestify = "*"
 bimap = "*"
-egg = "*"
+#egg = "*"
diff --git a/hercules_opt/src/rewrite_math_expressions.rs b/hercules_opt/src/rewrite_math_expressions.rs
index 6f52dc58..55e695ba 100644
--- a/hercules_opt/src/rewrite_math_expressions.rs
+++ b/hercules_opt/src/rewrite_math_expressions.rs
@@ -3,10 +3,22 @@ use std::fmt::{Error, Write};
 
 use hercules_ir::*;
 
-use egg::*;
+//use egg::*;
 
 use crate::*;
 
+pub fn rewrite_math_expressions(
+    editor: &mut FunctionEditor,
+    device: Device,
+    typing: &Vec<TypeID>,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>,
+    reduce_einsums: &(MathEnv, HashMap<NodeID, MathID>),
+) {
+    panic!("PANIC: The rewrite math expressions pass is currently disabled, as including egg increases compile times and we're not using it currently.");
+}
+
+/*
 define_language! {
     enum MathLanguage {
         "zero" = Zero,
@@ -164,3 +176,4 @@ fn egg_print_math_expr<W: Write>(id: MathID, env: &MathEnv, w: &mut W) -> Result
         _ => Err(Error::default()),
     }
 }
+*/
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index e5100561..1b2898bf 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -56,6 +56,7 @@ let init_body = init;
 inline(bfs@loop1, bfs@loop2);
 delete-uncalled(*);
 const-inline(*);
+simpl!(*);
 
 unforkify(init_body, traverse_body, collect_body);
 simpl!(*);
-- 
GitLab


From 23b791819e3440756624922c96400ae7c841ae7e Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 16:16:23 -0600
Subject: [PATCH 06/11] parallelize cost init loop

---
 juno_samples/rodinia/bfs/src/bfs.jn  | 2 +-
 juno_samples/rodinia/bfs/src/cpu.sch | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index d6ec25f2..75190300 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -24,7 +24,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
   @cost_init for i in 0..n {
     cost[i] = -1;
   }
-  cost[source as u64] = 0;
+  @cost_init cost[source as u64] = 0;
 
   // Nodes that were updated in the current iteration
   let updated: bool[n];
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index 1b2898bf..34b73506 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -48,10 +48,9 @@ let collect_body = outline(inner);
 
 let init_body = init;
 // Following code seems to generate breaking RT code
-//fork-tile[32, 0, false, true](init);
-//let (outer, inner) = fork-reshape[[1], [0]](init);
-//let init_body = outline(inner);
-//inline(bfs@cost_init);
+fork-tile[32, 0, false, true](init);
+let (outer, inner) = fork-reshape[[1], [0]](init);
+let init_body = outline(inner);
 
 inline(bfs@loop1, bfs@loop2);
 delete-uncalled(*);
-- 
GitLab


From c35f968a0c332f92dc9a5296ee6cb9b7f1ddc8db Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 16:17:08 -0600
Subject: [PATCH 07/11] .

---
 juno_samples/rodinia/bfs/src/gpu.sch | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 4e5c1f74..b734bc76 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -38,6 +38,12 @@ fixpoint {
 }
 simpl!(collect);
 
+fork-tile[32, 0, false, true](traverse, collect);
+let (outer, inner) = fork-reshape[[1], [0]](traverse);
+let traverse_body = outline(inner);
+let (outer, inner) = fork-reshape[[1], [0]](collect);
+let collect_body = outline(inner);
+
 fork-tile[1024, 0, false, true](traverse, collect);
 fork-split(traverse, collect);
 
-- 
GitLab


From e196f8044377e211d25bd09c198769925a6e26a4 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 16:17:31 -0600
Subject: [PATCH 08/11] whoops

---
 juno_samples/rodinia/bfs/src/gpu.sch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index b734bc76..3e2c133e 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -47,5 +47,5 @@ let collect_body = outline(inner);
 fork-tile[1024, 0, false, true](traverse, collect);
 fork-split(traverse, collect);
 
-unforkify(init);
+unforkify(init_body);
 gcm(*);
-- 
GitLab


From 638ef760a716dbccfdaac8507c804c1e5228364f Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Tue, 4 Mar 2025 16:35:32 -0600
Subject: [PATCH 09/11] fix

---
 juno_samples/rodinia/bfs/src/cpu.sch | 2 --
 juno_samples/rodinia/bfs/src/gpu.sch | 8 +++-----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index 34b73506..339782d6 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -46,8 +46,6 @@ let traverse_body = outline(inner);
 let (outer, inner) = fork-reshape[[1], [0]](collect);
 let collect_body = outline(inner);
 
-let init_body = init;
-// Following code seems to generate breaking RT code
 fork-tile[32, 0, false, true](init);
 let (outer, inner) = fork-reshape[[1], [0]](init);
 let init_body = outline(inner);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 3e2c133e..0253a021 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -38,11 +38,9 @@ fixpoint {
 }
 simpl!(collect);
 
-fork-tile[32, 0, false, true](traverse, collect);
-let (outer, inner) = fork-reshape[[1], [0]](traverse);
-let traverse_body = outline(inner);
-let (outer, inner) = fork-reshape[[1], [0]](collect);
-let collect_body = outline(inner);
+fork-tile[32, 0, false, true](init);
+let (outer, inner) = fork-reshape[[1], [0]](init);
+let init_body = outline(inner);
 
 fork-tile[1024, 0, false, true](traverse, collect);
 fork-split(traverse, collect);
-- 
GitLab


From fe37c4278ef1bb9daa9b632b3c330e5165b9e8ff Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 17:04:26 -0600
Subject: [PATCH 10/11] fix

---
 juno_samples/matmul/src/matmul.sch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch
index 306997f5..b6667596 100644
--- a/juno_samples/matmul/src/matmul.sch
+++ b/juno_samples/matmul/src/matmul.sch
@@ -51,7 +51,7 @@ if feature("cuda") {
   fork-coalesce(*);
   infer-schedules(*);
   dce(*);
-  rewrite(*);
+  //rewrite(*);
   fixpoint {
     simplify-cfg(*);
     dce(*);
-- 
GitLab


From 1e4e9e75d367431a525fc79d819b1838d629cc0d Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 4 Mar 2025 17:11:46 -0600
Subject: [PATCH 11/11] fix

---
 juno_samples/matmul/src/matmul.sch | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch
index b6667596..6867576e 100644
--- a/juno_samples/matmul/src/matmul.sch
+++ b/juno_samples/matmul/src/matmul.sch
@@ -52,6 +52,8 @@ if feature("cuda") {
   infer-schedules(*);
   dce(*);
   //rewrite(*);
+  let out = outline(matmul@outer);
+  gpu(out);
   fixpoint {
     simplify-cfg(*);
     dce(*);
-- 
GitLab