From fd62696bdbd42381fdd8a43ca1b20b9f278616b1 Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Tue, 4 Mar 2025 13:12:28 -0600 Subject: [PATCH 01/11] "fix" backprop large bench on gpu --- juno_samples/rodinia/backprop/benches/backprop_bench.rs | 2 +- juno_samples/rodinia/backprop/src/gpu.sch | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs index 492bce3b..80964c72 100644 --- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs +++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs @@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) { }; bench("backprop bench small", 65536); - bench("backprop bench large", 33554432); + bench("backprop bench large", 1048576); } criterion_group!(benches, backprop_bench); diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch index f8cc84a3..cc9cc2ac 100644 --- a/juno_samples/rodinia/backprop/src/gpu.sch +++ b/juno_samples/rodinia/backprop/src/gpu.sch @@ -33,10 +33,10 @@ fixpoint { reduce-slf(*); simpl!(*); -fork-extend[32](layer_forward@inner_loop); +fork-extend[1024](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); simpl!(layer_forward); -fork-tile[32, 0, false, true](layer_forward@inner_loop); +fork-tile[1024, 0, false, true](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); let out = fork-split(layer_forward@inner_loop); clean-monoid-reduces(layer_forward); -- GitLab From 7d59b63ca1517086c4fe5b4bc60ffcc64205fe3f Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Tue, 4 Mar 2025 13:29:41 -0600 Subject: [PATCH 02/11] fix backprop for real --- hercules_cg/src/gpu.rs | 13 ++++++++++--- .../rodinia/backprop/benches/backprop_bench.rs | 2 +- juno_samples/rodinia/backprop/src/gpu.sch | 5 +++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 1e6067a3..dd87acbe 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -562,9 +562,15 @@ namespace cg = cooperative_groups; * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { - write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?; + write!( + w, + "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n" + )?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; - write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?; + write!( + w, + "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n" + )?; Ok(()) } @@ -1726,7 +1732,8 @@ namespace cg = cooperative_groups; available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() { - write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?; + write!(thread_block_tiles, "\t{};\n", self.get_fork_iter(id, true))?; + write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, false))?; write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; } } diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs index 80964c72..492bce3b 100644 --- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs +++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs @@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) { }; bench("backprop bench small", 65536); - bench("backprop bench large", 1048576); + bench("backprop bench large", 33554432); } criterion_group!(benches, backprop_bench); diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch index cc9cc2ac..1773f713 100644 --- a/juno_samples/rodinia/backprop/src/gpu.sch +++ b/juno_samples/rodinia/backprop/src/gpu.sch @@ -33,10 +33,11 @@ fixpoint { reduce-slf(*); simpl!(*); -fork-extend[1024](layer_forward@inner_loop); +fork-extend[32768](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); simpl!(layer_forward); -fork-tile[1024, 0, false, true](layer_forward@inner_loop); +fork-tile[32768, 0, false, true](layer_forward@inner_loop); +fork-tile[1024, 1, false, true](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); let out = fork-split(layer_forward@inner_loop); clean-monoid-reduces(layer_forward); -- GitLab From 94950efe71d9cbf4ef1840eafd9c8799b7106966 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Tue, 4 Mar 2025 13:53:07 -0600 Subject: [PATCH 03/11] Load inputs once in benches --- juno_samples/rodinia/bfs/benches/bfs_bench.rs | 41 ++----- juno_samples/rodinia/cfd/benches/cfd_bench.rs | 112 +++++++++--------- 2 files changed, 67 insertions(+), 86 deletions(-) diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs index f87b4e09..ea8ba0ed 100644 --- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs +++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs @@ -13,41 +13,22 @@ fn bfs_bench(c: &mut Criterion) { let mut r = runner!(bfs); - group.bench_function("bfs bench 4096", |b| { - let input = "data/graph4096.txt"; - let (nodes, source, edges) = parse_graph(input.into()).unwrap(); - let n = nodes.len() as u64; - let m = edges.len() as u64; - let nodes = HerculesImmBox::from(&nodes as &[Node]); - let edges = HerculesImmBox::from(&edges as &[u32]); - b.iter(|| { - async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); - }) - }); - - group.bench_function("bfs bench 65536", |b| { - let input = "data/graph65536.txt"; - let (nodes, source, edges) = parse_graph(input.into()).unwrap(); - let n = nodes.len() as u64; - let m = edges.len() as u64; - let nodes = HerculesImmBox::from(&nodes as &[Node]); - let edges = HerculesImmBox::from(&edges as &[u32]); - b.iter(|| { - async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); - }) - }); - - group.bench_function("bfs bench 64M", |b| { - let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt"; + let mut bench = |name, input: &'_ str| { let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines."); let n = nodes.len() as u64; let m = edges.len() as u64; let nodes = HerculesImmBox::from(&nodes as &[Node]); let edges = HerculesImmBox::from(&edges as &[u32]); - b.iter(|| { - async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); - }) - }); + group.bench_function(name, |b| { + b.iter(|| { + async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); + }) + }); + }; + + bench("bfs bench 4096", "data/graph4096.txt"); + bench("bfs bench 65536", "data/graph65536.txt"); + bench("bfs bench 64M", "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt"); } criterion_group!(benches, bfs_bench); diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs index aa6d7727..5b4db044 100644 --- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs +++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs @@ -13,38 +13,38 @@ fn cfd_bench(c: &mut Criterion) { group.sample_size(10); let mut euler_bench = |name, data_file, iterations| { - group.bench_function(name, |b| { - let mut r = runner!(euler); - let block_size = 16; - let FarFieldConditions { - ff_variable, - ff_fc_momentum_x, - ff_fc_momentum_y, - ff_fc_momentum_z, - ff_fc_density_energy, - } = set_far_field_conditions(); - let GeometryData { - nelr, - areas, - elements_surrounding_elements, - normals, - } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); - let mut variables = initialize_variables(nelr, &ff_variable); + let mut r = runner!(euler); + let block_size = 16; + let FarFieldConditions { + ff_variable, + ff_fc_momentum_x, + ff_fc_momentum_y, + ff_fc_momentum_z, + ff_fc_density_energy, + } = set_far_field_conditions(); + let GeometryData { + nelr, + areas, + elements_surrounding_elements, + normals, + } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); + let mut variables = initialize_variables(nelr, &ff_variable); - let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); - let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); - let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); - let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); - let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); - let areas = HerculesImmBox::from(areas.as_slice()); - let elements_surrounding_elements = - HerculesImmBox::from(elements_surrounding_elements.as_slice()); + let areas = HerculesImmBox::from(areas.as_slice()); + let elements_surrounding_elements = + HerculesImmBox::from(elements_surrounding_elements.as_slice()); - let normals_x = HerculesImmBox::from(normals.x.as_slice()); - let normals_y = HerculesImmBox::from(normals.y.as_slice()); - let normals_z = HerculesImmBox::from(normals.z.as_slice()); + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); + group.bench_function(name, |b| { b.iter(|| { async_std::task::block_on(async { r.run( @@ -91,38 +91,38 @@ fn cfd_bench(c: &mut Criterion) { ); let mut pre_euler_bench = |name, data_file, iterations| { - group.bench_function(name, |b| { - let mut r = runner!(pre_euler); - let block_size = 16; - let FarFieldConditions { - ff_variable, - ff_fc_momentum_x, - ff_fc_momentum_y, - ff_fc_momentum_z, - ff_fc_density_energy, - } = set_far_field_conditions(); - let GeometryData { - nelr, - areas, - elements_surrounding_elements, - normals, - } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); - let mut variables = initialize_variables(nelr, &ff_variable); + let mut r = runner!(pre_euler); + let block_size = 16; + let FarFieldConditions { + ff_variable, + ff_fc_momentum_x, + ff_fc_momentum_y, + ff_fc_momentum_z, + ff_fc_density_energy, + } = set_far_field_conditions(); + let GeometryData { + nelr, + areas, + elements_surrounding_elements, + normals, + } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); + let mut variables = initialize_variables(nelr, &ff_variable); - let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); - let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); - let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); - let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); - let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); - let areas = HerculesImmBox::from(areas.as_slice()); - let elements_surrounding_elements = - HerculesImmBox::from(elements_surrounding_elements.as_slice()); + let areas = HerculesImmBox::from(areas.as_slice()); + let elements_surrounding_elements = + HerculesImmBox::from(elements_surrounding_elements.as_slice()); - let normals_x = HerculesImmBox::from(normals.x.as_slice()); - let normals_y = HerculesImmBox::from(normals.y.as_slice()); - let normals_z = HerculesImmBox::from(normals.z.as_slice()); + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); + group.bench_function(name, |b| { b.iter(|| { async_std::task::block_on(async { r.run( -- GitLab From 54acf3e288890bbe95d0bf003f7ecc1919348769 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 15:26:41 -0600 Subject: [PATCH 04/11] fix gpu backend to emit namespaces properly across cuda versions --- hercules_cg/src/gpu.rs | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index dd87acbe..4069cb02 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -354,7 +354,6 @@ impl GPUContext<'_> { write!( w, " -#define _CG_ABI_EXPERIMENTAL #include <assert.h> #include <stdio.h> #include <stddef.h> @@ -362,8 +361,23 @@ impl GPUContext<'_> { #include <cuda_runtime.h> #include <math_constants.h> #include <mma.h> + +#if (CUDA_VERSION >= 12000) +#else +#define _CG_ABI_EXPERIMENTAL +#endif + #include <cooperative_groups.h> #include <cooperative_groups/reduce.h> + +#if (CUDA_VERSION >= 12000) +namespace cg = cooperative_groups; +namespace cge = cooperative_groups; +#else +namespace cg = cooperative_groups; +namespace cge = cooperative_groups::experimental; +#endif + #include <cuda_bf16.h> namespace cg = cooperative_groups; @@ -564,12 +578,12 @@ namespace cg = cooperative_groups; fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!( w, - "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n" + "\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n" )?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; write!( w, - "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n" + "\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n" )?; Ok(()) } @@ -1715,20 +1729,20 @@ namespace cg = cooperative_groups; }; write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id )?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota )?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() { -- GitLab From 0107cc1c914a44cf925046a4d3fdaf6f160a7394 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 15:52:20 -0600 Subject: [PATCH 05/11] don't build egg for now --- Cargo.lock | 123 ------------------- hercules_opt/Cargo.toml | 2 +- hercules_opt/src/rewrite_math_expressions.rs | 15 ++- juno_samples/rodinia/bfs/src/cpu.sch | 1 + 4 files changed, 16 insertions(+), 125 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 61cde7f1..acc97830 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,12 +23,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1" -[[package]] -name = "allocator-api2" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" - [[package]] name = "anes" version = "0.1.6" @@ -684,27 +678,6 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" -[[package]] -name = "egg" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb749745461743bb477fba3ef87c663d5965876155c676c9489cfe0963de5ab" -dependencies = [ - "env_logger", - "hashbrown", - "indexmap", - "log", - "num-bigint", - "num-traits", - "quanta", - "rustc-hash", - "saturating", - "smallvec", - "symbol_table", - "symbolic_expressions", - "thiserror", -] - [[package]] name = "either" version = "1.13.0" @@ -723,15 +696,6 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" -[[package]] -name = "env_logger" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" -dependencies = [ - "log", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -845,12 +809,6 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" -[[package]] -name = "foldhash" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" - [[package]] name = "funty" version = "2.0.0" @@ -975,11 +933,6 @@ name = "hashbrown" version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" -dependencies = [ - "allocator-api2", - "equivalent", - "foldhash", -] [[package]] name = "heapless" @@ -1047,7 +1000,6 @@ version = "0.1.0" dependencies = [ "bimap", "bitvec", - "egg", "either", "hercules_cg", "hercules_ir", @@ -2117,21 +2069,6 @@ dependencies = [ "bytemuck", ] -[[package]] -name = "quanta" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e" -dependencies = [ - "crossbeam-utils", - "libc", - "once_cell", - "raw-cpuid", - "wasi 0.11.0+wasi-snapshot-preview1", - "web-sys", - "winapi", -] - [[package]] name = "quick-error" version = "2.0.1" @@ -2266,15 +2203,6 @@ dependencies = [ "rgb", ] -[[package]] -name = "raw-cpuid" -version = "11.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "529468c1335c1c03919960dfefdb1b3648858c20d7ec2d0663e728e4a717efbc" -dependencies = [ - "bitflags 2.8.0", -] - [[package]] name = "rayon" version = "1.10.0" @@ -2339,12 +2267,6 @@ version = "0.8.50" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - [[package]] name = "rustc_version" version = "0.4.1" @@ -2388,12 +2310,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "saturating" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71" - [[package]] name = "scopeguard" version = "1.2.0" @@ -2537,23 +2453,6 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "symbol_table" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f19bffd69fb182e684d14e3c71d04c0ef33d1641ac0b9e81c712c734e83703bc" -dependencies = [ - "crossbeam-utils", - "foldhash", - "hashbrown", -] - -[[package]] -name = "symbolic_expressions" -version = "5.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c68d531d83ec6c531150584c42a4290911964d5f0d79132b193b67252a23b71" - [[package]] name = "syn" version = "1.0.109" @@ -2938,22 +2837,6 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082" -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - [[package]] name = "winapi-util" version = "0.1.9" @@ -2963,12 +2846,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - [[package]] name = "windows" version = "0.59.0" diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml index 92e05339..892e4c15 100644 --- a/hercules_opt/Cargo.toml +++ b/hercules_opt/Cargo.toml @@ -22,4 +22,4 @@ hercules_cg = { path = "../hercules_cg" } hercules_ir = { path = "../hercules_ir" } nestify = "*" bimap = "*" -egg = "*" +#egg = "*" diff --git a/hercules_opt/src/rewrite_math_expressions.rs b/hercules_opt/src/rewrite_math_expressions.rs index 6f52dc58..55e695ba 100644 --- a/hercules_opt/src/rewrite_math_expressions.rs +++ b/hercules_opt/src/rewrite_math_expressions.rs @@ -3,10 +3,22 @@ use std::fmt::{Error, Write}; use hercules_ir::*; -use egg::*; +//use egg::*; use crate::*; +pub fn rewrite_math_expressions( + editor: &mut FunctionEditor, + device: Device, + typing: &Vec<TypeID>, + fork_join_map: &HashMap<NodeID, NodeID>, + nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>, + reduce_einsums: &(MathEnv, HashMap<NodeID, MathID>), +) { + panic!("PANIC: The rewrite math expressions pass is currently disabled, as including egg increases compile times and we're not using it currently."); +} + +/* define_language! { enum MathLanguage { "zero" = Zero, @@ -164,3 +176,4 @@ fn egg_print_math_expr<W: Write>(id: MathID, env: &MathEnv, w: &mut W) -> Result _ => Err(Error::default()), } } +*/ diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index e5100561..1b2898bf 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -56,6 +56,7 @@ let init_body = init; inline(bfs@loop1, bfs@loop2); delete-uncalled(*); const-inline(*); +simpl!(*); unforkify(init_body, traverse_body, collect_body); simpl!(*); -- GitLab From 23b791819e3440756624922c96400ae7c841ae7e Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 16:16:23 -0600 Subject: [PATCH 06/11] parallelize cost init loop --- juno_samples/rodinia/bfs/src/bfs.jn | 2 +- juno_samples/rodinia/bfs/src/cpu.sch | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index d6ec25f2..75190300 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -24,7 +24,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] @cost_init for i in 0..n { cost[i] = -1; } - cost[source as u64] = 0; + @cost_init cost[source as u64] = 0; // Nodes that were updated in the current iteration let updated: bool[n]; diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 1b2898bf..34b73506 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -48,10 +48,9 @@ let collect_body = outline(inner); let init_body = init; // Following code seems to generate breaking RT code -//fork-tile[32, 0, false, true](init); -//let (outer, inner) = fork-reshape[[1], [0]](init); -//let init_body = outline(inner); -//inline(bfs@cost_init); +fork-tile[32, 0, false, true](init); +let (outer, inner) = fork-reshape[[1], [0]](init); +let init_body = outline(inner); inline(bfs@loop1, bfs@loop2); delete-uncalled(*); -- GitLab From c35f968a0c332f92dc9a5296ee6cb9b7f1ddc8db Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 16:17:08 -0600 Subject: [PATCH 07/11] . --- juno_samples/rodinia/bfs/src/gpu.sch | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 4e5c1f74..b734bc76 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -38,6 +38,12 @@ fixpoint { } simpl!(collect); +fork-tile[32, 0, false, true](traverse, collect); +let (outer, inner) = fork-reshape[[1], [0]](traverse); +let traverse_body = outline(inner); +let (outer, inner) = fork-reshape[[1], [0]](collect); +let collect_body = outline(inner); + fork-tile[1024, 0, false, true](traverse, collect); fork-split(traverse, collect); -- GitLab From e196f8044377e211d25bd09c198769925a6e26a4 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 16:17:31 -0600 Subject: [PATCH 08/11] whoops --- juno_samples/rodinia/bfs/src/gpu.sch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index b734bc76..3e2c133e 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -47,5 +47,5 @@ let collect_body = outline(inner); fork-tile[1024, 0, false, true](traverse, collect); fork-split(traverse, collect); -unforkify(init); +unforkify(init_body); gcm(*); -- GitLab From 638ef760a716dbccfdaac8507c804c1e5228364f Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Tue, 4 Mar 2025 16:35:32 -0600 Subject: [PATCH 09/11] fix --- juno_samples/rodinia/bfs/src/cpu.sch | 2 -- juno_samples/rodinia/bfs/src/gpu.sch | 8 +++----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 34b73506..339782d6 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -46,8 +46,6 @@ let traverse_body = outline(inner); let (outer, inner) = fork-reshape[[1], [0]](collect); let collect_body = outline(inner); -let init_body = init; -// Following code seems to generate breaking RT code fork-tile[32, 0, false, true](init); let (outer, inner) = fork-reshape[[1], [0]](init); let init_body = outline(inner); diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 3e2c133e..0253a021 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -38,11 +38,9 @@ fixpoint { } simpl!(collect); -fork-tile[32, 0, false, true](traverse, collect); -let (outer, inner) = fork-reshape[[1], [0]](traverse); -let traverse_body = outline(inner); -let (outer, inner) = fork-reshape[[1], [0]](collect); -let collect_body = outline(inner); +fork-tile[32, 0, false, true](init); +let (outer, inner) = fork-reshape[[1], [0]](init); +let init_body = outline(inner); fork-tile[1024, 0, false, true](traverse, collect); fork-split(traverse, collect); -- GitLab From fe37c4278ef1bb9daa9b632b3c330e5165b9e8ff Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 17:04:26 -0600 Subject: [PATCH 10/11] fix --- juno_samples/matmul/src/matmul.sch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch index 306997f5..b6667596 100644 --- a/juno_samples/matmul/src/matmul.sch +++ b/juno_samples/matmul/src/matmul.sch @@ -51,7 +51,7 @@ if feature("cuda") { fork-coalesce(*); infer-schedules(*); dce(*); - rewrite(*); + //rewrite(*); fixpoint { simplify-cfg(*); dce(*); -- GitLab From 1e4e9e75d367431a525fc79d819b1838d629cc0d Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 17:11:46 -0600 Subject: [PATCH 11/11] fix --- juno_samples/matmul/src/matmul.sch | 2 ++ 1 file changed, 2 insertions(+) diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch index b6667596..6867576e 100644 --- a/juno_samples/matmul/src/matmul.sch +++ b/juno_samples/matmul/src/matmul.sch @@ -52,6 +52,8 @@ if feature("cuda") { infer-schedules(*); dce(*); //rewrite(*); + let out = outline(matmul@outer); + gpu(out); fixpoint { simplify-cfg(*); dce(*); -- GitLab