From fd62696bdbd42381fdd8a43ca1b20b9f278616b1 Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Tue, 4 Mar 2025 13:12:28 -0600 Subject: [PATCH 1/5] "fix" backprop large bench on gpu --- juno_samples/rodinia/backprop/benches/backprop_bench.rs | 2 +- juno_samples/rodinia/backprop/src/gpu.sch | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs index 492bce3b..80964c72 100644 --- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs +++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs @@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) { }; bench("backprop bench small", 65536); - bench("backprop bench large", 33554432); + bench("backprop bench large", 1048576); } criterion_group!(benches, backprop_bench); diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch index f8cc84a3..cc9cc2ac 100644 --- a/juno_samples/rodinia/backprop/src/gpu.sch +++ b/juno_samples/rodinia/backprop/src/gpu.sch @@ -33,10 +33,10 @@ fixpoint { reduce-slf(*); simpl!(*); -fork-extend[32](layer_forward@inner_loop); +fork-extend[1024](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); simpl!(layer_forward); -fork-tile[32, 0, false, true](layer_forward@inner_loop); +fork-tile[1024, 0, false, true](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); let out = fork-split(layer_forward@inner_loop); clean-monoid-reduces(layer_forward); -- GitLab From 7d59b63ca1517086c4fe5b4bc60ffcc64205fe3f Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Tue, 4 Mar 2025 13:29:41 -0600 Subject: [PATCH 2/5] fix backprop for real --- hercules_cg/src/gpu.rs | 13 ++++++++++--- .../rodinia/backprop/benches/backprop_bench.rs | 2 +- juno_samples/rodinia/backprop/src/gpu.sch | 5 +++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 1e6067a3..dd87acbe 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -562,9 +562,15 @@ namespace cg = cooperative_groups; * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { - write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?; + write!( + w, + "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n" + )?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; - write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?; + write!( + w, + "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n" + )?; Ok(()) } @@ -1726,7 +1732,8 @@ namespace cg = cooperative_groups; available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() { - write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?; + write!(thread_block_tiles, "\t{};\n", self.get_fork_iter(id, true))?; + write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, false))?; write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; } } diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs index 80964c72..492bce3b 100644 --- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs +++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs @@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) { }; bench("backprop bench small", 65536); - bench("backprop bench large", 1048576); + bench("backprop bench large", 33554432); } criterion_group!(benches, backprop_bench); diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch index cc9cc2ac..1773f713 100644 --- a/juno_samples/rodinia/backprop/src/gpu.sch +++ b/juno_samples/rodinia/backprop/src/gpu.sch @@ -33,10 +33,11 @@ fixpoint { reduce-slf(*); simpl!(*); -fork-extend[1024](layer_forward@inner_loop); +fork-extend[32768](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); simpl!(layer_forward); -fork-tile[1024, 0, false, true](layer_forward@inner_loop); +fork-tile[32768, 0, false, true](layer_forward@inner_loop); +fork-tile[1024, 1, false, true](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); let out = fork-split(layer_forward@inner_loop); clean-monoid-reduces(layer_forward); -- GitLab From 94950efe71d9cbf4ef1840eafd9c8799b7106966 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Tue, 4 Mar 2025 13:53:07 -0600 Subject: [PATCH 3/5] Load inputs once in benches --- juno_samples/rodinia/bfs/benches/bfs_bench.rs | 41 ++----- juno_samples/rodinia/cfd/benches/cfd_bench.rs | 112 +++++++++--------- 2 files changed, 67 insertions(+), 86 deletions(-) diff --git a/juno_samples/rodinia/bfs/benches/bfs_bench.rs b/juno_samples/rodinia/bfs/benches/bfs_bench.rs index f87b4e09..ea8ba0ed 100644 --- a/juno_samples/rodinia/bfs/benches/bfs_bench.rs +++ b/juno_samples/rodinia/bfs/benches/bfs_bench.rs @@ -13,41 +13,22 @@ fn bfs_bench(c: &mut Criterion) { let mut r = runner!(bfs); - group.bench_function("bfs bench 4096", |b| { - let input = "data/graph4096.txt"; - let (nodes, source, edges) = parse_graph(input.into()).unwrap(); - let n = nodes.len() as u64; - let m = edges.len() as u64; - let nodes = HerculesImmBox::from(&nodes as &[Node]); - let edges = HerculesImmBox::from(&edges as &[u32]); - b.iter(|| { - async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); - }) - }); - - group.bench_function("bfs bench 65536", |b| { - let input = "data/graph65536.txt"; - let (nodes, source, edges) = parse_graph(input.into()).unwrap(); - let n = nodes.len() as u64; - let m = edges.len() as u64; - let nodes = HerculesImmBox::from(&nodes as &[Node]); - let edges = HerculesImmBox::from(&edges as &[u32]); - b.iter(|| { - async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); - }) - }); - - group.bench_function("bfs bench 64M", |b| { - let input = "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt"; + let mut bench = |name, input: &'_ str| { let (nodes, source, edges) = parse_graph(input.into()).expect("PANIC: Couldn't read input file for 64M benchmark. Currently, this benchmark uses a hard-coded path, so it can only be run on the lab machines."); let n = nodes.len() as u64; let m = edges.len() as u64; let nodes = HerculesImmBox::from(&nodes as &[Node]); let edges = HerculesImmBox::from(&edges as &[u32]); - b.iter(|| { - async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); - }) - }); + group.bench_function(name, |b| { + b.iter(|| { + async_std::task::block_on(async { r.run(n, m, nodes.to(), source, edges.to()).await }); + }) + }); + }; + + bench("bfs bench 4096", "data/graph4096.txt"); + bench("bfs bench 65536", "data/graph65536.txt"); + bench("bfs bench 64M", "/scratch/aaronjc4/rodinia_3.1/data/bfs/graph64M.txt"); } criterion_group!(benches, bfs_bench); diff --git a/juno_samples/rodinia/cfd/benches/cfd_bench.rs b/juno_samples/rodinia/cfd/benches/cfd_bench.rs index aa6d7727..5b4db044 100644 --- a/juno_samples/rodinia/cfd/benches/cfd_bench.rs +++ b/juno_samples/rodinia/cfd/benches/cfd_bench.rs @@ -13,38 +13,38 @@ fn cfd_bench(c: &mut Criterion) { group.sample_size(10); let mut euler_bench = |name, data_file, iterations| { - group.bench_function(name, |b| { - let mut r = runner!(euler); - let block_size = 16; - let FarFieldConditions { - ff_variable, - ff_fc_momentum_x, - ff_fc_momentum_y, - ff_fc_momentum_z, - ff_fc_density_energy, - } = set_far_field_conditions(); - let GeometryData { - nelr, - areas, - elements_surrounding_elements, - normals, - } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); - let mut variables = initialize_variables(nelr, &ff_variable); + let mut r = runner!(euler); + let block_size = 16; + let FarFieldConditions { + ff_variable, + ff_fc_momentum_x, + ff_fc_momentum_y, + ff_fc_momentum_z, + ff_fc_density_energy, + } = set_far_field_conditions(); + let GeometryData { + nelr, + areas, + elements_surrounding_elements, + normals, + } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); + let mut variables = initialize_variables(nelr, &ff_variable); - let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); - let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); - let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); - let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); - let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); - let areas = HerculesImmBox::from(areas.as_slice()); - let elements_surrounding_elements = - HerculesImmBox::from(elements_surrounding_elements.as_slice()); + let areas = HerculesImmBox::from(areas.as_slice()); + let elements_surrounding_elements = + HerculesImmBox::from(elements_surrounding_elements.as_slice()); - let normals_x = HerculesImmBox::from(normals.x.as_slice()); - let normals_y = HerculesImmBox::from(normals.y.as_slice()); - let normals_z = HerculesImmBox::from(normals.z.as_slice()); + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); + group.bench_function(name, |b| { b.iter(|| { async_std::task::block_on(async { r.run( @@ -91,38 +91,38 @@ fn cfd_bench(c: &mut Criterion) { ); let mut pre_euler_bench = |name, data_file, iterations| { - group.bench_function(name, |b| { - let mut r = runner!(pre_euler); - let block_size = 16; - let FarFieldConditions { - ff_variable, - ff_fc_momentum_x, - ff_fc_momentum_y, - ff_fc_momentum_z, - ff_fc_density_energy, - } = set_far_field_conditions(); - let GeometryData { - nelr, - areas, - elements_surrounding_elements, - normals, - } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); - let mut variables = initialize_variables(nelr, &ff_variable); + let mut r = runner!(pre_euler); + let block_size = 16; + let FarFieldConditions { + ff_variable, + ff_fc_momentum_x, + ff_fc_momentum_y, + ff_fc_momentum_z, + ff_fc_density_energy, + } = set_far_field_conditions(); + let GeometryData { + nelr, + areas, + elements_surrounding_elements, + normals, + } = read_domain_geometry(data_file, block_size).expect("PANIC: Couldn't read input for CFD benchmark. Currently, the path for the largest CFD benchmark is hard-coded, so it can only be run on the lab machines."); + let mut variables = initialize_variables(nelr, &ff_variable); - let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); - let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); - let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); - let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); - let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); + let mut v_density = HerculesMutBox::from(variables.density.as_mut_slice()); + let mut v_momentum_x = HerculesMutBox::from(variables.momentum.x.as_mut_slice()); + let mut v_momentum_y = HerculesMutBox::from(variables.momentum.y.as_mut_slice()); + let mut v_momentum_z = HerculesMutBox::from(variables.momentum.z.as_mut_slice()); + let mut v_energy = HerculesMutBox::from(variables.energy.as_mut_slice()); - let areas = HerculesImmBox::from(areas.as_slice()); - let elements_surrounding_elements = - HerculesImmBox::from(elements_surrounding_elements.as_slice()); + let areas = HerculesImmBox::from(areas.as_slice()); + let elements_surrounding_elements = + HerculesImmBox::from(elements_surrounding_elements.as_slice()); - let normals_x = HerculesImmBox::from(normals.x.as_slice()); - let normals_y = HerculesImmBox::from(normals.y.as_slice()); - let normals_z = HerculesImmBox::from(normals.z.as_slice()); + let normals_x = HerculesImmBox::from(normals.x.as_slice()); + let normals_y = HerculesImmBox::from(normals.y.as_slice()); + let normals_z = HerculesImmBox::from(normals.z.as_slice()); + group.bench_function(name, |b| { b.iter(|| { async_std::task::block_on(async { r.run( -- GitLab From 3ce50c43068f7a664d5f4d228376a65a1e995c2b Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Tue, 4 Mar 2025 14:46:30 -0600 Subject: [PATCH 4/5] Add sequential feature to benchmarks --- juno_samples/cava/Cargo.toml | 3 +- juno_samples/cava/src/cpu.sch | 24 ++-- juno_samples/edge_detection/Cargo.toml | 1 + juno_samples/edge_detection/src/cpu.sch | 114 +++++++++++------- juno_samples/rodinia/backprop/Cargo.toml | 3 +- juno_samples/rodinia/backprop/src/cpu.sch | 20 +-- juno_samples/rodinia/bfs/Cargo.toml | 1 + juno_samples/rodinia/bfs/src/cpu.sch | 31 ++--- juno_samples/rodinia/cfd/Cargo.toml | 1 + juno_samples/rodinia/cfd/src/cpu_euler.sch | 48 ++++---- .../rodinia/cfd/src/cpu_pre_euler.sch | 59 +++++---- juno_samples/rodinia/srad/Cargo.toml | 1 + juno_samples/rodinia/srad/src/cpu.sch | 28 +++-- 13 files changed, 194 insertions(+), 140 deletions(-) diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml index 17e9a1d3..bdf144da 100644 --- a/juno_samples/cava/Cargo.toml +++ b/juno_samples/cava/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../juno_build" } @@ -30,4 +31,4 @@ criterion = { version = "0.5", features = ["html_reports"] } [[bench]] name = "cava_bench" -harness = false \ No newline at end of file +harness = false diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch index 6fc8adbb..ada2f552 100644 --- a/juno_samples/cava/src/cpu.sch +++ b/juno_samples/cava/src/cpu.sch @@ -113,14 +113,18 @@ fixpoint { simpl!(fuse4); array-slf(fuse4); simpl!(fuse4); -let par = fuse4@image_loop \ fuse4@channel_loop; -fork-tile[4, 1, false, false](par); -fork-tile[8, 0, false, false](par); -fork-interchange[1, 2](par); -let split = fork-split(par); -let fuse4_body = outline(split.cava_3.fj2); -fork-coalesce(fuse4, fuse4_body); -simpl!(fuse4, fuse4_body); + +if !feature("seq") { + let par = fuse4@image_loop \ fuse4@channel_loop; + fork-tile[4, 1, false, false](par); + fork-tile[8, 0, false, false](par); + fork-interchange[1, 2](par); + let split = fork-split(par); + let fuse4_body = outline(split.cava_3.fj2); + fork-coalesce(fuse4, fuse4_body); + simpl!(fuse4, fuse4_body); + fuse4 = fuse4_body; +} no-memset(fuse5@res1); no-memset(fuse5@res2); @@ -136,8 +140,8 @@ simpl!(fuse5); delete-uncalled(*); simpl!(*); -fork-split(fuse1, fuse2, fuse3, fuse4_body, fuse5); -unforkify(fuse1, fuse2, fuse3, fuse4_body, fuse5); +fork-split(fuse1, fuse2, fuse3, fuse4, fuse5); +unforkify(fuse1, fuse2, fuse3, fuse4, fuse5); simpl!(*); diff --git a/juno_samples/edge_detection/Cargo.toml b/juno_samples/edge_detection/Cargo.toml index fa4ca1ff..8def7500 100644 --- a/juno_samples/edge_detection/Cargo.toml +++ b/juno_samples/edge_detection/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [features] opencv = ["dep:opencv"] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [[bin]] name = "juno_edge_detection" diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index 3e1321c5..64fee6b6 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -24,14 +24,18 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); predication(gaussian_smoothing); simpl!(gaussian_smoothing); -let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; -fork-tile[4, 1, false, false](par); -fork-tile[8, 0, false, false](par); -fork-interchange[1, 2](par); -let split = fork-split(par); -let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); -fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); -simpl!(gaussian_smoothing, gaussian_smoothing_body); + +if !feature("seq") { + let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; + fork-tile[4, 1, false, false](par); + fork-tile[8, 0, false, false](par); + fork-interchange[1, 2](par); + let split = fork-split(par); + let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); + fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); + simpl!(gaussian_smoothing, gaussian_smoothing_body); + gaussian_smoothing = gaussian_smoothing_body; +} no-memset(laplacian_estimate@res); fixpoint { @@ -40,15 +44,19 @@ fixpoint { fork-coalesce(laplacian_estimate); } simpl!(laplacian_estimate); -let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; -fork-tile[4, 1, false, false](par); -fork-tile[8, 0, false, false](par); -fork-interchange[1, 2](par); -let split = fork-split(par); -let body = split._1_laplacian_estimate.fj2; -let laplacian_estimate_body = outline(body); -fork-coalesce(laplacian_estimate, laplacian_estimate_body); -simpl!(laplacian_estimate, laplacian_estimate_body); + +if !feature("seq") { + let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; + fork-tile[4, 1, false, false](par); + fork-tile[8, 0, false, false](par); + fork-interchange[1, 2](par); + let split = fork-split(par); + let body = split._1_laplacian_estimate.fj2; + let laplacian_estimate_body = outline(body); + fork-coalesce(laplacian_estimate, laplacian_estimate_body); + simpl!(laplacian_estimate, laplacian_estimate_body); + laplacian_estimate = laplacian_estimate_body; +} no-memset(zero_crossings@res); fixpoint { @@ -57,15 +65,19 @@ fixpoint { fork-coalesce(zero_crossings); } simpl!(zero_crossings); -let par = zero_crossings@image_loop \ zero_crossings@filter_loop; -fork-tile[4, 1, false, false](par); -fork-tile[8, 0, false, false](par); -fork-interchange[1, 2](par); -let split = fork-split(par); -let body = split._2_zero_crossings.fj2; -let zero_crossings_body = outline(body); -fork-coalesce(zero_crossings, zero_crossings_body); -simpl!(zero_crossings, zero_crossings_body); + +if !feature("seq") { + let par = zero_crossings@image_loop \ zero_crossings@filter_loop; + fork-tile[4, 1, false, false](par); + fork-tile[8, 0, false, false](par); + fork-interchange[1, 2](par); + let split = fork-split(par); + let body = split._2_zero_crossings.fj2; + let zero_crossings_body = outline(body); + fork-coalesce(zero_crossings, zero_crossings_body); + simpl!(zero_crossings, zero_crossings_body); + zero_crossings = zero_crossings_body; +} no-memset(gradient@res); fixpoint { @@ -84,17 +96,23 @@ fixpoint { fork-coalesce(max_gradient); } simpl!(max_gradient); -fork-dim-merge(max_gradient); -simpl!(max_gradient); -fork-tile[32, 0, false, false](max_gradient); -let split = fork-split(max_gradient); -clean-monoid-reduces(max_gradient); -let out = outline(split._4_max_gradient.fj1); -simpl!(max_gradient, out); -unforkify(out); -let out = fork-fission[split._4_max_gradient.fj0](max_gradient); -simpl!(max_gradient); -unforkify(out._4_max_gradient.fj_bottom); + +if !feature("seq") { + fork-dim-merge(max_gradient); + simpl!(max_gradient); + fork-tile[32, 0, false, false](max_gradient); + let split = fork-split(max_gradient); + clean-monoid-reduces(max_gradient); + let out = outline(split._4_max_gradient.fj1); + simpl!(max_gradient, out); + unforkify(out); + let out = fork-fission[split._4_max_gradient.fj0](max_gradient); + simpl!(max_gradient); + unforkify(out._4_max_gradient.fj_bottom); +} else { + fork-split(max_gradient); + unforkify(max_gradient); +} no-memset(reject_zero_crossings@res); fixpoint { @@ -104,18 +122,22 @@ fixpoint { } predication(reject_zero_crossings); simpl!(reject_zero_crossings); -fork-tile[4, 1, false, false](reject_zero_crossings); -fork-tile[8, 0, false, false](reject_zero_crossings); -fork-interchange[1, 2](reject_zero_crossings); -let split = fork-split(reject_zero_crossings); -let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); -fork-coalesce(reject_zero_crossings, reject_zero_crossings_body); -simpl!(reject_zero_crossings, reject_zero_crossings_body); + +if !feature("seq") { + fork-tile[4, 1, false, false](reject_zero_crossings); + fork-tile[8, 0, false, false](reject_zero_crossings); + fork-interchange[1, 2](reject_zero_crossings); + let split = fork-split(reject_zero_crossings); + let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); + fork-coalesce(reject_zero_crossings, reject_zero_crossings_body); + simpl!(reject_zero_crossings, reject_zero_crossings_body); + reject_zero_crossings = reject_zero_crossings_body; +} async-call(edge_detection@le, edge_detection@zc); -fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body); -unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body); +fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); +unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); simpl!(*); diff --git a/juno_samples/rodinia/backprop/Cargo.toml b/juno_samples/rodinia/backprop/Cargo.toml index 25185e09..68c76e92 100644 --- a/juno_samples/rodinia/backprop/Cargo.toml +++ b/juno_samples/rodinia/backprop/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../../juno_build" } @@ -31,4 +32,4 @@ criterion = { version = "0.5", features = ["html_reports"] } [[bench]] name = "backprop_bench" -harness = false \ No newline at end of file +harness = false diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 3c7f7d5f..4796f427 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -37,10 +37,12 @@ inline(backprop@forward_input, backprop@forward_hidden); let forward_input = outline(backprop@forward_input); let forward_hidden = outline(backprop@forward_hidden); -fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop); -let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop); -let forward_input = outline(inner); -inline(backprop@forward_input); +if !feature("seq") { + fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop); + let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop); + forward_input = outline(inner); + inline(backprop@forward_input); +} // The first call to adjust_weights has total loop dimensions of 1 * 17, so not // worth parallelizing (given that the body is trivial) @@ -50,10 +52,12 @@ inline(backprop@adjust_hidden, backprop@adjust_input); let adjust_hidden = outline(backprop@adjust_hidden); let adjust_input = outline(backprop@adjust_input); -fork-tile[16, 0, false, true](adjust_input); -let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input); -let adjust_input = outline(inner); -inline(backprop@adjust_input); +if !feature("seq") { + fork-tile[16, 0, false, true](adjust_input); + let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input); + adjust_input = outline(inner); + inline(backprop@adjust_input); +} delete-uncalled(*); const-inline(*); diff --git a/juno_samples/rodinia/bfs/Cargo.toml b/juno_samples/rodinia/bfs/Cargo.toml index 34b6f5ce..46f8ade6 100644 --- a/juno_samples/rodinia/bfs/Cargo.toml +++ b/juno_samples/rodinia/bfs/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../../juno_build" } diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index e5100561..550dce31 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -40,23 +40,24 @@ simpl!(collect); parallel-fork(traverse, collect); parallel-reduce(traverse, collect); -fork-tile[32, 0, false, true](traverse, collect); -let (outer, inner) = fork-reshape[[1], [0]](traverse); -let traverse_body = outline(inner); -let (outer, inner) = fork-reshape[[1], [0]](collect); -let collect_body = outline(inner); - -let init_body = init; -// Following code seems to generate breaking RT code -//fork-tile[32, 0, false, true](init); -//let (outer, inner) = fork-reshape[[1], [0]](init); -//let init_body = outline(inner); -//inline(bfs@cost_init); - -inline(bfs@loop1, bfs@loop2); +if !feature("seq") { + fork-tile[32, 0, false, true](traverse, collect); + let (outer, inner) = fork-reshape[[1], [0]](traverse); + traverse = outline(inner); + let (outer, inner) = fork-reshape[[1], [0]](collect); + collect = outline(inner); + + // Following code seems to generate breaking RT code + //fork-tile[32, 0, false, true](init); + //let (outer, inner) = fork-reshape[[1], [0]](init); + //init = outline(inner); + //inline(bfs@cost_init); + + inline(bfs@loop1, bfs@loop2); +} delete-uncalled(*); const-inline(*); -unforkify(init_body, traverse_body, collect_body); +unforkify(init, traverse, collect); simpl!(*); gcm(*); diff --git a/juno_samples/rodinia/cfd/Cargo.toml b/juno_samples/rodinia/cfd/Cargo.toml index 6720b527..172573dd 100644 --- a/juno_samples/rodinia/cfd/Cargo.toml +++ b/juno_samples/rodinia/cfd/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../../juno_build" } diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 7a284a9a..13125961 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -27,29 +27,35 @@ fixpoint { simpl!(*); unforkify(compute_flux@inner_loop); -fork-tile[32, 0, false, false](compute_step_factor); -let split = fork-split(compute_step_factor); -let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); -fork-coalesce(compute_step_factor, compute_step_factor_body); -simpl!(compute_step_factor, compute_step_factor_body); +if !feature("seq") { + fork-tile[32, 0, false, false](compute_step_factor); + let split = fork-split(compute_step_factor); + let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); + fork-coalesce(compute_step_factor, compute_step_factor_body); + simpl!(compute_step_factor, compute_step_factor_body); + compute_step_factor = compute_step_factor_body; -fork-tile[32, 0, false, false](compute_flux); -let split = fork-split(compute_flux); -let compute_flux_body = outline(split._6_compute_flux.fj1); -fork-coalesce(compute_flux, compute_flux_body); -simpl!(compute_flux, compute_flux_body); + fork-tile[32, 0, false, false](compute_flux); + let split = fork-split(compute_flux); + let compute_flux_body = outline(split._6_compute_flux.fj1); + fork-coalesce(compute_flux, compute_flux_body); + simpl!(compute_flux, compute_flux_body); + compute_flux = compute_flux_body; -fork-tile[32, 0, false, false](time_step); -let split = fork-split(time_step); -let time_step_body = outline(split._7_time_step.fj1); -fork-coalesce(time_step, time_step_body); -simpl!(time_step, time_step_body); + fork-tile[32, 0, false, false](time_step); + let split = fork-split(time_step); + let time_step_body = outline(split._7_time_step.fj1); + fork-coalesce(time_step, time_step_body); + simpl!(time_step, time_step_body); + time_step = time_step_body; -fork-tile[32, 0, false, false](copy_vars); -let split = fork-split(copy_vars); -let copy_vars_body = outline(split._8_copy_vars.fj1); -fork-coalesce(copy_vars, copy_vars_body); -simpl!(copy_vars, copy_vars_body); + fork-tile[32, 0, false, false](copy_vars); + let split = fork-split(copy_vars); + let copy_vars_body = outline(split._8_copy_vars.fj1); + fork-coalesce(copy_vars, copy_vars_body); + simpl!(copy_vars, copy_vars_body); + copy_vars = copy_vars_body; +} -unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body); +unforkify(compute_step_factor, compute_flux, time_step, copy_vars); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 518c656d..858be5ba 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -27,35 +27,42 @@ simpl!(*); no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res); unforkify(compute_flux@inner_loop); -fork-tile[32, 0, false, false](compute_step_factor); -let split = fork-split(compute_step_factor); -let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); -fork-coalesce(compute_step_factor, compute_step_factor_body); -simpl!(compute_step_factor, compute_step_factor_body); +if !feature("seq") { + fork-tile[32, 0, false, false](compute_step_factor); + let split = fork-split(compute_step_factor); + let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); + fork-coalesce(compute_step_factor, compute_step_factor_body); + simpl!(compute_step_factor, compute_step_factor_body); + compute_step_factor = compute_step_factor_body; -fork-tile[32, 0, false, false](compute_flux_contributions); -let split = fork-split(compute_flux_contributions); -let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1); -fork-coalesce(compute_flux_contributions, compute_flux_contributions_body); -simpl!(compute_flux_contributions, compute_flux_contributions_body); + fork-tile[32, 0, false, false](compute_flux_contributions); + let split = fork-split(compute_flux_contributions); + let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1); + fork-coalesce(compute_flux_contributions, compute_flux_contributions_body); + simpl!(compute_flux_contributions, compute_flux_contributions_body); + compute_flux_contributions = compute_flux_contributions_body; -fork-tile[32, 0, false, false](compute_flux); -let split = fork-split(compute_flux); -let compute_flux_body = outline(split._7_compute_flux.fj1); -fork-coalesce(compute_flux, compute_flux_body); -simpl!(compute_flux, compute_flux_body); + fork-tile[32, 0, false, false](compute_flux); + let split = fork-split(compute_flux); + let compute_flux_body = outline(split._7_compute_flux.fj1); + fork-coalesce(compute_flux, compute_flux_body); + simpl!(compute_flux, compute_flux_body); + compute_flux = compute_flux_body; -fork-tile[32, 0, false, false](time_step); -let split = fork-split(time_step); -let time_step_body = outline(split._8_time_step.fj1); -fork-coalesce(time_step, time_step_body); -simpl!(time_step, time_step_body); + fork-tile[32, 0, false, false](time_step); + let split = fork-split(time_step); + let time_step_body = outline(split._8_time_step.fj1); + fork-coalesce(time_step, time_step_body); + simpl!(time_step, time_step_body); + time_step = time_step_body; -fork-tile[32, 0, false, false](copy_vars); -let split = fork-split(copy_vars); -let copy_vars_body = outline(split._9_copy_vars.fj1); -fork-coalesce(copy_vars, copy_vars_body); -simpl!(copy_vars, copy_vars_body); + fork-tile[32, 0, false, false](copy_vars); + let split = fork-split(copy_vars); + let copy_vars_body = outline(split._9_copy_vars.fj1); + fork-coalesce(copy_vars, copy_vars_body); + simpl!(copy_vars, copy_vars_body); + copy_vars = copy_vars_body; +} -unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body); +unforkify(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars); gcm(*); diff --git a/juno_samples/rodinia/srad/Cargo.toml b/juno_samples/rodinia/srad/Cargo.toml index facf8c3b..783f3327 100644 --- a/juno_samples/rodinia/srad/Cargo.toml +++ b/juno_samples/rodinia/srad/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../../juno_build" } diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 8917f03d..8fa22aaa 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -35,20 +35,24 @@ simpl!(*); slf(*); simpl!(*); -fork-tile[32, 0, false, false](loop2); -let split = fork-split(loop2); -let loop2_body = outline(split.srad_1.fj1); -simpl!(loop2, loop2_body); +if !feature("seq") { + fork-tile[32, 0, false, false](loop2); + let split = fork-split(loop2); + let loop2_body = outline(split.srad_1.fj1); + simpl!(loop2, loop2_body); + loop2 = loop2_body; -fork-tile[32, 0, false, false](loop3); -let split = fork-split(loop3); -let loop3_body = outline(split.srad_2.fj1); -simpl!(loop3, loop3_body); + fork-tile[32, 0, false, false](loop3); + let split = fork-split(loop3); + let loop3_body = outline(split.srad_2.fj1); + simpl!(loop3, loop3_body); + loop3 = loop3_body; -inline(srad@loop2, srad@loop3); -delete-uncalled(*); + inline(srad@loop2, srad@loop3); + delete-uncalled(*); +} -fork-split(extract, compress, loop1, loop2_body, loop3_body); -unforkify(extract, compress, loop1, loop2_body, loop3_body); +fork-split(extract, compress, loop1, loop2, loop3); +unforkify(extract, compress, loop1, loop2, loop3); gcm(*); -- GitLab From a0499390ad8b31f3beeaba489c5ea659f78c1285 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 4 Mar 2025 21:14:47 -0600 Subject: [PATCH 5/5] fix --- juno_samples/rodinia/bfs/src/cpu.sch | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 63b2e4fe..ea6f0403 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -52,6 +52,7 @@ if !feature("seq") { let init_body = outline(inner); inline(bfs@cost_init, bfs@loop1, bfs@loop2); + init = init_body; } delete-uncalled(*); const-inline(*); @@ -59,4 +60,4 @@ simpl!(*); unforkify(init, traverse, collect); simpl!(*); -gcm(*); +gcm(*); \ No newline at end of file -- GitLab