From 3ce50c43068f7a664d5f4d228376a65a1e995c2b Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Tue, 4 Mar 2025 14:46:30 -0600 Subject: [PATCH] Add sequential feature to benchmarks --- juno_samples/cava/Cargo.toml | 3 +- juno_samples/cava/src/cpu.sch | 24 ++-- juno_samples/edge_detection/Cargo.toml | 1 + juno_samples/edge_detection/src/cpu.sch | 114 +++++++++++------- juno_samples/rodinia/backprop/Cargo.toml | 3 +- juno_samples/rodinia/backprop/src/cpu.sch | 20 +-- juno_samples/rodinia/bfs/Cargo.toml | 1 + juno_samples/rodinia/bfs/src/cpu.sch | 31 ++--- juno_samples/rodinia/cfd/Cargo.toml | 1 + juno_samples/rodinia/cfd/src/cpu_euler.sch | 48 ++++---- .../rodinia/cfd/src/cpu_pre_euler.sch | 59 +++++---- juno_samples/rodinia/srad/Cargo.toml | 1 + juno_samples/rodinia/srad/src/cpu.sch | 28 +++-- 13 files changed, 194 insertions(+), 140 deletions(-) diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml index 17e9a1d3..bdf144da 100644 --- a/juno_samples/cava/Cargo.toml +++ b/juno_samples/cava/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../juno_build" } @@ -30,4 +31,4 @@ criterion = { version = "0.5", features = ["html_reports"] } [[bench]] name = "cava_bench" -harness = false \ No newline at end of file +harness = false diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch index 6fc8adbb..ada2f552 100644 --- a/juno_samples/cava/src/cpu.sch +++ b/juno_samples/cava/src/cpu.sch @@ -113,14 +113,18 @@ fixpoint { simpl!(fuse4); array-slf(fuse4); simpl!(fuse4); -let par = fuse4@image_loop \ fuse4@channel_loop; -fork-tile[4, 1, false, false](par); -fork-tile[8, 0, false, false](par); -fork-interchange[1, 2](par); -let split = fork-split(par); -let fuse4_body = outline(split.cava_3.fj2); -fork-coalesce(fuse4, fuse4_body); -simpl!(fuse4, fuse4_body); + +if !feature("seq") { + let par = fuse4@image_loop \ fuse4@channel_loop; + fork-tile[4, 1, false, false](par); + fork-tile[8, 0, false, false](par); + fork-interchange[1, 2](par); + let split = fork-split(par); + let fuse4_body = outline(split.cava_3.fj2); + fork-coalesce(fuse4, fuse4_body); + simpl!(fuse4, fuse4_body); + fuse4 = fuse4_body; +} no-memset(fuse5@res1); no-memset(fuse5@res2); @@ -136,8 +140,8 @@ simpl!(fuse5); delete-uncalled(*); simpl!(*); -fork-split(fuse1, fuse2, fuse3, fuse4_body, fuse5); -unforkify(fuse1, fuse2, fuse3, fuse4_body, fuse5); +fork-split(fuse1, fuse2, fuse3, fuse4, fuse5); +unforkify(fuse1, fuse2, fuse3, fuse4, fuse5); simpl!(*); diff --git a/juno_samples/edge_detection/Cargo.toml b/juno_samples/edge_detection/Cargo.toml index fa4ca1ff..8def7500 100644 --- a/juno_samples/edge_detection/Cargo.toml +++ b/juno_samples/edge_detection/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" [features] opencv = ["dep:opencv"] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [[bin]] name = "juno_edge_detection" diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index 3e1321c5..64fee6b6 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -24,14 +24,18 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); predication(gaussian_smoothing); simpl!(gaussian_smoothing); -let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; -fork-tile[4, 1, false, false](par); -fork-tile[8, 0, false, false](par); -fork-interchange[1, 2](par); -let split = fork-split(par); -let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); -fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); -simpl!(gaussian_smoothing, gaussian_smoothing_body); + +if !feature("seq") { + let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; + fork-tile[4, 1, false, false](par); + fork-tile[8, 0, false, false](par); + fork-interchange[1, 2](par); + let split = fork-split(par); + let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); + fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); + simpl!(gaussian_smoothing, gaussian_smoothing_body); + gaussian_smoothing = gaussian_smoothing_body; +} no-memset(laplacian_estimate@res); fixpoint { @@ -40,15 +44,19 @@ fixpoint { fork-coalesce(laplacian_estimate); } simpl!(laplacian_estimate); -let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; -fork-tile[4, 1, false, false](par); -fork-tile[8, 0, false, false](par); -fork-interchange[1, 2](par); -let split = fork-split(par); -let body = split._1_laplacian_estimate.fj2; -let laplacian_estimate_body = outline(body); -fork-coalesce(laplacian_estimate, laplacian_estimate_body); -simpl!(laplacian_estimate, laplacian_estimate_body); + +if !feature("seq") { + let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; + fork-tile[4, 1, false, false](par); + fork-tile[8, 0, false, false](par); + fork-interchange[1, 2](par); + let split = fork-split(par); + let body = split._1_laplacian_estimate.fj2; + let laplacian_estimate_body = outline(body); + fork-coalesce(laplacian_estimate, laplacian_estimate_body); + simpl!(laplacian_estimate, laplacian_estimate_body); + laplacian_estimate = laplacian_estimate_body; +} no-memset(zero_crossings@res); fixpoint { @@ -57,15 +65,19 @@ fixpoint { fork-coalesce(zero_crossings); } simpl!(zero_crossings); -let par = zero_crossings@image_loop \ zero_crossings@filter_loop; -fork-tile[4, 1, false, false](par); -fork-tile[8, 0, false, false](par); -fork-interchange[1, 2](par); -let split = fork-split(par); -let body = split._2_zero_crossings.fj2; -let zero_crossings_body = outline(body); -fork-coalesce(zero_crossings, zero_crossings_body); -simpl!(zero_crossings, zero_crossings_body); + +if !feature("seq") { + let par = zero_crossings@image_loop \ zero_crossings@filter_loop; + fork-tile[4, 1, false, false](par); + fork-tile[8, 0, false, false](par); + fork-interchange[1, 2](par); + let split = fork-split(par); + let body = split._2_zero_crossings.fj2; + let zero_crossings_body = outline(body); + fork-coalesce(zero_crossings, zero_crossings_body); + simpl!(zero_crossings, zero_crossings_body); + zero_crossings = zero_crossings_body; +} no-memset(gradient@res); fixpoint { @@ -84,17 +96,23 @@ fixpoint { fork-coalesce(max_gradient); } simpl!(max_gradient); -fork-dim-merge(max_gradient); -simpl!(max_gradient); -fork-tile[32, 0, false, false](max_gradient); -let split = fork-split(max_gradient); -clean-monoid-reduces(max_gradient); -let out = outline(split._4_max_gradient.fj1); -simpl!(max_gradient, out); -unforkify(out); -let out = fork-fission[split._4_max_gradient.fj0](max_gradient); -simpl!(max_gradient); -unforkify(out._4_max_gradient.fj_bottom); + +if !feature("seq") { + fork-dim-merge(max_gradient); + simpl!(max_gradient); + fork-tile[32, 0, false, false](max_gradient); + let split = fork-split(max_gradient); + clean-monoid-reduces(max_gradient); + let out = outline(split._4_max_gradient.fj1); + simpl!(max_gradient, out); + unforkify(out); + let out = fork-fission[split._4_max_gradient.fj0](max_gradient); + simpl!(max_gradient); + unforkify(out._4_max_gradient.fj_bottom); +} else { + fork-split(max_gradient); + unforkify(max_gradient); +} no-memset(reject_zero_crossings@res); fixpoint { @@ -104,18 +122,22 @@ fixpoint { } predication(reject_zero_crossings); simpl!(reject_zero_crossings); -fork-tile[4, 1, false, false](reject_zero_crossings); -fork-tile[8, 0, false, false](reject_zero_crossings); -fork-interchange[1, 2](reject_zero_crossings); -let split = fork-split(reject_zero_crossings); -let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); -fork-coalesce(reject_zero_crossings, reject_zero_crossings_body); -simpl!(reject_zero_crossings, reject_zero_crossings_body); + +if !feature("seq") { + fork-tile[4, 1, false, false](reject_zero_crossings); + fork-tile[8, 0, false, false](reject_zero_crossings); + fork-interchange[1, 2](reject_zero_crossings); + let split = fork-split(reject_zero_crossings); + let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); + fork-coalesce(reject_zero_crossings, reject_zero_crossings_body); + simpl!(reject_zero_crossings, reject_zero_crossings_body); + reject_zero_crossings = reject_zero_crossings_body; +} async-call(edge_detection@le, edge_detection@zc); -fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body); -unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body); +fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); +unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); simpl!(*); diff --git a/juno_samples/rodinia/backprop/Cargo.toml b/juno_samples/rodinia/backprop/Cargo.toml index 25185e09..68c76e92 100644 --- a/juno_samples/rodinia/backprop/Cargo.toml +++ b/juno_samples/rodinia/backprop/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../../juno_build" } @@ -31,4 +32,4 @@ criterion = { version = "0.5", features = ["html_reports"] } [[bench]] name = "backprop_bench" -harness = false \ No newline at end of file +harness = false diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 3c7f7d5f..4796f427 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -37,10 +37,12 @@ inline(backprop@forward_input, backprop@forward_hidden); let forward_input = outline(backprop@forward_input); let forward_hidden = outline(backprop@forward_hidden); -fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop); -let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop); -let forward_input = outline(inner); -inline(backprop@forward_input); +if !feature("seq") { + fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop); + let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop); + forward_input = outline(inner); + inline(backprop@forward_input); +} // The first call to adjust_weights has total loop dimensions of 1 * 17, so not // worth parallelizing (given that the body is trivial) @@ -50,10 +52,12 @@ inline(backprop@adjust_hidden, backprop@adjust_input); let adjust_hidden = outline(backprop@adjust_hidden); let adjust_input = outline(backprop@adjust_input); -fork-tile[16, 0, false, true](adjust_input); -let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input); -let adjust_input = outline(inner); -inline(backprop@adjust_input); +if !feature("seq") { + fork-tile[16, 0, false, true](adjust_input); + let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input); + adjust_input = outline(inner); + inline(backprop@adjust_input); +} delete-uncalled(*); const-inline(*); diff --git a/juno_samples/rodinia/bfs/Cargo.toml b/juno_samples/rodinia/bfs/Cargo.toml index 34b6f5ce..46f8ade6 100644 --- a/juno_samples/rodinia/bfs/Cargo.toml +++ b/juno_samples/rodinia/bfs/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../../juno_build" } diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index e5100561..550dce31 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -40,23 +40,24 @@ simpl!(collect); parallel-fork(traverse, collect); parallel-reduce(traverse, collect); -fork-tile[32, 0, false, true](traverse, collect); -let (outer, inner) = fork-reshape[[1], [0]](traverse); -let traverse_body = outline(inner); -let (outer, inner) = fork-reshape[[1], [0]](collect); -let collect_body = outline(inner); - -let init_body = init; -// Following code seems to generate breaking RT code -//fork-tile[32, 0, false, true](init); -//let (outer, inner) = fork-reshape[[1], [0]](init); -//let init_body = outline(inner); -//inline(bfs@cost_init); - -inline(bfs@loop1, bfs@loop2); +if !feature("seq") { + fork-tile[32, 0, false, true](traverse, collect); + let (outer, inner) = fork-reshape[[1], [0]](traverse); + traverse = outline(inner); + let (outer, inner) = fork-reshape[[1], [0]](collect); + collect = outline(inner); + + // Following code seems to generate breaking RT code + //fork-tile[32, 0, false, true](init); + //let (outer, inner) = fork-reshape[[1], [0]](init); + //init = outline(inner); + //inline(bfs@cost_init); + + inline(bfs@loop1, bfs@loop2); +} delete-uncalled(*); const-inline(*); -unforkify(init_body, traverse_body, collect_body); +unforkify(init, traverse, collect); simpl!(*); gcm(*); diff --git a/juno_samples/rodinia/cfd/Cargo.toml b/juno_samples/rodinia/cfd/Cargo.toml index 6720b527..172573dd 100644 --- a/juno_samples/rodinia/cfd/Cargo.toml +++ b/juno_samples/rodinia/cfd/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../../juno_build" } diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 7a284a9a..13125961 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -27,29 +27,35 @@ fixpoint { simpl!(*); unforkify(compute_flux@inner_loop); -fork-tile[32, 0, false, false](compute_step_factor); -let split = fork-split(compute_step_factor); -let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); -fork-coalesce(compute_step_factor, compute_step_factor_body); -simpl!(compute_step_factor, compute_step_factor_body); +if !feature("seq") { + fork-tile[32, 0, false, false](compute_step_factor); + let split = fork-split(compute_step_factor); + let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); + fork-coalesce(compute_step_factor, compute_step_factor_body); + simpl!(compute_step_factor, compute_step_factor_body); + compute_step_factor = compute_step_factor_body; -fork-tile[32, 0, false, false](compute_flux); -let split = fork-split(compute_flux); -let compute_flux_body = outline(split._6_compute_flux.fj1); -fork-coalesce(compute_flux, compute_flux_body); -simpl!(compute_flux, compute_flux_body); + fork-tile[32, 0, false, false](compute_flux); + let split = fork-split(compute_flux); + let compute_flux_body = outline(split._6_compute_flux.fj1); + fork-coalesce(compute_flux, compute_flux_body); + simpl!(compute_flux, compute_flux_body); + compute_flux = compute_flux_body; -fork-tile[32, 0, false, false](time_step); -let split = fork-split(time_step); -let time_step_body = outline(split._7_time_step.fj1); -fork-coalesce(time_step, time_step_body); -simpl!(time_step, time_step_body); + fork-tile[32, 0, false, false](time_step); + let split = fork-split(time_step); + let time_step_body = outline(split._7_time_step.fj1); + fork-coalesce(time_step, time_step_body); + simpl!(time_step, time_step_body); + time_step = time_step_body; -fork-tile[32, 0, false, false](copy_vars); -let split = fork-split(copy_vars); -let copy_vars_body = outline(split._8_copy_vars.fj1); -fork-coalesce(copy_vars, copy_vars_body); -simpl!(copy_vars, copy_vars_body); + fork-tile[32, 0, false, false](copy_vars); + let split = fork-split(copy_vars); + let copy_vars_body = outline(split._8_copy_vars.fj1); + fork-coalesce(copy_vars, copy_vars_body); + simpl!(copy_vars, copy_vars_body); + copy_vars = copy_vars_body; +} -unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body); +unforkify(compute_step_factor, compute_flux, time_step, copy_vars); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 518c656d..858be5ba 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -27,35 +27,42 @@ simpl!(*); no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res); unforkify(compute_flux@inner_loop); -fork-tile[32, 0, false, false](compute_step_factor); -let split = fork-split(compute_step_factor); -let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); -fork-coalesce(compute_step_factor, compute_step_factor_body); -simpl!(compute_step_factor, compute_step_factor_body); +if !feature("seq") { + fork-tile[32, 0, false, false](compute_step_factor); + let split = fork-split(compute_step_factor); + let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); + fork-coalesce(compute_step_factor, compute_step_factor_body); + simpl!(compute_step_factor, compute_step_factor_body); + compute_step_factor = compute_step_factor_body; -fork-tile[32, 0, false, false](compute_flux_contributions); -let split = fork-split(compute_flux_contributions); -let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1); -fork-coalesce(compute_flux_contributions, compute_flux_contributions_body); -simpl!(compute_flux_contributions, compute_flux_contributions_body); + fork-tile[32, 0, false, false](compute_flux_contributions); + let split = fork-split(compute_flux_contributions); + let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1); + fork-coalesce(compute_flux_contributions, compute_flux_contributions_body); + simpl!(compute_flux_contributions, compute_flux_contributions_body); + compute_flux_contributions = compute_flux_contributions_body; -fork-tile[32, 0, false, false](compute_flux); -let split = fork-split(compute_flux); -let compute_flux_body = outline(split._7_compute_flux.fj1); -fork-coalesce(compute_flux, compute_flux_body); -simpl!(compute_flux, compute_flux_body); + fork-tile[32, 0, false, false](compute_flux); + let split = fork-split(compute_flux); + let compute_flux_body = outline(split._7_compute_flux.fj1); + fork-coalesce(compute_flux, compute_flux_body); + simpl!(compute_flux, compute_flux_body); + compute_flux = compute_flux_body; -fork-tile[32, 0, false, false](time_step); -let split = fork-split(time_step); -let time_step_body = outline(split._8_time_step.fj1); -fork-coalesce(time_step, time_step_body); -simpl!(time_step, time_step_body); + fork-tile[32, 0, false, false](time_step); + let split = fork-split(time_step); + let time_step_body = outline(split._8_time_step.fj1); + fork-coalesce(time_step, time_step_body); + simpl!(time_step, time_step_body); + time_step = time_step_body; -fork-tile[32, 0, false, false](copy_vars); -let split = fork-split(copy_vars); -let copy_vars_body = outline(split._9_copy_vars.fj1); -fork-coalesce(copy_vars, copy_vars_body); -simpl!(copy_vars, copy_vars_body); + fork-tile[32, 0, false, false](copy_vars); + let split = fork-split(copy_vars); + let copy_vars_body = outline(split._9_copy_vars.fj1); + fork-coalesce(copy_vars, copy_vars_body); + simpl!(copy_vars, copy_vars_body); + copy_vars = copy_vars_body; +} -unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body); +unforkify(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars); gcm(*); diff --git a/juno_samples/rodinia/srad/Cargo.toml b/juno_samples/rodinia/srad/Cargo.toml index facf8c3b..783f3327 100644 --- a/juno_samples/rodinia/srad/Cargo.toml +++ b/juno_samples/rodinia/srad/Cargo.toml @@ -13,6 +13,7 @@ path = "src/lib.rs" [features] cuda = ["juno_build/cuda", "hercules_rt/cuda"] +seq = [] [build-dependencies] juno_build = { path = "../../../juno_build" } diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 8917f03d..8fa22aaa 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -35,20 +35,24 @@ simpl!(*); slf(*); simpl!(*); -fork-tile[32, 0, false, false](loop2); -let split = fork-split(loop2); -let loop2_body = outline(split.srad_1.fj1); -simpl!(loop2, loop2_body); +if !feature("seq") { + fork-tile[32, 0, false, false](loop2); + let split = fork-split(loop2); + let loop2_body = outline(split.srad_1.fj1); + simpl!(loop2, loop2_body); + loop2 = loop2_body; -fork-tile[32, 0, false, false](loop3); -let split = fork-split(loop3); -let loop3_body = outline(split.srad_2.fj1); -simpl!(loop3, loop3_body); + fork-tile[32, 0, false, false](loop3); + let split = fork-split(loop3); + let loop3_body = outline(split.srad_2.fj1); + simpl!(loop3, loop3_body); + loop3 = loop3_body; -inline(srad@loop2, srad@loop3); -delete-uncalled(*); + inline(srad@loop2, srad@loop3); + delete-uncalled(*); +} -fork-split(extract, compress, loop1, loop2_body, loop3_body); -unforkify(extract, compress, loop1, loop2_body, loop3_body); +fork-split(extract, compress, loop1, loop2, loop3); +unforkify(extract, compress, loop1, loop2, loop3); gcm(*); -- GitLab