From e37ad2b482dfe3fd23723b5b0f8015754e447bea Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Sat, 1 Mar 2025 10:42:33 -0600 Subject: [PATCH 01/14] some tuning --- juno_samples/cava/src/cava.jn | 4 ++-- juno_samples/cava/src/cpu.sch | 2 +- juno_samples/cava/src/gpu.sch | 6 +++--- juno_samples/edge_detection/src/cpu.sch | 10 +++++----- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn index 4d02b2cd..931e78f8 100644 --- a/juno_samples/cava/src/cava.jn +++ b/juno_samples/cava/src/cava.jn @@ -145,7 +145,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( @image_loop for r = 0 to row { for c = 0 to col { @l2 let l2_dist : f32[num_ctrl_pts]; - for cp = 0 to num_ctrl_pts { + @cp_loop for cp = 0 to num_ctrl_pts { let v1 = input[0, r, c] - ctrl_pts[cp, 0]; let v2 = input[1, r, c] - ctrl_pts[cp, 1]; let v3 = input[2, r, c] - ctrl_pts[cp, 2]; @@ -155,7 +155,7 @@ fn gamut<row : usize, col : usize, num_ctrl_pts : usize>( @channel_loop for chan = 0 to CHAN { let chan_val : f32 = 0.0; - for cp = 0 to num_ctrl_pts { + @cp_loop for cp = 0 to num_ctrl_pts { chan_val += l2_dist[cp] * weights[cp, chan]; } diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch index 8f22b37d..6fc8adbb 100644 --- a/juno_samples/cava/src/cpu.sch +++ b/juno_samples/cava/src/cpu.sch @@ -115,7 +115,7 @@ array-slf(fuse4); simpl!(fuse4); let par = fuse4@image_loop \ fuse4@channel_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); let fuse4_body = outline(split.cava_3.fj2); diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index bacfd3ab..aa1df390 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -117,9 +117,9 @@ fixpoint { simpl!(fuse4); array-slf(fuse4); simpl!(fuse4); -//fork-tile[2, 0, false, true](fuse4@channel_loop); -//fork-split(fuse4@channel_loop); -//clean-monoid-reduces(fuse4); +fork-tile[2, 0, false, true](fuse4@channel_loop); +let out = fork-split(fuse4@channel_loop); +fork-unroll(out.cava_3.fj1); unforkify(fuse4@channel_loop); no-memset(fuse5@res1); diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index ec9e423d..b46523f3 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -26,7 +26,7 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); let par = gaussian_smoothing@image_loop \ gaussian_smoothing@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); @@ -42,7 +42,7 @@ fixpoint { simpl!(laplacian_estimate); let par = laplacian_estimate@image_loop \ laplacian_estimate@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2; @@ -59,7 +59,7 @@ fixpoint { simpl!(zero_crossings); let par = zero_crossings@image_loop \ zero_crossings@filter_loop; fork-tile[4, 1, false, false](par); -fork-tile[4, 0, false, false](par); +fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2; @@ -86,7 +86,7 @@ fixpoint { simpl!(max_gradient); fork-dim-merge(max_gradient); simpl!(max_gradient); -fork-tile[16, 0, false, false](max_gradient); +fork-tile[32, 0, false, false](max_gradient); let split = fork-split(max_gradient); clean-monoid-reduces(max_gradient); let out = outline(split._4_max_gradient.fj1); @@ -105,7 +105,7 @@ fixpoint { predication(reject_zero_crossings); simpl!(reject_zero_crossings); fork-tile[4, 1, false, false](reject_zero_crossings); -fork-tile[4, 0, false, false](reject_zero_crossings); +fork-tile[8, 0, false, false](reject_zero_crossings); fork-interchange[1, 2](reject_zero_crossings); let split = fork-split(reject_zero_crossings); let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2); -- GitLab From d0fd55bba80835ecdd77ccb224a550f4b4b4a8fe Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 10:49:16 -0600 Subject: [PATCH 02/14] optimize ptx --- juno_scheduler/src/pm.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 456df2ed..62bdaf73 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -1090,7 +1090,9 @@ impl PassManager { let mut nvcc_process = Command::new("nvcc") .arg("-c") + .arg("-Xptxas") .arg("-O3") + .arg("-use_fast_math") .arg("-diag-suppress") .arg("177") .arg("-o") -- GitLab From 59410bb3d8e491845c9d19533cbccc7326f5329d Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 11:33:40 -0600 Subject: [PATCH 03/14] whoops --- juno_samples/cava/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/juno_samples/cava/src/lib.rs b/juno_samples/cava/src/lib.rs index 1810a246..47e3b1b3 100644 --- a/juno_samples/cava/src/lib.rs +++ b/juno_samples/cava/src/lib.rs @@ -124,9 +124,9 @@ pub struct CavaInputs { #[clap(long = "output-verify", value_name = "PATH")] pub output_verify: Option<String>, pub cam_model: String, - #[clap(short, long)] + #[clap(long)] pub crop_rows: Option<usize>, - #[clap(short, long)] + #[clap(long)] pub crop_cols: Option<usize>, } -- GitLab From a8af6c9bcec6102e327f0d76fc554892458a9aec Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 16:00:59 -0600 Subject: [PATCH 04/14] Fix gpu backend, tile on gpu is good actually :shocked: --- hercules_cg/src/gpu.rs | 6 ++++-- juno_samples/cava/src/gpu.sch | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index c9720273..07dd3ebf 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups; // because Fork basic block's init section already does gating write!( w, - "{}{} = (threadIdx.x % {}) / {};\n", + "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n", tabs, define_variable, use_thread_quota.unwrap(), - use_thread_quota.unwrap() / parallel_factor.unwrap() + use_thread_quota.unwrap() / parallel_factor.unwrap(), + divide, + modulo, )?; } } diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index aa1df390..92a1835d 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -121,6 +121,12 @@ fork-tile[2, 0, false, true](fuse4@channel_loop); let out = fork-split(fuse4@channel_loop); fork-unroll(out.cava_3.fj1); unforkify(fuse4@channel_loop); +fork-tile[4, 1, false, true](fuse4); +fork-tile[8, 0, false, true](fuse4); +fork-interchange[1, 2](fuse4); +let split = fork-split(fuse4); +fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2); +fork-coalesce(split.cava_3.fj2); no-memset(fuse5@res1); no-memset(fuse5@res2); -- GitLab From 776a69d6007c1e68b0f7aeb884a60c09548cd120 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 16:12:51 -0600 Subject: [PATCH 05/14] reorg --- juno_samples/cava/src/gpu.sch | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index 92a1835d..0ef466c0 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -121,12 +121,6 @@ fork-tile[2, 0, false, true](fuse4@channel_loop); let out = fork-split(fuse4@channel_loop); fork-unroll(out.cava_3.fj1); unforkify(fuse4@channel_loop); -fork-tile[4, 1, false, true](fuse4); -fork-tile[8, 0, false, true](fuse4); -fork-interchange[1, 2](fuse4); -let split = fork-split(fuse4); -fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2); -fork-coalesce(split.cava_3.fj2); no-memset(fuse5@res1); no-memset(fuse5@res2); @@ -139,6 +133,13 @@ simpl!(fuse5); array-slf(fuse5); simpl!(fuse5); +fork-tile[4, 1, false, true](fuse4); +fork-tile[8, 0, false, true](fuse4); +fork-interchange[1, 2](fuse4); +let split = fork-split(fuse4); +fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2); +fork-coalesce(split.cava_3.fj2); + delete-uncalled(*); simpl!(*); -- GitLab From f10b9cc746326fb781e0c0e08582796b640c17ed Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 16:33:50 -0600 Subject: [PATCH 06/14] simplify edge code --- .../edge_detection/src/edge_detection.jn | 80 ++++++------------- juno_samples/edge_detection/src/gpu.sch | 4 +- 2 files changed, 28 insertions(+), 56 deletions(-) diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn index 58f364dc..ebd58206 100644 --- a/juno_samples/edge_detection/src/edge_detection.jn +++ b/juno_samples/edge_detection/src/edge_detection.jn @@ -43,35 +43,16 @@ fn laplacian_estimate<n, m, sz: usize>( @image_loop for row = 0 to n { for col = 0 to m { - // Copy data for dilation filter - @shr1 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MIN_BR - else input[row + i - r, col + j - r]; - } - } - // Compute pixel of dilated image let dilated = MIN_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - dilated = max!(dilated, imageArea[i, j] * structure[i, j]); - } - } - - // Data copy for erotion filter - @shr2 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MAX_BR - else input[row + i - r, col + j - r]; + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MIN_BR + else input[row + i - r, col + j - r]; + dilated = max!(dilated, filter * structure[i, j]); } } @@ -79,7 +60,12 @@ fn laplacian_estimate<n, m, sz: usize>( let eroded = MAX_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - eroded = min!(eroded, imageArea[i, j] * structure[i, j]); + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MAX_BR + else input[row + i - r, col + j - r]; + eroded = min!(eroded, filter * structure[i, j]); } } @@ -101,37 +87,17 @@ fn zero_crossings<n, m, sz: usize>( @image_loop for row = 0 to n { for col = 0 to m { - // Data copy for dilation filter - @shr1 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MIN_BR - else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR - else MIN_BR; - } - } - // Compute the pixel of dilated image let dilated = MIN_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - dilated = max!(dilated, imageArea[i, j] * structure[i, j]); - } - } - - // Data copy for erotion filter - @shr2 let imageArea : f32[sz, sz]; - @filter_loop for i = 0 to sz { - for j = 0 to sz { - imageArea[i, j] = if row + i < r - || row + i - r > n - 1 - || col + j < r - || col + j - r > m - 1 then MAX_BR - else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR - else MIN_BR; + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MIN_BR + else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR + else MIN_BR; + dilated = max!(dilated, filter * structure[i, j]); } } @@ -139,7 +105,13 @@ fn zero_crossings<n, m, sz: usize>( let eroded = MAX_BR; @filter_loop for i = 0 to sz { for j = 0 to sz { - eroded = min!(eroded, imageArea[i, j] * structure[i, j]); + let filter = if row + i < r + || row + i - r > n - 1 + || col + j < r + || col + j - r > m - 1 then MAX_BR + else if input[row + i - r, col + j - r] > MIN_BR then MAX_BR + else MIN_BR; + eroded = min!(eroded, filter * structure[i, j]); } } diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 7ee2904f..065a78f2 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -27,7 +27,7 @@ simpl!(gaussian_smoothing); predication(gaussian_smoothing); simpl!(gaussian_smoothing); -no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); +no-memset(laplacian_estimate@res); fixpoint { forkify(laplacian_estimate); fork-guard-elim(laplacian_estimate); @@ -35,7 +35,7 @@ fixpoint { } simpl!(laplacian_estimate); -no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); +no-memset(zero_crossings@res); fixpoint { forkify(zero_crossings); fork-guard-elim(zero_crossings); -- GitLab From 0c7afae76803f306266fc489e0cb8d9c6edf75fb Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 17:04:11 -0600 Subject: [PATCH 07/14] tile part of edge --- hercules_opt/src/fork_transforms.rs | 7 +++++++ juno_samples/edge_detection/src/gpu.sch | 27 +++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index e6db0345..8bd3f735 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1169,6 +1169,7 @@ pub fn fork_dim_merge( op: BinaryOperator::Rem, }); edit.sub_edit(tid, rem); + edit.sub_edit(tid, outer_tid); edit = edit.replace_all_uses(tid, rem)?; } else if tid_dim == inner_idx { let outer_tid = Node::ThreadID { @@ -1185,6 +1186,7 @@ pub fn fork_dim_merge( op: BinaryOperator::Div, }); edit.sub_edit(tid, div); + edit.sub_edit(tid, outer_tid); edit = edit.replace_all_uses(tid, div)?; } } @@ -1479,7 +1481,12 @@ fn fork_fusion( } // Perform the fusion. + let bottom_tids: Vec<_> = editor + .get_users(bottom_fork) + .filter(|id| nodes[id.idx()].is_thread_id()) + .collect(); editor.edit(|mut edit| { + edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?; if bottom_join_pred != bottom_fork { // If there is control flow in the bottom fork-join, stitch it into // the top fork-join. diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 065a78f2..f8da90d0 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -26,6 +26,15 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); predication(gaussian_smoothing); simpl!(gaussian_smoothing); +fork-dim-merge(gaussian_smoothing@filter_loop); +unforkify(gaussian_smoothing@filter_loop); +simpl!(gaussian_smoothing); + +fork-dim-merge(gaussian_smoothing); +fork-tile[32, 0, false, true](gaussian_smoothing); +simpl!(gaussian_smoothing); +fork-split(gaussian_smoothing); +simpl!(gaussian_smoothing); no-memset(laplacian_estimate@res); fixpoint { @@ -34,6 +43,15 @@ fixpoint { fork-coalesce(laplacian_estimate); } simpl!(laplacian_estimate); +fork-dim-merge(laplacian_estimate@filter_loop); +unforkify(laplacian_estimate@filter_loop); +simpl!(laplacian_estimate); + +fork-dim-merge(laplacian_estimate); +fork-tile[32, 0, false, true](laplacian_estimate); +simpl!(laplacian_estimate); +fork-split(laplacian_estimate); +simpl!(laplacian_estimate); no-memset(zero_crossings@res); fixpoint { @@ -42,6 +60,15 @@ fixpoint { fork-coalesce(zero_crossings); } simpl!(zero_crossings); +fork-dim-merge(zero_crossings@filter_loop); +unforkify(zero_crossings@filter_loop); +simpl!(zero_crossings); + +fork-dim-merge(zero_crossings); +fork-tile[32, 0, false, true](zero_crossings); +simpl!(zero_crossings); +fork-split(zero_crossings); +simpl!(zero_crossings); no-memset(gradient@res); fixpoint { -- GitLab From 79f6f3ae99a7e4c3baddc5618ece21830ee9bd2c Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 17:08:23 -0600 Subject: [PATCH 08/14] fix cpu edge --- juno_samples/edge_detection/src/cpu.sch | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index b46523f3..3e1321c5 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -33,7 +33,7 @@ let gaussian_smoothing_body = outline(split._0_gaussian_smoothing.fj2); fork-coalesce(gaussian_smoothing, gaussian_smoothing_body); simpl!(gaussian_smoothing, gaussian_smoothing_body); -no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); +no-memset(laplacian_estimate@res); fixpoint { forkify(laplacian_estimate); fork-guard-elim(laplacian_estimate); @@ -45,12 +45,12 @@ fork-tile[4, 1, false, false](par); fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); -let body = split._1_laplacian_estimate.fj2 | laplacian_estimate.shr1 | laplacian_estimate.shr2; +let body = split._1_laplacian_estimate.fj2; let laplacian_estimate_body = outline(body); fork-coalesce(laplacian_estimate, laplacian_estimate_body); simpl!(laplacian_estimate, laplacian_estimate_body); -no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); +no-memset(zero_crossings@res); fixpoint { forkify(zero_crossings); fork-guard-elim(zero_crossings); @@ -62,7 +62,7 @@ fork-tile[4, 1, false, false](par); fork-tile[8, 0, false, false](par); fork-interchange[1, 2](par); let split = fork-split(par); -let body = split._2_zero_crossings.fj2 | zero_crossings.shr1 | zero_crossings.shr2; +let body = split._2_zero_crossings.fj2; let zero_crossings_body = outline(body); fork-coalesce(zero_crossings, zero_crossings_body); simpl!(zero_crossings, zero_crossings_body); -- GitLab From 30964185a4bd1567d3e57595c4b88f17ad7d1a67 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 17:20:58 -0600 Subject: [PATCH 09/14] update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 516108dd..6298d7bb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -/target +**/target *.dot !paper_resources/*.dot *.bc @@ -14,3 +14,4 @@ .vscode *_env *.txt +*ncu-rep \ No newline at end of file -- GitLab From f3a018e80c5e4cf1a0a4d56720c3bf4f33e049b2 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sat, 1 Mar 2025 22:34:42 -0600 Subject: [PATCH 10/14] . --- hercules_opt/src/interprocedural_sroa.rs | 1 - juno_samples/rodinia/cfd/src/lib.rs | 31 +++++++++++++++++------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/hercules_opt/src/interprocedural_sroa.rs b/hercules_opt/src/interprocedural_sroa.rs index c7ea5836..a01b0f55 100644 --- a/hercules_opt/src/interprocedural_sroa.rs +++ b/hercules_opt/src/interprocedural_sroa.rs @@ -85,7 +85,6 @@ pub fn interprocedural_sroa( param_nodes[idx].push(id); } } - println!("{}", editor.func().name); let success = editor.edit(|mut edit| { for (idx, ids) in param_nodes.into_iter().enumerate() { let new_indices = &old_param_type_map[idx]; diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs index 62ee59f4..f9a5dd76 100644 --- a/juno_samples/rodinia/cfd/src/lib.rs +++ b/juno_samples/rodinia/cfd/src/lib.rs @@ -48,8 +48,7 @@ fn run_euler( let normals_z = HerculesImmBox::from(normals.z.as_slice()); let mut runner = runner!(euler); - let (density, momentum_x, momentum_y, momentum_z, energy) = - async_std::task::block_on(async { + let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async { runner .run( nelr as u64, @@ -123,8 +122,7 @@ fn run_pre_euler( let normals_z = HerculesImmBox::from(normals.z.as_slice()); let mut runner = runner!(pre_euler); - let (density, momentum_x, momentum_y, momentum_z, energy) = - async_std::task::block_on(async { + let (density, momentum_x, momentum_y, momentum_z, energy) = async_std::task::block_on(async { runner .run( nelr as u64, @@ -189,15 +187,30 @@ fn compare_floats(xs: &Variables, ys: &Variables) -> bool { let ys_energy = ys.energy.as_slice(); xs_density.len() == ys_density.len() - && xs_density.iter().zip(ys_density.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_density + .iter() + .zip(ys_density.iter()) + .all(|(x, y)| compare_float(*x, *y)) && xs_momentum_x.len() == ys_momentum_x.len() - && xs_momentum_x.iter().zip(ys_momentum_x.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_x + .iter() + .zip(ys_momentum_x.iter()) + .all(|(x, y)| compare_float(*x, *y)) && xs_momentum_y.len() == ys_momentum_y.len() - && xs_momentum_y.iter().zip(ys_momentum_y.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_y + .iter() + .zip(ys_momentum_y.iter()) + .all(|(x, y)| compare_float(*x, *y)) && xs_momentum_z.len() == ys_momentum_z.len() - && xs_momentum_z.iter().zip(ys_momentum_z.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_momentum_z + .iter() + .zip(ys_momentum_z.iter()) + .all(|(x, y)| compare_float(*x, *y)) && xs_energy.len() == ys_energy.len() - && xs_energy.iter().zip(ys_energy.iter()).all(|(x, y)| compare_float(*x, *y)) + && xs_energy + .iter() + .zip(ys_energy.iter()) + .all(|(x, y)| compare_float(*x, *y)) } pub fn cfd_harness(args: CFDInputs) { -- GitLab From a3483234d1a5b7607cea444d8be67180bbc36816 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 09:59:23 -0600 Subject: [PATCH 11/14] Multi-core cfd --- juno_samples/rodinia/cfd/src/cpu_euler.sch | 28 ++++++++++++++-- .../rodinia/cfd/src/cpu_pre_euler.sch | 33 ++++++++++++++++++- juno_samples/rodinia/cfd/src/lib.rs | 1 + juno_samples/rodinia/cfd/src/pre_euler.jn | 18 +++++----- 4 files changed, 68 insertions(+), 12 deletions(-) diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 4cf320a6..4e7ee3cf 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -25,7 +25,31 @@ fixpoint { } simpl!(*); no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); -parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop); +unforkify(compute_flux@inner_loop); -unforkify(*); +fork-tile[32, 0, false, false](compute_step_factor); +let split = fork-split(compute_step_factor); +let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); +fork-coalesce(compute_step_factor, compute_step_factor_body); +simpl!(compute_step_factor, compute_step_factor_body); + +fork-tile[32, 0, false, false](compute_flux); +let split = fork-split(compute_flux); +let compute_flux_body = outline(split._6_compute_flux.fj1); +fork-coalesce(compute_flux, compute_flux_body); +simpl!(compute_flux, compute_flux_body); + +fork-tile[32, 0, false, false](time_step); +let split = fork-split(time_step); +let time_step_body = outline(split._7_time_step.fj1); +fork-coalesce(time_step, time_step_body); +simpl!(time_step, time_step_body); + +fork-tile[32, 0, false, false](copy_vars); +let split = fork-split(copy_vars); +let copy_vars_body = outline(split._8_copy_vars.fj1); +fork-coalesce(copy_vars, copy_vars_body); +simpl!(copy_vars, copy_vars_body); + +unforkify(compute_step_factor_body, compute_flux_body, time_step_body, copy_vars_body); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 14eb6906..518c656d 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -24,7 +24,38 @@ fixpoint { fork-guard-elim(*); } simpl!(*); +no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res); +unforkify(compute_flux@inner_loop); -unforkify(*); +fork-tile[32, 0, false, false](compute_step_factor); +let split = fork-split(compute_step_factor); +let compute_step_factor_body = outline(split._4_compute_step_factor.fj1); +fork-coalesce(compute_step_factor, compute_step_factor_body); +simpl!(compute_step_factor, compute_step_factor_body); +fork-tile[32, 0, false, false](compute_flux_contributions); +let split = fork-split(compute_flux_contributions); +let compute_flux_contributions_body = outline(split._6_compute_flux_contributions.fj1); +fork-coalesce(compute_flux_contributions, compute_flux_contributions_body); +simpl!(compute_flux_contributions, compute_flux_contributions_body); + +fork-tile[32, 0, false, false](compute_flux); +let split = fork-split(compute_flux); +let compute_flux_body = outline(split._7_compute_flux.fj1); +fork-coalesce(compute_flux, compute_flux_body); +simpl!(compute_flux, compute_flux_body); + +fork-tile[32, 0, false, false](time_step); +let split = fork-split(time_step); +let time_step_body = outline(split._8_time_step.fj1); +fork-coalesce(time_step, time_step_body); +simpl!(time_step, time_step_body); + +fork-tile[32, 0, false, false](copy_vars); +let split = fork-split(copy_vars); +let copy_vars_body = outline(split._9_copy_vars.fj1); +fork-coalesce(copy_vars, copy_vars_body); +simpl!(copy_vars, copy_vars_body); + +unforkify(compute_step_factor_body, compute_flux_contributions_body, compute_flux_body, time_step_body, copy_vars_body); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/lib.rs b/juno_samples/rodinia/cfd/src/lib.rs index f9a5dd76..d61df4c5 100644 --- a/juno_samples/rodinia/cfd/src/lib.rs +++ b/juno_samples/rodinia/cfd/src/lib.rs @@ -237,6 +237,7 @@ pub fn cfd_harness(args: CFDInputs) { } = read_domain_geometry(data_file, block_size); let variables = initialize_variables(nelr, &ff_variable); + println!("Running CFD with nelr = {}.", nelr); let res_juno = if pre_euler { run_pre_euler( diff --git a/juno_samples/rodinia/cfd/src/pre_euler.jn b/juno_samples/rodinia/cfd/src/pre_euler.jn index c200f2db..979c2e9a 100644 --- a/juno_samples/rodinia/cfd/src/pre_euler.jn +++ b/juno_samples/rodinia/cfd/src/pre_euler.jn @@ -58,7 +58,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 { } fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] { - let step_factors : f32[nelr]; + @res let step_factors : f32[nelr]; for i in 0..nelr { let density = variables.density[i]; @@ -109,10 +109,10 @@ fn compute_flux_contribution( fn compute_flux_contributions<nelr: usize>( variables: Variables::<nelr>, ) -> (Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>, Momentum::<nelr>) { - let fc_momentum_x: Momentum::<nelr>; - let fc_momentum_y: Momentum::<nelr>; - let fc_momentum_z: Momentum::<nelr>; - let fc_density_energy: Momentum::<nelr>; + @res let fc_momentum_x: Momentum::<nelr>; + @res let fc_momentum_y: Momentum::<nelr>; + @res let fc_momentum_z: Momentum::<nelr>; + @res let fc_density_energy: Momentum::<nelr>; for i in 0..nelr { let density_i = variables.density[i]; @@ -167,9 +167,9 @@ fn compute_flux<nelr: usize>( ff_fc_momentum_z: float3, ) -> Variables::<nelr> { const smoothing_coefficient : f32 = 0.2; - let fluxes: Variables::<nelr>; + @res let fluxes: Variables::<nelr>; - for i in 0..nelr { + @outer_loop for i in 0..nelr { let density_i = variables.density[i]; let momentum_i = float3 { x: variables.momentum.x[i], @@ -201,7 +201,7 @@ fn compute_flux<nelr: usize>( let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 }; let flux_i_density_energy : f32 = 0.0; - for j in 0..NNB { + @inner_loop for j in 0..NNB { let nb = elements_surrounding_elements[j, i]; let normal = float3 { x: normals.x[j, i], @@ -328,7 +328,7 @@ fn time_step<nelr: usize>( } fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> { - let result : Variables::<nelr>; + @res let result : Variables::<nelr>; for i in 0..nelr { result.density[i] = variables.density[i]; -- GitLab From badf7e6a5cf86ebf02f3e1462a59a121ec78d87f Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 10:21:11 -0600 Subject: [PATCH 12/14] Tile srad gpu --- juno_samples/rodinia/srad/src/gpu.sch | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch index 289548f9..f736c0b7 100644 --- a/juno_samples/rodinia/srad/src/gpu.sch +++ b/juno_samples/rodinia/srad/src/gpu.sch @@ -54,4 +54,10 @@ ip-sroa(*); sroa(*); simpl!(*); +fork-dim-merge(main_loops); +fork-tile[32, 0, false, true](main_loops); +dce(main_loops); +fork-split(main_loops); +simpl!(main_loops); + gcm(*); -- GitLab From d5f4aacedf2ee39ce4fe2e254cad5197df092877 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 10:45:57 -0600 Subject: [PATCH 13/14] CFD opt --- hercules_opt/src/sroa.rs | 11 ++++++++--- juno_samples/rodinia/cfd/src/cpu_euler.sch | 2 +- juno_samples/rodinia/cfd/src/gpu_euler.sch | 18 ++++++++++++++---- .../rodinia/cfd/src/gpu_pre_euler.sch | 19 +++++++++++++++++-- 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/hercules_opt/src/sroa.rs b/hercules_opt/src/sroa.rs index e658ff88..2718f99d 100644 --- a/hercules_opt/src/sroa.rs +++ b/hercules_opt/src/sroa.rs @@ -447,7 +447,7 @@ pub fn sroa( field_map.insert(node, generate_reads(editor, types[&node], node)); } Node::Constant { id } => { - field_map.insert(node, generate_constant_fields(editor, id)); + field_map.insert(node, generate_constant_fields(editor, id, node)); to_delete.push(node); } _ => { @@ -1079,7 +1079,11 @@ pub fn generate_constant(editor: &mut FunctionEditor, typ: TypeID) -> ConstantID // Given a constant cnst adds node to the function which are the constant values of each field and // returns a list of pairs of indices and the node that holds that index -fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> IndexTree<NodeID> { +fn generate_constant_fields( + editor: &mut FunctionEditor, + cnst: ConstantID, + old_node: NodeID, +) -> IndexTree<NodeID> { let cs: Option<Vec<ConstantID>> = if let Some(cs) = editor.get_constant(cnst).try_product_fields() { Some(cs.into()) @@ -1090,13 +1094,14 @@ fn generate_constant_fields(editor: &mut FunctionEditor, cnst: ConstantID) -> In if let Some(cs) = cs { let mut fields = vec![]; for c in cs { - fields.push(generate_constant_fields(editor, c)); + fields.push(generate_constant_fields(editor, c, old_node)); } IndexTree::Node(fields) } else { let mut node = None; editor.edit(|mut edit| { node = Some(edit.add_node(Node::Constant { id: cnst })); + edit.sub_edit(old_node, node.unwrap()); Ok(edit) }); IndexTree::Leaf(node.expect("Add node cannot fail")) diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 4e7ee3cf..7a284a9a 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -12,6 +12,7 @@ macro simpl!(X) { simpl!(*); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); delete-uncalled(*); simpl!(*); ip-sroa[true](*); @@ -24,7 +25,6 @@ fixpoint { fork-guard-elim(*); } simpl!(*); -no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); unforkify(compute_flux@inner_loop); fork-tile[32, 0, false, false](compute_step_factor); diff --git a/juno_samples/rodinia/cfd/src/gpu_euler.sch b/juno_samples/rodinia/cfd/src/gpu_euler.sch index aed6115e..3700f79d 100644 --- a/juno_samples/rodinia/cfd/src/gpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/gpu_euler.sch @@ -12,6 +12,7 @@ macro simpl!(X) { simpl!(*); inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); delete-uncalled(*); gpu(copy_vars, compute_step_factor, compute_flux, time_step); @@ -26,9 +27,18 @@ fixpoint { fork-guard-elim(*); } simpl!(*); -no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); -parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop); +unforkify(compute_flux@inner_loop); + +fork-tile[32, 0, false, true](compute_step_factor); +fork-split(compute_step_factor); + +fork-tile[32, 0, false, true](compute_flux); +fork-split(compute_flux); + +fork-tile[32, 0, false, true](time_step); +fork-split(time_step); + +fork-tile[32, 0, false, true](copy_vars); +fork-split(copy_vars); -unforkify(*); -float-collections(*); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch index d91f1b00..d6db675b 100644 --- a/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/gpu_pre_euler.sch @@ -12,6 +12,7 @@ macro simpl!(X) { simpl!(*); inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); +no-memset(compute_step_factor@res, compute_flux_contributions@res, compute_flux@res, copy_vars@res); delete-uncalled(*); gpu(copy_vars, compute_step_factor, compute_flux_contributions, compute_flux, time_step); @@ -26,7 +27,21 @@ fixpoint { fork-guard-elim(*); } simpl!(*); +unforkify(compute_flux@inner_loop); + +fork-tile[32, 0, false, true](compute_step_factor); +fork-split(compute_step_factor); + +fork-tile[32, 0, false, true](compute_flux_contributions); +fork-split(compute_flux_contributions); + +fork-tile[32, 0, false, true](compute_flux); +fork-split(compute_flux); + +fork-tile[32, 0, false, true](time_step); +fork-split(time_step); + +fork-tile[32, 0, false, true](copy_vars); +fork-split(copy_vars); -unforkify(*); -float-collections(*); gcm(*); -- GitLab From 78a7c974850eac226b541126265aaebfdebbfe02 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 11:18:17 -0600 Subject: [PATCH 14/14] optimize edge --- juno_samples/edge_detection/src/edge_detection.jn | 2 +- juno_samples/edge_detection/src/gpu.sch | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn index ebd58206..3e49cb36 100644 --- a/juno_samples/edge_detection/src/edge_detection.jn +++ b/juno_samples/edge_detection/src/edge_detection.jn @@ -138,7 +138,7 @@ fn gradient<n, m, sb: usize>( let gx = 0; let gy = 0; - for i = 0 to sb { + @filter_loop for i = 0 to sb { for j = 0 to sb { let val = input[if row + i < sbr then 0 else if row + i - sbr > n - 1 then n - 1 diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index f8da90d0..666f6cef 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -80,6 +80,15 @@ predication(gradient); simpl!(gradient); predication(gradient); simpl!(gradient); +fork-dim-merge(gradient@filter_loop); +unforkify(gradient@filter_loop); +simpl!(gradient); + +fork-dim-merge(gradient); +fork-tile[32, 0, false, true](gradient); +simpl!(gradient); +fork-split(gradient); +simpl!(gradient); fixpoint { forkify(max_gradient); @@ -115,6 +124,12 @@ fixpoint { predication(reject_zero_crossings); simpl!(reject_zero_crossings); +fork-dim-merge(reject_zero_crossings); +fork-tile[32, 0, false, true](reject_zero_crossings); +simpl!(reject_zero_crossings); +fork-split(reject_zero_crossings); +simpl!(reject_zero_crossings); + async-call(edge_detection@le, edge_detection@zc); simpl!(*); -- GitLab