From 72a61f40f290d9b5ced10b2c2caa406998b7c861 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 13:15:40 -0600 Subject: [PATCH 1/9] Optimize srad --- juno_samples/rodinia/srad/src/gpu.sch | 35 ++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch index f736c0b7..f89b7ab8 100644 --- a/juno_samples/rodinia/srad/src/gpu.sch +++ b/juno_samples/rodinia/srad/src/gpu.sch @@ -41,15 +41,26 @@ fork-tile[32, 0, false, true](sum_loop); let out = fork-split(sum_loop); clean-monoid-reduces(sum_loop); simpl!(sum_loop); -let fission = fork-fission[out.srad_0.fj0](sum_loop); + +let fission1 = fork-fission[out.srad_0.fj0](sum_loop); +simpl!(sum_loop); +fork-tile[32, 0, false, true](fission1.srad_0.fj_bottom); +let out = fork-split(fission1.srad_0.fj_bottom); +clean-monoid-reduces(sum_loop); +simpl!(sum_loop); + +let fission2 = fork-fission[out.srad_0.fj0](sum_loop); simpl!(sum_loop); -fork-tile[32, 0, false, true](fission.srad_0.fj_bottom); -let out = fork-split(fission.srad_0.fj_bottom); +fork-tile[32, 0, false, true](fission2.srad_0.fj_bottom); +let out = fork-split(fission2.srad_0.fj_bottom); clean-monoid-reduces(sum_loop); simpl!(sum_loop); -let top = outline(fission.srad_0.fj_top); -let bottom = outline(out.srad_0.fj0); -gpu(top, bottom); + +let first = outline(fission1.srad_0.fj_top); +let second = outline(fission2.srad_0.fj_top); +let third = outline(out.srad_0.fj0); +gpu(first, second, third); +const-inline[false](*); ip-sroa(*); sroa(*); simpl!(*); @@ -60,4 +71,16 @@ dce(main_loops); fork-split(main_loops); simpl!(main_loops); +fork-dim-merge(extract); +fork-tile[32, 0, false, true](extract); +dce(extract); +fork-split(extract); +simpl!(extract); + +fork-dim-merge(compress); +fork-tile[32, 0, false, true](compress); +dce(compress); +fork-split(compress); +simpl!(compress); + gcm(*); -- GitLab From 82c77fe26f79846416cbf29553b9893ac176e936 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 13:50:00 -0600 Subject: [PATCH 2/9] . --- juno_scheduler/src/pm.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 62bdaf73..5c1dd477 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -1339,7 +1339,10 @@ fn interp_expr( } } Value::Record { fields } => match fields.get(field) { - None => Err(SchedulerError::UndefinedField(field.clone())), + None => Err(SchedulerError::UndefinedField(format!( + "{} not in {:?}", + field, fields + ))), Some(v) => Ok((v.clone(), changed)), }, } -- GitLab From c3e539f9c0743647f2bff0d66c35e1b56d3cb563 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 14:23:45 -0600 Subject: [PATCH 3/9] . --- hercules_ir/src/einsum.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hercules_ir/src/einsum.rs b/hercules_ir/src/einsum.rs index 6c2ca31b..3922506b 100644 --- a/hercules_ir/src/einsum.rs +++ b/hercules_ir/src/einsum.rs @@ -435,10 +435,9 @@ pub fn debug_print_math_expr(id: MathID, env: &MathEnv) { } MathExpr::IntrinsicFunc(intrinsic, ref args) => { print!("{}(", intrinsic.lower_case_name()); - debug_print_math_expr(id, env); for arg in args { - print!(", "); debug_print_math_expr(*arg, env); + print!(", "); } print!(")"); } -- GitLab From 620f17d3dede3e401f4b3d3f0156457725663e30 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 14:34:22 -0600 Subject: [PATCH 4/9] fork join test for array slf --- juno_samples/fork_join_tests/src/cpu.sch | 10 ++++++++-- .../fork_join_tests/src/fork_join_tests.jn | 13 +++++++++++++ juno_samples/fork_join_tests/src/gpu.sch | 4 +++- juno_samples/fork_join_tests/src/main.rs | 14 ++++++++++++++ 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/juno_samples/fork_join_tests/src/cpu.sch b/juno_samples/fork_join_tests/src/cpu.sch index f46c91d6..5f3ff94e 100644 --- a/juno_samples/fork_join_tests/src/cpu.sch +++ b/juno_samples/fork_join_tests/src/cpu.sch @@ -3,7 +3,7 @@ gvn(*); phi-elim(*); dce(*); -let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9); +let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10); cpu(auto.test1); cpu(auto.test2); cpu(auto.test3); @@ -12,6 +12,7 @@ cpu(auto.test5); cpu(auto.test7); cpu(auto.test8); cpu(auto.test9); +cpu(auto.test10); let test1_cpu = auto.test1; rename["test1_cpu"](test1_cpu); @@ -94,6 +95,11 @@ dce(auto.test8); simplify-cfg(auto.test8); dce(auto.test8); -no-memset(test9@const); +array-slf(auto.test10); +ccp(auto.test10); +dce(auto.test10); +simplify-cfg(auto.test10); +dce(auto.test10); +unforkify(auto.test10); gcm(*); diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn index 334fc2bf..2eab56b9 100644 --- a/juno_samples/fork_join_tests/src/fork_join_tests.jn +++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn @@ -147,3 +147,16 @@ fn test9<r, c : usize>(input : i32[r, c]) -> i32[r, c] { return out; } + +#[entry] +fn test10(k1 : i32[8], k2 : i32[8], v : i32[8]) -> i32 { + @const let s : i32[8]; + for i = 0 to 8 { + s[i] = v[k1[i] as u64]; + } + let sum = 0; + for i = 0 to 8 { + sum += s[k2[i] as u64]; + } + return sum; +} \ No newline at end of file diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch index 81dc8d98..43b28e34 100644 --- a/juno_samples/fork_join_tests/src/gpu.sch +++ b/juno_samples/fork_join_tests/src/gpu.sch @@ -8,12 +8,13 @@ no-memset(test6@const); no-memset(test8@const1); no-memset(test8@const2); no-memset(test9@const); +no-memset(test10@const); gvn(*); phi-elim(*); dce(*); -let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9); +let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10); gpu(auto.test1); gpu(auto.test2); gpu(auto.test3); @@ -22,6 +23,7 @@ gpu(auto.test5); gpu(auto.test7); gpu(auto.test8); gpu(auto.test9); +gpu(auto.test10); ip-sroa(*); sroa(*); diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs index e66309b2..0b37a99d 100644 --- a/juno_samples/fork_join_tests/src/main.rs +++ b/juno_samples/fork_join_tests/src/main.rs @@ -74,6 +74,20 @@ fn main() { 5 + 6 + 8 + 9, ]; assert(&correct, output); + + let mut r = runner!(test10); + let k1 = vec![0, 4, 3, 7, 3, 4, 2, 1]; + let k2 = vec![6, 4, 3, 2, 4, 1, 0, 5]; + let v = vec![3, -499, 4, 32, -2, 55, -74, 10]; + let mut correct = 0; + for i in 0..8 { + correct += v[k1[k2[i] as usize] as usize]; + } + let k1 = HerculesImmBox::from(&k1 as &[i32]); + let k2 = HerculesImmBox::from(&k2 as &[i32]); + let v = HerculesImmBox::from(&v as &[i32]); + let output = r.run(k1.to(), k2.to(), v.to()).await; + assert_eq!(output, correct); }); } -- GitLab From a8e41c3e7fb395e2f2d4a6b39832b08447f43d6c Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 14:58:35 -0600 Subject: [PATCH 5/9] tweaks --- juno_samples/rodinia/srad/src/cpu.sch | 1 + juno_samples/rodinia/srad/src/gpu.sch | 1 + juno_samples/rodinia/srad/src/srad.jn | 8 ++++---- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index a4cd4956..44007fca 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -8,6 +8,7 @@ macro simpl!(X) { infer-schedules(X); } +no-memset(srad@scratch); phi-elim(*); let loop1 = outline(srad@loop1); let loop2 = outline(srad@loop2); diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch index f89b7ab8..c5a30527 100644 --- a/juno_samples/rodinia/srad/src/gpu.sch +++ b/juno_samples/rodinia/srad/src/gpu.sch @@ -8,6 +8,7 @@ macro simpl!(X) { infer-schedules(X); } +no-memset(srad@scratch); phi-elim(*); let sum_loop = outline(srad@loop1); let main_loops = outline(srad@loop2 | srad@loop3); diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn index 6074bf8c..176778be 100644 --- a/juno_samples/rodinia/srad/src/srad.jn +++ b/juno_samples/rodinia/srad/src/srad.jn @@ -50,10 +50,10 @@ fn srad<nrows, ncols: usize>( let varROI = (sum2 / nelems as f32) - meanROI * meanROI; let q0sqr = varROI / (meanROI * meanROI); - @dirs let dN : f32[ncols, nrows]; - @dirs let dS : f32[ncols, nrows]; - @dirs let dE : f32[ncols, nrows]; - @dirs let dW : f32[ncols, nrows]; + @scratch let dN : f32[ncols, nrows]; + @scratch let dS : f32[ncols, nrows]; + @scratch let dE : f32[ncols, nrows]; + @scratch let dW : f32[ncols, nrows]; let c : f32[ncols, nrows]; -- GitLab From b1f50f47a8ddaf6081065312e63cd3f8d9d78965 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 15:01:03 -0600 Subject: [PATCH 6/9] tweak --- juno_samples/rodinia/srad/src/cpu.sch | 2 ++ 1 file changed, 2 insertions(+) diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 44007fca..b3188b60 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -32,6 +32,8 @@ simpl!(*); fork-interchange[0, 1](loop1); reduce-slf(*); simpl!(*); +slf!(*); +simpl!(*); fork-split(*); unforkify(*); -- GitLab From 5f96fddc185d0b5a39a2063af815984a606cd68b Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 15:09:24 -0600 Subject: [PATCH 7/9] tweak --- hercules_ir/src/typecheck.rs | 4 +-- .../rodinia/srad/benches/srad_bench.rs | 17 ---------- juno_samples/rodinia/srad/src/cpu.sch | 2 +- juno_samples/rodinia/srad/src/lib.rs | 33 +------------------ juno_samples/rodinia/srad/src/rust_srad.rs | 31 ++++++++--------- juno_samples/rodinia/srad/src/srad.jn | 24 ++++++++------ 6 files changed, 30 insertions(+), 81 deletions(-) diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs index b2567b8f..1ecebf11 100644 --- a/hercules_ir/src/typecheck.rs +++ b/hercules_ir/src/typecheck.rs @@ -822,7 +822,7 @@ fn typeflow( // We also return the return type from here match intrinsic { // Intrinsics that take any numeric type and return the same - Intrinsic::Abs => { + Intrinsic::Abs | Intrinsic::Max | Intrinsic::Min => { if let Concrete(id) = inputs[0] { if types[id.idx()].is_arithmetic() { Concrete(*id) @@ -856,8 +856,6 @@ fn typeflow( | Intrinsic::Ln1P | Intrinsic::Log10 | Intrinsic::Log2 - | Intrinsic::Max - | Intrinsic::Min | Intrinsic::Round | Intrinsic::Sin | Intrinsic::Sinh diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs index 728702d9..6af13aae 100644 --- a/juno_samples/rodinia/srad/benches/srad_bench.rs +++ b/juno_samples/rodinia/srad/benches/srad_bench.rs @@ -24,19 +24,6 @@ fn srad_bench(c: &mut Criterion) { } = read_graphics(image); let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); let mut image_h = HerculesMutBox::from(image.clone()); - let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>(); - let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>(); - let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>(); - let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>(); - // Fix boundary conditions - iN[0] = 0; - iS[nrows - 1] = (nrows - 1) as i32; - jW[0] = 0; - jE[ncols - 1] = (ncols - 1) as i32; - let iN_h = HerculesImmBox::from(iN.as_slice()); - let iS_h = HerculesImmBox::from(iS.as_slice()); - let jW_h = HerculesImmBox::from(jW.as_slice()); - let jE_h = HerculesImmBox::from(jE.as_slice()); group.bench_function("srad bench", |b| { b.iter(|| { async_std::task::block_on(async { @@ -45,10 +32,6 @@ fn srad_bench(c: &mut Criterion) { ncols as u64, niter as u64, image_h.to(), - iN_h.to(), - iS_h.to(), - jW_h.to(), - jE_h.to(), max, lambda, ) diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index b3188b60..5a8c180e 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -32,7 +32,7 @@ simpl!(*); fork-interchange[0, 1](loop1); reduce-slf(*); simpl!(*); -slf!(*); +slf(*); simpl!(*); fork-split(*); diff --git a/juno_samples/rodinia/srad/src/lib.rs b/juno_samples/rodinia/srad/src/lib.rs index a647b94a..cb156d9d 100644 --- a/juno_samples/rodinia/srad/src/lib.rs +++ b/juno_samples/rodinia/srad/src/lib.rs @@ -48,22 +48,6 @@ pub fn srad_harness(args: SRADInputs) { let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); let mut image_h = HerculesMutBox::from(image.clone()); - let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>(); - let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>(); - let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>(); - let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>(); - - // Fix boundary conditions - iN[0] = 0; - iS[nrows - 1] = (nrows - 1) as i32; - jW[0] = 0; - jE[ncols - 1] = (ncols - 1) as i32; - - let iN_h = HerculesImmBox::from(iN.as_slice()); - let iS_h = HerculesImmBox::from(iS.as_slice()); - let jW_h = HerculesImmBox::from(jW.as_slice()); - let jE_h = HerculesImmBox::from(jE.as_slice()); - let mut runner = runner!(srad); let result: Vec<f32> = HerculesMutBox::from( runner @@ -72,10 +56,6 @@ pub fn srad_harness(args: SRADInputs) { ncols as u64, niter as u64, image_h.to(), - iN_h.to(), - iS_h.to(), - jW_h.to(), - jE_h.to(), max, lambda, ) @@ -90,18 +70,7 @@ pub fn srad_harness(args: SRADInputs) { if verify { let mut rust_result = image; - rust_srad::srad( - nrows, - ncols, - niter, - &mut rust_result, - &iN, - &iS, - &jW, - &jE, - max, - lambda, - ); + rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda); if let Some(output) = output_verify { write_graphics(output, &rust_result, nrows, ncols, max); diff --git a/juno_samples/rodinia/srad/src/rust_srad.rs b/juno_samples/rodinia/srad/src/rust_srad.rs index 3226e35f..f25d382a 100644 --- a/juno_samples/rodinia/srad/src/rust_srad.rs +++ b/juno_samples/rodinia/srad/src/rust_srad.rs @@ -1,15 +1,4 @@ -pub fn srad( - nrows: usize, - ncols: usize, - niter: usize, - image: &mut Vec<f32>, - iN: &[i32], - iS: &[i32], - jW: &[i32], - jE: &[i32], - max: f32, - lambda: f32, -) { +pub fn srad(nrows: usize, ncols: usize, niter: usize, image: &mut Vec<f32>, max: f32, lambda: f32) { let nelems = nrows * ncols; // EXTRACT @@ -44,11 +33,15 @@ pub fn srad( for i in 0..nrows { let k = i + nrows * j; let Jc = image[k]; + let iN = std::cmp::max(i, 1) - 1; + let iS = std::cmp::min(i, nrows - 2) + 1; + let jW = std::cmp::max(j, 1) - 1; + let jE = std::cmp::min(j, ncols - 2) + 1; - dN[k] = image[iN[i] as usize + nrows * j] - Jc; - dS[k] = image[iS[i] as usize + nrows * j] - Jc; - dW[k] = image[i + nrows * jW[j] as usize] - Jc; - dE[k] = image[i + nrows * jE[j] as usize] - Jc; + dN[k] = image[iN as usize + nrows * j] - Jc; + dS[k] = image[iS as usize + nrows * j] - Jc; + dW[k] = image[i + nrows * jW as usize] - Jc; + dE[k] = image[i + nrows * jE as usize] - Jc; let G2 = (dN[k] * dN[k] + dS[k] * dS[k] + dW[k] * dW[k] + dE[k] * dE[k]) / (Jc * Jc); @@ -72,11 +65,13 @@ pub fn srad( for j in 0..ncols { for i in 0..nrows { let k = i + nrows * j; + let iS = std::cmp::min(i, nrows - 2) + 1; + let jE = std::cmp::min(j, ncols - 2) + 1; let cN = c[k]; - let cS = c[iS[i] as usize + nrows * j]; + let cS = c[iS as usize + nrows * j]; let cW = c[k]; - let cE = c[i + nrows * jE[j] as usize]; + let cE = c[i + nrows * jE as usize]; let D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k]; diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn index 176778be..b055b296 100644 --- a/juno_samples/rodinia/srad/src/srad.jn +++ b/juno_samples/rodinia/srad/src/srad.jn @@ -21,10 +21,6 @@ fn compress<nrows, ncols: usize>(inout image: f32[ncols, nrows], max: f32) { fn srad<nrows, ncols: usize>( niter: usize, inout image: f32[ncols, nrows], - iN: i32[nrows], - iS: i32[nrows], - jW: i32[ncols], - jE: i32[ncols], max: f32, lambda: f32, ) { @@ -60,10 +56,15 @@ fn srad<nrows, ncols: usize>( @loop2 for j in 0..ncols { for i in 0..nrows { let Jc = image[j, i]; - dN[j, i] = image[j, iN[i] as u64] - Jc; - dS[j, i] = image[j, iS[i] as u64] - Jc; - dW[j, i] = image[jW[j] as u64, i] - Jc; - dE[j, i] = image[jE[j] as u64, i] - Jc; + let iN = max!(i, 1) - 1; + let iS = min!(i, nrows - 2) + 1; + let jW = max!(j, 1) - 1; + let jE = min!(j, ncols - 2) + 1; + + dN[j, i] = image[j, iN as u64] - Jc; + dS[j, i] = image[j, iS as u64] - Jc; + dW[j, i] = image[jW as u64, i] - Jc; + dE[j, i] = image[jE as u64, i] - Jc; let G2 = (dN[j, i] * dN[j, i] + dS[j, i] * dS[j, i] + dW[j, i] * dW[j, i] + dE[j, i] * dE[j, i]) / (Jc * Jc); @@ -85,10 +86,13 @@ fn srad<nrows, ncols: usize>( @loop3 for j in 0..ncols { for i in 0..nrows { + let iS = min!(i, nrows - 2) + 1; + let jE = min!(j, ncols - 2) + 1; + let cN = c[j, i]; - let cS = c[j, iS[i] as u64]; + let cS = c[j, iS as u64]; let cW = c[j, i]; - let cE = c[jE[j] as u64, i]; + let cE = c[jE as u64, i]; let D = cN * dN[j, i] + cS * dS[j, i] + cW * dW[j, i] + cE * dE[j, i]; image[j, i] = image[j, i] + 0.25 * lambda * D; -- GitLab From 7c445a64d396f3e924659392deaf0854b5c255e9 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 15:29:54 -0600 Subject: [PATCH 8/9] parallelize loop2 in srad --- juno_samples/rodinia/srad/src/cpu.sch | 9 +++++++-- juno_samples/rodinia/srad/src/srad.jn | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 5a8c180e..43d8ceac 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -35,7 +35,12 @@ simpl!(*); slf(*); simpl!(*); -fork-split(*); -unforkify(*); +fork-tile[32, 0, false, false](loop2); +let split = fork-split(loop2); +let loop2_body = outline(split.srad_1.fj1); +simpl!(loop2, loop2_body); + +fork-split(extract, compress, loop1, loop2_body, loop3); +unforkify(extract, compress, loop1, loop2_body, loop3); gcm(*); diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn index b055b296..024be598 100644 --- a/juno_samples/rodinia/srad/src/srad.jn +++ b/juno_samples/rodinia/srad/src/srad.jn @@ -51,7 +51,7 @@ fn srad<nrows, ncols: usize>( @scratch let dE : f32[ncols, nrows]; @scratch let dW : f32[ncols, nrows]; - let c : f32[ncols, nrows]; + @scratch let c : f32[ncols, nrows]; @loop2 for j in 0..ncols { for i in 0..nrows { -- GitLab From 98c8cee0ca70828873c13b5a2426c00425c0edcd Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 2 Mar 2025 16:35:26 -0600 Subject: [PATCH 9/9] Inline to not re-allocate vec too much --- juno_samples/rodinia/srad/src/cpu.sch | 3 +++ 1 file changed, 3 insertions(+) diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 43d8ceac..7b7a6c9e 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -40,6 +40,9 @@ let split = fork-split(loop2); let loop2_body = outline(split.srad_1.fj1); simpl!(loop2, loop2_body); +inline(srad@loop2); +delete-uncalled(*); + fork-split(extract, compress, loop1, loop2_body, loop3); unforkify(extract, compress, loop1, loop2_body, loop3); -- GitLab