diff --git a/hercules_ir/src/einsum.rs b/hercules_ir/src/einsum.rs index 6c2ca31b48dc7f362d60404718ee3d2e00bd27fb..3922506b5815ee82655898ad0bc294e2282daf66 100644 --- a/hercules_ir/src/einsum.rs +++ b/hercules_ir/src/einsum.rs @@ -435,10 +435,9 @@ pub fn debug_print_math_expr(id: MathID, env: &MathEnv) { } MathExpr::IntrinsicFunc(intrinsic, ref args) => { print!("{}(", intrinsic.lower_case_name()); - debug_print_math_expr(id, env); for arg in args { - print!(", "); debug_print_math_expr(*arg, env); + print!(", "); } print!(")"); } diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs index b2567b8f1313e97aeb70268b11847b6d70b1a2f5..1ecebf117548b0ceeecc2ca2883a28129a8a5184 100644 --- a/hercules_ir/src/typecheck.rs +++ b/hercules_ir/src/typecheck.rs @@ -822,7 +822,7 @@ fn typeflow( // We also return the return type from here match intrinsic { // Intrinsics that take any numeric type and return the same - Intrinsic::Abs => { + Intrinsic::Abs | Intrinsic::Max | Intrinsic::Min => { if let Concrete(id) = inputs[0] { if types[id.idx()].is_arithmetic() { Concrete(*id) @@ -856,8 +856,6 @@ fn typeflow( | Intrinsic::Ln1P | Intrinsic::Log10 | Intrinsic::Log2 - | Intrinsic::Max - | Intrinsic::Min | Intrinsic::Round | Intrinsic::Sin | Intrinsic::Sinh diff --git a/hercules_opt/src/forkify.rs b/hercules_opt/src/forkify.rs index a6308d66eae3bd47b3d607fb8970679843599168..189670deed876b5fd7d9d9991e2d7a4aba9cad76 100644 --- a/hercules_opt/src/forkify.rs +++ b/hercules_opt/src/forkify.rs @@ -106,7 +106,7 @@ pub fn forkify_loop( else { return false; }; - + // Compute loop variance let loop_variance = compute_loop_variance(editor, l); let ivs = compute_induction_vars(editor.func(), l, &loop_variance); @@ -530,10 +530,11 @@ pub fn analyze_phis<'a>( let intersection: HashSet<_> = set1.intersection(&set2).cloned().collect(); // If this phi uses any other phis the node is loop dependant, - // we use `phis` because this phi can actually contain the loop iv and its fine. - if uses_for_dependance.any(|node| phis.contains(&node) && node != *phi) { - LoopPHI::LoopDependant(*phi) - } else if intersection.clone().iter().next().is_some() { + // // we use `phis` because this phi can actually contain the loop iv and its fine. + // if uses_for_dependance.any(|node| phis.contains(&node) && node != *phi) { + // LoopPHI::LoopDependant(*phi) + // } else + if intersection.clone().iter().next().is_some() { // PHIs on the frontier of the uses by the candidate phi, i.e in uses_for_dependance need // to have headers that postdominate the loop continue latch. The value of the PHI used needs to be defined // by the time the reduce is triggered (at the end of the loop's internal control). diff --git a/hercules_opt/src/loop_bound_canon.rs b/hercules_opt/src/loop_bound_canon.rs index f1ce872c9f8b31e8e1c1f3ba72a0df21d5858b26..3e194e2dda3319dd6c2c4ece4aea1af41e7e701b 100644 --- a/hercules_opt/src/loop_bound_canon.rs +++ b/hercules_opt/src/loop_bound_canon.rs @@ -275,7 +275,7 @@ pub fn canonicalize_single_loop_bounds( let new_binop_node = edit.add_node(Node::Binary { left, right: blah, op: BinaryOperator::LT }); edit = edit.replace_all_uses_where(binop_node, new_binop_node, |usee| *usee == if_node)?; - Some((init_id, bound_id, new_binop_node, if_node)) + Some((init_id, blah, new_binop_node, if_node)) } else {guard_info}; edit = edit.replace_all_uses_where(dc_bound_node, new_dc_bound_node, |usee| *usee == new_bop)?; @@ -289,7 +289,6 @@ pub fn canonicalize_single_loop_bounds( }; Ok(edit) }); - let update_expr_users: Vec<_> = editor .get_users(*update_expression) diff --git a/juno_samples/fork_join_tests/src/cpu.sch b/juno_samples/fork_join_tests/src/cpu.sch index a3bb146b3f8d456083daeb4ba62b8bb5369b7a71..ff30b277499d5d672e45a59a21317807fba64a8a 100644 --- a/juno_samples/fork_join_tests/src/cpu.sch +++ b/juno_samples/fork_join_tests/src/cpu.sch @@ -3,7 +3,7 @@ gvn(*); phi-elim(*); dce(*); -let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10); +let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10, test11); cpu(auto.test1); cpu(auto.test2); cpu(auto.test3); @@ -13,6 +13,7 @@ cpu(auto.test7); cpu(auto.test8); cpu(auto.test9); cpu(auto.test10); +cpu(auto.test11); let test1_cpu = auto.test1; rename["test1_cpu"](test1_cpu); @@ -95,8 +96,6 @@ dce(auto.test8); simplify-cfg(auto.test8); dce(auto.test8); -no-memset(test9@const); - fork-split(auto.test10); let fission_out = fork-fission-reduces[test10@loop3](auto.test10); fork-fusion(fission_out.test10_8.fj0, fission_out.test10_8.fj1); @@ -104,8 +103,13 @@ dce(auto.test10); simplify-cfg(auto.test10); dce(auto.test10); fork-split(auto.test10); - unforkify(auto.test10); +array-slf(auto.test11); +ccp(auto.test11); +dce(auto.test11); +simplify-cfg(auto.test11); +dce(auto.test11); +unforkify(auto.test11); gcm(*); diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn index 51284c29b159559a4c27dbe77521bf7bb725eaae..dfc81cd2f3e9b169424e3619118de7426e0782ac 100644 --- a/juno_samples/fork_join_tests/src/fork_join_tests.jn +++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn @@ -163,3 +163,16 @@ fn test10(i0 : u64, i1 : u64, input : i32) -> i32 { return arr0[i0, i1] + arr1[i0, i1] + arr2[i0, i1]; } + +#[entry] +fn test11(k1 : i32[8], k2 : i32[8], v : i32[8]) -> i32 { + @const let s : i32[8]; + for i = 0 to 8 { + s[i] = v[k1[i] as u64]; + } + let sum = 0; + for i = 0 to 8 { + sum += s[k2[i] as u64]; + } + return sum; +} diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch index 18caa8e528e2562498252c51bb5ddea888298f67..aa8861d5eda51dc95783d6affbc757c398204e5c 100644 --- a/juno_samples/fork_join_tests/src/gpu.sch +++ b/juno_samples/fork_join_tests/src/gpu.sch @@ -8,12 +8,14 @@ no-memset(test6@const); no-memset(test8@const1); no-memset(test8@const2); no-memset(test9@const); +no-memset(test10@const); +no-memset(test11@const); gvn(*); phi-elim(*); dce(*); -let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10); +let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10, test11); gpu(auto.test1); gpu(auto.test2); gpu(auto.test3); @@ -23,6 +25,7 @@ gpu(auto.test7); gpu(auto.test8); gpu(auto.test9); gpu(auto.test10); +gpu(auto.test11); ip-sroa(*); sroa(*); diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs index 277484f6bc094dbee05155fb7fee63fb3dcf3319..29b62beaf24645d68b95271438685073e01d63a6 100644 --- a/juno_samples/fork_join_tests/src/main.rs +++ b/juno_samples/fork_join_tests/src/main.rs @@ -79,6 +79,20 @@ fn main() { let output = r.run(1, 2, 5).await; let correct = 17i32; assert_eq!(correct, output); + + let mut r = runner!(test11); + let k1 = vec![0, 4, 3, 7, 3, 4, 2, 1]; + let k2 = vec![6, 4, 3, 2, 4, 1, 0, 5]; + let v = vec![3, -499, 4, 32, -2, 55, -74, 10]; + let mut correct = 0; + for i in 0..8 { + correct += v[k1[k2[i] as usize] as usize]; + } + let k1 = HerculesImmBox::from(&k1 as &[i32]); + let k2 = HerculesImmBox::from(&k2 as &[i32]); + let v = HerculesImmBox::from(&v as &[i32]); + let output = r.run(k1.to(), k2.to(), v.to()).await; + assert_eq!(output, correct); }); } diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index d2813388e0e7a1d7bd1696ffbb641e629096e2c2..7bc2083cf07c063eccda5855fa4fed3bfca91f87 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,24 +1,11 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(not(feature = "cuda"))] - { - JunoCompiler::new() - .file_in_src("matmul.jn") - .unwrap() - .schedule_in_src("cpu.sch") - .unwrap() - .build() - .unwrap(); - } - #[cfg(feature = "cuda")] - { - JunoCompiler::new() - .file_in_src("matmul.jn") - .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - } + JunoCompiler::new() + .file_in_src("matmul.jn") + .unwrap() + .schedule_in_src("matmul.sch") + .unwrap() + .build() + .unwrap(); } diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch deleted file mode 100644 index 69f1811d08a5691de6dc10fa86f0720e416fb85a..0000000000000000000000000000000000000000 --- a/juno_samples/matmul/src/cpu.sch +++ /dev/null @@ -1,61 +0,0 @@ -macro optimize!(X) { - gvn(X); - phi-elim(X); - dce(X); - ip-sroa(X); - sroa(X); - dce(X); - gvn(X); - phi-elim(X); - dce(X); -} - -macro codegen-prep!(X) { - optimize!(X); - gcm(X); - float-collections(X); - dce(X); - gcm(X); -} - -macro forkify!(X) { - fixpoint { - forkify(X); - fork-guard-elim(X); - } -} - -macro fork-tile { - fork-tile[n, 0, false, true](X); -} - -macro parallelize!(X) { - parallel-fork(X); - parallel-reduce(X); -} - -macro unforkify!(X) { - fork-split(X); - unforkify(X); -} - -optimize!(*); -forkify!(*); -associative(matmul@outer); - -// Parallelize by computing output array as 16 chunks -let par = matmul@outer \ matmul@inner; -fork-tile; -let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par); -parallelize!(outer \ inner); - -let body = outline(inner); -cpu(body); - -// Tile for cache, assuming 64B cache lines -fork-tile; -let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body); - -reduce-slf(inner); -unforkify!(body); -codegen-prep!(*); diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch deleted file mode 100644 index 76808149e7b99dce97e43f0e936536d3d13b7417..0000000000000000000000000000000000000000 --- a/juno_samples/matmul/src/gpu.sch +++ /dev/null @@ -1,26 +0,0 @@ -phi-elim(*); - -forkify(*); -fork-guard-elim(*); -dce(*); - -fixpoint { - reduce-slf(*); - slf(*); - infer-schedules(*); -} -fork-coalesce(*); -infer-schedules(*); -dce(*); -rewrite(*); -fixpoint { - simplify-cfg(*); - dce(*); -} - -ip-sroa(*); -sroa(*); -dce(*); - -float-collections(*); -gcm(*); diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch new file mode 100644 index 0000000000000000000000000000000000000000..306997f58eb217f9ce301dc18e418c412e6df621 --- /dev/null +++ b/juno_samples/matmul/src/matmul.sch @@ -0,0 +1,81 @@ +macro optimize!(X) { + gvn(X); + phi-elim(X); + dce(X); + ip-sroa(X); + sroa(X); + dce(X); + gvn(X); + phi-elim(X); + dce(X); +} + +macro codegen-prep!(X) { + optimize!(X); + gcm(X); + float-collections(X); + dce(X); + gcm(X); +} + +macro forkify!(X) { + fixpoint { + forkify(X); + fork-guard-elim(X); + } +} + +macro fork-tile { + fork-tile[n, 0, false, true](X); +} + +macro parallelize!(X) { + parallel-fork(X); + parallel-reduce(X); +} + +macro unforkify!(X) { + fork-split(X); + unforkify(X); +} + +optimize!(*); +forkify!(*); + +if feature("cuda") { + fixpoint { + reduce-slf(*); + slf(*); + infer-schedules(*); + } + fork-coalesce(*); + infer-schedules(*); + dce(*); + rewrite(*); + fixpoint { + simplify-cfg(*); + dce(*); + } + + optimize!(*); + codegen-prep!(*); +} else { + associative(matmul@outer); + + // Parallelize by computing output array as 16 chunks + let par = matmul@outer \ matmul@inner; + fork-tile; + let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par); + parallelize!(outer \ inner); + + let body = outline(inner); + cpu(body); + + // Tile for cache, assuming 64B cache lines + fork-tile; + let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body); + + reduce-slf(inner); + unforkify!(body); + codegen-prep!(*); +} diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs index 728702d9bcc18405ef291945f81413f49f5715af..6af13aae5d9093bd59cf6299dfa64ac3e73209a6 100644 --- a/juno_samples/rodinia/srad/benches/srad_bench.rs +++ b/juno_samples/rodinia/srad/benches/srad_bench.rs @@ -24,19 +24,6 @@ fn srad_bench(c: &mut Criterion) { } = read_graphics(image); let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); let mut image_h = HerculesMutBox::from(image.clone()); - let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>(); - let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>(); - let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>(); - let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>(); - // Fix boundary conditions - iN[0] = 0; - iS[nrows - 1] = (nrows - 1) as i32; - jW[0] = 0; - jE[ncols - 1] = (ncols - 1) as i32; - let iN_h = HerculesImmBox::from(iN.as_slice()); - let iS_h = HerculesImmBox::from(iS.as_slice()); - let jW_h = HerculesImmBox::from(jW.as_slice()); - let jE_h = HerculesImmBox::from(jE.as_slice()); group.bench_function("srad bench", |b| { b.iter(|| { async_std::task::block_on(async { @@ -45,10 +32,6 @@ fn srad_bench(c: &mut Criterion) { ncols as u64, niter as u64, image_h.to(), - iN_h.to(), - iS_h.to(), - jW_h.to(), - jE_h.to(), max, lambda, ) diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index a4cd49569aedadf42d3965bee040391f4d56cef9..7b7a6c9e2203b8e280f323ed9e6589ab149537c0 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -8,6 +8,7 @@ macro simpl!(X) { infer-schedules(X); } +no-memset(srad@scratch); phi-elim(*); let loop1 = outline(srad@loop1); let loop2 = outline(srad@loop2); @@ -31,8 +32,18 @@ simpl!(*); fork-interchange[0, 1](loop1); reduce-slf(*); simpl!(*); +slf(*); +simpl!(*); + +fork-tile[32, 0, false, false](loop2); +let split = fork-split(loop2); +let loop2_body = outline(split.srad_1.fj1); +simpl!(loop2, loop2_body); + +inline(srad@loop2); +delete-uncalled(*); -fork-split(*); -unforkify(*); +fork-split(extract, compress, loop1, loop2_body, loop3); +unforkify(extract, compress, loop1, loop2_body, loop3); gcm(*); diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch index f736c0b776759596cfc93a5025fb4f280c3fc932..c5a305272de767b0fb894e57e9f5896f12f0a21f 100644 --- a/juno_samples/rodinia/srad/src/gpu.sch +++ b/juno_samples/rodinia/srad/src/gpu.sch @@ -8,6 +8,7 @@ macro simpl!(X) { infer-schedules(X); } +no-memset(srad@scratch); phi-elim(*); let sum_loop = outline(srad@loop1); let main_loops = outline(srad@loop2 | srad@loop3); @@ -41,15 +42,26 @@ fork-tile[32, 0, false, true](sum_loop); let out = fork-split(sum_loop); clean-monoid-reduces(sum_loop); simpl!(sum_loop); -let fission = fork-fission[out.srad_0.fj0](sum_loop); + +let fission1 = fork-fission[out.srad_0.fj0](sum_loop); +simpl!(sum_loop); +fork-tile[32, 0, false, true](fission1.srad_0.fj_bottom); +let out = fork-split(fission1.srad_0.fj_bottom); +clean-monoid-reduces(sum_loop); +simpl!(sum_loop); + +let fission2 = fork-fission[out.srad_0.fj0](sum_loop); simpl!(sum_loop); -fork-tile[32, 0, false, true](fission.srad_0.fj_bottom); -let out = fork-split(fission.srad_0.fj_bottom); +fork-tile[32, 0, false, true](fission2.srad_0.fj_bottom); +let out = fork-split(fission2.srad_0.fj_bottom); clean-monoid-reduces(sum_loop); simpl!(sum_loop); -let top = outline(fission.srad_0.fj_top); -let bottom = outline(out.srad_0.fj0); -gpu(top, bottom); + +let first = outline(fission1.srad_0.fj_top); +let second = outline(fission2.srad_0.fj_top); +let third = outline(out.srad_0.fj0); +gpu(first, second, third); +const-inline[false](*); ip-sroa(*); sroa(*); simpl!(*); @@ -60,4 +72,16 @@ dce(main_loops); fork-split(main_loops); simpl!(main_loops); +fork-dim-merge(extract); +fork-tile[32, 0, false, true](extract); +dce(extract); +fork-split(extract); +simpl!(extract); + +fork-dim-merge(compress); +fork-tile[32, 0, false, true](compress); +dce(compress); +fork-split(compress); +simpl!(compress); + gcm(*); diff --git a/juno_samples/rodinia/srad/src/lib.rs b/juno_samples/rodinia/srad/src/lib.rs index a647b94a5ffc8aad3bab91badc1bd58a305e7e75..cb156d9db712155ad9e5b3f9775e3c6c29dbf22a 100644 --- a/juno_samples/rodinia/srad/src/lib.rs +++ b/juno_samples/rodinia/srad/src/lib.rs @@ -48,22 +48,6 @@ pub fn srad_harness(args: SRADInputs) { let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols); let mut image_h = HerculesMutBox::from(image.clone()); - let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>(); - let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>(); - let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>(); - let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>(); - - // Fix boundary conditions - iN[0] = 0; - iS[nrows - 1] = (nrows - 1) as i32; - jW[0] = 0; - jE[ncols - 1] = (ncols - 1) as i32; - - let iN_h = HerculesImmBox::from(iN.as_slice()); - let iS_h = HerculesImmBox::from(iS.as_slice()); - let jW_h = HerculesImmBox::from(jW.as_slice()); - let jE_h = HerculesImmBox::from(jE.as_slice()); - let mut runner = runner!(srad); let result: Vec<f32> = HerculesMutBox::from( runner @@ -72,10 +56,6 @@ pub fn srad_harness(args: SRADInputs) { ncols as u64, niter as u64, image_h.to(), - iN_h.to(), - iS_h.to(), - jW_h.to(), - jE_h.to(), max, lambda, ) @@ -90,18 +70,7 @@ pub fn srad_harness(args: SRADInputs) { if verify { let mut rust_result = image; - rust_srad::srad( - nrows, - ncols, - niter, - &mut rust_result, - &iN, - &iS, - &jW, - &jE, - max, - lambda, - ); + rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda); if let Some(output) = output_verify { write_graphics(output, &rust_result, nrows, ncols, max); diff --git a/juno_samples/rodinia/srad/src/rust_srad.rs b/juno_samples/rodinia/srad/src/rust_srad.rs index 3226e35faf2410f94ab887019d54abafe0219454..f25d382aff3c4f42c4478d911532a5850b509a26 100644 --- a/juno_samples/rodinia/srad/src/rust_srad.rs +++ b/juno_samples/rodinia/srad/src/rust_srad.rs @@ -1,15 +1,4 @@ -pub fn srad( - nrows: usize, - ncols: usize, - niter: usize, - image: &mut Vec<f32>, - iN: &[i32], - iS: &[i32], - jW: &[i32], - jE: &[i32], - max: f32, - lambda: f32, -) { +pub fn srad(nrows: usize, ncols: usize, niter: usize, image: &mut Vec<f32>, max: f32, lambda: f32) { let nelems = nrows * ncols; // EXTRACT @@ -44,11 +33,15 @@ pub fn srad( for i in 0..nrows { let k = i + nrows * j; let Jc = image[k]; + let iN = std::cmp::max(i, 1) - 1; + let iS = std::cmp::min(i, nrows - 2) + 1; + let jW = std::cmp::max(j, 1) - 1; + let jE = std::cmp::min(j, ncols - 2) + 1; - dN[k] = image[iN[i] as usize + nrows * j] - Jc; - dS[k] = image[iS[i] as usize + nrows * j] - Jc; - dW[k] = image[i + nrows * jW[j] as usize] - Jc; - dE[k] = image[i + nrows * jE[j] as usize] - Jc; + dN[k] = image[iN as usize + nrows * j] - Jc; + dS[k] = image[iS as usize + nrows * j] - Jc; + dW[k] = image[i + nrows * jW as usize] - Jc; + dE[k] = image[i + nrows * jE as usize] - Jc; let G2 = (dN[k] * dN[k] + dS[k] * dS[k] + dW[k] * dW[k] + dE[k] * dE[k]) / (Jc * Jc); @@ -72,11 +65,13 @@ pub fn srad( for j in 0..ncols { for i in 0..nrows { let k = i + nrows * j; + let iS = std::cmp::min(i, nrows - 2) + 1; + let jE = std::cmp::min(j, ncols - 2) + 1; let cN = c[k]; - let cS = c[iS[i] as usize + nrows * j]; + let cS = c[iS as usize + nrows * j]; let cW = c[k]; - let cE = c[i + nrows * jE[j] as usize]; + let cE = c[i + nrows * jE as usize]; let D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k]; diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn index 6074bf8cb12ccc2ad29c1086d7620b3ef98bcf59..024be598becdabb9ae9c32380a2b584871240cf7 100644 --- a/juno_samples/rodinia/srad/src/srad.jn +++ b/juno_samples/rodinia/srad/src/srad.jn @@ -21,10 +21,6 @@ fn compress<nrows, ncols: usize>(inout image: f32[ncols, nrows], max: f32) { fn srad<nrows, ncols: usize>( niter: usize, inout image: f32[ncols, nrows], - iN: i32[nrows], - iS: i32[nrows], - jW: i32[ncols], - jE: i32[ncols], max: f32, lambda: f32, ) { @@ -50,20 +46,25 @@ fn srad<nrows, ncols: usize>( let varROI = (sum2 / nelems as f32) - meanROI * meanROI; let q0sqr = varROI / (meanROI * meanROI); - @dirs let dN : f32[ncols, nrows]; - @dirs let dS : f32[ncols, nrows]; - @dirs let dE : f32[ncols, nrows]; - @dirs let dW : f32[ncols, nrows]; + @scratch let dN : f32[ncols, nrows]; + @scratch let dS : f32[ncols, nrows]; + @scratch let dE : f32[ncols, nrows]; + @scratch let dW : f32[ncols, nrows]; - let c : f32[ncols, nrows]; + @scratch let c : f32[ncols, nrows]; @loop2 for j in 0..ncols { for i in 0..nrows { let Jc = image[j, i]; - dN[j, i] = image[j, iN[i] as u64] - Jc; - dS[j, i] = image[j, iS[i] as u64] - Jc; - dW[j, i] = image[jW[j] as u64, i] - Jc; - dE[j, i] = image[jE[j] as u64, i] - Jc; + let iN = max!(i, 1) - 1; + let iS = min!(i, nrows - 2) + 1; + let jW = max!(j, 1) - 1; + let jE = min!(j, ncols - 2) + 1; + + dN[j, i] = image[j, iN as u64] - Jc; + dS[j, i] = image[j, iS as u64] - Jc; + dW[j, i] = image[jW as u64, i] - Jc; + dE[j, i] = image[jE as u64, i] - Jc; let G2 = (dN[j, i] * dN[j, i] + dS[j, i] * dS[j, i] + dW[j, i] * dW[j, i] + dE[j, i] * dE[j, i]) / (Jc * Jc); @@ -85,10 +86,13 @@ fn srad<nrows, ncols: usize>( @loop3 for j in 0..ncols { for i in 0..nrows { + let iS = min!(i, nrows - 2) + 1; + let jE = min!(j, ncols - 2) + 1; + let cN = c[j, i]; - let cS = c[j, iS[i] as u64]; + let cS = c[j, iS as u64]; let cW = c[j, i]; - let cE = c[jE[j] as u64, i]; + let cE = c[jE as u64, i]; let D = cN * dN[j, i] + cS * dS[j, i] + cW * dW[j, i] + cE * dE[j, i]; image[j, i] = image[j, i] + 0.25 * lambda * D; diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs index fbe6d8f084a5dc97e0d15a4063274c025845a23b..81fc82cd8f72d35ebe5a8f3276fe5690b2e0af88 100644 --- a/juno_scheduler/src/compile.rs +++ b/juno_scheduler/src/compile.rs @@ -22,6 +22,7 @@ pub enum ScheduleCompilerError { actual: usize, loc: Location, }, + SemanticError(String, Location), } impl fmt::Display for ScheduleCompilerError { @@ -46,6 +47,11 @@ impl fmt::Display for ScheduleCompilerError { "({}, {}) -- ({}, {}): Expected {} arguments, found {}", loc.0 .0, loc.0 .1, loc.1 .0, loc.1 .1, expected, actual ), + ScheduleCompilerError::SemanticError(msg, loc) => write!( + f, + "({}, {}) -- ({}, {}): {}", + loc.0 .0, loc.0 .1, loc.1 .0, loc.1 .1, msg, + ), } } } @@ -76,6 +82,8 @@ enum Appliable { // DeleteUncalled requires special handling because it changes FunctionIDs, so it is not // treated like a pass DeleteUncalled, + // Test whether a feature is enabled + Feature, Schedule(Schedule), Device(Device), } @@ -85,6 +93,8 @@ impl Appliable { fn is_valid_num_args(&self, num: usize) -> bool { match self { Appliable::Pass(pass) => pass.is_valid_num_args(num), + // Testing whether a feature is enabled takes the feature instead of a selection, so it + // has 0 arguments // Delete uncalled, Schedules, and devices do not take arguments _ => num == 0, } @@ -159,6 +169,8 @@ impl FromStr for Appliable { "serialize" => Ok(Appliable::Pass(ir::Pass::Serialize)), "write-predication" => Ok(Appliable::Pass(ir::Pass::WritePredication)), + "feature" => Ok(Appliable::Feature), + "print" => Ok(Appliable::Pass(ir::Pass::Print)), "cpu" | "llvm" => Ok(Appliable::Device(Device::LLVM)), @@ -276,6 +288,35 @@ fn compile_stmt( limit, }]) } + parser::Stmt::IfThenElse { + span: _, + cond, + thn, + els, + } => { + let cond = compile_exp_as_expr(cond, lexer, macrostab, macros)?; + + macros.open_scope(); + let thn = ir::ScheduleStmt::Block { + body: compile_ops_as_block(*thn, lexer, macrostab, macros)?, + }; + macros.close_scope(); + + macros.open_scope(); + let els = match els { + Some(els) => ir::ScheduleStmt::Block { + body: compile_ops_as_block(*els, lexer, macrostab, macros)?, + }, + None => ir::ScheduleStmt::Block { body: vec![] }, + }; + macros.close_scope(); + + Ok(vec![ir::ScheduleStmt::IfThenElse { + cond, + thn: Box::new(thn), + els: Box::new(els), + }]) + } parser::Stmt::MacroDecl { span: _, def } => { let parser::MacroDecl { name, @@ -381,6 +422,17 @@ fn compile_expr( on: selection, })) } + Appliable::Feature => match selection { + ir::Selector::Selection(mut args) if args.len() == 1 => { + Ok(ExprResult::Expr(ir::ScheduleExp::Feature { + feature: Box::new(args.pop().unwrap()), + })) + } + _ => Err(ScheduleCompilerError::SemanticError( + "feature requires exactly one argument as its selection".to_string(), + lexer.line_col(span), + )), + }, Appliable::Schedule(sched) => Ok(ExprResult::Stmt(ir::ScheduleStmt::AddSchedule { sched, on: selection, diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs index dd7a76a7da6b58bb5bc723116e84318b4b6346dc..287cd21a1ff64838942f8f2117937fd52020e75c 100644 --- a/juno_scheduler/src/ir.rs +++ b/juno_scheduler/src/ir.rs @@ -124,6 +124,9 @@ pub enum ScheduleExp { DeleteUncalled { on: Selector, }, + Feature { + feature: Box<ScheduleExp>, + }, Record { fields: Vec<(String, ScheduleExp)>, }, @@ -183,4 +186,9 @@ pub enum ScheduleStmt { device: Device, on: Selector, }, + IfThenElse { + cond: ScheduleExp, + thn: Box<ScheduleStmt>, + els: Box<ScheduleStmt>, + }, } diff --git a/juno_scheduler/src/lang.l b/juno_scheduler/src/lang.l index af154fce3d489b7b078607e52888d76d08f8e5fe..1f4f8723de4d950920d503a11c9145ad1d15d24a 100644 --- a/juno_scheduler/src/lang.l +++ b/juno_scheduler/src/lang.l @@ -20,12 +20,15 @@ \. "." apply "apply" +else "else" fixpoint "fixpoint" +if "if" let "let" macro "macro_keyword" on "on" set "set" target "target" +then "then" true "true" false "false" diff --git a/juno_scheduler/src/lang.y b/juno_scheduler/src/lang.y index 451f035b8122ac1a694567327a44776c9326fff2..55c82b9dc368e7cade9500fb852c9d48320be7d2 100644 --- a/juno_scheduler/src/lang.y +++ b/juno_scheduler/src/lang.y @@ -27,6 +27,10 @@ Stmt -> Stmt { Stmt::ExprStmt { span: $span, exp: $1 } } | 'fixpoint' FixpointLimit '{' Schedule '}' { Stmt::Fixpoint { span: $span, limit: $2, body: Box::new($4) } } + | 'if' Expr '{' Schedule '}' + { Stmt::IfThenElse { span: $span, cond: $2, thn: Box::new($4), els: None } } + | 'if' Expr '{' Schedule '}' 'else' '{' Schedule '}' + { Stmt::IfThenElse { span: $span, cond: $2, thn: Box::new($4), els: Some(Box::new($8)) } } | MacroDecl { Stmt::MacroDecl { span: $span, def: $1 } } ; @@ -163,6 +167,7 @@ pub enum Stmt { AssignStmt { span: Span, var: Span, rhs: Expr }, ExprStmt { span: Span, exp: Expr }, Fixpoint { span: Span, limit: FixpointLimit, body: Box<OperationList> }, + IfThenElse { span: Span, cond: Expr, thn: Box<OperationList>, els: Option<Box<OperationList>> }, MacroDecl { span: Span, def: MacroDecl }, } diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 2e31192a5e971d5eb2c32e07e966eb4cfe687da0..97bec5445eeeb9e9f849ef129b8ce9cf8c242798 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -16,6 +16,7 @@ use juno_utils::stringtab::StringTable; use std::cell::RefCell; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::env; use std::fmt; use std::fs::File; use std::io::Write; @@ -1199,6 +1200,23 @@ fn schedule_interpret( // were made Ok(i > 1) } + ScheduleStmt::IfThenElse { cond, thn, els } => { + let (cond, modified) = interp_expr(pm, cond, stringtab, env, functions)?; + let Value::Boolean { val: cond } = cond else { + return Err(SchedulerError::SemanticError( + "Condition must be a boolean value".to_string(), + )); + }; + let changed = schedule_interpret( + pm, + if cond { &*thn } else { &*els }, + stringtab, + env, + functions, + )?; + + Ok(modified || changed) + } ScheduleStmt::Block { body } => { let mut modified = false; env.open_scope(); @@ -1339,7 +1357,10 @@ fn interp_expr( } } Value::Record { fields } => match fields.get(field) { - None => Err(SchedulerError::UndefinedField(field.clone())), + None => Err(SchedulerError::UndefinedField(format!( + "{} not in {:?}", + field, fields + ))), Some(v) => Ok((v.clone(), changed)), }, } @@ -1443,6 +1464,23 @@ fn interp_expr( changed, )) } + ScheduleExp::Feature { feature } => { + let (feature, modified) = interp_expr(pm, &*feature, stringtab, env, functions)?; + let Value::String { val } = feature else { + return Err(SchedulerError::SemanticError( + "Feature expects a single string argument (instead of a selection)".to_string(), + )); + }; + // To test for features, the scheduler needs to be invoked from a build script so that + // Cargo provides the enabled features via environment variables + let key = "CARGO_FEATURE_".to_string() + &val.to_uppercase().replace("-", "_"); + Ok(( + Value::Boolean { + val: env::var(key).is_ok(), + }, + modified, + )) + } ScheduleExp::Record { fields } => { let mut result = HashMap::new(); let mut changed = false;