diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index 6998f8794c029a0c8d66ec8b557410557483d2ad..bebb8c6cd0867f8018a65d421b30938f60da49db 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1515,6 +1515,10 @@ fn fork_fusion( * Looks for would-be monoid reduces, if not for a gate on the reduction. * Partially predicate the gated reduction to allow for a proper monoid * reduction. + * + * Looks for monoid reduces that occur through a gated write to a single + * location, and lift them to a proper monoid reduction with a single gated + * write afterwards. */ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { for id in editor.node_ids() { @@ -1676,6 +1680,121 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { }); } } + + for id in editor.node_ids() { + // Identify reduce/write/phi cycle through which a sparse AND reduction + // is occurring. + let nodes = &editor.func().nodes; + let Some((join, init, reduct)) = nodes[id.idx()].try_reduce() else { + continue; + }; + let join_pred = nodes[join.idx()].try_join().unwrap(); + let join_succ = editor + .get_users(join) + .filter(|id| nodes[id.idx()].is_control()) + .next() + .unwrap(); + let Some((_, phi_data)) = nodes[reduct.idx()].try_phi() else { + continue; + }; + if phi_data.len() != 2 { + continue; + } + let phi_other_use = if phi_data[0] == id { + phi_data[1] + } else if phi_data[1] == id { + phi_data[0] + } else { + continue; + }; + let Some((collect, data, indices)) = nodes[phi_other_use.idx()].try_write() else { + continue; + }; + if collect != id { + continue; + } + if indices.into_iter().any(|idx| idx.is_position()) { + continue; + } + if !is_false(editor, data) { + continue; + } + let Some(preds) = nodes[join_pred.idx()].try_region() else { + continue; + }; + if preds.len() != 2 { + continue; + } + let Some((if1, _)) = nodes[preds[0].idx()].try_control_proj() else { + continue; + }; + let Some((if2, sel)) = nodes[preds[1].idx()].try_control_proj() else { + continue; + }; + if if1 != if2 { + continue; + } + let Some((_, mut cond)) = nodes[if1.idx()].try_if() else { + continue; + }; + + // Transform to a monoid reduction and a single gated write. + let negated = phi_other_use == phi_data[sel]; + let indices = indices.to_vec().into_boxed_slice(); + editor.edit(|mut edit| { + let t = edit.add_constant(Constant::Boolean(true)); + let t = edit.add_node(Node::Constant { id: t }); + let f = edit.add_constant(Constant::Boolean(false)); + let f = edit.add_node(Node::Constant { id: f }); + if negated { + cond = edit.add_node(Node::Unary { + op: UnaryOperator::Not, + input: cond, + }); + } + let reduce_id = NodeID::new(edit.num_node_ids()); + let and_id = NodeID::new(edit.num_node_ids() + 1); + edit.add_node(Node::Reduce { + control: join, + init: t, + reduct: and_id, + }); + edit.add_node(Node::Binary { + op: BinaryOperator::And, + left: cond, + right: reduce_id, + }); + + let new_if = edit.add_node(Node::If { + control: join, + cond: reduce_id, + }); + let cpj1 = edit.add_node(Node::ControlProjection { + control: new_if, + selection: 0, + }); + let cpj2 = edit.add_node(Node::ControlProjection { + control: new_if, + selection: 1, + }); + let region = edit.add_node(Node::Region { + preds: Box::new([cpj1, cpj2]), + }); + let write = edit.add_node(Node::Write { + collect: init, + data: f, + indices, + }); + let phi = edit.add_node(Node::Phi { + control: region, + data: Box::new([write, init]), + }); + edit = edit.replace_all_uses_where(id, phi, |other_id| { + *other_id != phi_other_use && *other_id != reduct + })?; + edit.replace_all_uses_where(join, region, |id| *id == join_succ) + }); + } } /* @@ -1741,6 +1860,7 @@ fn extend_fork(editor: &mut FunctionEditor, fork: NodeID, join: NodeID, multiple control: new_fork, dimension: idx, }); + edit.sub_edit(fork, tid); let old_bound = edit.add_node(Node::DynamicConstant { id: *old_factor }); edit.add_node(Node::Binary { op: BinaryOperator::LT, diff --git a/juno_samples/edge_detection/benches/edge_detection_bench.rs b/juno_samples/edge_detection/benches/edge_detection_bench.rs index 1fa08506a5840b252e293ddee36b72b7f359d78a..bedd5b267fa8df7e658527b491b4d53d264cea64 100644 --- a/juno_samples/edge_detection/benches/edge_detection_bench.rs +++ b/juno_samples/edge_detection/benches/edge_detection_bench.rs @@ -88,9 +88,6 @@ fn edge_detection_bench(c: &mut Criterion) { r.run( height as u64, width as u64, - gs as u64, - sz as u64, - sb as u64, input_h.to(), gaussian_filter_h, structure_h, diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index 64fee6b648c6fc0e17b39677353e442b4b37596e..34431711516635f2a0cfd2e2f24213fdd9f7d25c 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -134,7 +134,9 @@ if !feature("seq") { reject_zero_crossings = reject_zero_crossings_body; } -async-call(edge_detection@le, edge_detection@zc); +if !feature("seq") { + async-call(edge_detection@le, edge_detection@zc); +} fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn index 3e49cb365b186037cf6f380c3aa6a4d3b483fb5b..be98781210791bddb7aa11ac353457e791c1d4b0 100644 --- a/juno_samples/edge_detection/src/edge_detection.jn +++ b/juno_samples/edge_detection/src/edge_detection.jn @@ -1,6 +1,10 @@ -fn gaussian_smoothing<n, m, gs : usize>( +const gs : usize = 7; +const sz : usize = 3; +const sb : usize = 3; + +fn gaussian_smoothing<n, m : usize>( input: f32[n, m], - filter: f32[gs, gs], + filter: f32[7, 7], ) -> f32[n, m] { @res let result : f32[n, m]; @@ -13,12 +17,9 @@ fn gaussian_smoothing<n, m, gs : usize>( @filter_loop for i = 0 to gs { for j = 0 to gs { - let val = input[if row + i < gr then 0 - else if row + i - gr > n - 1 then n - 1 - else row + i - gr, - if col + j < gr then 0 - else if col + j - gr > m - 1 then m - 1 - else col + j - gr]; + let br = min!(max!(row + i, gr) - gr, n - 1); + let bc = min!(max!(col + j, gr) - gr, m - 1); + let val = input[br, bc]; smoothed += val * filter[i, j]; } } @@ -33,9 +34,9 @@ fn gaussian_smoothing<n, m, gs : usize>( const MIN_BR : f32 = 0; const MAX_BR : f32 = 1; -fn laplacian_estimate<n, m, sz: usize>( +fn laplacian_estimate<n, m : usize>( input: f32[n, m], - structure: f32[sz, sz], + structure: f32[3, 3], ) -> f32[n, m] { const r = sz / 2; @@ -77,9 +78,9 @@ fn laplacian_estimate<n, m, sz: usize>( return result; } -fn zero_crossings<n, m, sz: usize>( +fn zero_crossings<n, m : usize>( input: f32[n, m], - structure: f32[sz, sz], + structure: f32[3, 3], ) -> f32[n, m] { const r = sz / 2; @@ -123,10 +124,10 @@ fn zero_crossings<n, m, sz: usize>( return result; } -fn gradient<n, m, sb: usize>( +fn gradient<n, m : usize>( input: f32[n, m], - sx: f32[sb, sb], - sy: f32[sb, sb], + sx: f32[3, 3], + sy: f32[3, 3], ) -> f32[n, m] { const sbr = sb / 2; @@ -191,18 +192,18 @@ fn reject_zero_crossings<n, m: usize>( } #[entry] -fn edge_detection<n, m, gs, sz, sb: usize>( +fn edge_detection<n, m : usize>( input: f32[n, m], gaussian_filter: f32[gs, gs], - structure: f32[sz, sz], - sx: f32[sb, sb], - sy: f32[sb, sb], + structure: f32[3, 3], + sx: f32[3, 3], + sy: f32[3, 3], theta: f32, ) -> f32[n, m] { - let smoothed = gaussian_smoothing::<n, m, gs>(input, gaussian_filter); - @le let laplacian = laplacian_estimate::<n, m, sz>(smoothed, structure); - @zc let zcs = zero_crossings::<n, m, sz>(laplacian, structure); - let gradient = gradient::<n, m, sb>(smoothed, sx, sy); + let smoothed = gaussian_smoothing::<n, m>(input, gaussian_filter); + @le let laplacian = laplacian_estimate::<n, m>(smoothed, structure); + @zc let zcs = zero_crossings::<n, m>(laplacian, structure); + let gradient = gradient::<n, m>(smoothed, sx, sy); let maxgrad = max_gradient::<n, m>(gradient); return reject_zero_crossings::<n, m>(zcs, gradient, maxgrad, theta); } diff --git a/juno_samples/edge_detection/src/lib.rs b/juno_samples/edge_detection/src/lib.rs index aa44e2e7e7215ee21560fefefc8c748610e183ce..c1d04b0f9a7b4950f9b8bf6ff6438b46291cb2f3 100644 --- a/juno_samples/edge_detection/src/lib.rs +++ b/juno_samples/edge_detection/src/lib.rs @@ -194,9 +194,6 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) { r.run( height as u64, width as u64, - gs as u64, - sz as u64, - sb as u64, input_h.to(), gaussian_filter_h.to(), structure_h.to(), diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch index 6867576e4a3a81f327d23eca6dccbcd6a6183eb4..e6fe7bc84c83b39ae63ca918c206762e88ee4f8d 100644 --- a/juno_samples/matmul/src/matmul.sch +++ b/juno_samples/matmul/src/matmul.sch @@ -25,6 +25,10 @@ macro forkify!(X) { } } +macro fork-chunk { + fork-tile[n, 0, false, false](X); +} + macro fork-tile { fork-tile[n, 0, false, true](X); } @@ -66,8 +70,8 @@ if feature("cuda") { // Parallelize by computing output array as 16 chunks let par = matmul@outer \ matmul@inner; - fork-tile; - let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par); + fork-chunk; + let (outer, inner, _) = fork-reshape[[0, 2], [1], [3]](par); parallelize!(outer \ inner); let body = outline(inner); diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 4796f427aae2746f853e65e3f3d7589f63c2e7f2..cbdd90fdbc0a2a9762ec0736a2fc5fa1fe1c870b 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -38,8 +38,8 @@ let forward_input = outline(backprop@forward_input); let forward_hidden = outline(backprop@forward_hidden); if !feature("seq") { - fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop); - let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop); + fork-tile[16, 0, false, false](forward_input@outer_loop \ forward_input@inner_loop); + let (outer, inner) = fork-reshape[[0], [1]](forward_input@outer_loop \ forward_input@inner_loop); forward_input = outline(inner); inline(backprop@forward_input); } @@ -53,8 +53,8 @@ let adjust_hidden = outline(backprop@adjust_hidden); let adjust_input = outline(backprop@adjust_input); if !feature("seq") { - fork-tile[16, 0, false, true](adjust_input); - let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input); + fork-tile[16, 0, false, false](adjust_input); + let (outer, inner) = fork-reshape[[0], [1, 2]](adjust_input); adjust_input = outline(inner); inline(backprop@adjust_input); } diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index ea6f0403c8f0824c0bcf27dc6dcd15649bcdb2ec..07006edbd791810fc90e5e756315449687d13d7f 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -41,14 +41,14 @@ parallel-fork(traverse, collect); parallel-reduce(traverse, collect); if !feature("seq") { - fork-tile[32, 0, false, true](traverse, collect); - let (outer, inner) = fork-reshape[[1], [0]](traverse); + fork-tile[32, 0, false, false](traverse, collect); + let (outer, inner) = fork-reshape[[0], [1]](traverse); traverse = outline(inner); - let (outer, inner) = fork-reshape[[1], [0]](collect); + let (outer, inner) = fork-reshape[[0], [1]](collect); collect = outline(inner); - fork-tile[32, 0, false, true](init); - let (outer, inner) = fork-reshape[[1], [0]](init); + fork-tile[32, 0, false, false](init); + let (outer, inner) = fork-reshape[[0], [1]](init); let init_body = outline(inner); inline(bfs@cost_init, bfs@loop1, bfs@loop2); @@ -56,8 +56,14 @@ if !feature("seq") { } delete-uncalled(*); const-inline(*); +clean-monoid-reduces(collect); simpl!(*); +fork-tile[8, 0, false, true](init, traverse, collect); +clean-monoid-reduces(collect); +simpl!(*); + +fork-split(init, traverse, collect); unforkify(init, traverse, collect); simpl!(*); -gcm(*); \ No newline at end of file +gcm(*); diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 0253a0210f6cc2451d38399601ad39ff3ab9465a..ea81f330072e891df1250a88f54e87e7d73610d0 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -15,7 +15,7 @@ let traverse = outline(bfs@loop1); let collect = outline(bfs@loop2); parallel-reduce(traverse, collect); no-memset(make_stop_prod); -gpu(traverse, make_stop_prod, collect); +gpu(init, traverse, make_stop_prod, collect); simpl!(*); predication(*); @@ -38,12 +38,8 @@ fixpoint { } simpl!(collect); -fork-tile[32, 0, false, true](init); -let (outer, inner) = fork-reshape[[1], [0]](init); -let init_body = outline(inner); - -fork-tile[1024, 0, false, true](traverse, collect); -fork-split(traverse, collect); +fork-tile[1024, 0, false, true](init, traverse, collect); +let out = fork-split(init, traverse, collect); +simpl!(*); -unforkify(init_body); gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 13125961275b8982b7fc7c589198bc83bd36c420..d9d3eb8c0ebe133acd9daf3e9505eeb06b1be062 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -57,5 +57,10 @@ if !feature("seq") { copy_vars = copy_vars_body; } +const-inline[false](*); +simpl!(*); +fork-split(compute_step_factor, compute_flux, time_step, copy_vars); unforkify(compute_step_factor, compute_flux, time_step, copy_vars); +simpl!(*); + gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 858be5baec53ea93e5658cfed6b9290cc4010c1f..3acee55bfee6d413a479edb771bb0c36a7036205 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -64,5 +64,10 @@ if !feature("seq") { copy_vars = copy_vars_body; } +const-inline[false](*); +simpl!(*); +fork-split(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars); unforkify(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars); +simpl!(*); + gcm(*); diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 8fa22aaa9f1213777439d153cef25cbf5b88ab8b..5487355d3011502316415f8ef5b17f1ab62906f4 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -28,14 +28,28 @@ fixpoint { fork-guard-elim(*); fork-coalesce(*); } +fork-dim-merge(loop1); simpl!(*); -fork-interchange[0, 1](loop1); reduce-slf(*); simpl!(*); slf(*); simpl!(*); if !feature("seq") { + fork-tile[32, 0, false, false](loop1); + simpl!(loop1); + let split = fork-split(loop1); + simpl!(loop1); + clean-monoid-reduces(loop1); + let loop1_body = outline(split.srad_0.fj1); + simpl!(loop1, loop1_body); + unforkify(loop1_body); + let fission = fork-fission[split.srad_0.fj0](loop1); + simpl!(loop1, loop1_body); + unforkify(fission.srad_0.fj_bottom); + simpl!(loop1, loop1_body); + loop1 = loop1_body; + fork-tile[32, 0, false, false](loop2); let split = fork-split(loop2); let loop2_body = outline(split.srad_1.fj1);