diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index e6db0345def31324243cdee2bdcb6b5cca5d9a7b..8bd3f735fa60cf4e0404edde28eb858e0ed7e581 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1169,6 +1169,7 @@ pub fn fork_dim_merge( op: BinaryOperator::Rem, }); edit.sub_edit(tid, rem); + edit.sub_edit(tid, outer_tid); edit = edit.replace_all_uses(tid, rem)?; } else if tid_dim == inner_idx { let outer_tid = Node::ThreadID { @@ -1185,6 +1186,7 @@ pub fn fork_dim_merge( op: BinaryOperator::Div, }); edit.sub_edit(tid, div); + edit.sub_edit(tid, outer_tid); edit = edit.replace_all_uses(tid, div)?; } } @@ -1479,7 +1481,12 @@ fn fork_fusion( } // Perform the fusion. + let bottom_tids: Vec<_> = editor + .get_users(bottom_fork) + .filter(|id| nodes[id.idx()].is_thread_id()) + .collect(); editor.edit(|mut edit| { + edit = edit.replace_all_uses_where(bottom_fork, top_fork, |id| bottom_tids.contains(id))?; if bottom_join_pred != bottom_fork { // If there is control flow in the bottom fork-join, stitch it into // the top fork-join. diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 065a78f273cc0c3d4bdacf1f77428f6f0bbf3622..f8da90d06e8ec82c200048b04dedfe19281ac0dd 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -26,6 +26,15 @@ predication(gaussian_smoothing); simpl!(gaussian_smoothing); predication(gaussian_smoothing); simpl!(gaussian_smoothing); +fork-dim-merge(gaussian_smoothing@filter_loop); +unforkify(gaussian_smoothing@filter_loop); +simpl!(gaussian_smoothing); + +fork-dim-merge(gaussian_smoothing); +fork-tile[32, 0, false, true](gaussian_smoothing); +simpl!(gaussian_smoothing); +fork-split(gaussian_smoothing); +simpl!(gaussian_smoothing); no-memset(laplacian_estimate@res); fixpoint { @@ -34,6 +43,15 @@ fixpoint { fork-coalesce(laplacian_estimate); } simpl!(laplacian_estimate); +fork-dim-merge(laplacian_estimate@filter_loop); +unforkify(laplacian_estimate@filter_loop); +simpl!(laplacian_estimate); + +fork-dim-merge(laplacian_estimate); +fork-tile[32, 0, false, true](laplacian_estimate); +simpl!(laplacian_estimate); +fork-split(laplacian_estimate); +simpl!(laplacian_estimate); no-memset(zero_crossings@res); fixpoint { @@ -42,6 +60,15 @@ fixpoint { fork-coalesce(zero_crossings); } simpl!(zero_crossings); +fork-dim-merge(zero_crossings@filter_loop); +unforkify(zero_crossings@filter_loop); +simpl!(zero_crossings); + +fork-dim-merge(zero_crossings); +fork-tile[32, 0, false, true](zero_crossings); +simpl!(zero_crossings); +fork-split(zero_crossings); +simpl!(zero_crossings); no-memset(gradient@res); fixpoint {