diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index 6998f8794c029a0c8d66ec8b557410557483d2ad..bebb8c6cd0867f8018a65d421b30938f60da49db 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1515,6 +1515,10 @@ fn fork_fusion(
  * Looks for would-be monoid reduces, if not for a gate on the reduction.
  * Partially predicate the gated reduction to allow for a proper monoid
  * reduction.
+ *
+ * Looks for monoid reduces that occur through a gated write to a single
+ * location, and lift them to a proper monoid reduction with a single gated
+ * write afterwards.
  */
 pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
     for id in editor.node_ids() {
@@ -1676,6 +1680,121 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
             });
         }
     }
+
+    for id in editor.node_ids() {
+        // Identify reduce/write/phi cycle through which a sparse AND reduction
+        // is occurring.
+        let nodes = &editor.func().nodes;
+        let Some((join, init, reduct)) = nodes[id.idx()].try_reduce() else {
+            continue;
+        };
+        let join_pred = nodes[join.idx()].try_join().unwrap();
+        let join_succ = editor
+            .get_users(join)
+            .filter(|id| nodes[id.idx()].is_control())
+            .next()
+            .unwrap();
+        let Some((_, phi_data)) = nodes[reduct.idx()].try_phi() else {
+            continue;
+        };
+        if phi_data.len() != 2 {
+            continue;
+        }
+        let phi_other_use = if phi_data[0] == id {
+            phi_data[1]
+        } else if phi_data[1] == id {
+            phi_data[0]
+        } else {
+            continue;
+        };
+        let Some((collect, data, indices)) = nodes[phi_other_use.idx()].try_write() else {
+            continue;
+        };
+        if collect != id {
+            continue;
+        }
+        if indices.into_iter().any(|idx| idx.is_position()) {
+            continue;
+        }
+        if !is_false(editor, data) {
+            continue;
+        }
+        let Some(preds) = nodes[join_pred.idx()].try_region() else {
+            continue;
+        };
+        if preds.len() != 2 {
+            continue;
+        }
+        let Some((if1, _)) = nodes[preds[0].idx()].try_control_proj() else {
+            continue;
+        };
+        let Some((if2, sel)) = nodes[preds[1].idx()].try_control_proj() else {
+            continue;
+        };
+        if if1 != if2 {
+            continue;
+        }
+        let Some((_, mut cond)) = nodes[if1.idx()].try_if() else {
+            continue;
+        };
+
+        // Transform to a monoid reduction and a single gated write.
+        let negated = phi_other_use == phi_data[sel];
+        let indices = indices.to_vec().into_boxed_slice();
+        editor.edit(|mut edit| {
+            let t = edit.add_constant(Constant::Boolean(true));
+            let t = edit.add_node(Node::Constant { id: t });
+            let f = edit.add_constant(Constant::Boolean(false));
+            let f = edit.add_node(Node::Constant { id: f });
+            if negated {
+                cond = edit.add_node(Node::Unary {
+                    op: UnaryOperator::Not,
+                    input: cond,
+                });
+            }
+            let reduce_id = NodeID::new(edit.num_node_ids());
+            let and_id = NodeID::new(edit.num_node_ids() + 1);
+            edit.add_node(Node::Reduce {
+                control: join,
+                init: t,
+                reduct: and_id,
+            });
+            edit.add_node(Node::Binary {
+                op: BinaryOperator::And,
+                left: cond,
+                right: reduce_id,
+            });
+
+            let new_if = edit.add_node(Node::If {
+                control: join,
+                cond: reduce_id,
+            });
+            let cpj1 = edit.add_node(Node::ControlProjection {
+                control: new_if,
+                selection: 0,
+            });
+            let cpj2 = edit.add_node(Node::ControlProjection {
+                control: new_if,
+                selection: 1,
+            });
+            let region = edit.add_node(Node::Region {
+                preds: Box::new([cpj1, cpj2]),
+            });
+            let write = edit.add_node(Node::Write {
+                collect: init,
+                data: f,
+                indices,
+            });
+            let phi = edit.add_node(Node::Phi {
+                control: region,
+                data: Box::new([write, init]),
+            });
+            edit = edit.replace_all_uses_where(id, phi, |other_id| {
+                *other_id != phi_other_use && *other_id != reduct
+            })?;
+            edit.replace_all_uses_where(join, region, |id| *id == join_succ)
+        });
+    }
 }
 
 /*
@@ -1741,6 +1860,7 @@ fn extend_fork(editor: &mut FunctionEditor, fork: NodeID, join: NodeID, multiple
                     control: new_fork,
                     dimension: idx,
                 });
+                edit.sub_edit(fork, tid);
                 let old_bound = edit.add_node(Node::DynamicConstant { id: *old_factor });
                 edit.add_node(Node::Binary {
                     op: BinaryOperator::LT,
diff --git a/juno_samples/edge_detection/benches/edge_detection_bench.rs b/juno_samples/edge_detection/benches/edge_detection_bench.rs
index 1fa08506a5840b252e293ddee36b72b7f359d78a..bedd5b267fa8df7e658527b491b4d53d264cea64 100644
--- a/juno_samples/edge_detection/benches/edge_detection_bench.rs
+++ b/juno_samples/edge_detection/benches/edge_detection_bench.rs
@@ -88,9 +88,6 @@ fn edge_detection_bench(c: &mut Criterion) {
                     r.run(
                         height as u64,
                         width as u64,
-                        gs as u64,
-                        sz as u64,
-                        sb as u64,
                         input_h.to(),
                         gaussian_filter_h,
                         structure_h,
diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index 64fee6b648c6fc0e17b39677353e442b4b37596e..34431711516635f2a0cfd2e2f24213fdd9f7d25c 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -134,7 +134,9 @@ if !feature("seq") {
   reject_zero_crossings = reject_zero_crossings_body;
 }
 
-async-call(edge_detection@le, edge_detection@zc);
+if !feature("seq") {
+  async-call(edge_detection@le, edge_detection@zc);
+}
 
 fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
 unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
index 3e49cb365b186037cf6f380c3aa6a4d3b483fb5b..be98781210791bddb7aa11ac353457e791c1d4b0 100644
--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -1,6 +1,10 @@
-fn gaussian_smoothing<n, m, gs : usize>(
+const gs : usize = 7;
+const sz : usize = 3;
+const sb : usize = 3;
+
+fn gaussian_smoothing<n, m : usize>(
   input: f32[n, m],
-  filter: f32[gs, gs],
+  filter: f32[7, 7],
 ) -> f32[n, m] {
   @res let result : f32[n, m];
 
@@ -13,12 +17,9 @@ fn gaussian_smoothing<n, m, gs : usize>(
 
       @filter_loop for i = 0 to gs {
         for j = 0 to gs {
-          let val = input[if row + i < gr               then 0
-                          else if row + i - gr > n - 1  then n - 1
-                                                        else row + i - gr,
-                          if col + j < gr               then 0
-                          else if col + j - gr > m - 1  then m - 1
-                                                        else col + j - gr];
+          let br = min!(max!(row + i, gr) - gr, n - 1);
+          let bc = min!(max!(col + j, gr) - gr, m - 1);
+          let val = input[br, bc];
           smoothed += val * filter[i, j];
         }
       }
@@ -33,9 +34,9 @@ fn gaussian_smoothing<n, m, gs : usize>(
 const MIN_BR : f32 = 0;
 const MAX_BR : f32 = 1;
 
-fn laplacian_estimate<n, m, sz: usize>(
+fn laplacian_estimate<n, m : usize>(
   input: f32[n, m],
-  structure: f32[sz, sz],
+  structure: f32[3, 3],
 ) -> f32[n, m] {
   const r = sz / 2;
 
@@ -77,9 +78,9 @@ fn laplacian_estimate<n, m, sz: usize>(
   return result;
 }
 
-fn zero_crossings<n, m, sz: usize>(
+fn zero_crossings<n, m : usize>(
   input: f32[n, m],
-  structure: f32[sz, sz],
+  structure: f32[3, 3],
 ) -> f32[n, m] {
   const r = sz / 2;
 
@@ -123,10 +124,10 @@ fn zero_crossings<n, m, sz: usize>(
   return result;
 }
 
-fn gradient<n, m, sb: usize>(
+fn gradient<n, m : usize>(
   input: f32[n, m],
-  sx: f32[sb, sb],
-  sy: f32[sb, sb],
+  sx: f32[3, 3],
+  sy: f32[3, 3],
 ) -> f32[n, m] {
   const sbr = sb / 2;
 
@@ -191,18 +192,18 @@ fn reject_zero_crossings<n, m: usize>(
 }
 
 #[entry]
-fn edge_detection<n, m, gs, sz, sb: usize>(
+fn edge_detection<n, m : usize>(
   input: f32[n, m],
   gaussian_filter: f32[gs, gs],
-  structure: f32[sz, sz],
-  sx: f32[sb, sb],
-  sy: f32[sb, sb],
+  structure: f32[3, 3],
+  sx: f32[3, 3],
+  sy: f32[3, 3],
   theta: f32,
 ) -> f32[n, m] {
-  let smoothed = gaussian_smoothing::<n, m, gs>(input, gaussian_filter);
-  @le let laplacian = laplacian_estimate::<n, m, sz>(smoothed, structure);
-  @zc let zcs = zero_crossings::<n, m, sz>(laplacian, structure);
-  let gradient = gradient::<n, m, sb>(smoothed, sx, sy);
+  let smoothed = gaussian_smoothing::<n, m>(input, gaussian_filter);
+  @le let laplacian = laplacian_estimate::<n, m>(smoothed, structure);
+  @zc let zcs = zero_crossings::<n, m>(laplacian, structure);
+  let gradient = gradient::<n, m>(smoothed, sx, sy);
   let maxgrad = max_gradient::<n, m>(gradient);
   return reject_zero_crossings::<n, m>(zcs, gradient, maxgrad, theta);
 }
diff --git a/juno_samples/edge_detection/src/lib.rs b/juno_samples/edge_detection/src/lib.rs
index aa44e2e7e7215ee21560fefefc8c748610e183ce..c1d04b0f9a7b4950f9b8bf6ff6438b46291cb2f3 100644
--- a/juno_samples/edge_detection/src/lib.rs
+++ b/juno_samples/edge_detection/src/lib.rs
@@ -194,9 +194,6 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) {
                 r.run(
                     height as u64,
                     width as u64,
-                    gs as u64,
-                    sz as u64,
-                    sb as u64,
                     input_h.to(),
                     gaussian_filter_h.to(),
                     structure_h.to(),
diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch
index 6867576e4a3a81f327d23eca6dccbcd6a6183eb4..e6fe7bc84c83b39ae63ca918c206762e88ee4f8d 100644
--- a/juno_samples/matmul/src/matmul.sch
+++ b/juno_samples/matmul/src/matmul.sch
@@ -25,6 +25,10 @@ macro forkify!(X) {
   }
 }
 
+macro fork-chunk![n](X) {
+  fork-tile[n, 0, false, false](X);
+}
+
 macro fork-tile![n](X) {
   fork-tile[n, 0, false, true](X);
 }
@@ -66,8 +70,8 @@ if feature("cuda") {
 
   // Parallelize by computing output array as 16 chunks
   let par = matmul@outer \ matmul@inner;
-  fork-tile![4](par);
-  let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
+  fork-chunk![4](par);
+  let (outer, inner, _) = fork-reshape[[0, 2], [1], [3]](par);
   parallelize!(outer \ inner);
 
   let body = outline(inner);
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index 4796f427aae2746f853e65e3f3d7589f63c2e7f2..cbdd90fdbc0a2a9762ec0736a2fc5fa1fe1c870b 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -38,8 +38,8 @@ let forward_input = outline(backprop@forward_input);
 let forward_hidden = outline(backprop@forward_hidden);
 
 if !feature("seq") {
-  fork-tile[16, 0, false, true](forward_input@outer_loop \ forward_input@inner_loop);
-  let (outer, inner) = fork-reshape[[1], [0]](forward_input@outer_loop \ forward_input@inner_loop);
+  fork-tile[16, 0, false, false](forward_input@outer_loop \ forward_input@inner_loop);
+  let (outer, inner) = fork-reshape[[0], [1]](forward_input@outer_loop \ forward_input@inner_loop);
   forward_input = outline(inner);
   inline(backprop@forward_input);
 }
@@ -53,8 +53,8 @@ let adjust_hidden = outline(backprop@adjust_hidden);
 let adjust_input = outline(backprop@adjust_input);
 
 if !feature("seq") {
-  fork-tile[16, 0, false, true](adjust_input);
-  let (outer, inner) = fork-reshape[[1], [0, 2]](adjust_input);
+  fork-tile[16, 0, false, false](adjust_input);
+  let (outer, inner) = fork-reshape[[0], [1, 2]](adjust_input);
   adjust_input = outline(inner);
   inline(backprop@adjust_input);
 }
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index ea6f0403c8f0824c0bcf27dc6dcd15649bcdb2ec..07006edbd791810fc90e5e756315449687d13d7f 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -41,14 +41,14 @@ parallel-fork(traverse, collect);
 parallel-reduce(traverse, collect);
 
 if !feature("seq") {
-  fork-tile[32, 0, false, true](traverse, collect);
-  let (outer, inner) = fork-reshape[[1], [0]](traverse);
+  fork-tile[32, 0, false, false](traverse, collect);
+  let (outer, inner) = fork-reshape[[0], [1]](traverse);
   traverse = outline(inner);
-  let (outer, inner) = fork-reshape[[1], [0]](collect);
+  let (outer, inner) = fork-reshape[[0], [1]](collect);
   collect = outline(inner);
 
-  fork-tile[32, 0, false, true](init);
-  let (outer, inner) = fork-reshape[[1], [0]](init);
+  fork-tile[32, 0, false, false](init);
+  let (outer, inner) = fork-reshape[[0], [1]](init);
   let init_body = outline(inner);
 
   inline(bfs@cost_init, bfs@loop1, bfs@loop2);
@@ -56,8 +56,14 @@ if !feature("seq") {
 }
 delete-uncalled(*);
 const-inline(*);
+clean-monoid-reduces(collect);
 simpl!(*);
 
+fork-tile[8, 0, false, true](init, traverse, collect);
+clean-monoid-reduces(collect);
+simpl!(*);
+
+fork-split(init, traverse, collect);
 unforkify(init, traverse, collect);
 simpl!(*);
-gcm(*);
\ No newline at end of file
+gcm(*);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 0253a0210f6cc2451d38399601ad39ff3ab9465a..ea81f330072e891df1250a88f54e87e7d73610d0 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -15,7 +15,7 @@ let traverse = outline(bfs@loop1);
 let collect = outline(bfs@loop2);
 parallel-reduce(traverse, collect);
 no-memset(make_stop_prod);
-gpu(traverse, make_stop_prod, collect);
+gpu(init, traverse, make_stop_prod, collect);
 
 simpl!(*);
 predication(*);
@@ -38,12 +38,8 @@ fixpoint {
 }
 simpl!(collect);
 
-fork-tile[32, 0, false, true](init);
-let (outer, inner) = fork-reshape[[1], [0]](init);
-let init_body = outline(inner);
-
-fork-tile[1024, 0, false, true](traverse, collect);
-fork-split(traverse, collect);
+fork-tile[1024, 0, false, true](init, traverse, collect);
+let out = fork-split(init, traverse, collect);
+simpl!(*);
 
-unforkify(init_body);
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch
index 13125961275b8982b7fc7c589198bc83bd36c420..d9d3eb8c0ebe133acd9daf3e9505eeb06b1be062 100644
--- a/juno_samples/rodinia/cfd/src/cpu_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch
@@ -57,5 +57,10 @@ if !feature("seq") {
   copy_vars = copy_vars_body;
 }
 
+const-inline[false](*);
+simpl!(*);
+fork-split(compute_step_factor, compute_flux, time_step, copy_vars);
 unforkify(compute_step_factor, compute_flux, time_step, copy_vars);
+simpl!(*);
+
 gcm(*);
diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
index 858be5baec53ea93e5658cfed6b9290cc4010c1f..3acee55bfee6d413a479edb771bb0c36a7036205 100644
--- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
+++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch
@@ -64,5 +64,10 @@ if !feature("seq") {
   copy_vars = copy_vars_body;
 }
 
+const-inline[false](*);
+simpl!(*);
+fork-split(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars);
 unforkify(compute_step_factor, compute_flux_contributions, compute_flux, time_step, copy_vars);
+simpl!(*);
+
 gcm(*);
diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index 8fa22aaa9f1213777439d153cef25cbf5b88ab8b..5487355d3011502316415f8ef5b17f1ab62906f4 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -28,14 +28,28 @@ fixpoint {
   fork-guard-elim(*);
   fork-coalesce(*);
 }
+fork-dim-merge(loop1);
 simpl!(*);
-fork-interchange[0, 1](loop1);
 reduce-slf(*);
 simpl!(*);
 slf(*);
 simpl!(*);
 
 if !feature("seq") {
+  fork-tile[32, 0, false, false](loop1);
+  simpl!(loop1);
+  let split = fork-split(loop1);
+  simpl!(loop1);
+  clean-monoid-reduces(loop1);
+  let loop1_body = outline(split.srad_0.fj1);
+  simpl!(loop1, loop1_body);
+  unforkify(loop1_body);
+  let fission = fork-fission[split.srad_0.fj0](loop1);
+  simpl!(loop1, loop1_body);
+  unforkify(fission.srad_0.fj_bottom);
+  simpl!(loop1, loop1_body);
+  loop1 = loop1_body;
+
   fork-tile[32, 0, false, false](loop2);
   let split = fork-split(loop2);
   let loop2_body = outline(split.srad_1.fj1);