diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 07dd3ebfc551b84b20cea797421bf5e6846f30c8..1e6067a360c6a57674efd43444abd300d74075b4 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -354,6 +354,7 @@ impl GPUContext<'_> {
         write!(
             w,
             "
+#define _CG_ABI_EXPERIMENTAL
 #include <assert.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -561,8 +562,9 @@ namespace cg = cooperative_groups;
      * and writes.
      */
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
+        write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
-        write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?;
+        write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?;
         Ok(())
     }
 
@@ -1294,7 +1296,7 @@ namespace cg = cooperative_groups;
                 }
                 if !is_primitive && state != KernelState::OutBlock {
                     write!(w, "{}}}\n", tabs)?;
-                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    //write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
                     *num_tabs -= 1;
                 }
                 if !is_primitive && state == KernelState::OutBlock {
@@ -1311,6 +1313,7 @@ namespace cg = cooperative_groups;
                 }
                 if !is_primitive
                     && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false))
+                    && !self.function.schedules[id.idx()].contains(&Schedule::NoResetConstant)
                 {
                     let data_size = self.get_size(self.typing[id.idx()], None);
                     write!(
@@ -1321,6 +1324,7 @@ namespace cg = cooperative_groups;
                     write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?;
                     write!(w, "{}}}\n", tabs)?;
                     write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    //write!(w, "__syncthreads\n")?;
                 }
             }
             // Dynamic constants emitted at top
@@ -1595,7 +1599,7 @@ namespace cg = cooperative_groups;
                     write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?;
                     write!(w, "{}}}\n", tabs)?;
                 }
-                write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                //write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
                 let collect_variable = self.get_value(*collect, false, false);
                 write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?;
             }
@@ -1705,20 +1709,20 @@ namespace cg = cooperative_groups;
                     };
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                         use_thread_per_id, cg_tile, use_thread_per_id
                     )?;
                     let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                         use_thread_quota, cg_tile_use, use_thread_quota
                     )?;
                     let available_thread_quota = available_thread_quota.unwrap();
                     let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {
@@ -1781,6 +1785,7 @@ namespace cg = cooperative_groups;
                     let fork = self.join_fork_map.get(&id).unwrap();
                     let cg_tile_available = self.get_cg_tile(*fork, CGType::Available);
                     write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
+                    //write!(w_term, "\t__syncthreads;\n")?;
                 }
                 // If the Fork was parallelized, each thread or UsedPerId tile of
                 // threads only runs one ThreadID, so we can jump straight to the
diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index e1598463cc49719b0f8c7365122ebe40f8cfb734..6998f8794c029a0c8d66ec8b557410557483d2ad 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1533,7 +1533,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                 left: _,
                 right: _,
             } if (op == BinaryOperator::Add || op == BinaryOperator::Or)
-                && !is_zero(editor, init) =>
+                && !is_zero(editor, init)
+                && !is_false(editor, init) =>
             {
                 editor.edit(|mut edit| {
                     let zero = edit.add_zero_constant(typing[init.idx()]);
@@ -1556,7 +1557,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                 left: _,
                 right: _,
             } if (op == BinaryOperator::Mul || op == BinaryOperator::And)
-                && !is_one(editor, init) =>
+                && !is_one(editor, init)
+                && !is_true(editor, init) =>
             {
                 editor.edit(|mut edit| {
                     let one = edit.add_one_constant(typing[init.idx()]);
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index d950941a4acba886f47cdd0e99cb3d9a48459636..4a6365c8c68bd7714b529744be1cc2bd0259071a 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -212,7 +212,8 @@ fn preliminary_fixups(
             let (_, init, _) = nodes[reduce.idx()].try_reduce().unwrap();
 
             // Replace uses of the reduce in its cycle with the init.
-            let success = editor.edit(|edit| {
+            let success = editor.edit(|mut edit| {
+                edit = edit.add_schedule(init, Schedule::ParallelReduce)?;
                 edit.replace_all_uses_where(reduce, init, |id| reduce_cycles[&reduce].contains(id))
             });
             assert!(success);
@@ -870,7 +871,7 @@ fn spill_clones(
     // Step 2: filter edges (A, B) to just see edges where A uses B and A
     // mutates B. These are the edges that may require a spill.
     let mut spill_edges = edges.into_iter().filter(|(a, b)| {
-        mutating_writes(editor.func(), *a, objects).any(|id| id == *b)
+        (mutating_writes(editor.func(), *a, objects).any(|id| id == *b)
             || (get_uses(&editor.func().nodes[a.idx()])
                 .as_ref()
                 .into_iter()
@@ -890,7 +891,14 @@ fn spill_clones(
                         data.contains(b)
                             && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce)
                     })
-                    .unwrap_or(false))
+                    .unwrap_or(false)))
+            && !editor.func().nodes[a.idx()]
+                .try_write()
+                .map(|(collect, _, _)| {
+                    collect == *b
+                        && editor.func().schedules[b.idx()].contains(&Schedule::ParallelReduce)
+                })
+                .unwrap_or(false)
     });
 
     // Step 3: if there is a spill edge, spill it and return true. Otherwise,
diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs
index b910a128116fb8fb39de29475b93ffa70a12dfcd..351abc2b263b0ad3f405c7221e916d8477e02680 100644
--- a/hercules_opt/src/utils.rs
+++ b/hercules_opt/src/utils.rs
@@ -598,6 +598,24 @@ pub fn is_one(editor: &FunctionEditor, id: NodeID) -> bool {
         || nodes[id.idx()].is_undef()
 }
 
+pub fn is_false(editor: &FunctionEditor, id: NodeID) -> bool {
+    let nodes = &editor.func().nodes;
+    nodes[id.idx()]
+        .try_constant()
+        .map(|id| editor.get_constant(id).is_false())
+        .unwrap_or(false)
+        || nodes[id.idx()].is_undef()
+}
+
+pub fn is_true(editor: &FunctionEditor, id: NodeID) -> bool {
+    let nodes = &editor.func().nodes;
+    nodes[id.idx()]
+        .try_constant()
+        .map(|id| editor.get_constant(id).is_true())
+        .unwrap_or(false)
+        || nodes[id.idx()].is_undef()
+}
+
 pub fn is_largest(editor: &FunctionEditor, id: NodeID) -> bool {
     let nodes = &editor.func().nodes;
     nodes[id.idx()]
diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index 3d0280f1535b35bfd19bb4c1032eb3a224ac5a0d..f82d9d80cf5aa2275e4dcad941bb8128cdf6ee43 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -1,4 +1,11 @@
 type Node = struct { edge_start: u32; num_edges: u32; };
+type StopProd = struct { stop: bool; };
+
+fn make_stop_prod() -> StopProd {
+  let ret : StopProd;
+  ret.stop = true;
+  return ret;
+}
 
 #[entry]
 fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] {
@@ -23,8 +30,6 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
   let updated: bool[n];
 
   while !stop {
-    stop = true;
-
     @loop1 for i in 0..n {
       if mask[i] {
         mask[i] = false;
@@ -42,15 +47,16 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
       }
     }
 
+    @make let stop_prod = make_stop_prod();
     @loop2 for i in 0..n {
-      stop = stop && !updated[i];
-    }
-
-    @loop3 for i in 0..n {
-      mask[i] = mask[i] || updated[i];
-      visited[i] = visited[i] || updated[i];
-      updated[i] = false;
+      if updated[i] {
+        mask[i] = true;
+        visited[i] = true;
+        updated[i] = false;
+	stop_prod.stop = updated[i];
+      }
     }
+    stop = stop_prod.stop;
   }
 
   return cost;
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index a33e361db3d4b9634f669226cf5f7198f010869e..589b93b1109b09e9146af593c8649987d6298635 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -12,8 +12,7 @@ phi-elim(bfs);
 no-memset(bfs@cost);
 let init = outline(bfs@cost_init);
 let traverse = outline(bfs@loop1);
-let collect = outline(bfs@loop2 | bfs@loop3);
-parallel-reduce(traverse);
+let collect = outline(bfs@loop2);
 
 simpl!(*);
 predication(*);
@@ -29,12 +28,13 @@ predication(*);
 simpl!(*);
 reduce-slf(*);
 simpl!(*);
+slf(*);
+simpl!(*);
 
 fixpoint {
   forkify(collect);
   fork-guard-elim(collect);
 }
-fork-fusion(collect);
 simpl!(collect);
 
 unforkify(init, traverse, collect);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 56489a23ed693014512de67f80c399e1031be7b8..4e5c1f74e34cedacd48ddf6c33b631a08079b995 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -11,10 +11,11 @@ macro simpl!(X) {
 phi-elim(bfs);
 no-memset(bfs@cost);
 let init = outline(bfs@cost_init);
-let loop1 = outline(bfs@loop1);
-let loop2 = outline(bfs@loop2);
-let loop3 = outline(bfs@loop3);
-parallel-reduce(loop1);
+let traverse = outline(bfs@loop1);
+let collect = outline(bfs@loop2);
+parallel-reduce(traverse, collect);
+no-memset(make_stop_prod);
+gpu(traverse, make_stop_prod, collect);
 
 simpl!(*);
 predication(*);
@@ -31,41 +32,14 @@ simpl!(*);
 reduce-slf(*);
 simpl!(*);
 
-fork-tile[32, 0, false, true](loop1);
-fork-split(loop1);
-gpu(loop1);
-
 fixpoint {
-  forkify(loop2, loop3);
-  fork-guard-elim(loop2, loop3);
+  forkify(collect);
+  fork-guard-elim(collect);
 }
+simpl!(collect);
 
-simpl!(loop2, loop3);
-fork-tile[32, 0, false, true](loop2, loop3);
-let out = fork-split(loop2, loop3);
-clean-monoid-reduces(loop2, loop3);
-simpl!(loop2, loop3);
-gpu(loop3);
-
-let fission1 = fork-fission[out.bfs_2.fj0](loop2);
-simpl!(loop2);
-fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom);
-let out = fork-split(fission1.bfs_2.fj_bottom);
-clean-monoid-reduces(loop2);
-simpl!(loop2);
-let fission2 = fork-fission[out.bfs_2.fj0](loop2);
-simpl!(loop2);
-fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom);
-let out = fork-split(fission2.bfs_2.fj_bottom);
-clean-monoid-reduces(loop2);
-simpl!(loop2);
-let top = outline(fission1.bfs_2.fj_top);
-let middle = outline(fission2.bfs_2.fj_top);
-let bottom = outline(out.bfs_2.fj0);
-const-inline(loop2, top, middle, bottom);
-no-memset(top, middle);
-gpu(top, middle, bottom);
-simpl!(loop2, top, middle, bottom);
+fork-tile[1024, 0, false, true](traverse, collect);
+fork-split(traverse, collect);
 
 unforkify(init);
 gcm(*);