diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index 3d0280f1535b35bfd19bb4c1032eb3a224ac5a0d..f82d9d80cf5aa2275e4dcad941bb8128cdf6ee43 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -1,4 +1,11 @@
 type Node = struct { edge_start: u32; num_edges: u32; };
+type StopProd = struct { stop: bool; };
+
+fn make_stop_prod() -> StopProd {
+  let ret : StopProd;
+  ret.stop = true;
+  return ret;
+}
 
 #[entry]
 fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] {
@@ -23,8 +30,6 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
   let updated: bool[n];
 
   while !stop {
-    stop = true;
-
     @loop1 for i in 0..n {
       if mask[i] {
         mask[i] = false;
@@ -42,15 +47,16 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
       }
     }
 
+    @make let stop_prod = make_stop_prod();
     @loop2 for i in 0..n {
-      stop = stop && !updated[i];
-    }
-
-    @loop3 for i in 0..n {
-      mask[i] = mask[i] || updated[i];
-      visited[i] = visited[i] || updated[i];
-      updated[i] = false;
+      if updated[i] {
+        mask[i] = true;
+        visited[i] = true;
+        updated[i] = false;
+	stop_prod.stop = updated[i];
+      }
     }
+    stop = stop_prod.stop;
   }
 
   return cost;
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index a33e361db3d4b9634f669226cf5f7198f010869e..589b93b1109b09e9146af593c8649987d6298635 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -12,8 +12,7 @@ phi-elim(bfs);
 no-memset(bfs@cost);
 let init = outline(bfs@cost_init);
 let traverse = outline(bfs@loop1);
-let collect = outline(bfs@loop2 | bfs@loop3);
-parallel-reduce(traverse);
+let collect = outline(bfs@loop2);
 
 simpl!(*);
 predication(*);
@@ -29,12 +28,13 @@ predication(*);
 simpl!(*);
 reduce-slf(*);
 simpl!(*);
+slf(*);
+simpl!(*);
 
 fixpoint {
   forkify(collect);
   fork-guard-elim(collect);
 }
-fork-fusion(collect);
 simpl!(collect);
 
 unforkify(init, traverse, collect);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 56489a23ed693014512de67f80c399e1031be7b8..d5c8dee62a5c55f8d8b17a6cedfe58ac16da7dbf 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -11,10 +11,11 @@ macro simpl!(X) {
 phi-elim(bfs);
 no-memset(bfs@cost);
 let init = outline(bfs@cost_init);
-let loop1 = outline(bfs@loop1);
-let loop2 = outline(bfs@loop2);
-let loop3 = outline(bfs@loop3);
-parallel-reduce(loop1);
+let traverse = outline(bfs@loop1);
+let collect = outline(bfs@loop2);
+parallel-reduce(traverse, collect);
+no-memset(make_stop_prod);
+gpu(traverse, make_stop_prod, collect);
 
 simpl!(*);
 predication(*);
@@ -31,41 +32,14 @@ simpl!(*);
 reduce-slf(*);
 simpl!(*);
 
-fork-tile[32, 0, false, true](loop1);
-fork-split(loop1);
-gpu(loop1);
-
 fixpoint {
-  forkify(loop2, loop3);
-  fork-guard-elim(loop2, loop3);
+  forkify(collect);
+  fork-guard-elim(collect);
 }
+simpl!(collect);
 
-simpl!(loop2, loop3);
-fork-tile[32, 0, false, true](loop2, loop3);
-let out = fork-split(loop2, loop3);
-clean-monoid-reduces(loop2, loop3);
-simpl!(loop2, loop3);
-gpu(loop3);
-
-let fission1 = fork-fission[out.bfs_2.fj0](loop2);
-simpl!(loop2);
-fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom);
-let out = fork-split(fission1.bfs_2.fj_bottom);
-clean-monoid-reduces(loop2);
-simpl!(loop2);
-let fission2 = fork-fission[out.bfs_2.fj0](loop2);
-simpl!(loop2);
-fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom);
-let out = fork-split(fission2.bfs_2.fj_bottom);
-clean-monoid-reduces(loop2);
-simpl!(loop2);
-let top = outline(fission1.bfs_2.fj_top);
-let middle = outline(fission2.bfs_2.fj_top);
-let bottom = outline(out.bfs_2.fj0);
-const-inline(loop2, top, middle, bottom);
-no-memset(top, middle);
-gpu(top, middle, bottom);
-simpl!(loop2, top, middle, bottom);
+fork-tile[32, 0, false, true](traverse, collect);
+fork-split(traverse, collect);
 
 unforkify(init);
 gcm(*);