diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 6c4d027b77df936c5840237c211950d6c0430082..56489a23ed693014512de67f80c399e1031be7b8 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -10,14 +10,16 @@ macro simpl!(X) {
 
 phi-elim(bfs);
 no-memset(bfs@cost);
-let cost_init = outline(bfs@cost_init);
+let init = outline(bfs@cost_init);
 let loop1 = outline(bfs@loop1);
 let loop2 = outline(bfs@loop2);
-gpu(loop1, loop2);
+let loop3 = outline(bfs@loop3);
+parallel-reduce(loop1);
 
 simpl!(*);
 predication(*);
 const-inline(*);
+loop-bound-canon(*);
 simpl!(*);
 fixpoint {
   forkify(*);
@@ -26,14 +28,44 @@ fixpoint {
 simpl!(*);
 predication(*);
 simpl!(*);
-
-unforkify(cost_init);
-parallel-reduce(loop1);
-forkify(*);
-fork-guard-elim(*);
-simpl!(*);
-predication(*);
 reduce-slf(*);
 simpl!(*);
 
+fork-tile[32, 0, false, true](loop1);
+fork-split(loop1);
+gpu(loop1);
+
+fixpoint {
+  forkify(loop2, loop3);
+  fork-guard-elim(loop2, loop3);
+}
+
+simpl!(loop2, loop3);
+fork-tile[32, 0, false, true](loop2, loop3);
+let out = fork-split(loop2, loop3);
+clean-monoid-reduces(loop2, loop3);
+simpl!(loop2, loop3);
+gpu(loop3);
+
+let fission1 = fork-fission[out.bfs_2.fj0](loop2);
+simpl!(loop2);
+fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom);
+let out = fork-split(fission1.bfs_2.fj_bottom);
+clean-monoid-reduces(loop2);
+simpl!(loop2);
+let fission2 = fork-fission[out.bfs_2.fj0](loop2);
+simpl!(loop2);
+fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom);
+let out = fork-split(fission2.bfs_2.fj_bottom);
+clean-monoid-reduces(loop2);
+simpl!(loop2);
+let top = outline(fission1.bfs_2.fj_top);
+let middle = outline(fission2.bfs_2.fj_top);
+let bottom = outline(out.bfs_2.fj0);
+const-inline(loop2, top, middle, bottom);
+no-memset(top, middle);
+gpu(top, middle, bottom);
+simpl!(loop2, top, middle, bottom);
+
+unforkify(init);
 gcm(*);