diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index 6998f8794c029a0c8d66ec8b557410557483d2ad..c46e4e985449a3fae8aa3041782b02ab8213c7cb 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1741,6 +1741,7 @@ fn extend_fork(editor: &mut FunctionEditor, fork: NodeID, join: NodeID, multiple
                     control: new_fork,
                     dimension: idx,
                 });
+                edit.sub_edit(fork, tid);
                 let old_bound = edit.add_node(Node::DynamicConstant { id: *old_factor });
                 edit.add_node(Node::Binary {
                     op: BinaryOperator::LT,
diff --git a/juno_samples/cava/src/cpu.sch b/juno_samples/cava/src/cpu.sch
index ada2f552fb1f31412d3f9a0bfe5c27884d4d86e6..32e2b63b55c69e6e2c05cabc3221f3c8feff674b 100644
--- a/juno_samples/cava/src/cpu.sch
+++ b/juno_samples/cava/src/cpu.sch
@@ -124,6 +124,8 @@ if !feature("seq") {
   fork-coalesce(fuse4, fuse4_body);
   simpl!(fuse4, fuse4_body);
   fuse4 = fuse4_body;
+} else {
+  fork-tile[6, 0, false, true](fuse4@channel_loop);
 }
 
 no-memset(fuse5@res1);
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index ea6f0403c8f0824c0bcf27dc6dcd15649bcdb2ec..f564cd36571564dfc352315c848c14c36ad5970f 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -41,23 +41,25 @@ parallel-fork(traverse, collect);
 parallel-reduce(traverse, collect);
 
 if !feature("seq") {
-  fork-tile[32, 0, false, true](traverse, collect);
-  let (outer, inner) = fork-reshape[[1], [0]](traverse);
+  fork-tile[32, 0, false, false](traverse, collect);
+  let (outer, inner) = fork-reshape[[0], [1]](traverse);
   traverse = outline(inner);
-  let (outer, inner) = fork-reshape[[1], [0]](collect);
+  let (outer, inner) = fork-reshape[[0], [1]](collect);
   collect = outline(inner);
 
-  fork-tile[32, 0, false, true](init);
-  let (outer, inner) = fork-reshape[[1], [0]](init);
+  fork-tile[32, 0, false, false](init);
+  let (outer, inner) = fork-reshape[[0], [1]](init);
   let init_body = outline(inner);
 
   inline(bfs@cost_init, bfs@loop1, bfs@loop2);
   init = init_body;
 }
+fork-tile[8, 0, false, true](init, traverse, collect);
 delete-uncalled(*);
 const-inline(*);
 simpl!(*);
 
+fork-split(init, traverse, collect);
 unforkify(init, traverse, collect);
 simpl!(*);
-gcm(*);
\ No newline at end of file
+gcm(*);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 0253a0210f6cc2451d38399601ad39ff3ab9465a..541d15d7a5b90b17a484c98c2ed216c5912bd666 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -15,7 +15,7 @@ let traverse = outline(bfs@loop1);
 let collect = outline(bfs@loop2);
 parallel-reduce(traverse, collect);
 no-memset(make_stop_prod);
-gpu(traverse, make_stop_prod, collect);
+gpu(init, traverse, make_stop_prod, collect);
 
 simpl!(*);
 predication(*);
@@ -38,12 +38,7 @@ fixpoint {
 }
 simpl!(collect);
 
-fork-tile[32, 0, false, true](init);
-let (outer, inner) = fork-reshape[[1], [0]](init);
-let init_body = outline(inner);
+fork-tile[1024, 0, false, true](init, traverse, collect);
+fork-split(init, traverse, collect);
 
-fork-tile[1024, 0, false, true](traverse, collect);
-fork-split(traverse, collect);
-
-unforkify(init_body);
 gcm(*);