diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 1e6067a360c6a57674efd43444abd300d74075b4..dd87acbe18de297557fc8b97696928946e90c2e2 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -562,9 +562,15 @@ namespace cg = cooperative_groups;
      * and writes.
      */
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
-        write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?;
+        write!(
+            w,
+            "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n"
+        )?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
-        write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?;
+        write!(
+            w,
+            "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n"
+        )?;
         Ok(())
     }
 
@@ -1726,7 +1732,8 @@ namespace cg = cooperative_groups;
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {
-                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?;
+                        write!(thread_block_tiles, "\t{};\n", self.get_fork_iter(id, true))?;
+                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, false))?;
                         write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?;
                     }
                 }
diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
index 80964c7228212e2c64f13b565b8f81200001221e..492bce3bd7117598009993f040d567c71c78db99 100644
--- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs
+++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs
@@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) {
     };
 
     bench("backprop bench small", 65536);
-    bench("backprop bench large", 1048576);
+    bench("backprop bench large", 33554432);
 }
 
 criterion_group!(benches, backprop_bench);
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index cc9cc2acaffcff28a761a3f30b21884104eedffa..1773f713ee3cdd223f5e2fa5c88ef2d4a837f2dc 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -33,10 +33,11 @@ fixpoint {
 reduce-slf(*);
 simpl!(*);
 
-fork-extend[1024](layer_forward@inner_loop);
+fork-extend[32768](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 simpl!(layer_forward);
-fork-tile[1024, 0, false, true](layer_forward@inner_loop);
+fork-tile[32768, 0, false, true](layer_forward@inner_loop);
+fork-tile[1024, 1, false, true](layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);
 let out = fork-split(layer_forward@inner_loop);
 clean-monoid-reduces(layer_forward);