diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index c9720273c03243d4874b27fc6c74f20fd21a6c33..07dd3ebfc551b84b20cea797421bf5e6846f30c8 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups;
                             // because Fork basic block's init section already does gating
                             write!(
                                 w,
-                                "{}{} = (threadIdx.x % {}) / {};\n",
+                                "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n",
                                 tabs,
                                 define_variable,
                                 use_thread_quota.unwrap(),
-                                use_thread_quota.unwrap() / parallel_factor.unwrap()
+                                use_thread_quota.unwrap() / parallel_factor.unwrap(),
+                                divide,
+                                modulo,
                             )?;
                         }
                     }
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index aa1df3904fd8fa700ba5da50b1d1217ee43a576e..92a1835d7d49d38f533036f162a5fe2d6113e979 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -121,6 +121,12 @@ fork-tile[2, 0, false, true](fuse4@channel_loop);
 let out = fork-split(fuse4@channel_loop);
 fork-unroll(out.cava_3.fj1);
 unforkify(fuse4@channel_loop);
+fork-tile[4, 1, false, true](fuse4);
+fork-tile[8, 0, false, true](fuse4);
+fork-interchange[1, 2](fuse4);
+let split = fork-split(fuse4);
+fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2);
+fork-coalesce(split.cava_3.fj2);
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);