diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index c9720273c03243d4874b27fc6c74f20fd21a6c33..07dd3ebfc551b84b20cea797421bf5e6846f30c8 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1225,11 +1225,13 @@ namespace cg = cooperative_groups; // because Fork basic block's init section already does gating write!( w, - "{}{} = (threadIdx.x % {}) / {};\n", + "{}{} = (((threadIdx.x % {}) / {}) / ({})) % ({});\n", tabs, define_variable, use_thread_quota.unwrap(), - use_thread_quota.unwrap() / parallel_factor.unwrap() + use_thread_quota.unwrap() / parallel_factor.unwrap(), + divide, + modulo, )?; } } diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index aa1df3904fd8fa700ba5da50b1d1217ee43a576e..92a1835d7d49d38f533036f162a5fe2d6113e979 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -121,6 +121,12 @@ fork-tile[2, 0, false, true](fuse4@channel_loop); let out = fork-split(fuse4@channel_loop); fork-unroll(out.cava_3.fj1); unforkify(fuse4@channel_loop); +fork-tile[4, 1, false, true](fuse4); +fork-tile[8, 0, false, true](fuse4); +fork-interchange[1, 2](fuse4); +let split = fork-split(fuse4); +fork-coalesce(split.cava_3.fj0 \ split.cava_3.fj2); +fork-coalesce(split.cava_3.fj2); no-memset(fuse5@res1); no-memset(fuse5@res2);