diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 1e6067a360c6a57674efd43444abd300d74075b4..dd87acbe18de297557fc8b97696928946e90c2e2 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -562,9 +562,15 @@ namespace cg = cooperative_groups; * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { - write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?; + write!( + w, + "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n" + )?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; - write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?; + write!( + w, + "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n" + )?; Ok(()) } @@ -1726,7 +1732,8 @@ namespace cg = cooperative_groups; available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() { - write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?; + write!(thread_block_tiles, "\t{};\n", self.get_fork_iter(id, true))?; + write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, false))?; write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; } } diff --git a/juno_samples/rodinia/backprop/benches/backprop_bench.rs b/juno_samples/rodinia/backprop/benches/backprop_bench.rs index 80964c7228212e2c64f13b565b8f81200001221e..492bce3bd7117598009993f040d567c71c78db99 100644 --- a/juno_samples/rodinia/backprop/benches/backprop_bench.rs +++ b/juno_samples/rodinia/backprop/benches/backprop_bench.rs @@ -68,7 +68,7 @@ fn backprop_bench(c: &mut Criterion) { }; bench("backprop bench small", 65536); - bench("backprop bench large", 1048576); + bench("backprop bench large", 33554432); } criterion_group!(benches, backprop_bench); diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch index cc9cc2acaffcff28a761a3f30b21884104eedffa..1773f713ee3cdd223f5e2fa5c88ef2d4a837f2dc 100644 --- a/juno_samples/rodinia/backprop/src/gpu.sch +++ b/juno_samples/rodinia/backprop/src/gpu.sch @@ -33,10 +33,11 @@ fixpoint { reduce-slf(*); simpl!(*); -fork-extend[1024](layer_forward@inner_loop); +fork-extend[32768](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); simpl!(layer_forward); -fork-tile[1024, 0, false, true](layer_forward@inner_loop); +fork-tile[32768, 0, false, true](layer_forward@inner_loop); +fork-tile[1024, 1, false, true](layer_forward@inner_loop); clean-monoid-reduces(layer_forward); let out = fork-split(layer_forward@inner_loop); clean-monoid-reduces(layer_forward);