diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 3a00e547f88d66cfc7ca9ff4c20f0c411520404c..1e6067a360c6a57674efd43444abd300d74075b4 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -562,8 +562,9 @@ namespace cg = cooperative_groups; * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { + write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; - write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; + write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?; Ok(()) } @@ -1322,8 +1323,8 @@ namespace cg = cooperative_groups; )?; write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; write!(w, "{}}}\n", tabs)?; - //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; - write!(w, "__syncthreads\n")?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "__syncthreads\n")?; } } // Dynamic constants emitted at top @@ -1783,8 +1784,8 @@ namespace cg = cooperative_groups; } let fork = self.join_fork_map.get(&id).unwrap(); let cg_tile_available = self.get_cg_tile(*fork, CGType::Available); - //write!(w_term, "\t{}.sync();\n", cg_tile_available)?; - write!(w_term, "\t__syncthreads;\n")?; + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + //write!(w_term, "\t__syncthreads;\n")?; } // If the Fork was parallelized, each thread or UsedPerId tile of // threads only runs one ThreadID, so we can jump straight to the