diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 07dd3ebfc551b84b20cea797421bf5e6846f30c8..1e6067a360c6a57674efd43444abd300d74075b4 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -354,6 +354,7 @@ impl GPUContext<'_> { write!( w, " +#define _CG_ABI_EXPERIMENTAL #include <assert.h> #include <stdio.h> #include <stddef.h> @@ -561,8 +562,9 @@ namespace cg = cooperative_groups; * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { + write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; - write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; + write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?; Ok(()) } @@ -1294,7 +1296,7 @@ namespace cg = cooperative_groups; } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; *num_tabs -= 1; } if !is_primitive && state == KernelState::OutBlock { @@ -1311,6 +1313,7 @@ namespace cg = cooperative_groups; } if !is_primitive && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false)) + && !self.function.schedules[id.idx()].contains(&Schedule::NoResetConstant) { let data_size = self.get_size(self.typing[id.idx()], None); write!( @@ -1321,6 +1324,7 @@ namespace cg = cooperative_groups; write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; write!(w, "{}}}\n", tabs)?; write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "__syncthreads\n")?; } } // Dynamic constants emitted at top @@ -1595,7 +1599,7 @@ namespace cg = cooperative_groups; write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?; write!(w, "{}}}\n", tabs)?; } - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -1705,20 +1709,20 @@ namespace cg = cooperative_groups; }; write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id )?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota )?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() { @@ -1781,6 +1785,7 @@ namespace cg = cooperative_groups; let fork = self.join_fork_map.get(&id).unwrap(); let cg_tile_available = self.get_cg_tile(*fork, CGType::Available); write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + //write!(w_term, "\t__syncthreads;\n")?; } // If the Fork was parallelized, each thread or UsedPerId tile of // threads only runs one ThreadID, so we can jump straight to the diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index e1598463cc49719b0f8c7365122ebe40f8cfb734..6998f8794c029a0c8d66ec8b557410557483d2ad 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1533,7 +1533,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { left: _, right: _, } if (op == BinaryOperator::Add || op == BinaryOperator::Or) - && !is_zero(editor, init) => + && !is_zero(editor, init) + && !is_false(editor, init) => { editor.edit(|mut edit| { let zero = edit.add_zero_constant(typing[init.idx()]); @@ -1556,7 +1557,8 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { left: _, right: _, } if (op == BinaryOperator::Mul || op == BinaryOperator::And) - && !is_one(editor, init) => + && !is_one(editor, init) + && !is_true(editor, init) => { editor.edit(|mut edit| { let one = edit.add_one_constant(typing[init.idx()]); diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index d950941a4acba886f47cdd0e99cb3d9a48459636..4a6365c8c68bd7714b529744be1cc2bd0259071a 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -212,7 +212,8 @@ fn preliminary_fixups( let (_, init, _) = nodes[reduce.idx()].try_reduce().unwrap(); // Replace uses of the reduce in its cycle with the init. - let success = editor.edit(|edit| { + let success = editor.edit(|mut edit| { + edit = edit.add_schedule(init, Schedule::ParallelReduce)?; edit.replace_all_uses_where(reduce, init, |id| reduce_cycles[&reduce].contains(id)) }); assert!(success); @@ -870,7 +871,7 @@ fn spill_clones( // Step 2: filter edges (A, B) to just see edges where A uses B and A // mutates B. These are the edges that may require a spill. let mut spill_edges = edges.into_iter().filter(|(a, b)| { - mutating_writes(editor.func(), *a, objects).any(|id| id == *b) + (mutating_writes(editor.func(), *a, objects).any(|id| id == *b) || (get_uses(&editor.func().nodes[a.idx()]) .as_ref() .into_iter() @@ -890,7 +891,14 @@ fn spill_clones( data.contains(b) && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce) }) - .unwrap_or(false)) + .unwrap_or(false))) + && !editor.func().nodes[a.idx()] + .try_write() + .map(|(collect, _, _)| { + collect == *b + && editor.func().schedules[b.idx()].contains(&Schedule::ParallelReduce) + }) + .unwrap_or(false) }); // Step 3: if there is a spill edge, spill it and return true. Otherwise, diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs index b910a128116fb8fb39de29475b93ffa70a12dfcd..351abc2b263b0ad3f405c7221e916d8477e02680 100644 --- a/hercules_opt/src/utils.rs +++ b/hercules_opt/src/utils.rs @@ -598,6 +598,24 @@ pub fn is_one(editor: &FunctionEditor, id: NodeID) -> bool { || nodes[id.idx()].is_undef() } +pub fn is_false(editor: &FunctionEditor, id: NodeID) -> bool { + let nodes = &editor.func().nodes; + nodes[id.idx()] + .try_constant() + .map(|id| editor.get_constant(id).is_false()) + .unwrap_or(false) + || nodes[id.idx()].is_undef() +} + +pub fn is_true(editor: &FunctionEditor, id: NodeID) -> bool { + let nodes = &editor.func().nodes; + nodes[id.idx()] + .try_constant() + .map(|id| editor.get_constant(id).is_true()) + .unwrap_or(false) + || nodes[id.idx()].is_undef() +} + pub fn is_largest(editor: &FunctionEditor, id: NodeID) -> bool { let nodes = &editor.func().nodes; nodes[id.idx()] diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index 3d0280f1535b35bfd19bb4c1032eb3a224ac5a0d..f82d9d80cf5aa2275e4dcad941bb8128cdf6ee43 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -1,4 +1,11 @@ type Node = struct { edge_start: u32; num_edges: u32; }; +type StopProd = struct { stop: bool; }; + +fn make_stop_prod() -> StopProd { + let ret : StopProd; + ret.stop = true; + return ret; +} #[entry] fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] { @@ -23,8 +30,6 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] let updated: bool[n]; while !stop { - stop = true; - @loop1 for i in 0..n { if mask[i] { mask[i] = false; @@ -42,15 +47,16 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] } } + @make let stop_prod = make_stop_prod(); @loop2 for i in 0..n { - stop = stop && !updated[i]; - } - - @loop3 for i in 0..n { - mask[i] = mask[i] || updated[i]; - visited[i] = visited[i] || updated[i]; - updated[i] = false; + if updated[i] { + mask[i] = true; + visited[i] = true; + updated[i] = false; + stop_prod.stop = updated[i]; + } } + stop = stop_prod.stop; } return cost; diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index a33e361db3d4b9634f669226cf5f7198f010869e..589b93b1109b09e9146af593c8649987d6298635 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -12,8 +12,7 @@ phi-elim(bfs); no-memset(bfs@cost); let init = outline(bfs@cost_init); let traverse = outline(bfs@loop1); -let collect = outline(bfs@loop2 | bfs@loop3); -parallel-reduce(traverse); +let collect = outline(bfs@loop2); simpl!(*); predication(*); @@ -29,12 +28,13 @@ predication(*); simpl!(*); reduce-slf(*); simpl!(*); +slf(*); +simpl!(*); fixpoint { forkify(collect); fork-guard-elim(collect); } -fork-fusion(collect); simpl!(collect); unforkify(init, traverse, collect); diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index 56489a23ed693014512de67f80c399e1031be7b8..4e5c1f74e34cedacd48ddf6c33b631a08079b995 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -11,10 +11,11 @@ macro simpl!(X) { phi-elim(bfs); no-memset(bfs@cost); let init = outline(bfs@cost_init); -let loop1 = outline(bfs@loop1); -let loop2 = outline(bfs@loop2); -let loop3 = outline(bfs@loop3); -parallel-reduce(loop1); +let traverse = outline(bfs@loop1); +let collect = outline(bfs@loop2); +parallel-reduce(traverse, collect); +no-memset(make_stop_prod); +gpu(traverse, make_stop_prod, collect); simpl!(*); predication(*); @@ -31,41 +32,14 @@ simpl!(*); reduce-slf(*); simpl!(*); -fork-tile[32, 0, false, true](loop1); -fork-split(loop1); -gpu(loop1); - fixpoint { - forkify(loop2, loop3); - fork-guard-elim(loop2, loop3); + forkify(collect); + fork-guard-elim(collect); } +simpl!(collect); -simpl!(loop2, loop3); -fork-tile[32, 0, false, true](loop2, loop3); -let out = fork-split(loop2, loop3); -clean-monoid-reduces(loop2, loop3); -simpl!(loop2, loop3); -gpu(loop3); - -let fission1 = fork-fission[out.bfs_2.fj0](loop2); -simpl!(loop2); -fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom); -let out = fork-split(fission1.bfs_2.fj_bottom); -clean-monoid-reduces(loop2); -simpl!(loop2); -let fission2 = fork-fission[out.bfs_2.fj0](loop2); -simpl!(loop2); -fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom); -let out = fork-split(fission2.bfs_2.fj_bottom); -clean-monoid-reduces(loop2); -simpl!(loop2); -let top = outline(fission1.bfs_2.fj_top); -let middle = outline(fission2.bfs_2.fj_top); -let bottom = outline(out.bfs_2.fj0); -const-inline(loop2, top, middle, bottom); -no-memset(top, middle); -gpu(top, middle, bottom); -simpl!(loop2, top, middle, bottom); +fork-tile[1024, 0, false, true](traverse, collect); +fork-split(traverse, collect); unforkify(init); gcm(*);