diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 1086d1aa3f3c662e12da9f28102d4fc89f321bec..f20bfeb727dc6233dbb212a72c3a25b0c720cf36 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1521,6 +1521,8 @@ extern \"C\" {} {}(", let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } + // Undef nodes never need to be assigned to. + Node::Undef { ty: _ } => {} _ => { panic!( "Unsupported data node type: {:?}", diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index d9505fde5e5f7ec19be86ded7cbd4af6e467869a..b2f9767c9cd2eb56b698f05d7b93dbba59c8681e 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -181,8 +181,13 @@ fn preliminary_fixups( reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>, ) -> bool { let nodes = &editor.func().nodes; + let schedules = &editor.func().schedules; + + // Sequentialize non-parallel forks that contain problematic reduce cycles. for (reduce, cycle) in reduce_cycles { - if cycle.into_iter().any(|id| nodes[id.idx()].is_reduce()) { + if !schedules[reduce.idx()].contains(&Schedule::ParallelReduce) + && cycle.into_iter().any(|id| nodes[id.idx()].is_reduce()) + { let join = nodes[reduce.idx()].try_reduce().unwrap().0; let fork = fork_join_map .into_iter() @@ -198,6 +203,31 @@ fn preliminary_fixups( return true; } } + + // Get rid of the backward edge on parallel reduces in fork-joins. + for (_, join) in fork_join_map { + let parallel_reduces: Vec<_> = editor + .get_users(*join) + .filter(|id| { + nodes[id.idx()].is_reduce() + && schedules[id.idx()].contains(&Schedule::ParallelReduce) + }) + .collect(); + for reduce in parallel_reduces { + if reduce_cycles[&reduce].is_empty() { + continue; + } + let (_, init, _) = nodes[reduce.idx()].try_reduce().unwrap(); + + // Replace uses of the reduce in its cycle with the init. + let success = editor.edit(|edit| { + edit.replace_all_uses_where(reduce, init, |id| reduce_cycles[&reduce].contains(id)) + }); + assert!(success); + return true; + } + } + false } @@ -511,7 +541,8 @@ fn basic_blocks( // outside of reduce loops. Nodes that do need to be in a reduce // loop use the reduce node forming the loop, so the dominator chain // will consist of one block, and this loop won't ever iterate. - let currently_at_join = function.nodes[location.idx()].is_join(); + let currently_at_join = function.nodes[location.idx()].is_join() + && !function.nodes[control_node.idx()].is_join(); if (!is_constant_or_undef || is_gpu_returned) && (shallower_nest || currently_at_join) @@ -811,7 +842,14 @@ fn spill_clones( .into_iter() .any(|u| *u == *b) && (editor.func().nodes[a.idx()].is_phi() - || editor.func().nodes[a.idx()].is_reduce())) + || editor.func().nodes[a.idx()].is_reduce()) + && !editor.func().nodes[a.idx()] + .try_reduce() + .map(|(_, init, _)| { + init == *b + && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce) + }) + .unwrap_or(false)) }); // Step 3: if there is a spill edge, spill it and return true. Otherwise, @@ -989,15 +1027,16 @@ fn liveness_dataflow( } let mut num_phis_reduces = vec![0; function.nodes.len()]; let mut has_phi = vec![false; function.nodes.len()]; - let mut has_reduce = vec![false; function.nodes.len()]; + let mut has_seq_reduce = vec![false; function.nodes.len()]; for (node_idx, bb) in bbs.0.iter().enumerate() { let node = &function.nodes[node_idx]; if node.is_phi() || node.is_reduce() { num_phis_reduces[bb.idx()] += 1; } has_phi[bb.idx()] = node.is_phi(); - has_reduce[bb.idx()] = node.is_reduce(); - assert!(!has_phi[bb.idx()] || !has_reduce[bb.idx()]); + has_seq_reduce[bb.idx()] = + node.is_reduce() && !function.schedules[node_idx].contains(&Schedule::ParallelReduce); + assert!(!node.is_phi() || !node.is_reduce()); } let is_obj = |id: NodeID| !objects[&func_id].objects(id).is_empty(); @@ -1009,11 +1048,14 @@ fn liveness_dataflow( let last_pt = bbs.1[bb.idx()].len(); let old_value = &liveness[&bb][last_pt]; let mut new_value = BTreeSet::new(); - for succ in control_subgraph.succs(*bb).chain(if has_reduce[bb.idx()] { - Either::Left(once(*bb)) - } else { - Either::Right(empty()) - }) { + for succ in control_subgraph + .succs(*bb) + .chain(if has_seq_reduce[bb.idx()] { + Either::Left(once(*bb)) + } else { + Either::Right(empty()) + }) + { // The liveness at the bottom of a basic block is the union of: // 1. The liveness of each succecessor right after its phis and // reduces. @@ -1041,7 +1083,9 @@ fn liveness_dataflow( assert_eq!(control, succ); if succ == *bb { new_value.insert(reduct); - } else { + } else if !function.schedules[id.idx()] + .contains(&Schedule::ParallelReduce) + { new_value.insert(init); } } @@ -1058,6 +1102,7 @@ fn liveness_dataflow( let mut new_value = liveness[&bb][pt + 1].clone(); let id = bbs.1[bb.idx()][pt]; let uses = get_uses(&function.nodes[id.idx()]); + let is_obj = |id: &NodeID| is_obj(*id); new_value.remove(&id); new_value.extend( if let Node::Write { @@ -1070,14 +1115,19 @@ fn liveness_dataflow( // If this write is a cloning write, the `collect` input // isn't actually live, because its value doesn't // matter. - Either::Left(once(data).filter(|id| is_obj(*id))) + Either::Left(once(data).filter(is_obj)) + } else if let Node::Reduce { + control: _, + init: _, + reduct, + } = function.nodes[id.idx()] + && function.schedules[id.idx()].contains(&Schedule::ParallelReduce) + { + // If this reduce is a parallel reduce, the `init` input + // isn't actually live. + Either::Left(once(reduct).filter(is_obj)) } else { - Either::Right( - uses.as_ref() - .into_iter() - .map(|id| *id) - .filter(|id| is_obj(*id)), - ) + Either::Right(uses.as_ref().into_iter().map(|id| *id).filter(is_obj)) }, ); changed |= *old_value != new_value; diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn index bcfa1d25d0281b5c842d62142976c7518fe194e6..fdfa51a87a36f0def2c3b24701469a56fe498440 100644 --- a/juno_samples/fork_join_tests/src/fork_join_tests.jn +++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn @@ -62,7 +62,7 @@ fn test4(input : i32) -> i32[4, 4] { #[entry] fn test5(input : i32) -> i32[4] { - let arr1 : i32[4]; + @cons let arr1 : i32[4]; for i = 0 to 4 { let red = arr1[i]; for k = 0 to 3 { diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch index 6e2d6845423d13e0c2208b0169e3437a4735ba8f..0987083ec8c40565d8c079fbc5535177d68353b7 100644 --- a/juno_samples/fork_join_tests/src/gpu.sch +++ b/juno_samples/fork_join_tests/src/gpu.sch @@ -1,3 +1,4 @@ +no-memset(test5@cons); parallel-reduce(test5@reduce); gvn(*); diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs index 4b6aba655a45a2fe2f844efe75f4e8a738a17b96..5e848adeeb3ddde17a5794afd783559b8f0f8c16 100644 --- a/juno_samples/fork_join_tests/src/main.rs +++ b/juno_samples/fork_join_tests/src/main.rs @@ -46,6 +46,6 @@ fn main() { } #[test] -fn implicit_clone_test() { +fn fork_join_test() { main(); }