diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index 7c67af538b81946f67786a8133b9f670b7138f9e..415244043c304fd94e7b95cd4c3ace990bfdcbcb 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -272,7 +272,7 @@ impl<'a> CPUContext<'a> { write!(body, "double {} to double", val)? } } - _ => panic!("PANIC: Can't dynamically allocate memory for an aggregate type within a CPU function."), + _ => panic!("PANIC: Can't dynamically allocate memory for an aggregate type within a CPU function ({:?} in {}).", id, self.function.name), } } Node::DynamicConstant { id: dc_id } => { diff --git a/hercules_opt/src/clone_elim.rs b/hercules_opt/src/clone_elim.rs deleted file mode 100644 index 59507baca5731c94e8b63963e644792a50bf2ea7..0000000000000000000000000000000000000000 --- a/hercules_opt/src/clone_elim.rs +++ /dev/null @@ -1,33 +0,0 @@ -extern crate hercules_ir; - -use std::collections::BTreeSet; - -use self::hercules_ir::ir::*; - -use crate::*; - -/* - * Top level function to run clone elimination. - */ -pub fn clone_elim(editor: &mut FunctionEditor) { - // Create workset (starts as all nodes). - let mut workset: BTreeSet<NodeID> = (0..editor.func().nodes.len()).map(NodeID::new).collect(); - - while let Some(work) = workset.pop_first() { - // Look for Write nodes with identical `collect` and `data` inputs. - let nodes = &editor.func().nodes; - if let Node::Write { - collect, - data, - ref indices, - } = nodes[work.idx()] - && nodes[collect.idx()] == nodes[data.idx()] - { - assert!(indices.is_empty()); - editor.edit(|edit| edit.replace_all_uses(work, collect)?.delete_node(work)); - - // Removing this write may affect downstream writes. - workset.extend(editor.get_users(work)); - } - } -} diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 4ff08e6927103ed39d8025ab1fac7bc52d52984a..313596514216257797ea3aa0cd17af332f37cbf2 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -62,6 +62,7 @@ pub struct FunctionEdit<'a: 'b, 'b> { added_types: Vec<Type>, // Compute a def-use map entries iteratively. updated_def_use: BTreeMap<NodeID, HashSet<NodeID>>, + updated_param_types: Option<Vec<TypeID>>, updated_return_type: Option<TypeID>, // Keep track of which deleted and added node IDs directly correspond. sub_edits: Vec<(NodeID, NodeID)>, @@ -113,6 +114,7 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { added_dynamic_constants: Vec::new().into(), added_types: Vec::new().into(), updated_def_use: BTreeMap::new(), + updated_param_types: None, updated_return_type: None, sub_edits: vec![], }; @@ -130,6 +132,7 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { added_dynamic_constants, added_types, updated_def_use, + updated_param_types, updated_return_type, sub_edits, } = populated_edit; @@ -206,7 +209,12 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { editor_dynamic_constants.extend(added_dynamic_constants); editor_types.extend(added_types); - // Step 8: update return type if necessary. + // Step 8: update parameter types if necessary. + if let Some(param_types) = updated_param_types { + editor.function.param_types = param_types; + } + + // Step 9: update return type if necessary. if let Some(return_type) = updated_return_type { editor.function.return_type = return_type; } @@ -580,6 +588,10 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { } } + pub fn set_param_types(&mut self, tys: Vec<TypeID>) { + self.updated_param_types = Some(tys); + } + pub fn set_return_type(&mut self, ty: TypeID) { self.updated_return_type = Some(ty); } diff --git a/hercules_opt/src/float_collections.rs b/hercules_opt/src/float_collections.rs new file mode 100644 index 0000000000000000000000000000000000000000..30df387598395f9182fd123024d14240510ce73a --- /dev/null +++ b/hercules_opt/src/float_collections.rs @@ -0,0 +1,107 @@ +extern crate hercules_ir; + +use self::hercules_ir::*; + +use crate::*; + +/* + * Float collections constants out of device functions, where allocation isn't + * allowed. + */ +pub fn float_collections( + editors: &mut [FunctionEditor], + typing: &ModuleTyping, + callgraph: &CallGraph, + devices: &Vec<Device>, +) { + let topo = callgraph.topo(); + for to_float_id in topo { + // Collection constants float until reaching an AsyncRust function. + if devices[to_float_id.idx()] == Device::AsyncRust { + continue; + } + + // Find the target constant nodes in the function. + let cons: Vec<(NodeID, Node)> = editors[to_float_id.idx()] + .func() + .nodes + .iter() + .enumerate() + .filter(|(_, node)| { + node.try_constant() + .map(|cons_id| !editors[to_float_id.idx()].get_constant(cons_id).is_scalar()) + .unwrap_or(false) + }) + .map(|(idx, node)| (NodeID::new(idx), node.clone())) + .collect(); + if cons.is_empty() { + continue; + } + + // Each constant node becomes a new parameter. + let mut new_param_types = editors[to_float_id.idx()].func().param_types.clone(); + let old_num_params = new_param_types.len(); + for (id, _) in cons.iter() { + new_param_types.push(typing[to_float_id.idx()][id.idx()]); + } + let success = editors[to_float_id.idx()].edit(|mut edit| { + for (idx, (id, _)) in cons.iter().enumerate() { + let param = edit.add_node(Node::Parameter { + index: idx + old_num_params, + }); + edit = edit.replace_all_uses(*id, param)?; + edit = edit.delete_node(*id)?; + } + edit.set_param_types(new_param_types); + Ok(edit) + }); + if !success { + continue; + } + + // Add constants in callers and pass them into calls. + for caller in callgraph.get_callers(to_float_id) { + let calls: Vec<(NodeID, Node)> = editors[caller.idx()] + .func() + .nodes + .iter() + .enumerate() + .filter(|(_, node)| { + node.try_call() + .map(|(_, callee, _, _)| callee == to_float_id) + .unwrap_or(false) + }) + .map(|(idx, node)| (NodeID::new(idx), node.clone())) + .collect(); + let success = editors[caller.idx()].edit(|mut edit| { + let cons_ids: Vec<_> = cons + .iter() + .map(|(_, node)| edit.add_node(node.clone())) + .collect(); + for (id, node) in calls { + let Node::Call { + control, + function, + dynamic_constants, + args, + } = node + else { + panic!() + }; + let mut args = Vec::from(args); + args.extend(cons_ids.iter()); + let new_call = edit.add_node(Node::Call { + control, + function, + dynamic_constants, + args: args.into_boxed_slice(), + }); + edit = edit.replace_all_uses(id, new_call)?; + edit = edit.delete_node(id)?; + } + Ok(edit) + }); + assert!(success); + } + } +} diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs new file mode 100644 index 0000000000000000000000000000000000000000..76ce3fdfeced77b5663dc99cd76049ba13e1172c --- /dev/null +++ b/hercules_opt/src/gcm.rs @@ -0,0 +1,888 @@ +extern crate bitvec; +extern crate either; +extern crate hercules_cg; +extern crate hercules_ir; + +use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; +use std::iter::{empty, once, zip, FromIterator}; + +use self::bitvec::prelude::*; +use self::either::Either; + +use self::hercules_cg::*; +use self::hercules_ir::*; + +use crate::*; + +/* + * Top level function to legalize the reference semantics of a Hercules IR + * function. Hercules IR is a value semantics representation, meaning that all + * program state is in the form of copyable values, and mutation takes place by + * making a new value that is a copy of the old value with some modification. + * This representation is extremely convenient for optimization, but is not good + * for code generation, where we need to generate code with references to get + * good performance. Hercules IR can alternatively be interpreted using + * reference semantics, where pointers to collection objects are passed around, + * read from, and written to. However, the value semantics and reference + * semantics interpretation of a Hercules IR function may not be equal - this + * pass transforms a Hercules IR function such that its new value semantics is + * the same as its old value semantics and that its new reference semantics is + * the same as its new value semantics. This pass returns a placement of nodes + * into ordered basic blocks, since the reference semantics of a function + * depends on the order of execution with respect to anti-dependencies. This + * is analogous to global code motion from the original sea of nodes paper. + * + * Our strategy for handling multiple mutating users of a collection is to treat + * the problem similar to register allocation; we perform a liveness analysis, + * spill constants into newly allocated constants, and read back the spilled + * contents when they are used after the first mutation. It's not obvious how + * many spills are needed upfront, and newly spilled constants may affect the + * liveness analysis result, so every spill restarts the process of checking for + * spills. Once no more spills are found, the process terminates. When a spill + * is found, the basic block assignments, and all the other analyses, are not + * necessarily valid anymore, so this function is called in a loop in pass.rs + * until no more spills are found. + */ +pub fn gcm( + editor: &mut FunctionEditor, + def_use: &ImmutableDefUseMap, + reverse_postorder: &Vec<NodeID>, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + dom: &DomTree, + fork_join_map: &HashMap<NodeID, NodeID>, + loops: &LoopTree, + objects: &CollectionObjects, +) -> Option<BasicBlocks> { + let bbs = basic_blocks( + editor.func(), + editor.func_id(), + def_use, + reverse_postorder, + control_subgraph, + dom, + loops, + fork_join_map, + objects, + ); + if spill_clones(editor, typing, control_subgraph, objects, &bbs) { + None + } else { + Some(bbs) + } +} + +/* + * Top level global code motion function. Assigns each data node to one of its + * immediate control use / user nodes, forming (unordered) basic blocks. Returns + * the control node / basic block each node is in. Takes in a partial + * partitioning that must be respected. Based on the schedule-early-schedule- + * late method from Cliff Click's PhD thesis. May fail if an anti-dependency + * edge can't be satisfied - in this case, a clone that has to be induced is + * returned instead. + */ +fn basic_blocks( + function: &Function, + func_id: FunctionID, + def_use: &ImmutableDefUseMap, + reverse_postorder: &Vec<NodeID>, + control_subgraph: &Subgraph, + dom: &DomTree, + loops: &LoopTree, + fork_join_map: &HashMap<NodeID, NodeID>, + objects: &CollectionObjects, +) -> BasicBlocks { + let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()]; + + // Step 1: assign the basic block locations of all nodes that must be in a + // specific block. This includes control nodes as well as some special data + // nodes, such as phis. + for idx in 0..function.nodes.len() { + match function.nodes[idx] { + Node::Phi { control, data: _ } => bbs[idx] = Some(control), + Node::ThreadID { + control, + dimension: _, + } => bbs[idx] = Some(control), + Node::Reduce { + control, + init: _, + reduct: _, + } => bbs[idx] = Some(control), + Node::Call { + control, + function: _, + dynamic_constants: _, + args: _, + } => bbs[idx] = Some(control), + Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)), + Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)), + Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)), + _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)), + _ => {} + } + } + + // Step 2: schedule early. Place nodes in the earliest position they could + // go - use worklist to iterate nodes. + let mut schedule_early = bbs.clone(); + let mut worklist = VecDeque::from(reverse_postorder.clone()); + while let Some(id) = worklist.pop_front() { + if schedule_early[id.idx()].is_some() { + continue; + } + + // For every use, check what block is its "schedule early" block. This + // node goes in the lowest block amongst those blocks. + let use_places: Option<Vec<NodeID>> = get_uses(&function.nodes[id.idx()]) + .as_ref() + .into_iter() + .map(|id| *id) + .map(|id| schedule_early[id.idx()]) + .collect(); + if let Some(use_places) = use_places { + // If every use has been placed, we can place this node as the + // lowest place in the domtree that dominates all of the use places. + let lowest = dom.lowest_amongst(use_places.into_iter()); + schedule_early[id.idx()] = Some(lowest); + } else { + // If not, then just push this node back on the worklist. + worklist.push_back(id); + } + } + + // Step 3: find anti-dependence edges. An anti-dependence edge needs to be + // drawn between a collection reading node and a collection mutating node + // when the following conditions are true: + // + // 1: The reading and mutating nodes may involve the same collection. + // 2: The node producing the collection used by the reading node is in a + // schedule early block that dominates the schedule early block of the + // mutating node. The node producing the collection used by the reading + // node may be an originator of a collection, phi or reduce, or mutator, + // but not forwarding read - forwarding reads are collapsed, and the + // bottom read is treated as reading from the transitive parent of the + // forwarding read(s). + let mut antideps = BTreeSet::new(); + for id in reverse_postorder.iter() { + // Find a terminating read node and the collections it reads. + let terminating_reads: BTreeSet<_> = + terminating_reads(function, func_id, *id, objects).collect(); + if !terminating_reads.is_empty() { + // Walk forwarding reads to find anti-dependency roots. + let mut workset = terminating_reads.clone(); + let mut roots = BTreeSet::new(); + while let Some(pop) = workset.pop_first() { + let forwarded: BTreeSet<_> = + forwarding_reads(function, func_id, pop, objects).collect(); + if forwarded.is_empty() { + roots.insert(pop); + } else { + workset.extend(forwarded); + } + } + + // For each root, find mutating nodes dominated by the root that + // modify an object read on any input of the current node (the + // terminating read). + // TODO: make this less outrageously inefficient. + let func_objects = &objects[&func_id]; + for root in roots.iter() { + let root_early = schedule_early[root.idx()].unwrap(); + let mut root_block_iterated_users: BTreeSet<NodeID> = BTreeSet::new(); + let mut workset = BTreeSet::new(); + workset.insert(*root); + while let Some(pop) = workset.pop_first() { + let users = def_use.get_users(pop).into_iter().filter(|user| { + !function.nodes[user.idx()].is_phi() + && !function.nodes[user.idx()].is_reduce() + && schedule_early[user.idx()].unwrap() == root_early + }); + workset.extend(users.clone()); + root_block_iterated_users.extend(users); + } + let read_objs: BTreeSet<_> = terminating_reads + .iter() + .map(|read_use| func_objects.objects(*read_use).into_iter()) + .flatten() + .map(|id| *id) + .collect(); + for mutator in reverse_postorder.iter() { + let mutator_early = schedule_early[mutator.idx()].unwrap(); + if dom.does_dom(root_early, mutator_early) + && (root_early != mutator_early + || root_block_iterated_users.contains(&mutator)) + && mutating_objects(function, func_id, *mutator, objects) + .any(|mutated| read_objs.contains(&mutated)) + && id != mutator + { + antideps.insert((*id, *mutator)); + } + } + } + } + } + let mut antideps_uses = vec![vec![]; function.nodes.len()]; + let mut antideps_users = vec![vec![]; function.nodes.len()]; + for (reader, mutator) in antideps.iter() { + antideps_uses[mutator.idx()].push(*reader); + antideps_users[reader.idx()].push(*mutator); + } + + // Step 4: schedule late and pick each nodes final position. Since the late + // schedule of each node depends on the final positions of its users, these + // two steps must be fused. Compute their latest position, then use the + // control dependent + shallow loop heuristic to actually place them. A + // placement might not necessarily be found due to anti-dependency edges. + // These are optional and not necessary to consider, but we do since obeying + // them can reduce the number of clones. If the worklist stops making + // progress, stop considering the anti-dependency edges. + let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map + .into_iter() + .map(|(fork, join)| (*join, *fork)) + .collect(); + let mut worklist = VecDeque::from_iter(reverse_postorder.into_iter().map(|id| *id).rev()); + let mut num_skip_iters = 0; + let mut consider_antidependencies = true; + while let Some(id) = worklist.pop_front() { + if num_skip_iters >= worklist.len() { + consider_antidependencies = false; + } + + if bbs[id.idx()].is_some() { + num_skip_iters = 0; + continue; + } + + // Calculate the least common ancestor of user blocks, a.k.a. the "late" + // schedule. + let calculate_lca = || -> Option<_> { + let mut lca = None; + // Helper to incrementally update the LCA. + let mut update_lca = |a| { + if let Some(acc) = lca { + lca = Some(dom.least_common_ancestor(acc, a)); + } else { + lca = Some(a); + } + }; + + // For every user, consider where we need to be to directly dominate the + // user. + for user in def_use + .get_users(id) + .as_ref() + .into_iter() + .chain(if consider_antidependencies { + Either::Left(antideps_users[id.idx()].iter()) + } else { + Either::Right(empty()) + }) + .map(|id| *id) + { + if let Node::Phi { control, data } = &function.nodes[user.idx()] { + // For phis, we need to dominate the block jumping to the phi in + // the slot that corresponds to our use. + for (control, data) in + zip(get_uses(&function.nodes[control.idx()]).as_ref(), data) + { + if id == *data { + update_lca(*control); + } + } + } else if let Node::Reduce { + control, + init, + reduct, + } = &function.nodes[user.idx()] + { + // For reduces, we need to either dominate the block right + // before the fork if we're the init input, or we need to + // dominate the join if we're the reduct input. + if id == *init { + let before_fork = function.nodes[join_fork_map[control].idx()] + .try_fork() + .unwrap() + .0; + update_lca(before_fork); + } else { + assert_eq!(id, *reduct); + update_lca(*control); + } + } else { + // For everything else, we just need to dominate the user. + update_lca(bbs[user.idx()]?); + } + } + + Some(lca) + }; + + // Check if all users have been placed. If one of them hasn't, then add + // this node back on to the worklist. + let Some(lca) = calculate_lca() else { + worklist.push_back(id); + num_skip_iters += 1; + continue; + }; + + // Look between the LCA and the schedule early location to place the + // node. + let schedule_early = schedule_early[id.idx()].unwrap(); + let schedule_late = lca.unwrap_or(schedule_early); + let mut chain = dom + // If the node has no users, then it doesn't really matter where we + // place it - just place it at the early placement. + .chain(schedule_late, schedule_early); + + if let Some(mut location) = chain.next() { + while let Some(control_node) = chain.next() { + // If the next node further up the dominator tree is in a shallower + // loop nest or if we can get out of a reduce loop when we don't + // need to be in one, place this data node in a higher-up location. + let old_nest = loops + .header_of(location) + .map(|header| loops.nesting(header).unwrap()); + let new_nest = loops + .header_of(control_node) + .map(|header| loops.nesting(header).unwrap()); + let shallower_nest = if let (Some(old_nest), Some(new_nest)) = (old_nest, new_nest) + { + old_nest > new_nest + } else { + // If the new location isn't a loop, it's nesting level should + // be considered "shallower" if the current location is in a + // loop. + old_nest.is_some() + }; + // This will move all nodes that don't need to be in reduce loops + // outside of reduce loops. Nodes that do need to be in a reduce + // loop use the reduce node forming the loop, so the dominator chain + // will consist of one block, and this loop won't ever iterate. + let currently_at_join = function.nodes[location.idx()].is_join(); + if shallower_nest || currently_at_join { + location = control_node; + } + } + + bbs[id.idx()] = Some(location); + num_skip_iters = 0; + } else { + // If there is no valid location for this node, then it's a reading + // node of a collection that can't be placed above a mutation that + // anti-depend uses it. Push the node back on the list, and we'll + // stop considering anti-dependencies soon. Don't immediately stop + // considering anti-dependencies, as we may be able to eak out some + // more use of them. + worklist.push_back(id); + num_skip_iters += 1; + continue; + } + } + let bbs: Vec<_> = bbs.into_iter().map(Option::unwrap).collect(); + // Calculate the number of phis and reduces per basic block. We use this to + // emit phis and reduces at the top of basic blocks. We want to emit phis + // and reduces first into ordered basic blocks for two reasons: + // 1. This is useful for liveness analysis. + // 2. This is needed for some backends - LLVM expects phis to be at the top + // of basic blocks. + let mut num_phis_reduces = vec![0; function.nodes.len()]; + for (node_idx, bb) in bbs.iter().enumerate() { + let node = &function.nodes[node_idx]; + if node.is_phi() || node.is_reduce() { + num_phis_reduces[bb.idx()] += 1; + } + } + + // Step 5: determine the order of nodes inside each block. Use worklist to + // add nodes to blocks in order that obeys dependencies. + let mut order: Vec<Vec<NodeID>> = vec![vec![]; function.nodes.len()]; + let mut worklist = VecDeque::from_iter( + reverse_postorder + .into_iter() + .filter(|id| !function.nodes[id.idx()].is_control()), + ); + let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; + let mut num_skip_iters = 0; + let mut consider_antidependencies = true; + while let Some(id) = worklist.pop_front() { + // If the worklist isn't making progress, then there's at least one + // reading node of a collection that is in a anti-depend + normal depend + // use cycle with a mutating node. See above comment about anti- + // dependencies being optional; we just stop considering them here. + if num_skip_iters >= worklist.len() { + consider_antidependencies = false; + } + + // Phis and reduces always get emitted. Other nodes need to obey + // dependency relationships and need to come after phis and reduces. + let node = &function.nodes[id.idx()]; + let bb = bbs[id.idx()]; + if node.is_phi() + || node.is_reduce() + || (num_phis_reduces[bb.idx()] == 0 + && get_uses(node) + .as_ref() + .into_iter() + .chain(if consider_antidependencies { + Either::Left(antideps_uses[id.idx()].iter()) + } else { + Either::Right(empty()) + }) + .all(|u| { + function.nodes[u.idx()].is_control() + || bbs[u.idx()] != bbs[id.idx()] + || visited[u.idx()] + })) + { + order[bb.idx()].push(*id); + visited.set(id.idx(), true); + num_skip_iters = 0; + if node.is_phi() || node.is_reduce() { + num_phis_reduces[bb.idx()] -= 1; + } + } else { + worklist.push_back(id); + num_skip_iters += 1; + } + } + + (bbs, order) +} + +fn terminating_reads<'a>( + function: &'a Function, + func_id: FunctionID, + reader: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = NodeID> + 'a> { + match function.nodes[reader.idx()] { + Node::Read { + collect, + indices: _, + } if objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)), + Node::Write { + collect: _, + data, + indices: _, + } if !objects[&func_id].objects(data).is_empty() => Box::new(once(data)), + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| { + let objects = &objects[&callee]; + let returns = objects.returned_objects(); + let param_obj = objects.param_to_object(idx)?; + if !objects.is_mutated(param_obj) && !returns.contains(¶m_obj) { + Some(*arg) + } else { + None + } + })), + _ => Box::new(empty()), + } +} + +fn forwarding_reads<'a>( + function: &'a Function, + func_id: FunctionID, + reader: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = NodeID> + 'a> { + match function.nodes[reader.idx()] { + Node::Read { + collect, + indices: _, + } if !objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)), + Node::Ternary { + op: TernaryOperator::Select, + first: _, + second, + third, + } if !objects[&func_id].objects(reader).is_empty() => { + Box::new(once(second).chain(once(third))) + } + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| { + let objects = &objects[&callee]; + let returns = objects.returned_objects(); + let param_obj = objects.param_to_object(idx)?; + if !objects.is_mutated(param_obj) && returns.contains(¶m_obj) { + Some(*arg) + } else { + None + } + })), + _ => Box::new(empty()), + } +} + +fn mutating_objects<'a>( + function: &'a Function, + func_id: FunctionID, + mutator: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = CollectionObjectID> + 'a> { + match function.nodes[mutator.idx()] { + Node::Write { + collect, + data: _, + indices: _, + } => Box::new(objects[&func_id].objects(collect).into_iter().map(|id| *id)), + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new( + args.into_iter() + .enumerate() + .filter_map(move |(idx, arg)| { + let callee_objects = &objects[&callee]; + let param_obj = callee_objects.param_to_object(idx)?; + if callee_objects.is_mutated(param_obj) { + Some(objects[&func_id].objects(*arg).into_iter().map(|id| *id)) + } else { + None + } + }) + .flatten(), + ), + _ => Box::new(empty()), + } +} + +type Liveness = BTreeMap<NodeID, Vec<BTreeSet<NodeID>>>; + +/* + * Top level function to find implicit clones that need to be spilled. Returns + * whether a clone was spilled, in which case the whole scheduling process must + * be restarted. + */ +fn spill_clones( + editor: &mut FunctionEditor, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + objects: &CollectionObjects, + bbs: &BasicBlocks, +) -> bool { + // Step 1: compute a liveness analysis of collection values in the IR. This + // requires a dataflow analysis over the scheduled IR, which is not a common + // need in Hercules, so just hardcode the analysis. + let liveness = liveness_dataflow( + editor.func(), + editor.func_id(), + control_subgraph, + objects, + bbs, + ); + + // Step 2: compute an interference graph from the liveness result. This + // graph contains a vertex per node ID producing a collection value and an + // edge per pair of node IDs that interfere. Nodes A and B interfere if node + // A is defined right above a point where node B is live. + let mut edges = vec![]; + for (bb, liveness) in liveness { + let insts = &bbs.1[bb.idx()]; + for (node, live) in zip(insts, liveness.into_iter().skip(1)) { + for live_node in live { + if *node != live_node { + edges.push((*node, live_node)); + } + } + } + } + + // Step 3: filter edges (A, B) to just see edges where A uses B and A isn't + // a terminating read. These are the edges that may require a spill. + let mut spill_edges = edges.into_iter().filter(|(a, b)| { + get_uses(&editor.func().nodes[a.idx()]) + .as_ref() + .into_iter() + .any(|u| *u == *b) + && !terminating_reads(editor.func(), editor.func_id(), *a, objects).any(|id| id == *b) + }); + + // Step 4: if there is a spill edge, spill it and return true. Otherwise, + // return false. + if let Some((user, obj)) = spill_edges.next() { + // Figure out the most immediate dominating region for every basic + // block. These are the points where spill slot phis get placed. + let nodes = &editor.func().nodes; + let mut imm_dom_reg = vec![NodeID::new(0); editor.func().nodes.len()]; + for (idx, node) in nodes.into_iter().enumerate() { + if node.is_region() { + imm_dom_reg[idx] = NodeID::new(idx); + } + } + let rev_po = control_subgraph.rev_po(NodeID::new(0)); + for bb in rev_po.iter() { + if !nodes[bb.idx()].is_region() && !nodes[bb.idx()].is_start() { + imm_dom_reg[bb.idx()] = + imm_dom_reg[control_subgraph.preds(*bb).next().unwrap().idx()]; + } + } + + let other_obj_users: Vec<_> = editor.get_users(obj).filter(|id| *id != user).collect(); + let mut dummy_phis = vec![NodeID::new(0); imm_dom_reg.len()]; + let mut success = editor.edit(|mut edit| { + // Construct the spill slot. This is just a constant that gets phi- + // ed throughout the entire function. + let cons_id = edit.add_zero_constant(typing[obj.idx()]); + let slot_id = edit.add_node(Node::Constant { id: cons_id }); + + // Allocate IDs for phis that move the spill slot throughout the + // function without implicit clones. These are dummy phis, since + // there are potentially cycles between them. We will replace them + // later. + for (idx, reg) in imm_dom_reg.iter().enumerate().skip(1) { + if idx == reg.idx() { + dummy_phis[idx] = edit.add_node(Node::Phi { + control: *reg, + data: empty().collect(), + }); + } + } + + // Spill `obj` before `user` potentially modifies it. + let spill_region = imm_dom_reg[bbs.0[obj.idx()].idx()]; + let spill_id = edit.add_node(Node::Write { + collect: if spill_region == NodeID::new(0) { + slot_id + } else { + dummy_phis[spill_region.idx()] + }, + data: obj, + indices: empty().collect(), + }); + + // Before each other user, unspill `obj`. + for other_user in other_obj_users { + let other_region = imm_dom_reg[bbs.0[other_user.idx()].idx()]; + // If this assert fails, then `obj` is not in the first basic + // block, but it has a user that is in the first basic block, + // which violates SSA. + assert!(other_region == spill_region || other_region != NodeID::new(0)); + + // If an other user is a phi, we need to be a little careful + // about how we insert unspilling code for `obj`. Instead of + // inserting an unspill in the same block as the user, we need + // to insert one in each predecessor of the phi that corresponds + // to a use of `obj`. Since this requires modifying individual + // uses in a phi, just rebuild the node entirely. + if let Node::Phi { control, data } = edit.get_node(other_user).clone() { + assert_eq!(control, other_region); + let mut new_data = vec![]; + for (pred, data) in zip(control_subgraph.preds(control), data) { + let pred = imm_dom_reg[pred.idx()]; + if data == obj { + let unspill_id = edit.add_node(Node::Write { + collect: obj, + data: if pred == spill_region { + spill_id + } else { + dummy_phis[pred.idx()] + }, + indices: empty().collect(), + }); + new_data.push(unspill_id); + } else { + new_data.push(data); + } + } + let new_phi = edit.add_node(Node::Phi { + control, + data: new_data.into_boxed_slice(), + }); + edit = edit.replace_all_uses(other_user, new_phi)?; + edit = edit.delete_node(other_user)?; + } else { + let unspill_id = edit.add_node(Node::Write { + collect: obj, + data: if other_region == spill_region { + spill_id + } else { + dummy_phis[other_region.idx()] + }, + indices: empty().collect(), + }); + edit = edit.replace_all_uses_where(obj, unspill_id, |id| *id == other_user)?; + } + } + + // Create and hook up all the real phis. Phi elimination will clean + // this up. + let mut real_phis = vec![NodeID::new(0); imm_dom_reg.len()]; + for (idx, reg) in imm_dom_reg.iter().enumerate().skip(1) { + if idx == reg.idx() { + real_phis[idx] = edit.add_node(Node::Phi { + control: *reg, + data: control_subgraph + .preds(*reg) + .map(|pred| { + let pred = imm_dom_reg[pred.idx()]; + if pred == spill_region { + spill_id + } else if pred == NodeID::new(0) { + slot_id + } else { + dummy_phis[pred.idx()] + } + }) + .collect(), + }); + } + } + for (dummy, real) in zip(dummy_phis.iter(), real_phis) { + if *dummy != real { + edit = edit.replace_all_uses(*dummy, real)?; + } + } + + Ok(edit) + }); + success = success + && editor.edit(|mut edit| { + for dummy in dummy_phis { + if dummy != NodeID::new(0) { + edit = edit.delete_node(dummy)?; + } + } + Ok(edit) + }); + assert!(success, "PANIC: GCM cannot fail to edit a function, as it needs to legalize the reference semantics of every function before code generation."); + true + } else { + false + } +} + +/* + * Liveness dataflow analysis on scheduled Hercules IR. Just look at nodes that + * involve collections. + */ +fn liveness_dataflow( + function: &Function, + func_id: FunctionID, + control_subgraph: &Subgraph, + objects: &CollectionObjects, + bbs: &BasicBlocks, +) -> Liveness { + let mut po = control_subgraph.rev_po(NodeID::new(0)); + po.reverse(); + let mut liveness = Liveness::default(); + for (bb_idx, insts) in bbs.1.iter().enumerate() { + liveness.insert(NodeID::new(bb_idx), vec![BTreeSet::new(); insts.len() + 1]); + } + let mut num_phis_reduces = vec![0; function.nodes.len()]; + let mut reducing = vec![false; function.nodes.len()]; + for (node_idx, bb) in bbs.0.iter().enumerate() { + let node = &function.nodes[node_idx]; + if node.is_phi() || node.is_reduce() { + num_phis_reduces[bb.idx()] += 1; + // Phis and reduces can't be in the same basic block. + if node.is_reduce() { + assert!(num_phis_reduces[bb.idx()] == 0 || reducing[bb.idx()]); + reducing[bb.idx()] = true; + } else { + assert!(!reducing[bb.idx()]); + } + } + } + let is_obj = |id: NodeID| !objects[&func_id].objects(id).is_empty(); + + loop { + let mut changed = false; + + for bb in po.iter() { + // First, calculate the liveness set for the bottom of this block. + let last_pt = bbs.1[bb.idx()].len(); + let old_value = &liveness[&bb][last_pt]; + let mut new_value = BTreeSet::new(); + for succ in control_subgraph.succs(*bb).chain(if reducing[bb.idx()] { + Either::Left(once(*bb)) + } else { + Either::Right(empty()) + }) { + // The liveness at the bottom of a basic block is the union of: + // 1. The liveness of each succecessor right after its phis and + // reduces. + // 2. Every data use in a phi or reduce that corresponds to this + // block as the predecessor. + let after_phis_reduces_pt = num_phis_reduces[succ.idx()]; + new_value.extend(&liveness[&succ][after_phis_reduces_pt]); + for inst_idx in 0..after_phis_reduces_pt { + let id = bbs.1[succ.idx()][inst_idx]; + new_value.remove(&id); + match function.nodes[id.idx()] { + Node::Phi { control, ref data } if is_obj(data[0]) => { + assert_eq!(control, succ); + new_value.extend( + zip(control_subgraph.preds(succ), data) + .filter(|(pred, _)| *pred == *bb) + .map(|(_, data)| *data), + ); + } + Node::Reduce { + control, + init, + reduct, + } if is_obj(init) => { + assert_eq!(control, succ); + if succ == *bb { + new_value.insert(reduct); + } else { + new_value.insert(init); + } + } + _ => {} + } + } + } + changed |= *old_value != new_value; + liveness.get_mut(&bb).unwrap()[last_pt] = new_value; + + // Second, calculate the liveness set above each instruction in this block. + for pt in (0..last_pt).rev() { + let old_value = &liveness[&bb][pt]; + let mut new_value = liveness[&bb][pt + 1].clone(); + let id = bbs.1[bb.idx()][pt]; + let uses = get_uses(&function.nodes[id.idx()]); + new_value.remove(&id); + new_value.extend( + if let Node::Write { + collect: _, + data, + ref indices, + } = function.nodes[id.idx()] + && indices.is_empty() + { + // If this write is a cloning write, the `collect` input + // isn't actually live, because its value doesn't + // matter. + Either::Left(once(data).filter(|id| is_obj(*id))) + } else { + Either::Right( + uses.as_ref() + .into_iter() + .map(|id| *id) + .filter(|id| is_obj(*id)), + ) + }, + ); + changed |= *old_value != new_value; + liveness.get_mut(&bb).unwrap()[pt] = new_value; + } + } + + if !changed { + return liveness; + } + } +} diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs deleted file mode 100644 index 5db49ec467977076ca1d82f933e23020f18ce2f4..0000000000000000000000000000000000000000 --- a/hercules_opt/src/legalize_reference_semantics.rs +++ /dev/null @@ -1,1236 +0,0 @@ -extern crate bitvec; -extern crate hercules_cg; -extern crate hercules_ir; - -use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; -use std::iter::{empty, once, zip, FromIterator}; - -use self::bitvec::prelude::*; - -use self::hercules_cg::*; -use self::hercules_ir::*; - -use crate::*; - -/* - * Top level function to legalize the reference semantics of a Hercules IR - * function. Hercules IR is a value semantics representation, meaning that all - * program state is in the form of copyable values, and mutation takes place by - * making a new value that is a copy of the old value with some modification. - * This representation is extremely convenient for optimization, but is not good - * for code generation, where we need to generate code with references to get - * good performance. Hercules IR can alternatively be interpreted using - * reference semantics, where pointers to collection objects are passed around, - * read from, and written to. However, the value semantics and reference - * semantics interpretation of a Hercules IR function may not be equal - this - * pass transforms a Hercules IR function such that its new value semantics is - * the same as its old value semantics and that its new reference semantics is - * the same as its new value semantics. This pass returns a placement of nodes - * into ordered basic blocks, since the reference semantics of a function - * depends on the order of execution with respect to anti-dependencies. Clones - * are inserted sparingly when there are two write users of a single collection - * or if a read user cannot be scheduled before a write user. - */ -pub fn legalize_reference_semantics( - editor: &mut FunctionEditor, - def_use: &ImmutableDefUseMap, - reverse_postorder: &Vec<NodeID>, - typing: &Vec<TypeID>, - control_subgraph: &Subgraph, - dom: &DomTree, - fork_join_map: &HashMap<NodeID, NodeID>, - loops: &LoopTree, - objects: &CollectionObjects, -) -> Option<BasicBlocks> { - // Repeatedly try to place nodes into basic blocks. If clones are induced, - // re-try. Specifically, repeat the following procedure until no new clones: - // - // 1. Attempt to place nodes in basic blocks. If a node can't be placed due - // to anti-dependency edges, induce a clone on the read and go back to - // step 1. - // 2. Check for any write-induced clones. If there are any, go back to step - // 1. - // - // Since each analysis needs to be re-calculated in each iteration, this - // function just implements the body of the described loop. The re-try logic - // is found in pass.rs. When a re-try is needed, no basic block assignment - // is returned. When a re-try isn't needed (no new clones were found), a - // basic block assignment is returned. - let bbs = match basic_blocks( - editor.func(), - editor.func_id(), - def_use, - reverse_postorder, - control_subgraph, - dom, - loops, - fork_join_map, - objects, - ) { - Ok(bbs) => bbs, - Err((obj, reader)) => { - todo!(); - return None; - } - }; - if materialize_clones(editor, typing, control_subgraph, dom, loops, objects, &bbs) { - None - } else { - Some(bbs) - } -} - -/* - * Top level global code motion function. Assigns each data node to one of its - * immediate control use / user nodes, forming (unordered) basic blocks. Returns - * the control node / basic block each node is in. Takes in a partial - * partitioning that must be respected. Based on the schedule-early-schedule- - * late method from Cliff Click's PhD thesis. May fail if an anti-dependency - * edge can't be satisfied - in this case, a clone that has to be induced is - * returned instead. - */ -fn basic_blocks( - function: &Function, - func_id: FunctionID, - def_use: &ImmutableDefUseMap, - reverse_postorder: &Vec<NodeID>, - control_subgraph: &Subgraph, - dom: &DomTree, - loops: &LoopTree, - fork_join_map: &HashMap<NodeID, NodeID>, - objects: &CollectionObjects, -) -> Result<BasicBlocks, (NodeID, NodeID)> { - let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()]; - - // Step 1: assign the basic block locations of all nodes that must be in a - // specific block. This includes control nodes as well as some special data - // nodes, such as phis. - for idx in 0..function.nodes.len() { - match function.nodes[idx] { - Node::Phi { control, data: _ } => bbs[idx] = Some(control), - Node::ThreadID { - control, - dimension: _, - } => bbs[idx] = Some(control), - Node::Reduce { - control, - init: _, - reduct: _, - } => bbs[idx] = Some(control), - Node::Call { - control, - function: _, - dynamic_constants: _, - args: _, - } => bbs[idx] = Some(control), - Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)), - Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)), - Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)), - _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)), - _ => {} - } - } - - // Step 2: schedule early. Place nodes in the earliest position they could - // go - use worklist to iterate nodes. - let mut schedule_early = bbs.clone(); - let mut worklist = VecDeque::from(reverse_postorder.clone()); - while let Some(id) = worklist.pop_front() { - if schedule_early[id.idx()].is_some() { - continue; - } - - // For every use, check what block is its "schedule early" block. This - // node goes in the lowest block amongst those blocks. - let use_places: Option<Vec<NodeID>> = get_uses(&function.nodes[id.idx()]) - .as_ref() - .into_iter() - .map(|id| *id) - .map(|id| schedule_early[id.idx()]) - .collect(); - if let Some(use_places) = use_places { - // If every use has been placed, we can place this node as the - // lowest place in the domtree that dominates all of the use places. - let lowest = dom.lowest_amongst(use_places.into_iter()); - schedule_early[id.idx()] = Some(lowest); - } else { - // If not, then just push this node back on the worklist. - worklist.push_back(id); - } - } - - // Step 3: find anti-dependence edges. An anti-dependence edge needs to be - // drawn between a collection reading node and a collection mutating node - // when the following conditions are true: - // - // 1: The reading and mutating nodes may involve the same collection. - // 2: The node producing the collection used by the reading node is in a - // schedule early block that dominates the schedule early block of the - // mutating node. The node producing the collection used by the reading - // node may be an originator of a collection, phi or reduce, or mutator, - // but not forwarding read - forwarding reads are collapsed, and the - // bottom read is treated as reading from the transitive parent of the - // forwarding read(s). - let mut antideps = BTreeSet::new(); - for id in reverse_postorder.iter() { - // Find a terminating read node and the collections it reads. - let terminating_reads: BTreeSet<_> = - terminating_reads(function, func_id, *id, objects).collect(); - if !terminating_reads.is_empty() { - // Walk forwarding reads to find anti-dependency roots. - let mut workset = terminating_reads.clone(); - let mut roots = BTreeSet::new(); - while let Some(pop) = workset.pop_first() { - let forwarded: BTreeSet<_> = - forwarding_reads(function, func_id, pop, objects).collect(); - if forwarded.is_empty() { - roots.insert(pop); - } else { - workset.extend(forwarded); - } - } - - // For each root, find mutating nodes dominated by the root that - // modify an object read on any input of the current node (the - // terminating read). - // TODO: make this less outrageously inefficient. - let func_objects = &objects[&func_id]; - for root in roots.iter() { - let root_early = schedule_early[root.idx()].unwrap(); - let mut root_block_iterated_users: BTreeSet<NodeID> = BTreeSet::new(); - let mut workset = BTreeSet::new(); - workset.insert(*root); - while let Some(pop) = workset.pop_first() { - let users = def_use.get_users(pop).into_iter().filter(|user| { - !function.nodes[user.idx()].is_phi() - && !function.nodes[user.idx()].is_reduce() - && schedule_early[user.idx()].unwrap() == root_early - }); - workset.extend(users.clone()); - root_block_iterated_users.extend(users); - } - let read_objs: BTreeSet<_> = terminating_reads - .iter() - .map(|read_use| func_objects.objects(*read_use).into_iter()) - .flatten() - .map(|id| *id) - .collect(); - for mutator in reverse_postorder.iter() { - let mutator_early = schedule_early[mutator.idx()].unwrap(); - if dom.does_dom(root_early, mutator_early) - && (root_early != mutator_early - || root_block_iterated_users.contains(&mutator)) - && mutating_objects(function, func_id, *mutator, objects) - .any(|mutated| read_objs.contains(&mutated)) - && id != mutator - { - antideps.insert((*id, *mutator)); - } - } - } - } - } - let mut antideps_uses = vec![vec![]; function.nodes.len()]; - let mut antideps_users = vec![vec![]; function.nodes.len()]; - for (reader, mutator) in antideps.iter() { - antideps_uses[mutator.idx()].push(*reader); - antideps_users[reader.idx()].push(*mutator); - } - - // Step 4: schedule late and pick each nodes final position. Since the late - // schedule of each node depends on the final positions of its users, these - // two steps must be fused. Compute their latest position, then use the - // control dependent + shallow loop heuristic to actually place them. - let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map - .into_iter() - .map(|(fork, join)| (*join, *fork)) - .collect(); - let mut worklist = VecDeque::from_iter(reverse_postorder.into_iter().map(|id| *id).rev()); - while let Some(id) = worklist.pop_front() { - if bbs[id.idx()].is_some() { - continue; - } - - // Calculate the least common ancestor of user blocks, a.k.a. the "late" - // schedule. - let calculate_lca = || -> Option<_> { - let mut lca = None; - // Helper to incrementally update the LCA. - let mut update_lca = |a| { - if let Some(acc) = lca { - lca = Some(dom.least_common_ancestor(acc, a)); - } else { - lca = Some(a); - } - }; - - // For every user, consider where we need to be to directly dominate the - // user. - for user in def_use - .get_users(id) - .as_ref() - .into_iter() - .chain(antideps_users[id.idx()].iter()) - .map(|id| *id) - { - if let Node::Phi { control, data } = &function.nodes[user.idx()] { - // For phis, we need to dominate the block jumping to the phi in - // the slot that corresponds to our use. - for (control, data) in - zip(get_uses(&function.nodes[control.idx()]).as_ref(), data) - { - if id == *data { - update_lca(*control); - } - } - } else if let Node::Reduce { - control, - init, - reduct, - } = &function.nodes[user.idx()] - { - // For reduces, we need to either dominate the block right - // before the fork if we're the init input, or we need to - // dominate the join if we're the reduct input. - if id == *init { - let before_fork = function.nodes[join_fork_map[control].idx()] - .try_fork() - .unwrap() - .0; - update_lca(before_fork); - } else { - assert_eq!(id, *reduct); - update_lca(*control); - } - } else { - // For everything else, we just need to dominate the user. - update_lca(bbs[user.idx()]?); - } - } - - Some(lca) - }; - - // Check if all users have been placed. If one of them hasn't, then add - // this node back on to the worklist. - let Some(lca) = calculate_lca() else { - worklist.push_back(id); - continue; - }; - - // Look between the LCA and the schedule early location to place the - // node. - let schedule_early = schedule_early[id.idx()].unwrap(); - let schedule_late = lca.unwrap_or(schedule_early); - let mut chain = dom - // If the node has no users, then it doesn't really matter where we - // place it - just place it at the early placement. - .chain(schedule_late, schedule_early); - - if let Some(mut location) = chain.next() { - /* - while let Some(control_node) = chain.next() { - // If the next node further up the dominator tree is in a shallower - // loop nest or if we can get out of a reduce loop when we don't - // need to be in one, place this data node in a higher-up location. - let old_nest = loops - .header_of(location) - .map(|header| loops.nesting(header).unwrap()); - let new_nest = loops - .header_of(control_node) - .map(|header| loops.nesting(header).unwrap()); - let shallower_nest = if let (Some(old_nest), Some(new_nest)) = (old_nest, new_nest) - { - old_nest > new_nest - } else { - // If the new location isn't a loop, it's nesting level should - // be considered "shallower" if the current location is in a - // loop. - old_nest.is_some() - }; - // This will move all nodes that don't need to be in reduce loops - // outside of reduce loops. Nodes that do need to be in a reduce - // loop use the reduce node forming the loop, so the dominator chain - // will consist of one block, and this loop won't ever iterate. - let currently_at_join = function.nodes[location.idx()].is_join(); - if shallower_nest || currently_at_join { - location = control_node; - } - } - */ - - bbs[id.idx()] = Some(location); - } else { - // If there is no valid location for this node, then it's a reading - // node of a collection that can't be placed above a mutation that - // anti-depend uses it. Thus, a clone needs to be induced. - todo!() - } - } - let bbs: Vec<_> = bbs.into_iter().map(Option::unwrap).collect(); - - // Step 5: determine the order of nodes inside each block. Use worklist to - // add nodes to blocks in order that obeys dependencies. - let mut order: Vec<Vec<NodeID>> = vec![vec![]; function.nodes.len()]; - let mut worklist = VecDeque::from_iter( - reverse_postorder - .into_iter() - .filter(|id| !function.nodes[id.idx()].is_control()), - ); - let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; - let mut no_change_iters = 0; - while no_change_iters <= worklist.len() - && let Some(id) = worklist.pop_front() - { - let node = &function.nodes[id.idx()]; - if node.is_phi() - || node.is_reduce() - || get_uses(node) - .as_ref() - .into_iter() - .chain(antideps_uses[id.idx()].iter()) - .all(|u| { - function.nodes[u.idx()].is_control() - || bbs[u.idx()] != bbs[id.idx()] - || visited[u.idx()] - }) - { - order[bbs[id.idx()].idx()].push(*id); - visited.set(id.idx(), true); - no_change_iters = 0; - } else { - worklist.push_back(id); - no_change_iters += 1; - } - } - - if no_change_iters == 0 { - Ok((bbs, order)) - } else { - // If the worklist exited without finishing, then there's at least one - // reading node of a collection that is in a anti-depend + normal depend - // use cycle with a mutating node. This cycle must be broken by inducing - // a clone. - todo!() - } -} - -fn terminating_reads<'a>( - function: &'a Function, - func_id: FunctionID, - reader: NodeID, - objects: &'a CollectionObjects, -) -> Box<dyn Iterator<Item = NodeID> + 'a> { - match function.nodes[reader.idx()] { - Node::Read { - collect, - indices: _, - } if objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)), - Node::Write { - collect: _, - data, - indices: _, - } if !objects[&func_id].objects(data).is_empty() => Box::new(once(data)), - Node::Call { - control: _, - function: callee, - dynamic_constants: _, - ref args, - } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| { - let objects = &objects[&callee]; - let returns = objects.returned_objects(); - let param_obj = objects.param_to_object(idx)?; - if !objects.is_mutated(param_obj) && !returns.contains(¶m_obj) { - Some(*arg) - } else { - None - } - })), - _ => Box::new(empty()), - } -} - -fn forwarding_reads<'a>( - function: &'a Function, - func_id: FunctionID, - reader: NodeID, - objects: &'a CollectionObjects, -) -> Box<dyn Iterator<Item = NodeID> + 'a> { - match function.nodes[reader.idx()] { - Node::Read { - collect, - indices: _, - } if !objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)), - Node::Ternary { - op: TernaryOperator::Select, - first: _, - second, - third, - } if !objects[&func_id].objects(reader).is_empty() => { - Box::new(once(second).chain(once(third))) - } - Node::Call { - control: _, - function: callee, - dynamic_constants: _, - ref args, - } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| { - let objects = &objects[&callee]; - let returns = objects.returned_objects(); - let param_obj = objects.param_to_object(idx)?; - if !objects.is_mutated(param_obj) && returns.contains(¶m_obj) { - Some(*arg) - } else { - None - } - })), - _ => Box::new(empty()), - } -} - -fn mutating_objects<'a>( - function: &'a Function, - func_id: FunctionID, - mutator: NodeID, - objects: &'a CollectionObjects, -) -> Box<dyn Iterator<Item = CollectionObjectID> + 'a> { - match function.nodes[mutator.idx()] { - Node::Write { - collect, - data: _, - indices: _, - } => Box::new(objects[&func_id].objects(collect).into_iter().map(|id| *id)), - Node::Call { - control: _, - function: callee, - dynamic_constants: _, - ref args, - } => Box::new( - args.into_iter() - .enumerate() - .filter_map(move |(idx, arg)| { - let callee_objects = &objects[&callee]; - let param_obj = callee_objects.param_to_object(idx)?; - if callee_objects.is_mutated(param_obj) { - Some(objects[&func_id].objects(*arg).into_iter().map(|id| *id)) - } else { - None - } - }) - .flatten(), - ), - _ => Box::new(empty()), - } -} - -/* - * Top level function to materialize clones of collections. This transformation - * eliminates the possibility of multiple independent writes (including dynamic - * writes) to a single collection by introducing extra collection constants and - * inserting explicit clones. This allows us to make the simplifying assumption - * in the backend that collections have reference, rather than value, semantics. - * The pass calling this function is mandatory for correctness. - */ -fn materialize_clones( - editor: &mut FunctionEditor, - typing: &Vec<TypeID>, - control_subgraph: &Subgraph, - dom: &DomTree, - loops: &LoopTree, - objects: &CollectionObjects, - bbs: &BasicBlocks, -) -> bool { - let rev_po = control_subgraph.rev_po(NodeID::new(0)); - let mut total_num_pts = 0; - let mut bb_to_prefix_sum = vec![0; bbs.0.len()]; - for ((idx, bb), insts) in zip(bbs.0.iter().enumerate(), bbs.1.iter()) { - if idx == bb.idx() { - bb_to_prefix_sum[idx] = total_num_pts; - total_num_pts += insts.len() + 1; - } - } - - // Calculate two lattices - one that includes back edges, and one that - // doesn't. We want to handle simple clones before loop induced clones, so - // we first materialize clones based on the no-back-edges lattice, and hten - // based on the full lattice. - let mut no_back_edge_lattice: Vec<BTreeMap<NodeID, BTreeSet<NodeID>>> = - vec![BTreeMap::new(); total_num_pts]; - used_collections_dataflow( - editor, - &mut no_back_edge_lattice, - &rev_po, - &bb_to_prefix_sum, - control_subgraph, - objects, - bbs, - ); - let mut super_value = BTreeMap::new(); - if find_clones( - editor, - &super_value, - &no_back_edge_lattice, - &rev_po, - &typing, - control_subgraph, - dom, - loops, - objects, - &bb_to_prefix_sum, - bbs, - ) { - return true; - } - - // After inducing simple clones, calculate the full lattice and materialize - // any loop induced clones. - let mut lattice: Vec<BTreeMap<NodeID, BTreeSet<NodeID>>> = vec![BTreeMap::new(); total_num_pts]; - loop { - let changed = used_collections_dataflow( - editor, - &mut lattice, - &rev_po, - &bb_to_prefix_sum, - control_subgraph, - objects, - bbs, - ); - if !changed { - break; - } - } - for value in lattice.iter() { - meet(&mut super_value, value); - } - find_clones( - editor, - &super_value, - &lattice, - &rev_po, - &typing, - control_subgraph, - dom, - loops, - objects, - &bb_to_prefix_sum, - bbs, - ) -} - -fn meet(left: &mut BTreeMap<NodeID, BTreeSet<NodeID>>, right: &BTreeMap<NodeID, BTreeSet<NodeID>>) { - for (used, users) in right.into_iter() { - left.entry(*used).or_default().extend(users.into_iter()); - } -} - -/* - * Helper function to run a single iteration of the used collections dataflow - * analysis. Returns whether the lattice was changed. The lattice maps each - * program point to a set of used values and their possible users. Top is that - * no nodes are used yet. - */ -fn used_collections_dataflow( - editor: &FunctionEditor, - lattice: &mut Vec<BTreeMap<NodeID, BTreeSet<NodeID>>>, - rev_po: &Vec<NodeID>, - bb_to_prefix_sum: &Vec<usize>, - control_subgraph: &Subgraph, - objects: &CollectionObjects, - bbs: &BasicBlocks, -) -> bool { - // Run dataflow analysis to figure out which accesses to collections induce - // clones. This dataflow analysis depends on basic block assignments and is - // more analogous to standard dataflow analysis in CFG + SSA IRs. This is - // the only place this form is used, so just hardcode it here. - // - // This forward dataflow analysis tracks which collections are used at each - // program point, and by what user nodes. Collections are referred to using - // node IDs. Specifically: - // - // - Phi - a phi node adds its inputs to the used set and removes itself - // from the used set. If a phi uses an ID that is used along the edge of - // the corresponding predecessor, a clone is induced. - // - Select - a select node adds its inputs to the used set and removes - // itself from the used set. If either use is already used, a clone is - // induced. - // - Reduce - a reduce node adds its inputs to the used set and removes - // itself from the used set. If the `init` input is already used, a clone - // is induced. If the `reduct` input is used at the end of the basic block - // containing the reduce, then a clone is induced. At the end of the basic - // block, the reduce removes itself from the used set. - // - Read - a read node that reads a sub-collections from a collection, - // rather than reading a primitive type, adds its input to the used set - // and removes itself from the used set. If the `collect` input is already - // used, a clone is induced. - // - Write - a write node adds its `collect` input to the used set and - // removes itself from the used set. If the `collect` input is already - // used, a clone is induced. - // - Call - a call node adds any mutated input or input that may be returned - // to the used set and removes itself from the used set. If any mutated - // input is already used, a clone is induced. - // - // Reads of sub-collections (select, some read, and some call nodes) use a - // collection because they may have downstream writes that depend on the new - // "sub-view" of the same collection. This does not include reads that "end" - // (most reads, some calls, the `data` input of a write). This analysis does - // not consider parallel mutations in fork-joins. - let nodes = &editor.func().nodes; - let func_id = editor.func_id(); - let mut changed = false; - - for bb in rev_po.iter() { - // The lattice value of the first point is the meet of the - // predecessor terminating lattice values. - let old_top_value = &lattice[bb_to_prefix_sum[bb.idx()]]; - let mut new_top_value = BTreeMap::new(); - // Clearing `top_value` is not necessary since used nodes are never - // removed from lattice values, only added. - for pred in control_subgraph.preds(*bb) { - let last_pt = bbs.1[pred.idx()].len(); - meet( - &mut new_top_value, - &lattice[bb_to_prefix_sum[pred.idx()] + last_pt], - ); - } - changed |= *old_top_value != new_top_value; - lattice[bb_to_prefix_sum[bb.idx()]] = new_top_value; - - // The lattice value of following points are determined by their - // immediate preceding instructions. - let insts = &bbs.1[bb.idx()]; - for (prev_pt, inst) in insts.iter().enumerate() { - let old_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1]; - let prev_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt]; - let mut new_value = prev_value.clone(); - match nodes[inst.idx()] { - Node::Phi { - control: _, - ref data, - } if !objects[&func_id].objects(*inst).is_empty() => { - for elem in data { - new_value.entry(*elem).or_default().insert(*inst); - } - new_value.remove(inst); - } - Node::Ternary { - op: TernaryOperator::Select, - first: _, - second, - third, - } - | Node::Reduce { - control: _, - init: second, - reduct: third, - } => { - if !objects[&func_id].objects(*inst).is_empty() { - new_value.entry(second).or_default().insert(*inst); - new_value.entry(third).or_default().insert(*inst); - new_value.remove(inst); - } - } - Node::Read { - collect, - indices: _, - } if !objects[&func_id].objects(*inst).is_empty() => { - new_value.entry(collect).or_default().insert(*inst); - new_value.remove(inst); - } - Node::Write { - collect, - data: _, - indices: _, - } => { - new_value.entry(collect).or_default().insert(*inst); - new_value.remove(inst); - } - Node::Call { - control: _, - function: callee, - dynamic_constants: _, - ref args, - } => { - let callee_objects = &objects[&callee]; - for (param_idx, arg) in args.into_iter().enumerate() { - if callee_objects - .param_to_object(param_idx) - .map(|object| { - callee_objects.is_mutated(object) - || callee_objects.returned_objects().contains(&object) - }) - .unwrap_or(false) - { - new_value.entry(*arg).or_default().insert(*inst); - } - } - new_value.remove(inst); - } - _ => {} - } - changed |= *old_value != new_value; - lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1] = new_value; - } - - // Handle reduces in this block specially at the very end. - let last_pt = insts.len(); - let old_bottom_value = &lattice[bb_to_prefix_sum[bb.idx()] + last_pt]; - let mut new_bottom_value = old_bottom_value.clone(); - for inst in insts.iter() { - if let Node::Reduce { - control: _, - init: _, - reduct, - } = nodes[inst.idx()] - { - assert!( - new_bottom_value.contains_key(&reduct), - "PANIC: Can't handle clones inside a reduction cycle currently." - ); - new_bottom_value.remove(inst); - } - } - changed |= *old_bottom_value != new_bottom_value; - lattice[bb_to_prefix_sum[bb.idx()] + last_pt] = new_bottom_value; - } - - changed -} - -/* - * Helper function to induce a clone once an object with multiple users has been - * found. - */ -fn induce_clone( - editor: &mut FunctionEditor, - object: NodeID, - user: NodeID, - value: &BTreeMap<NodeID, BTreeSet<NodeID>>, - super_value: &BTreeMap<NodeID, BTreeSet<NodeID>>, - lattice: &Vec<BTreeMap<NodeID, BTreeSet<NodeID>>>, - rev_po: &Vec<NodeID>, - typing: &Vec<TypeID>, - control_subgraph: &Subgraph, - dom: &DomTree, - loops: &LoopTree, - bb_to_prefix_sum: &Vec<usize>, - bbs: &BasicBlocks, -) { - // If `user` already used `object` and tries to use it again, then the - // clone is a "loop induced" clone. Otherwise, it's a simple clone. - if !value[&object].contains(&user) { - let success = editor.edit(|mut edit| { - // Create the constant collection object for allocation. - let object_ty = typing[object.idx()]; - let object_cons = edit.add_zero_constant(object_ty); - let cons_node = edit.add_node(Node::Constant { id: object_cons }); - - // Create the clone into the new constant collection. - let clone_node = edit.add_node(Node::Write { - collect: cons_node, - data: object, - indices: vec![].into_boxed_slice(), - }); - - // Make user use the cloned object. - edit.replace_all_uses_where(object, clone_node, |id| *id == user) - }); - assert!(success); - } else { - // Figure out where to place that phi. This is the deepest - // loop header where `user` is responsible for making `object` used - // used at the top of the block, and the block dominates the block - // containing `user`. If `user` is a phi, then the region it's - // attached to is excluded from eligibility. - let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| { - lattice[bb_to_prefix_sum[bb.idx()]] - .get(&object) - .unwrap_or(&BTreeSet::new()) - .contains(&user) - && dom.does_dom(*bb, bbs.0[user.idx()]) - && loops.contains(*bb) - && loops.is_in_loop(*bb, bbs.0[user.idx()]) - && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()]) - }); - let top_block = eligible_blocks - .max_by_key(|bb| loops.nesting(*bb).unwrap()) - .unwrap(); - assert!(editor.func().nodes[top_block.idx()].is_region()); - - // Figure out the users of `object` that we need to phi back - // upwards. Assign each user a number indicating how far down the - // user chain it is, higher is farther down. This is used for - // picking the most downstream user later. - let mut users: BTreeMap<NodeID, usize> = BTreeMap::new(); - let mut workset: BTreeSet<NodeID> = BTreeSet::new(); - workset.insert(object); - let mut chain_ordering = 1; - assert!(!super_value.is_empty()); - while let Some(pop) = workset.pop_first() { - let iterated_users: BTreeSet<_> = super_value - .get(&pop) - .map(|users| users.into_iter()) - .into_iter() - .flatten() - .map(|id| *id) - .filter(|iterated_user| loops.is_in_loop(top_block, bbs.0[iterated_user.idx()])) - .collect(); - workset.extend(iterated_users.iter().filter(|id| !users.contains_key(id))); - for user in iterated_users { - *users.entry(user).or_default() = chain_ordering; - chain_ordering += 1; - } - } - - // The fringe users may not dominate any predecessors of the loop - // header. The following is some Juno code that exposes this: - // - // fn problematic(a : size) -> i32 { - // for i = 0 to a { - // let arr : i32[1]; - // for j = 0 to a { - // arr[0] = 1; - // } - // } - // return 0; - // } - // - // Note that `arr` induces a clone each iteration, since its value - // needs to be reset to all zeros. However, it should also be noted - // that the most fringe user of `arr`, the write inside the inner - // loop, does not dominate the bottom of the outer loop. Thus, we - // need to insert a phi in the bottom block of the outer loop to - // retrieve either the write, or `arr` before the inner loop. The - // general version of this problem requires the following solution. - // Our goal is to figure out which downstream user represents - // `object` at each block in the loop. We first assign each block - // containing a user the most downstream user it contains. Then, we - // create a dummy phi for every region (including the header) in the - // loop, which is the downstream user for that block. Then, every - // other block is assigned the downstream user of its single - // predecessor. This basically amounts to recreating SSA for - // `object` inside the loop. - let mut user_per_loop_bb = BTreeMap::new(); - let mut added_phis = BTreeMap::new(); - let mut top_phi = NodeID::new(0); - // Assign existing users. - for (user, ordering) in users.iter() { - let bb = bbs.0[user.idx()]; - if let Some(old_user) = user_per_loop_bb.get(&bb) - && users[old_user] > *ordering - { - } else { - user_per_loop_bb.insert(bb, *user); - } - } - // Assign dummy phis. - for bb in loops.nodes_in_loop(top_block) { - if (!user_per_loop_bb.contains_key(&bb) || bb == top_block) - && editor.func().nodes[bb.idx()].is_region() - { - let success = editor.edit(|mut edit| { - let phi_node = edit.add_node(Node::Phi { - control: bb, - data: empty().collect(), - }); - if bb != top_block || !user_per_loop_bb.contains_key(&bb) { - user_per_loop_bb.insert(bb, phi_node); - } - if bb == top_block { - top_phi = phi_node; - } - added_phis.insert(phi_node, bb); - Ok(edit) - }); - assert!(success); - } - } - // Assign users for the rest of the blocks. - for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) { - if !user_per_loop_bb.contains_key(&bb) { - assert!(control_subgraph.preds(*bb).count() == 1); - user_per_loop_bb.insert( - *bb, - user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()], - ); - } - } - - // Induce the clone. - let success = editor.edit(|mut edit| { - // Create the constant collection object for allocation. - let object_ty = typing[object.idx()]; - let object_cons = edit.add_zero_constant(object_ty); - let cons_node = edit.add_node(Node::Constant { id: object_cons }); - - // Create the phis. - let mut phi_map = BTreeMap::new(); - let mut real_phis = BTreeSet::new(); - for (dummy, bb) in added_phis { - let real = edit.add_node(Node::Phi { - control: bb, - data: control_subgraph - .preds(bb) - .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node)) - .collect(), - }); - phi_map.insert(dummy, real); - real_phis.insert(real); - } - - // Create the clone into the phi. - let real_top_phi = phi_map[&top_phi]; - let clone_node = edit.add_node(Node::Write { - collect: real_top_phi, - data: object, - indices: vec![].into_boxed_slice(), - }); - - // Make users use the cloned object. - edit = edit.replace_all_uses_where(object, clone_node, |id| { - id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()]) - })?; - - // Get rid of the dummy phis. - for (dummy, real) in phi_map { - edit = edit.replace_all_uses(dummy, real)?; - edit = edit.delete_node(dummy)?; - } - - // Make phis use the clone instead of the top phi. - edit.replace_all_uses_where(real_top_phi, clone_node, |id| *id != clone_node) - }); - assert!(success); - - // De-duplicate phis. - gvn(editor, false); - - // Get rid of unused phis. - dce(editor); - - // Simplify phis. - phi_elim(editor); - } -} - -/* - * Helper function to analyze lattice values at each program point and find - * multiple dynamic users of a single write. Return as soon as any clone is - * found. - */ -fn find_clones( - editor: &mut FunctionEditor, - super_value: &BTreeMap<NodeID, BTreeSet<NodeID>>, - lattice: &Vec<BTreeMap<NodeID, BTreeSet<NodeID>>>, - rev_po: &Vec<NodeID>, - typing: &Vec<TypeID>, - control_subgraph: &Subgraph, - dom: &DomTree, - loops: &LoopTree, - objects: &CollectionObjects, - bb_to_prefix_sum: &Vec<usize>, - bbs: &BasicBlocks, -) -> bool { - let nodes = &editor.func().nodes; - let func_id = editor.func_id(); - for bb in rev_po.iter().rev() { - let insts = &bbs.1[bb.idx()]; - // Accumulate predecessor bottom used sets for phis. Phis are special in - // that they need to be path sensitive, but multiple phis in a single - // block may use a single collection, and that needs to induce a clone. - let mut phi_acc_bottoms = BTreeMap::new(); - for (prev_pt, inst) in insts.iter().enumerate() { - let value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt]; - match nodes[inst.idx()] { - Node::Phi { - control: _, - ref data, - } => { - // In phis, check if an argument is already used in the - // predecessor's bottom lattice value (phis need to be path- - // sensitive). - for (pred, arg) in zip(control_subgraph.preds(*bb), data) { - let bottom = phi_acc_bottoms.entry(pred).or_insert_with(|| { - let last_pt = bbs.1[pred.idx()].len(); - let bottom = &lattice[bb_to_prefix_sum[pred.idx()] + last_pt]; - bottom.clone() - }); - if bottom.contains_key(&arg) { - induce_clone( - editor, - *arg, - *inst, - bottom, - super_value, - lattice, - rev_po, - typing, - control_subgraph, - dom, - loops, - bb_to_prefix_sum, - bbs, - ); - return true; - } else { - // Subsequent phis using `arg` along the same - // predecessor induce a clone. - bottom.insert(*arg, once(*inst).collect()); - } - } - } - Node::Ternary { - op: TernaryOperator::Select, - first: _, - second, - third, - } => { - if value.contains_key(&second) { - induce_clone( - editor, - second, - *inst, - &value, - super_value, - lattice, - rev_po, - typing, - control_subgraph, - dom, - loops, - bb_to_prefix_sum, - bbs, - ); - return true; - } - if value.contains_key(&third) { - induce_clone( - editor, - third, - *inst, - &value, - super_value, - lattice, - rev_po, - typing, - control_subgraph, - dom, - loops, - bb_to_prefix_sum, - bbs, - ); - return true; - } - } - Node::Reduce { - control: _, - init, - reduct: _, - } => { - if value.contains_key(&init) { - induce_clone( - editor, - init, - *inst, - &value, - super_value, - lattice, - rev_po, - typing, - control_subgraph, - dom, - loops, - bb_to_prefix_sum, - bbs, - ); - return true; - } - } - Node::Read { - collect, - indices: _, - } if !objects[&func_id].objects(*inst).is_empty() => { - if value.contains_key(&collect) { - induce_clone( - editor, - collect, - *inst, - &value, - super_value, - lattice, - rev_po, - typing, - control_subgraph, - dom, - loops, - bb_to_prefix_sum, - bbs, - ); - return true; - } - } - Node::Write { - collect, - data: _, - indices: _, - } => { - if value.contains_key(&collect) { - induce_clone( - editor, - collect, - *inst, - &value, - super_value, - lattice, - rev_po, - typing, - control_subgraph, - dom, - loops, - bb_to_prefix_sum, - bbs, - ); - return true; - } - } - Node::Call { - control: _, - function: callee, - dynamic_constants: _, - ref args, - } => { - let callee_objects = &objects[&callee]; - for (param_idx, arg) in args.into_iter().enumerate() { - if callee_objects - .param_to_object(param_idx) - .map(|object| { - callee_objects.is_mutated(object) - || callee_objects.returned_objects().contains(&object) - }) - .unwrap_or(false) - && value.contains_key(arg) - { - induce_clone( - editor, - *arg, - *inst, - value, - super_value, - lattice, - rev_po, - typing, - control_subgraph, - dom, - loops, - bb_to_prefix_sum, - bbs, - ); - return true; - } - } - } - _ => {} - } - } - } - false -} diff --git a/hercules_opt/src/lib.rs b/hercules_opt/src/lib.rs index ed658b22481aa1678a5fe9b47c8ccaad84d47e57..08d183a7f8c2ddfc650bb1ef3ce385761a137def 100644 --- a/hercules_opt/src/lib.rs +++ b/hercules_opt/src/lib.rs @@ -1,17 +1,17 @@ #![feature(let_chains)] pub mod ccp; -pub mod clone_elim; pub mod dce; pub mod delete_uncalled; pub mod editor; +pub mod float_collections; pub mod fork_concat_split; pub mod fork_guard_elim; pub mod forkify; +pub mod gcm; pub mod gvn; pub mod inline; pub mod interprocedural_sroa; -pub mod legalize_reference_semantics; pub mod outline; pub mod pass; pub mod phi_elim; @@ -22,17 +22,17 @@ pub mod unforkify; pub mod utils; pub use crate::ccp::*; -pub use crate::clone_elim::*; pub use crate::dce::*; pub use crate::delete_uncalled::*; pub use crate::editor::*; +pub use crate::float_collections::*; pub use crate::fork_concat_split::*; pub use crate::fork_guard_elim::*; pub use crate::forkify::*; +pub use crate::gcm::*; pub use crate::gvn::*; pub use crate::inline::*; pub use crate::interprocedural_sroa::*; -pub use crate::legalize_reference_semantics::*; pub use crate::outline::*; pub use crate::pass::*; pub use crate::phi_elim::*; diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs index 84cedb7629e45abb3abad5eac841b24f224a2698..239838334801c7048a3bcaaad0cd626f2c9d0dea 100644 --- a/hercules_opt/src/outline.rs +++ b/hercules_opt/src/outline.rs @@ -567,9 +567,7 @@ pub fn outline( } /* - * Just outlines all of a function except the entry, return, and aggregate - * constants. This is the minimum work needed to cause runtime Rust code to be - * generated as necessary. + * Just outlines all of a function except the start and return nodes. */ pub fn dumb_outline( editor: &mut FunctionEditor, @@ -585,11 +583,7 @@ pub fn dumb_outline( .node_ids() .filter(|id| { let node = &editor.func().nodes[id.idx()]; - if let Node::Constant { id } = editor.func().nodes[id.idx()] { - editor.get_constant(id).is_scalar() - } else { - !(node.is_start() || node.is_parameter() || node.is_return()) - } + !(node.is_start() || node.is_parameter() || node.is_return()) }) .collect(); outline( diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index b136fef9c704b82c46877ed0c1d9c39bd7df793a..57fd464de82a36cab4c2d7c3758bd43b3d13eae3 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -39,8 +39,8 @@ pub enum Pass { ForkSplit, Unforkify, InferSchedules, - LegalizeReferenceSemantics, - CloneElim, + GCM, + FloatCollections, Verify, // Parameterized over whether analyses that aid visualization are necessary. // Useful to set to false if displaying a potentially broken module. @@ -750,7 +750,7 @@ impl PassManager { } self.clear_analyses(); } - Pass::LegalizeReferenceSemantics => loop { + Pass::GCM => loop { self.make_def_uses(); self.make_reverse_postorders(); self.make_typing(); @@ -782,7 +782,7 @@ impl PassManager { &types_ref, &def_uses[idx], ); - if let Some(bb) = legalize_reference_semantics( + if let Some(bb) = gcm( &mut editor, &def_uses[idx], &reverse_postorders[idx], @@ -808,30 +808,41 @@ impl PassManager { break; } }, - Pass::CloneElim => { + Pass::FloatCollections => { self.make_def_uses(); + self.make_typing(); + self.make_callgraph(); let def_uses = self.def_uses.as_ref().unwrap(); - for idx in 0..self.module.functions.len() { - let constants_ref = - RefCell::new(std::mem::take(&mut self.module.constants)); - let dynamic_constants_ref = - RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); - let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); - let mut editor = FunctionEditor::new( - &mut self.module.functions[idx], + let typing = self.typing.as_ref().unwrap(); + let callgraph = self.callgraph.as_ref().unwrap(); + let devices = device_placement(&self.module.functions, &callgraph); + let constants_ref = RefCell::new(std::mem::take(&mut self.module.constants)); + let dynamic_constants_ref = + RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); + let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, - &def_uses[idx], - ); - clone_elim(&mut editor); + def_use, + ) + }) + .collect(); + float_collections(&mut editors, typing, callgraph, &devices); - self.module.constants = constants_ref.take(); - self.module.dynamic_constants = dynamic_constants_ref.take(); - self.module.types = types_ref.take(); + self.module.constants = constants_ref.take(); + self.module.dynamic_constants = dynamic_constants_ref.take(); + self.module.types = types_ref.take(); - self.module.functions[idx].delete_gravestones(); + for func in self.module.functions.iter_mut() { + func.delete_gravestones(); } self.clear_analyses(); } diff --git a/juno_frontend/src/lib.rs b/juno_frontend/src/lib.rs index ae6a74211272d9a557fd42c348fe2d0a94cf1a1e..9297173d4deededf8b541dfc821e1b88dd68841d 100644 --- a/juno_frontend/src/lib.rs +++ b/juno_frontend/src/lib.rs @@ -191,8 +191,6 @@ pub fn compile_ir( add_pass!(pm, verify, Unforkify); add_pass!(pm, verify, GVN); add_verified_pass!(pm, verify, DCE); - add_pass!(pm, verify, LegalizeReferenceSemantics); - add_pass!(pm, verify, CloneElim); add_pass!(pm, verify, DCE); add_pass!(pm, verify, Outline); add_pass!(pm, verify, InterproceduralSROA); @@ -203,9 +201,10 @@ pub fn compile_ir( pm.add_pass(hercules_opt::pass::Pass::Xdot(true)); } - add_pass!(pm, verify, LegalizeReferenceSemantics); + add_pass!(pm, verify, GCM); add_verified_pass!(pm, verify, DCE); - add_pass!(pm, verify, LegalizeReferenceSemantics); + add_pass!(pm, verify, FloatCollections); + add_pass!(pm, verify, GCM); pm.add_pass(hercules_opt::pass::Pass::Codegen(output_dir, module_name)); pm.run_passes(); diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn index 67bdd44a68de51a9acd1a242c0b9bb40ad983fcd..882e5abc51bb596be63512eaff96434a2c45d43a 100644 --- a/juno_samples/implicit_clone/src/implicit_clone.jn +++ b/juno_samples/implicit_clone/src/implicit_clone.jn @@ -90,6 +90,7 @@ fn tricky3_loop_implicit_clone(a : usize, b : usize) -> usize { for kk = 0 to 10 { arr2[kk] += arr1[kk]; } + x += arr2[1]; } return x; } @@ -112,7 +113,7 @@ fn no_implicit_clone(input : i32) -> i32 { } #[entry] -fn complex_implicit_clone(input : i32) -> i32 { +fn mirage_implicit_clone(input : i32) -> i32 { let arr1 : i32[2]; let arr2 : i32[2]; let arr3 : i32[2]; diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs index a46a67280c96aade3a056fe594443122972394e2..a92e4e2d7558733b28db0bba203415a73631ab6d 100644 --- a/juno_samples/implicit_clone/src/main.rs +++ b/juno_samples/implicit_clone/src/main.rs @@ -29,13 +29,13 @@ fn main() { let output = tricky3_loop_implicit_clone(5, 7).await; println!("{}", output); - assert_eq!(output, 0); + assert_eq!(output, 7); let output = no_implicit_clone(4).await; println!("{}", output); assert_eq!(output, 13); - let output = complex_implicit_clone(73).await; + let output = mirage_implicit_clone(73).await; println!("{}", output); assert_eq!(output, 843); });