diff --git a/Cargo.lock b/Cargo.lock index 13cadc9592c8e1d95e21c013f465919ffcf6c5ec..7b70c0b028a2ebcd5ca1f99ac68b192fbec83b98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -741,6 +741,15 @@ dependencies = [ "phf", ] +[[package]] +name = "juno_implicit_clone" +version = "0.1.0" +dependencies = [ + "async-std", + "juno_build", + "with_builtin_macros", +] + [[package]] name = "juno_matmul" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 1db806f41befe6b20d89b8484e069a4674276d6d..dc0c64789c6ce2abdc8ec53757c05f64010738bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,5 +25,5 @@ members = [ "juno_samples/casts_and_intrinsics", "juno_samples/nested_ccp", "juno_samples/antideps", - #"juno_samples/implicit_clone", + "juno_samples/implicit_clone", ] diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index a09eacfa621876a570d37bd96e1d1f534e94fa4e..7c67af538b81946f67786a8133b9f670b7138f9e 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -24,7 +24,6 @@ pub fn cpu_codegen<W: Write>( reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, - antideps: &Vec<(NodeID, NodeID)>, bbs: &BasicBlocks, w: &mut W, ) -> Result<(), Error> { @@ -36,7 +35,6 @@ pub fn cpu_codegen<W: Write>( reverse_postorder, typing, control_subgraph, - antideps, bbs, }; ctx.codegen_function(w) @@ -50,7 +48,6 @@ struct CPUContext<'a> { reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, - antideps: &'a Vec<(NodeID, NodeID)>, bbs: &'a BasicBlocks, } @@ -524,12 +521,6 @@ impl<'a> CPUContext<'a> { self.get_value(data, true), index_ptr_name )?; - write!( - body, - " {} = bitcast {} to ptr\n", - self.get_value(id, false), - self.get_value(collect, true) - )?; } else { // If the data item being written is not a primitive type, // then perform a memcpy from the data collection to the @@ -543,6 +534,12 @@ impl<'a> CPUContext<'a> { data_size )?; } + write!( + body, + " {} = bitcast {} to ptr\n", + self.get_value(id, false), + self.get_value(collect, true) + )?; } _ => panic!("PANIC: Can't lower {:?}.", self.function.nodes[id.idx()]), } @@ -732,7 +729,7 @@ impl<'a> CPUContext<'a> { // the dynamic constant bounds. let mut acc_size = self.codegen_type_size(elem, body)?; for dc in bounds { - acc_size = Self::multiply(&acc_size, &format!("dc{}", dc.idx()), body)?; + acc_size = Self::multiply(&acc_size, &format!("%dc{}", dc.idx()), body)?; } Ok(acc_size) } diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index c579b7e9af641d0ee8912b52e92a9f0328b8ffe1..77cfa5404ddbb368ea7e30506bba1b9890663ec6 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -12,6 +12,18 @@ extern crate hercules_ir; use self::hercules_ir::*; +/* + * Basic block info consists of two things: + * + * 1. A map from node to block (named by control nodes). + * 2. For each node, which nodes are in its own block. + * + * Note that for #2, the structure is Vec<NodeID>, meaning the nodes are ordered + * inside the block. This order corresponds to the traversal order of the nodes + * in the block needed by the backend code generators. + */ +pub type BasicBlocks = (Vec<NodeID>, Vec<Vec<NodeID>>); + /* * The alignment of a type does not depend on dynamic constants. */ diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 890c898d75841c85756f3b6e2dd0011678e7789b..e484729d78e4f4c077f70145ab74d25041d785e8 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -19,7 +19,6 @@ pub fn rt_codegen<W: Write>( reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, - antideps: &Vec<(NodeID, NodeID)>, bbs: &BasicBlocks, collection_objects: &CollectionObjects, callgraph: &CallGraph, @@ -32,7 +31,6 @@ pub fn rt_codegen<W: Write>( reverse_postorder, typing, control_subgraph, - antideps, bbs, collection_objects, callgraph, @@ -47,7 +45,6 @@ struct RTContext<'a> { reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, - antideps: &'a Vec<(NodeID, NodeID)>, bbs: &'a BasicBlocks, collection_objects: &'a CollectionObjects, callgraph: &'a CallGraph, diff --git a/hercules_ir/src/antideps.rs b/hercules_ir/src/antideps.rs deleted file mode 100644 index af949708dd2bc278410ecedd632a04313fa30415..0000000000000000000000000000000000000000 --- a/hercules_ir/src/antideps.rs +++ /dev/null @@ -1,297 +0,0 @@ -use std::collections::{BTreeMap, BTreeSet}; -use std::iter::zip; - -use crate::*; - -/* - * In addition to collections, we need to figure out which "generation" of a - * collection a node may take as input. - */ -#[derive(PartialEq, Eq, Clone, Debug)] -struct GenerationLattice { - objs: BTreeSet<(CollectionObjectID, NodeID)>, -} - -impl Semilattice for GenerationLattice { - fn meet(a: &Self, b: &Self) -> Self { - GenerationLattice { - objs: a.objs.union(&b.objs).map(|x| *x).collect(), - } - } - - fn top() -> Self { - GenerationLattice { - objs: BTreeSet::new(), - } - } - - fn bottom() -> Self { - // Bottom is not representable for this lattice with our Semilattice - // interface, but we never need to construct it. - panic!() - } -} - -/* - * Function to assemble anti-dependence edges. Returns a list of pairs of nodes. - * The first item in the pair is the reading node, and the second item is the - * mutating node. - */ -pub fn antideps( - function: &Function, - reverse_postorder: &Vec<NodeID>, - objects: &FunctionCollectionObjects, -) -> Vec<(NodeID, NodeID)> { - // First, we analyze "generations" of collections as they are mutated. - // Originating, mutating, phi, and reduce nodes start a new generation of a - // collection. Generations are not ordered due to loops, but are rather just - // node IDs of node (parameter, constant, call, undef, write, phi, reduce). - // Other nodes operating on collections mean reads / writes can operate on - // potentially different generations of multiple collections (select). - let lattice = forward_dataflow(function, reverse_postorder, |inputs, id| { - match function.nodes[id.idx()] { - Node::Ternary { - op: TernaryOperator::Select, - first: _, - second: _, - third: _, - } => inputs - .into_iter() - .fold(GenerationLattice::top(), |acc, input| { - GenerationLattice::meet(&acc, input) - }), - Node::Parameter { index: _ } | Node::Constant { id: _ } | Node::Undef { ty: _ } => { - let objs = objects.objects(id); - GenerationLattice { - objs: objs.into_iter().map(|obj| (*obj, id)).collect(), - } - } - Node::Call { - control: _, - function: _, - dynamic_constants: _, - ref args, - } => { - let mut objs = BTreeSet::new(); - let call_objs = objects.objects(id); - - // If this call node might originate an object, add that to the - // lattice output - its generation is this call node. - for obj in call_objs { - if objects.origin(*obj) == CollectionObjectOrigin::Call(id) { - assert!(objs.len() <= 1); - objs.insert((*obj, id)); - } - } - - // For every argument... - for (arg, arg_gens) in zip(args, inputs.into_iter().skip(1)) { - // Look at its objects... - for arg_obj in objects.objects(*arg) { - // For each object that might be returned... - if call_objs.contains(&arg_obj) { - let mutable = objects.mutators(*arg_obj).contains(&id); - for (obj, gen) in arg_gens.objs.iter() { - // Add that object to the output lattice. - if obj == arg_obj && mutable { - // Set the generation to this node if the - // object might be mutated. - objs.insert((*obj, id)); - } else if obj == arg_obj { - // Otherwise, keep the old generation. - objs.insert((*obj, *gen)); - } - } - } - } - } - GenerationLattice { objs } - } - Node::Read { - collect: _, - indices: _, - } => inputs[0].clone(), - Node::Phi { - control: _, - data: _, - } - | Node::Reduce { - control: _, - init: _, - reduct: _, - } - | Node::Write { - collect: _, - data: _, - indices: _, - } => { - // Phis, reduces, and writes update the generation to the write. - let objs = inputs[0].objs.iter().map(|(obj, _)| (*obj, id)).collect(); - GenerationLattice { objs } - } - _ => GenerationLattice::top(), - } - }); - - // Second, we generate anti-dependence edges from the dataflow analysis. - // There are four cases where an anti-dependence edge is generated: - // - // 1. A read node and a write node share an object and generation pair on - // their `collect` input. - // 2. A read node and a call node share an object and generation pair, where - // the pair is on the read's `collect` input and the pair is on any input - // of the call node AND the call node is a mutator of the object. - // 3. A call node and a write node share an object and generation pair, - // where the pair is on any input of the call node and the pair is on the - // write's `collect` input. - // 4. A call node and another call node share an object and generation pair, - // where the pair is on any input of both call nodes AND the second call - // node is a mutator of the object. - let mut reads_writes_calls_mut_calls_per_pair: BTreeMap< - (CollectionObjectID, NodeID), - (Vec<NodeID>, Vec<NodeID>, Vec<NodeID>, Vec<NodeID>), - > = BTreeMap::new(); - for (idx, node) in function.nodes.iter().enumerate() { - let id = NodeID::new(idx); - match node { - Node::Read { - collect, - indices: _, - } => { - for pair in lattice[collect.idx()].objs.iter() { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .0 - .push(id); - } - } - Node::Write { - collect, - data, - indices: _, - } => { - for pair in lattice[collect.idx()].objs.iter() { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .1 - .push(id); - } - - // When a write takes a collection on its `data` input, it - // memcpys that collection into the mutated collection. This is - // a read. - if !objects.objects(*data).is_empty() { - for pair in lattice[collect.idx()].objs.iter() { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .0 - .push(id); - } - } - } - Node::Call { - control: _, - function: _, - dynamic_constants: _, - ref args, - } => { - for arg in args { - for pair in lattice[arg.idx()].objs.iter() { - if objects.mutators(pair.0).contains(&id) { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .3 - .push(id); - } else { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .2 - .push(id); - } - } - } - } - _ => {} - } - } - - // Once we've grouped reads / writes / calls by pairs, we create pair-wise - // anti-dependence edges. Due to loops, a write may technically anti-depend - // on a read where the read depends on the write, but we don't want to - // generate that anti-dependence edge, since it'll create a cycle during - // backend code generation. Thus, if the mutator in an anti-dependence is - // the same as the generation of the current pair, don't generate the edge. - let mut antideps = vec![]; - for ((_, gen), (reads, writes, calls, mut_calls)) in reads_writes_calls_mut_calls_per_pair { - // Case 1: - for read in reads.iter() { - for write in writes.iter() { - if *write != gen && *read != *write { - antideps.push((*read, *write)); - } - } - } - - // Case 2: - for read in reads.iter() { - for mut_call in mut_calls.iter() { - if *mut_call != gen && *read != *mut_call { - antideps.push((*read, *mut_call)); - } - } - } - - // Case 3: - for call in calls.iter().chain(mut_calls.iter()) { - for write in writes.iter() { - if *write != gen && *call != *write { - antideps.push((*call, *write)); - } - } - } - - // Case 4: - for call in calls.iter().chain(mut_calls.iter()) { - for mut_call in mut_calls.iter() { - if *mut_call != gen && *call != *mut_call { - antideps.push((*call, *mut_call)); - } - } - } - } - - antideps -} - -/* - * Utility to make a map from node to anti-dependency uses (map mutator -> - * reads). - */ -pub fn flip_antideps(antideps: &Vec<(NodeID, NodeID)>) -> BTreeMap<NodeID, Vec<NodeID>> { - let mut result: BTreeMap<NodeID, Vec<NodeID>> = BTreeMap::new(); - - for (read, mutator) in antideps { - result.entry(*mutator).or_default().push(*read); - } - - result -} - -/* - * Utility to make a map from node to anti-dependency users (map reads -> - * mutators). - */ -pub fn map_antideps(antideps: &Vec<(NodeID, NodeID)>) -> BTreeMap<NodeID, Vec<NodeID>> { - let mut result: BTreeMap<NodeID, Vec<NodeID>> = BTreeMap::new(); - - for (read, mutator) in antideps { - result.entry(*read).or_default().push(*mutator); - } - - result -} diff --git a/hercules_ir/src/collections.rs b/hercules_ir/src/collections.rs index 23d84b1b6629f0a477217d0657085d234bf6cfe1..8bb1b359fdbf27c44baaec5ac129419abb066331 100644 --- a/hercules_ir/src/collections.rs +++ b/hercules_ir/src/collections.rs @@ -285,8 +285,8 @@ pub fn collection_objects( Node::Read { collect: _, indices: _, - } - | Node::Write { + } if !module.types[typing[id.idx()].idx()].is_primitive() => inputs[0].clone(), + Node::Write { collect: _, data: _, indices: _, diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs index 5ef16bb1c61846a36b78e308767838c03fb8ede0..c05f2606b2d90ae06b10f47cd2028b8a3ca974f5 100644 --- a/hercules_ir/src/dot.rs +++ b/hercules_ir/src/dot.rs @@ -20,7 +20,6 @@ pub fn xdot_module( reverse_postorders: &Vec<Vec<NodeID>>, doms: Option<&Vec<DomTree>>, fork_join_maps: Option<&Vec<HashMap<NodeID, NodeID>>>, - bbs: Option<&Vec<BasicBlocks>>, ) { let mut tmp_path = temp_dir(); let mut rng = rand::thread_rng(); @@ -33,7 +32,6 @@ pub fn xdot_module( &reverse_postorders, doms, fork_join_maps, - bbs, &mut contents, ) .expect("PANIC: Unable to generate output file contents."); @@ -55,7 +53,6 @@ pub fn write_dot<W: Write>( reverse_postorders: &Vec<Vec<NodeID>>, doms: Option<&Vec<DomTree>>, fork_join_maps: Option<&Vec<HashMap<NodeID, NodeID>>>, - bbs: Option<&Vec<BasicBlocks>>, w: &mut W, ) -> std::fmt::Result { write_digraph_header(w)?; @@ -170,28 +167,6 @@ pub fn write_dot<W: Write>( } } - // Step 4: draw BB edges in olive. - if let Some(bbs) = bbs { - let bbs = &bbs[function_id.idx()]; - for node_idx in 0..bbs.0.len() { - let maybe_data = NodeID::new(node_idx); - let control = bbs.0[node_idx]; - if maybe_data != control { - write_edge( - maybe_data, - function_id, - control, - function_id, - true, - "olivedrab4, constraint=false", - "dotted", - &module, - w, - )?; - } - } - } - write_graph_footer(w)?; } diff --git a/hercules_ir/src/fork_join_analysis.rs b/hercules_ir/src/fork_join_analysis.rs new file mode 100644 index 0000000000000000000000000000000000000000..5fe6b13221144e7c1dcbaa645d5446da5c8a2a06 --- /dev/null +++ b/hercules_ir/src/fork_join_analysis.rs @@ -0,0 +1,131 @@ +extern crate bitvec; + +use std::collections::{HashMap, HashSet}; + +use self::bitvec::prelude::*; + +use crate::*; + +/* + * Top level function for creating a fork-join map. Map is from fork node ID to + * join node ID, since a join can easily determine the fork it corresponds to + * (that's the mechanism used to implement this analysis). This analysis depends + * on type information. + */ +pub fn fork_join_map(function: &Function, control: &Subgraph) -> HashMap<NodeID, NodeID> { + let mut fork_join_map = HashMap::new(); + for idx in 0..function.nodes.len() { + // We only care about join nodes. + if function.nodes[idx].is_join() { + // Iterate the control predecessors until finding a fork. Maintain a + // counter of unmatched fork-join pairs seen on the way, since fork- + // joins may be nested. Every join is dominated by their fork, so + // just iterate the first unseen predecessor of each control node. + let join_id = NodeID::new(idx); + let mut unpaired = 0; + let mut cursor = join_id; + let mut seen = HashSet::<NodeID>::new(); + let fork_id = loop { + cursor = control + .preds(cursor) + .filter(|pred| !seen.contains(pred)) + .next() + .unwrap(); + seen.insert(cursor); + + if function.nodes[cursor.idx()].is_join() { + unpaired += 1; + } else if function.nodes[cursor.idx()].is_fork() && unpaired > 0 { + unpaired -= 1; + } else if function.nodes[cursor.idx()].is_fork() { + break cursor; + } + }; + fork_join_map.insert(fork_id, join_id); + } + } + fork_join_map +} + +/* + * Find fork/join nests that each control node is inside of. Result is a map + * from each control node to a list of fork nodes. The fork nodes are listed in + * ascending order of nesting. + */ +pub fn compute_fork_join_nesting( + function: &Function, + dom: &DomTree, + fork_join_map: &HashMap<NodeID, NodeID>, +) -> HashMap<NodeID, Vec<NodeID>> { + // For each control node, ascend dominator tree, looking for fork nodes. For + // each fork node, make sure each control node isn't strictly dominated by + // the corresponding join node. + (0..function.nodes.len()) + .map(NodeID::new) + .filter(|id| dom.contains(*id)) + .map(|id| { + ( + id, + dom.ascend(id) + // Filter for forks that dominate this control node, + .filter(|id| function.nodes[id.idx()].is_fork()) + // where its corresponding join doesn't dominate the control + // node (if so, then this control is after the fork-join). + .filter(|fork_id| !dom.does_prop_dom(fork_join_map[&fork_id], id)) + .collect(), + ) + }) + .collect() +} + +/* + * Check if a data node dominates a control node. This involves checking all + * immediate control uses to see if they dominate the queried control node. + */ +pub fn does_data_dom_control( + function: &Function, + data: NodeID, + control: NodeID, + dom: &DomTree, +) -> bool { + let mut stack = vec![data]; + let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; + visited.set(data.idx(), true); + + while let Some(pop) = stack.pop() { + let node = &function.nodes[pop.idx()]; + + let imm_control = match node { + Node::Phi { control, data: _ } + | Node::Reduce { + control, + init: _, + reduct: _, + } + | Node::Call { + control, + function: _, + dynamic_constants: _, + args: _, + } => Some(*control), + _ if node.is_control() => Some(pop), + _ => { + for u in get_uses(node).as_ref() { + if !visited[u.idx()] { + visited.set(u.idx(), true); + stack.push(*u); + } + } + None + } + }; + + if let Some(imm_control) = imm_control + && !dom.does_dom(imm_control, control) + { + return false; + } + } + + true +} diff --git a/hercules_ir/src/gcm.rs b/hercules_ir/src/gcm.rs deleted file mode 100644 index 3718df9b00d0e262572d4602a9c9555ea9f6bb98..0000000000000000000000000000000000000000 --- a/hercules_ir/src/gcm.rs +++ /dev/null @@ -1,391 +0,0 @@ -extern crate bitvec; - -use std::collections::{HashMap, HashSet, VecDeque}; -use std::iter::{zip, FromIterator}; - -use self::bitvec::prelude::*; - -use crate::*; - -/* - * Basic block info consists of two things: - * - * 1. A map from node to block (named by control nodes). - * 2. For each node, which nodes are in its own block. - * - * Note that for #2, the structure is Vec<NodeID>, meaning the nodes are ordered - * inside the block. This order corresponds to the traversal order of the nodes - * in the block needed by the backend code generators. - */ -pub type BasicBlocks = (Vec<NodeID>, Vec<Vec<NodeID>>); - -/* - * Top level global code motion function. Assigns each data node to one of its - * immediate control use / user nodes, forming (unordered) basic blocks. Returns - * the control node / basic block each node is in. Takes in a partial - * partitioning that must be respected. Based on the schedule-early-schedule- - * late method from Cliff Click's PhD thesis. - */ -pub fn gcm( - function: &Function, - def_use: &ImmutableDefUseMap, - reverse_postorder: &Vec<NodeID>, - control_subgraph: &Subgraph, - dom: &DomTree, - antideps: &Vec<(NodeID, NodeID)>, - loops: &LoopTree, - fork_join_map: &HashMap<NodeID, NodeID>, -) -> BasicBlocks { - let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()]; - let back_edges = control_subgraph.back_edges(NodeID::new(0)); - let no_loop_reachability = - control_subgraph.pairwise_reachability(|src, dst| !back_edges.contains(&(src, dst))); - let antideps_users = map_antideps(antideps); - let antideps_uses = flip_antideps(antideps); - - // Step 1: assign the basic block locations of all nodes that must be in a - // specific block. This includes control nodes as well as some special data - // nodes, such as phis. - for idx in 0..function.nodes.len() { - match function.nodes[idx] { - Node::Phi { control, data: _ } => bbs[idx] = Some(control), - Node::ThreadID { - control, - dimension: _, - } => bbs[idx] = Some(control), - Node::Reduce { - control, - init: _, - reduct: _, - } => bbs[idx] = Some(control), - Node::Call { - control, - function: _, - dynamic_constants: _, - args: _, - } => bbs[idx] = Some(control), - Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)), - Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)), - Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)), - _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)), - _ => {} - } - } - - // Step 2: schedule early. Place nodes in the earliest position they could - // go - use worklist to iterate nodes. - let mut schedule_early = bbs.clone(); - let mut worklist = VecDeque::from(reverse_postorder.clone()); - while let Some(id) = worklist.pop_front() { - if schedule_early[id.idx()].is_some() { - continue; - } - - // For every use, check what block is its "schedule early" block. This - // node goes in the lowest block amongst those blocks. - let use_places: Option<Vec<NodeID>> = get_uses(&function.nodes[id.idx()]) - .as_ref() - .into_iter() - .map(|id| *id) - .map(|id| schedule_early[id.idx()]) - .collect(); - if let Some(use_places) = use_places { - // If every use has been placed, we can place this node as the - // lowest place in the domtree that dominates all of the use places. - let lowest = dom.lowest_amongst(use_places.into_iter()); - schedule_early[id.idx()] = Some(lowest); - } else { - // If not, then just push this node back on the worklist. - worklist.push_back(id); - } - } - - // Step 3: schedule late and pick each nodes final position. Since the late - // schedule of each node depends on the final positions of its users, these - // two steps must be fused. Compute their latest position, then use the - // control dependent + shallow loop heuristic to actually place them. - let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map - .into_iter() - .map(|(fork, join)| (*join, *fork)) - .collect(); - let mut worklist = VecDeque::from_iter(reverse_postorder.into_iter().map(|id| *id).rev()); - 'worklist: while let Some(id) = worklist.pop_front() { - if bbs[id.idx()].is_some() { - continue; - } - - // Calculate the least common ancestor of user blocks, a.k.a. the "late" - // schedule. - let calculate_lca = || -> Option<_> { - let mut lca = None; - // Helper to incrementally update the LCA. - let mut update_lca = |a| { - if let Some(acc) = lca { - lca = Some(dom.least_common_ancestor(acc, a)); - } else { - lca = Some(a); - } - }; - - // For every user, consider where we need to be to directly dominate the - // user. - for user in def_use.get_users(id).as_ref().into_iter().map(|id| *id) { - if let Node::Phi { control, data } = &function.nodes[user.idx()] { - // For phis, we need to dominate the block jumping to the phi in - // the slot that corresponds to our use. - for (control, data) in - zip(get_uses(&function.nodes[control.idx()]).as_ref(), data) - { - if id == *data { - update_lca(*control); - } - } - } else if let Node::Reduce { - control, - init, - reduct, - } = &function.nodes[user.idx()] - { - // For reduces, we need to either dominate the block right - // before the fork if we're the init input, or we need to - // dominate the join if we're the reduct input. - if id == *init { - let before_fork = function.nodes[join_fork_map[control].idx()] - .try_fork() - .unwrap() - .0; - update_lca(before_fork); - } else { - assert_eq!(id, *reduct); - update_lca(*control); - } - } else { - // For everything else, we just need to dominate the user. - update_lca(bbs[user.idx()]?); - } - } - - Some(lca) - }; - - // Check if all users have been placed. If one of them hasn't, then add - // this node back on to the worklist. - let Some(lca) = calculate_lca() else { - worklist.push_back(id); - continue; - }; - - // Check if all anti-dependency users have been placed. If one of them - // hasn't, then add this node back on to the worklist. We need to know - // where the anti-dependency users are, so that we can place this - // read "above" mutators that anti-depend on it. The condition for a - // potential placement location is that in the CFG *without loop back- - // edges* the mutator cannot reach the read. Ask Russel about why this - // works, hopefully I'll have a convincing argument by then ;). - let mut antidep_user_locations = vec![]; - for antidep_user in antideps_users.get(&id).unwrap_or(&vec![]) { - if let Some(location) = bbs[antidep_user.idx()] { - antidep_user_locations.push(location); - } else { - worklist.push_back(id); - continue 'worklist; - } - } - - // Look between the LCA and the schedule early location to place the - // node. - let schedule_early = schedule_early[id.idx()].unwrap(); - let mut chain = dom - // If the node has no users, then it doesn't really matter where we - // place it - just place it at the early placement. - .chain(lca.unwrap_or(schedule_early), schedule_early) - // Only allow locations that don't violate the anti-depence property - // listed above. - .filter(|location| { - !antidep_user_locations.iter().any(|antidep_user_location| { - antidep_user_location != location - && no_loop_reachability[antidep_user_location.idx()][location.idx()] - }) - }); - let mut location = chain.next().unwrap(); - while let Some(control_node) = chain.next() { - // If the next node further up the dominator tree is in a shallower - // loop nest or if we can get out of a reduce loop when we don't - // need to be in one, place this data node in a higher-up location. - let shallower_nest = if let (Some(old_nest), Some(new_nest)) = - (loops.nesting(location), loops.nesting(control_node)) - { - old_nest > new_nest - } else { - false - }; - // This will move all nodes that don't need to be in reduce loops - // outside of reduce loops. Nodes that do need to be in a reduce - // loop use the reduce node forming the loop, so the dominator chain - // will consist of one block, and this loop won't ever iterate. - let currently_at_join = function.nodes[location.idx()].is_join(); - if shallower_nest || currently_at_join { - location = control_node; - } - } - - bbs[id.idx()] = Some(location); - } - let bbs: Vec<_> = bbs.into_iter().map(Option::unwrap).collect(); - - // Step 4: determine the order of nodes inside each block. Use worklist to - // add nodes to blocks in order that obeys dependencies. - let mut order: Vec<Vec<NodeID>> = vec![vec![]; function.nodes.len()]; - let mut worklist = VecDeque::from_iter( - reverse_postorder - .into_iter() - .filter(|id| !function.nodes[id.idx()].is_control()), - ); - let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; - while let Some(id) = worklist.pop_front() { - let node = &function.nodes[id.idx()]; - if node.is_phi() - || node.is_reduce() - || get_uses(node) - .as_ref() - .into_iter() - .chain(antideps_uses.get(&id).into_iter().flatten()) - .all(|u| { - function.nodes[u.idx()].is_control() - || bbs[u.idx()] != bbs[id.idx()] - || visited[u.idx()] - }) - { - order[bbs[id.idx()].idx()].push(*id); - visited.set(id.idx(), true); - } else { - worklist.push_back(id); - } - } - - (bbs, order) -} - -/* - * Top level function for creating a fork-join map. Map is from fork node ID to - * join node ID, since a join can easily determine the fork it corresponds to - * (that's the mechanism used to implement this analysis). This analysis depends - * on type information. - */ -pub fn fork_join_map(function: &Function, control: &Subgraph) -> HashMap<NodeID, NodeID> { - let mut fork_join_map = HashMap::new(); - for idx in 0..function.nodes.len() { - // We only care about join nodes. - if function.nodes[idx].is_join() { - // Iterate the control predecessors until finding a fork. Maintain a - // counter of unmatched fork-join pairs seen on the way, since fork- - // joins may be nested. Every join is dominated by their fork, so - // just iterate the first unseen predecessor of each control node. - let join_id = NodeID::new(idx); - let mut unpaired = 0; - let mut cursor = join_id; - let mut seen = HashSet::<NodeID>::new(); - let fork_id = loop { - cursor = control - .preds(cursor) - .filter(|pred| !seen.contains(pred)) - .next() - .unwrap(); - seen.insert(cursor); - - if function.nodes[cursor.idx()].is_join() { - unpaired += 1; - } else if function.nodes[cursor.idx()].is_fork() && unpaired > 0 { - unpaired -= 1; - } else if function.nodes[cursor.idx()].is_fork() { - break cursor; - } - }; - fork_join_map.insert(fork_id, join_id); - } - } - fork_join_map -} - -/* - * Find fork/join nests that each control node is inside of. Result is a map - * from each control node to a list of fork nodes. The fork nodes are listed in - * ascending order of nesting. - */ -pub fn compute_fork_join_nesting( - function: &Function, - dom: &DomTree, - fork_join_map: &HashMap<NodeID, NodeID>, -) -> HashMap<NodeID, Vec<NodeID>> { - // For each control node, ascend dominator tree, looking for fork nodes. For - // each fork node, make sure each control node isn't strictly dominated by - // the corresponding join node. - (0..function.nodes.len()) - .map(NodeID::new) - .filter(|id| dom.contains(*id)) - .map(|id| { - ( - id, - dom.ascend(id) - // Filter for forks that dominate this control node, - .filter(|id| function.nodes[id.idx()].is_fork()) - // where its corresponding join doesn't dominate the control - // node (if so, then this control is after the fork-join). - .filter(|fork_id| !dom.does_prop_dom(fork_join_map[&fork_id], id)) - .collect(), - ) - }) - .collect() -} - -/* - * Check if a data node dominates a control node. This involves checking all - * immediate control uses to see if they dominate the queried control node. - */ -pub fn does_data_dom_control( - function: &Function, - data: NodeID, - control: NodeID, - dom: &DomTree, -) -> bool { - let mut stack = vec![data]; - let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; - visited.set(data.idx(), true); - - while let Some(pop) = stack.pop() { - let node = &function.nodes[pop.idx()]; - - let imm_control = match node { - Node::Phi { control, data: _ } - | Node::Reduce { - control, - init: _, - reduct: _, - } - | Node::Call { - control, - function: _, - dynamic_constants: _, - args: _, - } => Some(*control), - _ if node.is_control() => Some(pop), - _ => { - for u in get_uses(node).as_ref() { - if !visited[u.idx()] { - visited.set(u.idx(), true); - stack.push(*u); - } - } - None - } - }; - - if let Some(imm_control) = imm_control - && !dom.does_dom(imm_control, control) - { - return false; - } - } - - true -} diff --git a/hercules_ir/src/lib.rs b/hercules_ir/src/lib.rs index 05e5e2e860a122392a668a254d54a7a5917db3f4..32bbf6310ff7ea0415383dc3fd7176043de835ee 100644 --- a/hercules_ir/src/lib.rs +++ b/hercules_ir/src/lib.rs @@ -6,7 +6,6 @@ iter_intersperse )] -pub mod antideps; pub mod build; pub mod callgraph; pub mod collections; @@ -14,7 +13,7 @@ pub mod dataflow; pub mod def_use; pub mod dom; pub mod dot; -pub mod gcm; +pub mod fork_join_analysis; pub mod ir; pub mod loops; pub mod parse; @@ -22,7 +21,6 @@ pub mod subgraph; pub mod typecheck; pub mod verify; -pub use crate::antideps::*; pub use crate::build::*; pub use crate::callgraph::*; pub use crate::collections::*; @@ -30,7 +28,7 @@ pub use crate::dataflow::*; pub use crate::def_use::*; pub use crate::dom::*; pub use crate::dot::*; -pub use crate::gcm::*; +pub use crate::fork_join_analysis::*; pub use crate::ir::*; pub use crate::loops::*; pub use crate::parse::*; diff --git a/hercules_ir/src/loops.rs b/hercules_ir/src/loops.rs index 7c9a0a85949efcc248439031601b2fed17f0acf6..3ab3313fa43570e118e9cb690b464df3cc01c5de 100644 --- a/hercules_ir/src/loops.rs +++ b/hercules_ir/src/loops.rs @@ -25,6 +25,7 @@ use crate::*; pub struct LoopTree { root: NodeID, loops: HashMap<NodeID, (BitVec<u8, Lsb0>, NodeID)>, + inverse_loops: HashMap<NodeID, NodeID>, nesting: HashMap<NodeID, usize>, } @@ -45,6 +46,10 @@ impl LoopTree { header == self.root || self.loops[&header].0[is_in.idx()] } + pub fn header_of(&self, control_node: NodeID) -> Option<NodeID> { + self.inverse_loops.get(&control_node).map(|h| *h) + } + /* * Sometimes, we need to iterate the loop tree bottom-up. Just assemble the * order upfront. @@ -149,7 +154,16 @@ pub fn loops( }) .collect(); - // Step 6: compute loop tree nesting. + // Step 6: compute the inverse loop map - this maps control nodes to which + // loop they are in (keyed by header), if they are in one. + let mut inverse_loops = HashMap::new(); + for (header, (contents, _)) in loops.iter() { + for idx in contents.iter_ones() { + inverse_loops.insert(NodeID::new(idx), *header); + } + } + + // Step 7: compute loop tree nesting. let mut nesting = HashMap::new(); let mut worklist: VecDeque<NodeID> = loops.keys().map(|id| *id).collect(); while let Some(header) = worklist.pop_front() { @@ -166,6 +180,7 @@ pub fn loops( LoopTree { root, loops, + inverse_loops, nesting, } } diff --git a/hercules_ir/src/subgraph.rs b/hercules_ir/src/subgraph.rs index 89e8bcc64febd6fe36ec69d0d3a68a0dc0eda348..a2aedadf0fbc996bc0eb46feae77ee7a526de491 100644 --- a/hercules_ir/src/subgraph.rs +++ b/hercules_ir/src/subgraph.rs @@ -203,6 +203,33 @@ impl Subgraph { edges } + pub fn rev_po(&self, root: NodeID) -> Vec<NodeID> { + let mut order = vec![]; + let mut stack = vec![]; + let mut visited = bitvec![u8, Lsb0; 0; self.original_num_nodes as usize]; + + stack.push(root); + visited.set(root.idx(), true); + + while let Some(pop) = stack.pop() { + if self.succs(pop).any(|succ| !visited[succ.idx()]) { + stack.push(pop); + for succ in self.succs(pop) { + if !visited[succ.idx()] { + visited.set(succ.idx(), true); + stack.push(succ); + break; + } + } + } else { + order.push(pop); + } + } + + order.reverse(); + order + } + pub fn pairwise_reachability<P>(&self, p: P) -> Vec<BitVec<u8, Lsb0>> where P: Fn(NodeID, NodeID) -> bool, diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs index c657d5987f005a721ffe663ee22fa6b8fc877b43..d6862c354199dc748797e47d4f663f898df24d7b 100644 --- a/hercules_ir/src/typecheck.rs +++ b/hercules_ir/src/typecheck.rs @@ -984,10 +984,6 @@ fn typeflow( data: _, indices, } => { - if indices.len() == 0 { - return Error(String::from("Write node must have at least one index.")); - } - // Traverse the collect input's type tree downwards. if let (Concrete(mut collect_id), Concrete(data_id)) = (inputs[0], inputs[1]) { for index in indices.iter() { diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 0c97abff6429a76f03481542f03c9ba7cd09a5f3..4ff08e6927103ed39d8025ab1fac7bc52d52984a 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -25,6 +25,7 @@ pub struct FunctionEditor<'a> { // Wraps a mutable reference to a function. Doesn't provide access to this // reference directly, so that we can monitor edits. function: &'a mut Function, + function_id: FunctionID, // Keep a RefCell to (dynamic) constants and types to allow function changes // to update these constants: &'a RefCell<Vec<Constant>>, @@ -69,6 +70,7 @@ pub struct FunctionEdit<'a: 'b, 'b> { impl<'a: 'b, 'b> FunctionEditor<'a> { pub fn new( function: &'a mut Function, + function_id: FunctionID, constants: &'a RefCell<Vec<Constant>>, dynamic_constants: &'a RefCell<Vec<DynamicConstant>>, types: &'a RefCell<Vec<Type>>, @@ -87,6 +89,7 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { FunctionEditor { function, + function_id, constants, dynamic_constants, types, @@ -218,6 +221,10 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { &self.function } + pub fn func_id(&self) -> FunctionID { + self.function_id + } + pub fn get_dynamic_constants(&self) -> Ref<'_, Vec<DynamicConstant>> { self.dynamic_constants.borrow() } @@ -660,6 +667,7 @@ fn func(x: i32) -> i32 // Edit the function by replacing the add with a multiply. let mut editor = FunctionEditor::new( func, + FunctionID::new(0), &constants_ref, &dynamic_constants_ref, &types_ref, diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs new file mode 100644 index 0000000000000000000000000000000000000000..254524f9eb75865a0d0c480dd5aa7fd71115e374 --- /dev/null +++ b/hercules_opt/src/legalize_reference_semantics.rs @@ -0,0 +1,835 @@ +extern crate bitvec; +extern crate hercules_cg; +extern crate hercules_ir; + +use std::collections::{BTreeSet, HashMap, VecDeque}; +use std::iter::{empty, once, zip, FromIterator}; +use std::mem::take; + +use self::bitvec::prelude::*; + +use self::hercules_cg::*; +use self::hercules_ir::*; + +use crate::*; + +/* + * Top level function to legalize the reference semantics of a Hercules IR + * function. Hercules IR is a value semantics representation, meaning that all + * program state is in the form of copyable values, and mutation takes place by + * making a new value that is a copy of the old value with some modification. + * This representation is extremely convenient for optimization, but is not good + * for code generation, where we need to generate code with references to get + * good performance. Hercules IR can alternatively be interpreted using + * reference semantics, where pointers to collection objects are passed around, + * read from, and written to. However, the value semantics and reference + * semantics interpretation of a Hercules IR function may not be equal - this + * pass transforms a Hercules IR function such that its new value semantics is + * the same as its old value semantics and that its new reference semantics is + * the same as its new value semantics. This pass returns a placement of nodes + * into ordered basic blocks, since the reference semantics of a function + * depends on the order of execution with respect to anti-dependencies. Clones + * are inserted sparingly when there are two write users of a single collection + * or if a read user cannot be scheduled before a write user. + */ +pub fn legalize_reference_semantics( + editor: &mut FunctionEditor, + def_use: &ImmutableDefUseMap, + reverse_postorder: &Vec<NodeID>, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + dom: &DomTree, + fork_join_map: &HashMap<NodeID, NodeID>, + loops: &LoopTree, + objects: &CollectionObjects, +) -> Option<BasicBlocks> { + // Repeatedly try to place nodes into basic blocks. If clones are induced, + // re-try. Specifically, repeat the following procedure until no new clones: + // + // 1. Attempt to place nodes in basic blocks. If a node can't be placed due + // to anti-dependency edges, induce a clone on the read and go back to + // step 1. + // 2. Check for any write-induced clones. If there are any, go back to step + // 1. + // + // Since each analysis needs to be re-calculated in each iteration, this + // function just implements the body of the described loop. The re-try logic + // is found in pass.rs. When a re-try is needed, no basic block assignment + // is returned. When a re-try isn't needed (no new clones were found), a + // basic block assignment is returned. + let bbs = match basic_blocks( + editor.func(), + editor.func_id(), + def_use, + reverse_postorder, + control_subgraph, + dom, + loops, + fork_join_map, + objects, + ) { + Ok(bbs) => bbs, + Err((obj, reader)) => { + induce_clone(editor, typing, obj, reader); + return None; + } + }; + if materialize_clones(editor, typing, control_subgraph, objects, &bbs) { + None + } else { + Some(bbs) + } +} + +/* + * Top level global code motion function. Assigns each data node to one of its + * immediate control use / user nodes, forming (unordered) basic blocks. Returns + * the control node / basic block each node is in. Takes in a partial + * partitioning that must be respected. Based on the schedule-early-schedule- + * late method from Cliff Click's PhD thesis. May fail if an anti-dependency + * edge can't be satisfied - in this case, a clone that has to be induced is + * returned instead. + */ +fn basic_blocks( + function: &Function, + func_id: FunctionID, + def_use: &ImmutableDefUseMap, + reverse_postorder: &Vec<NodeID>, + control_subgraph: &Subgraph, + dom: &DomTree, + loops: &LoopTree, + fork_join_map: &HashMap<NodeID, NodeID>, + objects: &CollectionObjects, +) -> Result<BasicBlocks, (NodeID, NodeID)> { + let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()]; + + // Step 1: assign the basic block locations of all nodes that must be in a + // specific block. This includes control nodes as well as some special data + // nodes, such as phis. + for idx in 0..function.nodes.len() { + match function.nodes[idx] { + Node::Phi { control, data: _ } => bbs[idx] = Some(control), + Node::ThreadID { + control, + dimension: _, + } => bbs[idx] = Some(control), + Node::Reduce { + control, + init: _, + reduct: _, + } => bbs[idx] = Some(control), + Node::Call { + control, + function: _, + dynamic_constants: _, + args: _, + } => bbs[idx] = Some(control), + Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)), + Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)), + Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)), + _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)), + _ => {} + } + } + + // Step 2: schedule early. Place nodes in the earliest position they could + // go - use worklist to iterate nodes. + let mut schedule_early = bbs.clone(); + let mut worklist = VecDeque::from(reverse_postorder.clone()); + while let Some(id) = worklist.pop_front() { + if schedule_early[id.idx()].is_some() { + continue; + } + + // For every use, check what block is its "schedule early" block. This + // node goes in the lowest block amongst those blocks. + let use_places: Option<Vec<NodeID>> = get_uses(&function.nodes[id.idx()]) + .as_ref() + .into_iter() + .map(|id| *id) + .map(|id| schedule_early[id.idx()]) + .collect(); + if let Some(use_places) = use_places { + // If every use has been placed, we can place this node as the + // lowest place in the domtree that dominates all of the use places. + let lowest = dom.lowest_amongst(use_places.into_iter()); + schedule_early[id.idx()] = Some(lowest); + } else { + // If not, then just push this node back on the worklist. + worklist.push_back(id); + } + } + + // Step 3: find anti-dependence edges. An anti-dependence edge needs to be + // drawn between a collection reading node and a collection mutating node + // when the following conditions are true: + // + // 1: The reading and mutating nodes may involve the same collection. + // 2: The node producing the collection used by the reading node is in a + // schedule early block that dominates the schedule early block of the + // mutating node. The node producing the collection used by the reading + // node may be an originator of a collection, phi or reduce, or mutator, + // but not forwarding read - forwarding reads are collapsed, and the + // bottom read is treated as reading from the transitive parent of the + // forwarding read(s). + let mut antideps = BTreeSet::new(); + for id in reverse_postorder.iter() { + // Find a terminating read node and the collections it reads. + let terminating_reads: BTreeSet<_> = + terminating_reads(function, func_id, *id, objects).collect(); + if !terminating_reads.is_empty() { + // Walk forwarding reads to find anti-dependency roots. + let mut workset = terminating_reads.clone(); + let mut roots = BTreeSet::new(); + while let Some(pop) = workset.pop_first() { + let forwarded: BTreeSet<_> = + forwarding_reads(function, func_id, pop, objects).collect(); + if forwarded.is_empty() { + roots.insert(pop); + } else { + workset.extend(forwarded); + } + } + + // For each root, find mutating nodes dominated by the root that + // modify an object read on any input of the current node (the + // terminating read). + // TODO: make this less outrageously inefficient. + let func_objects = &objects[&func_id]; + for root in roots.iter() { + let root_early = schedule_early[root.idx()].unwrap(); + let mut root_block_iterated_users: BTreeSet<NodeID> = BTreeSet::new(); + let mut workset = BTreeSet::new(); + workset.insert(*root); + while let Some(pop) = workset.pop_first() { + let users = def_use.get_users(pop).into_iter().filter(|user| { + !function.nodes[user.idx()].is_phi() + && !function.nodes[user.idx()].is_reduce() + && schedule_early[user.idx()].unwrap() == root_early + }); + workset.extend(users.clone()); + root_block_iterated_users.extend(users); + } + let read_objs: BTreeSet<_> = terminating_reads + .iter() + .map(|read_use| func_objects.objects(*read_use).into_iter()) + .flatten() + .map(|id| *id) + .collect(); + for mutator in reverse_postorder.iter() { + let mutator_early = schedule_early[mutator.idx()].unwrap(); + if dom.does_dom(root_early, mutator_early) + && (root_early != mutator_early + || root_block_iterated_users.contains(&mutator)) + && mutating_objects(function, func_id, *mutator, objects) + .any(|mutated| read_objs.contains(&mutated)) + { + antideps.insert((*id, *mutator)); + } + } + } + } + } + let mut antideps_uses = vec![vec![]; function.nodes.len()]; + let mut antideps_users = vec![vec![]; function.nodes.len()]; + for (reader, mutator) in antideps.iter() { + antideps_uses[mutator.idx()].push(*reader); + antideps_users[reader.idx()].push(*mutator); + } + + // Step 4: schedule late and pick each nodes final position. Since the late + // schedule of each node depends on the final positions of its users, these + // two steps must be fused. Compute their latest position, then use the + // control dependent + shallow loop heuristic to actually place them. + let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map + .into_iter() + .map(|(fork, join)| (*join, *fork)) + .collect(); + let mut worklist = VecDeque::from_iter(reverse_postorder.into_iter().map(|id| *id).rev()); + while let Some(id) = worklist.pop_front() { + if bbs[id.idx()].is_some() { + continue; + } + + // Calculate the least common ancestor of user blocks, a.k.a. the "late" + // schedule. + let calculate_lca = || -> Option<_> { + let mut lca = None; + // Helper to incrementally update the LCA. + let mut update_lca = |a| { + if let Some(acc) = lca { + lca = Some(dom.least_common_ancestor(acc, a)); + } else { + lca = Some(a); + } + }; + + // For every user, consider where we need to be to directly dominate the + // user. + for user in def_use + .get_users(id) + .as_ref() + .into_iter() + .chain(antideps_users[id.idx()].iter()) + .map(|id| *id) + { + if let Node::Phi { control, data } = &function.nodes[user.idx()] { + // For phis, we need to dominate the block jumping to the phi in + // the slot that corresponds to our use. + for (control, data) in + zip(get_uses(&function.nodes[control.idx()]).as_ref(), data) + { + if id == *data { + update_lca(*control); + } + } + } else if let Node::Reduce { + control, + init, + reduct, + } = &function.nodes[user.idx()] + { + // For reduces, we need to either dominate the block right + // before the fork if we're the init input, or we need to + // dominate the join if we're the reduct input. + if id == *init { + let before_fork = function.nodes[join_fork_map[control].idx()] + .try_fork() + .unwrap() + .0; + update_lca(before_fork); + } else { + assert_eq!(id, *reduct); + update_lca(*control); + } + } else { + // For everything else, we just need to dominate the user. + update_lca(bbs[user.idx()]?); + } + } + + Some(lca) + }; + + // Check if all users have been placed. If one of them hasn't, then add + // this node back on to the worklist. + let Some(lca) = calculate_lca() else { + worklist.push_back(id); + continue; + }; + + // Look between the LCA and the schedule early location to place the + // node. + let schedule_early = schedule_early[id.idx()].unwrap(); + let mut chain = dom + // If the node has no users, then it doesn't really matter where we + // place it - just place it at the early placement. + .chain(lca.unwrap_or(schedule_early), schedule_early); + + if let Some(mut location) = chain.next() { + /* + while let Some(control_node) = chain.next() { + // If the next node further up the dominator tree is in a shallower + // loop nest or if we can get out of a reduce loop when we don't + // need to be in one, place this data node in a higher-up location. + let old_nest = loops + .header_of(location) + .map(|header| loops.nesting(header).unwrap()); + let new_nest = loops + .header_of(control_node) + .map(|header| loops.nesting(header).unwrap()); + let shallower_nest = if let (Some(old_nest), Some(new_nest)) = (old_nest, new_nest) + { + old_nest > new_nest + } else { + // If the new location isn't a loop, it's nesting level should + // be considered "shallower" if the current location is in a + // loop. + old_nest.is_some() + }; + // This will move all nodes that don't need to be in reduce loops + // outside of reduce loops. Nodes that do need to be in a reduce + // loop use the reduce node forming the loop, so the dominator chain + // will consist of one block, and this loop won't ever iterate. + let currently_at_join = function.nodes[location.idx()].is_join(); + if shallower_nest || currently_at_join { + location = control_node; + } + } + */ + + bbs[id.idx()] = Some(location); + } else { + // If there is no valid location for this node, then it's a reading + // node of a collection that can't be placed above a mutation that + // anti-depend uses it. Thus, a clone needs to be induced. + todo!() + } + } + let bbs: Vec<_> = bbs.into_iter().map(Option::unwrap).collect(); + + // Step 5: determine the order of nodes inside each block. Use worklist to + // add nodes to blocks in order that obeys dependencies. + let mut order: Vec<Vec<NodeID>> = vec![vec![]; function.nodes.len()]; + let mut worklist = VecDeque::from_iter( + reverse_postorder + .into_iter() + .filter(|id| !function.nodes[id.idx()].is_control()), + ); + let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; + let mut no_change_iters = 0; + while no_change_iters <= worklist.len() + && let Some(id) = worklist.pop_front() + { + let node = &function.nodes[id.idx()]; + if node.is_phi() + || node.is_reduce() + || get_uses(node) + .as_ref() + .into_iter() + .chain(antideps_uses[id.idx()].iter()) + .all(|u| { + function.nodes[u.idx()].is_control() + || bbs[u.idx()] != bbs[id.idx()] + || visited[u.idx()] + }) + { + order[bbs[id.idx()].idx()].push(*id); + visited.set(id.idx(), true); + no_change_iters = 0; + } else { + worklist.push_back(id); + no_change_iters += 1; + } + } + + if no_change_iters == 0 { + Ok((bbs, order)) + } else { + // If the worklist exited without finishing, then there's at least one + // reading node of a collection that is in a anti-depend + normal depend + // use cycle with a mutating node. This cycle must be broken by inducing + // a clone. + todo!() + } +} + +fn terminating_reads<'a>( + function: &'a Function, + func_id: FunctionID, + reader: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = NodeID> + 'a> { + match function.nodes[reader.idx()] { + Node::Read { + collect, + indices: _, + } if objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)), + Node::Write { + collect: _, + data, + indices: _, + } if !objects[&func_id].objects(data).is_empty() => Box::new(once(data)), + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| { + let objects = &objects[&callee]; + let returns = objects.returned_objects(); + let param_obj = objects.param_to_object(idx)?; + if !objects.is_mutated(param_obj) && !returns.contains(¶m_obj) { + Some(*arg) + } else { + None + } + })), + _ => Box::new(empty()), + } +} + +fn forwarding_reads<'a>( + function: &'a Function, + func_id: FunctionID, + reader: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = NodeID> + 'a> { + match function.nodes[reader.idx()] { + Node::Read { + collect, + indices: _, + } if !objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)), + Node::Ternary { + op: TernaryOperator::Select, + first: _, + second, + third, + } if !objects[&func_id].objects(reader).is_empty() => { + Box::new(once(second).chain(once(third))) + } + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| { + let objects = &objects[&callee]; + let returns = objects.returned_objects(); + let param_obj = objects.param_to_object(idx)?; + if !objects.is_mutated(param_obj) && returns.contains(¶m_obj) { + Some(*arg) + } else { + None + } + })), + _ => Box::new(empty()), + } +} + +fn mutating_objects<'a>( + function: &'a Function, + func_id: FunctionID, + mutator: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = CollectionObjectID> + 'a> { + match function.nodes[mutator.idx()] { + Node::Write { + collect, + data: _, + indices: _, + } => Box::new(objects[&func_id].objects(collect).into_iter().map(|id| *id)), + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new( + args.into_iter() + .enumerate() + .filter_map(move |(idx, arg)| { + let callee_objects = &objects[&callee]; + let param_obj = callee_objects.param_to_object(idx)?; + if callee_objects.is_mutated(param_obj) { + Some(objects[&func_id].objects(*arg).into_iter().map(|id| *id)) + } else { + None + } + }) + .flatten(), + ), + _ => Box::new(empty()), + } +} + +/* + * Top level function to materialize clones of collections. This transformation + * eliminates the possibility of multiple independent writes (including dynamic + * writes) to a single collection by introducing extra collection constants and + * inserting explicit clones. This allows us to make the simplifying assumption + * in the backend that collections have reference, rather than value, semantics. + * The pass calling this function is mandatory for correctness. + */ +fn materialize_clones( + editor: &mut FunctionEditor, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + objects: &CollectionObjects, + bbs: &BasicBlocks, +) -> bool { + // First, run dataflow analysis to figure out which access to collections + // induce clones. This dataflow analysis depends on basic block assignments + // and is more analogous to standard dataflow analysis in CFG + SSA IRs. + // This is the only place this form is used, so just hardcode it here. + // + // This forward dataflow analysis tracks which collections are used at each + // program point. Collections are referred to using node IDs. Specifically: + // + // - Phi - a phi node adds its inputs to the used set and removes itself + // from the used set. If a phi uses an ID that is used along the edge of + // the corresponding predecessor, a clone is induced. + // - Select - a select node adds its inputs to the used set and removes + // itself from the used set. If either use is already used, a clone is + // induced. + // - Reduce - a reduce node adds its inputs to the used set and removes + // itself from the used set. If the `init` input is already used, a clone + // is induced. If the `reduct` input is used at the end of the basic block + // containing the reduce, then a clone is induced. At the end of the basic + // block, the reduce removes itself from the used set. + // - Read - a read node that reads a sub-collections from a collection, + // rather than reading a primitive type, adds its input to the used set + // and removes itself from the used set. If the `collect` input is already + // used, a clone is induced. + // - Write - a write node adds its `collect` input to the used set and + // removes itself from the used set. If the `collect` input is already + // used, a clone is induced. + // - Call - a call node adds any mutated input or input that may be returned + // to the used set and removes itself from the used set. If any mutated + // input is already used, a clone is induced. + // + // Reads of sub-collections (select, read, and call nodes) use a collection + // because they may have downstream writes that depend on the new "view" of + // the same collection. This does not include reads that "end" (the `data` + // input of a write). This analysis does not consider parallel mutations in + // fork-joins, which are handled separately later in this function. + let rev_po = control_subgraph.rev_po(NodeID::new(0)); + let mut total_num_pts = 0; + let mut bb_to_prefix_sum = vec![0; bbs.0.len()]; + for ((idx, bb), insts) in zip(bbs.0.iter().enumerate(), bbs.1.iter()) { + if idx == bb.idx() { + bb_to_prefix_sum[idx] = total_num_pts; + total_num_pts += insts.len() + 1; + } + } + // Lattice maps each program point to a set of used values. Top is that no + // nodes are used yet. + let nodes = &editor.func().nodes; + let func_id = editor.func_id(); + let mut lattice: Vec<BTreeSet<NodeID>> = vec![BTreeSet::new(); total_num_pts]; + loop { + let mut changed = false; + + for bb in rev_po.iter() { + // The lattice value of the first point is the meet of the + // predecessor terminating lattice values. + let mut top_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()]]); + // Clearing `top_value` is not necessary since used nodes are never + // removed from lattice values, only added. + for pred in control_subgraph.preds(*bb) { + // It should not be possible in Hercules IR for a basic block to + // be one of its own predecessors. + assert_ne!(*bb, pred); + let last_pt = bbs.1[pred.idx()].len(); + for elem in lattice[bb_to_prefix_sum[pred.idx()] + last_pt].iter() { + changed |= top_value.insert(*elem); + } + } + lattice[bb_to_prefix_sum[bb.idx()]] = top_value; + + // The lattice value of following points are determined by their + // immediate preceding instructions. + let insts = &bbs.1[bb.idx()]; + for (prev_pt, inst) in insts.iter().enumerate() { + let mut new_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1]); + let prev_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt]; + match nodes[inst.idx()] { + Node::Phi { + control: _, + ref data, + } if !objects[&func_id].objects(*inst).is_empty() => { + for elem in data { + changed |= new_value.insert(*elem); + } + changed |= new_value.remove(inst); + } + Node::Ternary { + op: TernaryOperator::Select, + first: _, + second, + third, + } => { + if !objects[&func_id].objects(*inst).is_empty() { + changed |= new_value.insert(second); + changed |= new_value.insert(third); + changed |= new_value.remove(inst); + } + } + Node::Reduce { + control: _, + init, + reduct, + } if !objects[&func_id].objects(*inst).is_empty() => { + changed |= new_value.insert(init); + changed |= new_value.insert(reduct); + changed |= new_value.remove(inst); + } + Node::Read { + collect, + indices: _, + } if !objects[&func_id].objects(*inst).is_empty() => { + changed |= new_value.insert(collect); + changed |= new_value.remove(inst); + } + Node::Write { + collect, + data: _, + indices: _, + } => { + changed |= new_value.insert(collect); + changed |= new_value.remove(inst); + } + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => { + let callee_objects = &objects[&callee]; + for (param_idx, arg) in args.into_iter().enumerate() { + if callee_objects + .param_to_object(param_idx) + .map(|object| { + callee_objects.is_mutated(object) + || callee_objects.returned_objects().contains(&object) + }) + .unwrap_or(false) + { + changed |= new_value.insert(*arg); + } + } + changed |= new_value.remove(inst); + } + _ => { + for elem in prev_value { + changed |= new_value.insert(*elem); + } + } + } + lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1] = new_value; + } + + // Handle reduces in this block specially at the very end. + let last_pt = insts.len(); + let mut bottom_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()] + last_pt]); + for inst in insts.iter() { + if let Node::Reduce { + control: _, + init: _, + reduct, + } = nodes[inst.idx()] + { + assert!( + bottom_value.contains(&reduct), + "PANIC: Can't handle clones inside a reduction cycle currently." + ); + changed |= bottom_value.remove(inst); + } + } + lattice[bb_to_prefix_sum[bb.idx()] + last_pt] = bottom_value; + } + + if !changed { + break; + } + } + + // Now that we've computed the used collections dataflow analysis, use the + // results to materialize a clone whenever a node attempts to use an already + // used node. + let mut any_induced = false; + let nodes = nodes.clone(); + for bb in rev_po.iter() { + let insts = &bbs.1[bb.idx()]; + for (prev_pt, inst) in insts.iter().enumerate() { + let value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt]; + match nodes[inst.idx()] { + Node::Phi { + control: _, + ref data, + } => { + // In phis, check if an argument is already used in the + // predecessor's bottom lattice value (phis need to be path- + // sensitive). + for (pred, arg) in zip(control_subgraph.preds(*bb), data) { + let last_pt = bbs.1[pred.idx()].len(); + let bottom = &lattice[bb_to_prefix_sum[pred.idx()] + last_pt]; + if bottom.contains(arg) { + induce_clone(editor, typing, *arg, *inst); + any_induced = true; + } + } + } + Node::Ternary { + op: TernaryOperator::Select, + first: _, + second, + third, + } => { + if value.contains(&second) { + induce_clone(editor, typing, second, *inst); + any_induced = true; + } + if value.contains(&third) { + induce_clone(editor, typing, third, *inst); + any_induced = true; + } + } + Node::Reduce { + control: _, + init, + reduct: _, + } => { + if value.contains(&init) { + induce_clone(editor, typing, init, *inst); + any_induced = true; + } + } + Node::Read { + collect, + indices: _, + } if !objects[&func_id].objects(*inst).is_empty() => { + if value.contains(&collect) { + induce_clone(editor, typing, collect, *inst); + any_induced = true; + } + } + Node::Write { + collect, + data: _, + indices: _, + } => { + if value.contains(&collect) { + induce_clone(editor, typing, collect, *inst); + any_induced = true; + } + } + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => { + let callee_objects = &objects[&callee]; + for (param_idx, arg) in args.into_iter().enumerate() { + if callee_objects + .param_to_object(param_idx) + .map(|object| { + callee_objects.is_mutated(object) + || callee_objects.returned_objects().contains(&object) + }) + .unwrap_or(false) + && value.contains(arg) + { + induce_clone(editor, typing, *arg, *inst); + any_induced = true; + } + } + } + _ => {} + } + } + } + any_induced +} + +/* + * Utility to insert a clone before a use of a collection. + */ +fn induce_clone(editor: &mut FunctionEditor, typing: &Vec<TypeID>, object: NodeID, user: NodeID) { + editor.edit(|mut edit| { + // Create the constant collection object for allocation. + let object_ty = typing[object.idx()]; + let object_cons = edit.add_zero_constant(object_ty); + let cons_node = edit.add_node(Node::Constant { id: object_cons }); + + // Create the clone into the new constant collection. + let clone_node = edit.add_node(Node::Write { + collect: cons_node, + data: object, + indices: vec![].into_boxed_slice(), + }); + + // Make user use the cloned object. + edit.replace_all_uses_where(object, clone_node, |id| *id == user) + }); +} diff --git a/hercules_opt/src/lib.rs b/hercules_opt/src/lib.rs index 4a4011b1f19c62d741f8d30189998039a1dd1b30..a69ca5391f9876b73b7b969b3b15820faf3a85cf 100644 --- a/hercules_opt/src/lib.rs +++ b/hercules_opt/src/lib.rs @@ -10,7 +10,7 @@ pub mod forkify; pub mod gvn; pub mod inline; pub mod interprocedural_sroa; -pub mod materialize_clones; +pub mod legalize_reference_semantics; pub mod outline; pub mod pass; pub mod phi_elim; @@ -30,7 +30,7 @@ pub use crate::forkify::*; pub use crate::gvn::*; pub use crate::inline::*; pub use crate::interprocedural_sroa::*; -pub use crate::materialize_clones::*; +pub use crate::legalize_reference_semantics::*; pub use crate::outline::*; pub use crate::pass::*; pub use crate::phi_elim::*; diff --git a/hercules_opt/src/materialize_clones.rs b/hercules_opt/src/materialize_clones.rs deleted file mode 100644 index 687ac10c87d595c6f53a3fff4435e14f6dc4f375..0000000000000000000000000000000000000000 --- a/hercules_opt/src/materialize_clones.rs +++ /dev/null @@ -1,21 +0,0 @@ -extern crate hercules_ir; - -use self::hercules_ir::*; - -use crate::*; - -/* - * Top level function to materialize clones of collections. This transformation - * eliminates the possibility of multiple independent writes (including dynamic - * writes) to a single collection by introducing extra collection constants and - * inserting explicit clones. This allows us to make the simplifying assumption - * in the backend that collections have reference, rather than value, semantics. - * The pass calling this function is mandatory for correctness. - */ -pub fn materialize_clones( - editor: &mut FunctionEditor, - objects: &FunctionCollectionObjects, - bbs: &BasicBlocks, -) { - todo!() -} diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs index 70062bbcbce0001f0d07a3cfb25fbf2cd94d0433..84cedb7629e45abb3abad5eac841b24f224a2698 100644 --- a/hercules_opt/src/outline.rs +++ b/hercules_opt/src/outline.rs @@ -6,7 +6,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use self::hercules_ir::def_use::*; use self::hercules_ir::dom::*; -use self::hercules_ir::gcm::*; +use self::hercules_ir::fork_join_analysis::*; use self::hercules_ir::ir::*; use self::hercules_ir::subgraph::*; diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 3b7c81eda174a27fa544fe6dd136fcfc24cce695..d0449a4a57b2325fcb7c2a766e1d17bff2587ea5 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -38,8 +38,8 @@ pub enum Pass { DeleteUncalled, ForkSplit, Unforkify, - MaterializeClones, InferSchedules, + LegalizeReferenceSemantics, Verify, // Parameterized over whether analyses that aid visualization are necessary. // Useful to set to false if displaying a potentially broken module. @@ -72,7 +72,6 @@ pub struct PassManager { pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>, pub loops: Option<Vec<LoopTree>>, pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, - pub antideps: Option<Vec<Vec<(NodeID, NodeID)>>>, pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub bbs: Option<Vec<BasicBlocks>>, pub collection_objects: Option<CollectionObjects>, @@ -94,7 +93,6 @@ impl PassManager { fork_join_nests: None, loops: None, reduce_cycles: None, - antideps: None, data_nodes_in_fork_joins: None, bbs: None, collection_objects: None, @@ -238,28 +236,6 @@ impl PassManager { } } - pub fn make_antideps(&mut self) { - if self.antideps.is_none() { - self.make_reverse_postorders(); - self.make_collection_objects(); - self.antideps = Some( - zip( - self.module.functions.iter(), - zip( - self.reverse_postorders.as_ref().unwrap().iter(), - self.collection_objects.as_ref().unwrap().iter(), - ), - ) - // Fine since collection_objects is a BTreeMap - iteration order - // is fixed. - .map(|(function, (reverse_postorder, objects))| { - antideps(function, reverse_postorder, objects.1) - }) - .collect(), - ); - } - } - pub fn make_data_nodes_in_fork_joins(&mut self) { if self.data_nodes_in_fork_joins.is_none() { self.make_def_uses(); @@ -280,64 +256,6 @@ impl PassManager { } } - pub fn make_bbs(&mut self) { - if self.bbs.is_none() { - self.make_def_uses(); - self.make_reverse_postorders(); - self.make_control_subgraphs(); - self.make_doms(); - self.make_antideps(); - self.make_loops(); - self.make_fork_join_maps(); - let def_uses = self.def_uses.as_ref().unwrap().iter(); - let reverse_postorders = self.reverse_postorders.as_ref().unwrap().iter(); - let control_subgraphs = self.control_subgraphs.as_ref().unwrap().iter(); - let doms = self.doms.as_ref().unwrap().iter(); - let antideps = self.antideps.as_ref().unwrap().iter(); - let loops = self.loops.as_ref().unwrap().iter(); - let fork_join_maps = self.fork_join_maps.as_ref().unwrap().iter(); - self.bbs = Some( - zip( - self.module.functions.iter(), - zip( - def_uses, - zip( - reverse_postorders, - zip( - control_subgraphs, - zip(doms, zip(antideps, zip(loops, fork_join_maps))), - ), - ), - ), - ) - .map( - |( - function, - ( - def_use, - ( - reverse_postorder, - (control_subgraph, (dom, (antideps, (loops, fork_join_map)))), - ), - ), - )| { - gcm( - function, - def_use, - reverse_postorder, - control_subgraph, - dom, - antideps, - loops, - fork_join_map, - ) - }, - ) - .collect(), - ); - } - } - pub fn make_collection_objects(&mut self) { if self.collection_objects.is_none() { self.make_reverse_postorders(); @@ -375,6 +293,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -409,6 +328,7 @@ impl PassManager { .map(|(i, f)| { FunctionEditor::new( f, + FunctionID::new(i), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -442,6 +362,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -468,6 +389,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -515,6 +437,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -590,6 +513,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -614,18 +538,21 @@ impl PassManager { let dynamic_constants_ref = RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); - let mut editors: Vec<_> = - zip(self.module.functions.iter_mut(), def_uses.iter()) - .map(|(func, def_use)| { - FunctionEditor::new( - func, - &constants_ref, - &dynamic_constants_ref, - &types_ref, - def_use, - ) - }) - .collect(); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); inline(&mut editors, callgraph); self.module.constants = constants_ref.take(); @@ -645,18 +572,21 @@ impl PassManager { RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let old_num_funcs = self.module.functions.len(); - let mut editors: Vec<_> = - zip(self.module.functions.iter_mut(), def_uses.iter()) - .map(|(func, def_use)| { - FunctionEditor::new( - func, - &constants_ref, - &dynamic_constants_ref, - &types_ref, - def_use, - ) - }) - .collect(); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); for editor in editors.iter_mut() { collapse_returns(editor); ensure_between_control_flow(editor); @@ -678,18 +608,21 @@ impl PassManager { let dynamic_constants_ref = RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); - let mut editors: Vec<_> = - zip(self.module.functions.iter_mut(), def_uses.iter()) - .map(|(func, def_use)| { - FunctionEditor::new( - func, - &constants_ref, - &dynamic_constants_ref, - &types_ref, - def_use, - ) - }) - .collect(); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); let mut new_funcs = vec![]; for (idx, editor) in editors.iter_mut().enumerate() { let new_func_id = FunctionID::new(old_num_funcs + new_funcs.len()); @@ -726,18 +659,21 @@ impl PassManager { // By default in an editor all nodes are mutable, which is desired in this case // since we are only modifying the IDs of functions that we call. - let mut editors: Vec<_> = - zip(self.module.functions.iter_mut(), def_uses.iter()) - .map(|(func, def_use)| { - FunctionEditor::new( - func, - &constants_ref, - &dynamic_constants_ref, - &types_ref, - def_use, - ) - }) - .collect(); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); let new_idx = delete_uncalled(&mut editors, callgraph); self.module.constants = constants_ref.take(); @@ -768,6 +704,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -796,6 +733,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -811,13 +749,24 @@ impl PassManager { } self.clear_analyses(); } - Pass::MaterializeClones => { + Pass::LegalizeReferenceSemantics => loop { self.make_def_uses(); + self.make_reverse_postorders(); + self.make_typing(); + self.make_control_subgraphs(); + self.make_doms(); + self.make_fork_join_maps(); + self.make_loops(); self.make_collection_objects(); - self.make_bbs(); let def_uses = self.def_uses.as_ref().unwrap(); + let reverse_postorders = self.reverse_postorders.as_ref().unwrap(); + let typing = self.typing.as_ref().unwrap(); + let doms = self.doms.as_ref().unwrap(); + let fork_join_maps = self.fork_join_maps.as_ref().unwrap(); + let loops = self.loops.as_ref().unwrap(); + let control_subgraphs = self.control_subgraphs.as_ref().unwrap(); let collection_objects = self.collection_objects.as_ref().unwrap(); - let bbs = self.bbs.as_ref().unwrap(); + let mut bbs = vec![]; for idx in 0..self.module.functions.len() { let constants_ref = RefCell::new(std::mem::take(&mut self.module.constants)); @@ -826,16 +775,25 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, &def_uses[idx], ); - materialize_clones( + if let Some(bb) = legalize_reference_semantics( &mut editor, - &collection_objects[&FunctionID::new(idx)], - &bbs[idx], - ); + &def_uses[idx], + &reverse_postorders[idx], + &typing[idx], + &control_subgraphs[idx], + &doms[idx], + &fork_join_maps[idx], + &loops[idx], + collection_objects, + ) { + bbs.push(bb); + } self.module.constants = constants_ref.take(); self.module.dynamic_constants = dynamic_constants_ref.take(); @@ -844,7 +802,11 @@ impl PassManager { self.module.functions[idx].delete_gravestones(); } self.clear_analyses(); - } + if bbs.len() == self.module.functions.len() { + self.bbs = Some(bbs); + break; + } + }, Pass::InferSchedules => { self.make_def_uses(); self.make_fork_join_maps(); @@ -860,6 +822,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -905,28 +868,23 @@ impl PassManager { if *force_analyses { self.make_doms(); self.make_fork_join_maps(); - self.make_bbs(); } xdot_module( &self.module, self.reverse_postorders.as_ref().unwrap(), self.doms.as_ref(), self.fork_join_maps.as_ref(), - self.bbs.as_ref(), ); } Pass::Codegen(output_dir, module_name) => { self.make_reverse_postorders(); self.make_typing(); self.make_control_subgraphs(); - self.make_antideps(); - self.make_bbs(); self.make_collection_objects(); self.make_callgraph(); let reverse_postorders = self.reverse_postorders.as_ref().unwrap(); let typing = self.typing.as_ref().unwrap(); let control_subgraphs = self.control_subgraphs.as_ref().unwrap(); - let antideps = self.antideps.as_ref().unwrap(); let bbs = self.bbs.as_ref().unwrap(); let collection_objects = self.collection_objects.as_ref().unwrap(); let callgraph = self.callgraph.as_ref().unwrap(); @@ -945,7 +903,6 @@ impl PassManager { &reverse_postorders[idx], &typing[idx], &control_subgraphs[idx], - &antideps[idx], &bbs[idx], &mut llvm_ir, ) @@ -956,7 +913,6 @@ impl PassManager { &reverse_postorders[idx], &typing[idx], &control_subgraphs[idx], - &antideps[idx], &bbs[idx], &collection_objects, &callgraph, @@ -1026,7 +982,6 @@ impl PassManager { self.fork_join_nests = None; self.loops = None; self.reduce_cycles = None; - self.antideps = None; self.data_nodes_in_fork_joins = None; self.bbs = None; self.collection_objects = None; diff --git a/juno_frontend/src/lib.rs b/juno_frontend/src/lib.rs index b18b29791b54aa267945ec8e658fdce069e250f3..0dd5cdd338db792a9725b22cc95527e5a08688c2 100644 --- a/juno_frontend/src/lib.rs +++ b/juno_frontend/src/lib.rs @@ -187,18 +187,21 @@ pub fn compile_ir( //add_pass!(pm, verify, Forkify); //add_pass!(pm, verify, ForkGuardElim); add_verified_pass!(pm, verify, DCE); + add_pass!(pm, verify, ForkSplit); + add_pass!(pm, verify, Unforkify); + add_pass!(pm, verify, GVN); + add_verified_pass!(pm, verify, DCE); + add_pass!(pm, verify, LegalizeReferenceSemantics); add_pass!(pm, verify, Outline); add_pass!(pm, verify, InterproceduralSROA); add_pass!(pm, verify, SROA); add_pass!(pm, verify, InferSchedules); - add_pass!(pm, verify, ForkSplit); - add_pass!(pm, verify, Unforkify); - add_pass!(pm, verify, GVN); add_verified_pass!(pm, verify, DCE); if x_dot { pm.add_pass(hercules_opt::pass::Pass::Xdot(true)); } + add_pass!(pm, verify, LegalizeReferenceSemantics); pm.add_pass(hercules_opt::pass::Pass::Codegen(output_dir, module_name)); pm.run_passes(); diff --git a/juno_samples/antideps/src/antideps.jn b/juno_samples/antideps/src/antideps.jn index 5949c91a4e64bea9e3afa966f1f9f6a160d8553a..9efe71f10963aacf1620c4348abef6a74d8cb502 100644 --- a/juno_samples/antideps/src/antideps.jn +++ b/juno_samples/antideps/src/antideps.jn @@ -7,7 +7,20 @@ fn simple_antideps(a : usize, b : usize) -> i32 { } #[entry] -fn complex_antideps(x : i32) -> i32 { +fn loop_antideps(input : i32) -> i32 { + let arr1 : i32[1]; + arr1[0] = 2; + let p1 = arr1[0]; + while input > 10 { + arr1[0] = arr1[0] + 1; + input -= 10; + } + let p2 = arr1[0]; + return p1 + p2; +} + +#[entry] +fn complex_antideps1(x : i32) -> i32 { let arr : i32[4]; let arr2 : i32[12]; arr[1] = 7 + arr2[0]; @@ -28,6 +41,23 @@ fn complex_antideps(x : i32) -> i32 { return r; } +#[entry] +fn complex_antideps2(input : i32) -> i32 { + let arr1 : i32[2]; + arr1[0] = 2; + arr1[1] = 3; + let p1 = arr1[0] + arr1[1]; + if input > 0 { + while input > 10 { + arr1[0] = arr1[1] + input; + arr1[1] = arr1[0] + input; + input -= 10; + } + } + let p2 = arr1[0]; + return p1 + p2; +} + #[entry] fn very_complex_antideps(x: usize) -> usize { let arr1 : usize[203]; diff --git a/juno_samples/antideps/src/main.rs b/juno_samples/antideps/src/main.rs index b0a991637bde67a0229fb749213927b8e14c06dd..a9c225b21bb7a8f3693f124471989fc9d4ebe23c 100644 --- a/juno_samples/antideps/src/main.rs +++ b/juno_samples/antideps/src/main.rs @@ -11,10 +11,18 @@ fn main() { println!("{}", output); assert_eq!(output, 5); - let output = complex_antideps(9).await; + let output = loop_antideps(11).await; + println!("{}", output); + assert_eq!(output, 5); + + let output = complex_antideps1(9).await; println!("{}", output); assert_eq!(output, 20); + let output = complex_antideps2(44).await; + println!("{}", output); + assert_eq!(output, 226); + let output = very_complex_antideps(3).await; println!("{}", output); assert_eq!(output, 144); diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn index 17e345e51e80db27c0d2f21854e22abf1eefcdb8..a2d6cba0c275fdbe776964558de1ec800c1c2ab2 100644 --- a/juno_samples/implicit_clone/src/implicit_clone.jn +++ b/juno_samples/implicit_clone/src/implicit_clone.jn @@ -1,5 +1,5 @@ #[entry] -fn implicit_clone(input : i32) -> i32 { +fn simple_implicit_clone(input : i32) -> i32 { let arr : i32[3]; arr[0] = 2; let arr2 = arr; @@ -7,3 +7,82 @@ fn implicit_clone(input : i32) -> i32 { arr[2] = 4; return arr[0] + arr2[0] + arr[1] + arr2[1] + arr[2] + arr2[2]; } + +#[entry] +fn loop_implicit_clone(input : i32) -> i32 { + let arr : i32[3]; + let r : i32 = 5; + while input > 0 { + r = arr[0]; + let arr2 = arr; + let x = arr2[input as usize - input as usize]; + arr2[input as usize - input as usize] = 9; + if x == 0 { + input -= arr2[0]; + } else { + r = 99; + break; + } + } + return r + 7; +} + +#[entry] +fn no_implicit_clone(input : i32) -> i32 { + let arr : i32[2]; + arr[0] = input; + while input > 0 { + arr[0] += 1; + input -= 1; + } + let arr2 : i32[1]; + if input == 0 { + arr2[0] = 5; + } else { + arr2[0] = 3; + } + return arr[0] + arr2[0]; +} + +#[entry] +fn complex_implicit_clone(input : i32) -> i32 { + let arr1 : i32[2]; + let arr2 : i32[2]; + let arr3 : i32[2]; + let arr4 : i32[2]; + arr1[0] = 7; + arr1[1] = 3; + arr2[0] = input; + arr2[1] = 45; + arr3[0] = -14; + arr3[1] = -5; + arr4[0] = -1; + arr4[1] = 0; + arr2 = arr4; + arr3 = arr2; + arr2 = arr1; + let p1 = arr1[0] + arr1[1] + arr2[0] + arr2[1] + arr3[0] + arr3[1] + arr4[0] + arr4[1]; // 18 + arr4 = arr2; + let p2 = arr1[0] + arr1[1] + arr2[0] + arr2[1] + arr3[0] + arr3[1] + arr4[0] + arr4[1]; // 29 + if input > 0 { + while input > 10 { + arr1[0] = arr1[1] + input; + arr1[1] = arr1[0] + input; + input -= 10; + } + } + let p3 = arr1[0]; // 592 + let x : i32 = 0; + while input < 20 { + let arr5 : i32[2]; + arr5[0] = 7; + let y = arr5[0] + arr5[1]; + arr5 = arr4; + arr5[1] += 2; + y += arr5[1]; + x += 12; + input += 1; + } + let p4 = x; // 204 + return p1 + p2 + p3 + p4; +} diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs index ca7ddeb1571be6698f6a9c3971ef617b3a6fd4ca..45c722d783494d7fd5a9b9e91d4ea4db90ce4f0c 100644 --- a/juno_samples/implicit_clone/src/main.rs +++ b/juno_samples/implicit_clone/src/main.rs @@ -7,9 +7,21 @@ juno_build::juno!("implicit_clone"); fn main() { async_std::task::block_on(async { - let output = implicit_clone(3).await; + let output = simple_implicit_clone(3).await; println!("{}", output); assert_eq!(output, 11); + + let output = loop_implicit_clone(100).await; + println!("{}", output); + assert_eq!(output, 7); + + let output = no_implicit_clone(4).await; + println!("{}", output); + assert_eq!(output, 13); + + let output = complex_implicit_clone(73).await; + println!("{}", output); + assert_eq!(output, 843); }); } diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs index 948459dd4badfbdf31fea25be70520df22ff5b6e..1c5b9d42b020b3acd7dbaba0db11ec734e83e574 100644 --- a/juno_samples/matmul/src/main.rs +++ b/juno_samples/matmul/src/main.rs @@ -47,7 +47,7 @@ fn main() { I * K * 4, ); }; - let tiled_c_bytes = matmul(I as u64, J as u64, K as u64, a_bytes, b_bytes).await; + let tiled_c_bytes = tiled_64_matmul(I as u64, J as u64, K as u64, a_bytes, b_bytes).await; let mut tiled_c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); unsafe { copy_nonoverlapping(