Handle loop vs. simple induced clones better

9e5b344f · rarbore2 · 157e4645 · 9e5b344f · 9e5b344f · 9e5b344f
Commit 9e5b344f authored 6 months ago by rarbore2
--- a/hercules_ir/src/subgraph.rs
+++ b/hercules_ir/src/subgraph.rs
@@ -23,6 +23,7 @@ pub struct Subgraph {
    original_num_nodes: u32,
 }

+#[derive(Debug, Clone)]
 pub struct SubgraphIterator<'a> {
    nodes: &'a Vec<NodeID>,
    edges: &'a [u32],

--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -321,10 +321,11 @@ fn basic_blocks(
        // Look between the LCA and the schedule early location to place the
        // node.
        let schedule_early = schedule_early[id.idx()].unwrap();
+        let schedule_late = lca.unwrap_or(schedule_early);
        let mut chain = dom
            // If the node has no users, then it doesn't really matter where we
            // place it - just place it at the early placement.
-            .chain(lca.unwrap_or(schedule_early), schedule_early);
+            .chain(schedule_late, schedule_early);

        if let Some(mut location) = chain.next() {
            /*
@@ -539,10 +540,108 @@ fn materialize_clones(
    objects: &CollectionObjects,
    bbs: &BasicBlocks,
 ) -> bool {
-    // First, run dataflow analysis to figure out which access to collections
-    // induce clones. This dataflow analysis depends on basic block assignments
-    // and is more analogous to standard dataflow analysis in CFG + SSA IRs.
-    // This is the only place this form is used, so just hardcode it here.
+    let rev_po = control_subgraph.rev_po(NodeID::new(0));
+    let mut total_num_pts = 0;
+    let mut bb_to_prefix_sum = vec![0; bbs.0.len()];
+    for ((idx, bb), insts) in zip(bbs.0.iter().enumerate(), bbs.1.iter()) {
+        if idx == bb.idx() {
+            bb_to_prefix_sum[idx] = total_num_pts;
+            total_num_pts += insts.len() + 1;
+        }
+    }
+
+    // Calculate two lattices - one that includes back edges, and one that
+    // doesn't. We want to handle simple clones before loop induced clones, so
+    // we first materialize clones based on the no-back-edges lattice, and hten
+    // based on the full lattice.
+    let mut no_back_edge_lattice: Vec<BTreeMap<NodeID, BTreeSet<NodeID>>> =
+        vec![BTreeMap::new(); total_num_pts];
+    used_collections_dataflow(
+        editor,
+        &mut no_back_edge_lattice,
+        &rev_po,
+        &bb_to_prefix_sum,
+        control_subgraph,
+        objects,
+        bbs,
+    );
+    let mut super_value = BTreeMap::new();
+    if find_clones(
+        editor,
+        &super_value,
+        &no_back_edge_lattice,
+        &rev_po,
+        &typing,
+        control_subgraph,
+        dom,
+        loops,
+        objects,
+        &bb_to_prefix_sum,
+        bbs,
+    ) {
+        return true;
+    }
+
+    // After inducing simple clones, calculate the full lattice and materialize
+    // any loop induced clones.
+    let mut lattice: Vec<BTreeMap<NodeID, BTreeSet<NodeID>>> = vec![BTreeMap::new(); total_num_pts];
+    loop {
+        let changed = used_collections_dataflow(
+            editor,
+            &mut lattice,
+            &rev_po,
+            &bb_to_prefix_sum,
+            control_subgraph,
+            objects,
+            bbs,
+        );
+        if !changed {
+            break;
+        }
+    }
+    for value in lattice.iter() {
+        meet(&mut super_value, value);
+    }
+    find_clones(
+        editor,
+        &super_value,
+        &lattice,
+        &rev_po,
+        &typing,
+        control_subgraph,
+        dom,
+        loops,
+        objects,
+        &bb_to_prefix_sum,
+        bbs,
+    )
+}
+
+fn meet(left: &mut BTreeMap<NodeID, BTreeSet<NodeID>>, right: &BTreeMap<NodeID, BTreeSet<NodeID>>) {
+    for (used, users) in right.into_iter() {
+        left.entry(*used).or_default().extend(users.into_iter());
+    }
+}
+
+/*
+ * Helper function to run a single iteration of the used collections dataflow
+ * analysis. Returns whether the lattice was changed. The lattice maps each
+ * program point to a set of used values and their possible users. Top is that
+ * no nodes are used yet.
+ */
+fn used_collections_dataflow(
+    editor: &FunctionEditor,
+    lattice: &mut Vec<BTreeMap<NodeID, BTreeSet<NodeID>>>,
+    rev_po: &Vec<NodeID>,
+    bb_to_prefix_sum: &Vec<usize>,
+    control_subgraph: &Subgraph,
+    objects: &CollectionObjects,
+    bbs: &BasicBlocks,
+) -> bool {
+    // Run dataflow analysis to figure out which accesses to collections induce
+    // clones. This dataflow analysis depends on basic block assignments and is
+    // more analogous to standard dataflow analysis in CFG + SSA IRs. This is
+    // the only place this form is used, so just hardcode it here.
    //
    // This forward dataflow analysis tracks which collections are used at each
    // program point, and by what user nodes. Collections are referred to using
@@ -575,363 +674,364 @@ fn materialize_clones(
    // "sub-view" of the same collection. This does not include reads that "end"
    // (most reads, some calls, the `data` input of a write). This analysis does
    // not consider parallel mutations in fork-joins.
-    let rev_po = control_subgraph.rev_po(NodeID::new(0));
-    let mut total_num_pts = 0;
-    let mut bb_to_prefix_sum = vec![0; bbs.0.len()];
-    for ((idx, bb), insts) in zip(bbs.0.iter().enumerate(), bbs.1.iter()) {
-        if idx == bb.idx() {
-            bb_to_prefix_sum[idx] = total_num_pts;
-            total_num_pts += insts.len() + 1;
-        }
-    }
-    // Lattice maps each program point to a set of used values and their
-    // possible users. Top is that no nodes are used yet.
    let nodes = &editor.func().nodes;
    let func_id = editor.func_id();
-    let meet = |left: &mut BTreeMap<NodeID, BTreeSet<NodeID>>,
-                right: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
-        for (used, users) in right.into_iter() {
-            left.entry(*used).or_default().extend(users.into_iter());
-        }
-    };
-    let mut lattice: Vec<BTreeMap<NodeID, BTreeSet<NodeID>>> = vec![BTreeMap::new(); total_num_pts];
-    loop {
-        let mut changed = false;
+    let mut changed = false;

-        for bb in rev_po.iter() {
-            // The lattice value of the first point is the meet of the
-            // predecessor terminating lattice values.
-            let old_top_value = &lattice[bb_to_prefix_sum[bb.idx()]];
-            let mut new_top_value = BTreeMap::new();
-            // Clearing `top_value` is not necessary since used nodes are never
-            // removed from lattice values, only added.
-            for pred in control_subgraph.preds(*bb) {
-                let last_pt = bbs.1[pred.idx()].len();
-                meet(
-                    &mut new_top_value,
-                    &lattice[bb_to_prefix_sum[pred.idx()] + last_pt],
-                );
-            }
-            changed |= *old_top_value != new_top_value;
-            lattice[bb_to_prefix_sum[bb.idx()]] = new_top_value;
+    for bb in rev_po.iter() {
+        // The lattice value of the first point is the meet of the
+        // predecessor terminating lattice values.
+        let old_top_value = &lattice[bb_to_prefix_sum[bb.idx()]];
+        let mut new_top_value = BTreeMap::new();
+        // Clearing `top_value` is not necessary since used nodes are never
+        // removed from lattice values, only added.
+        for pred in control_subgraph.preds(*bb) {
+            let last_pt = bbs.1[pred.idx()].len();
+            meet(
+                &mut new_top_value,
+                &lattice[bb_to_prefix_sum[pred.idx()] + last_pt],
+            );
+        }
+        changed |= *old_top_value != new_top_value;
+        lattice[bb_to_prefix_sum[bb.idx()]] = new_top_value;

-            // The lattice value of following points are determined by their
-            // immediate preceding instructions.
-            let insts = &bbs.1[bb.idx()];
-            for (prev_pt, inst) in insts.iter().enumerate() {
-                let old_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1];
-                let prev_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt];
-                let mut new_value = prev_value.clone();
-                match nodes[inst.idx()] {
-                    Node::Phi {
-                        control: _,
-                        ref data,
-                    } if !objects[&func_id].objects(*inst).is_empty() => {
-                        for elem in data {
-                            new_value.entry(*elem).or_default().insert(*inst);
-                        }
-                        new_value.remove(inst);
-                    }
-                    Node::Ternary {
-                        op: TernaryOperator::Select,
-                        first: _,
-                        second,
-                        third,
-                    }
-                    | Node::Reduce {
-                        control: _,
-                        init: second,
-                        reduct: third,
-                    } => {
-                        if !objects[&func_id].objects(*inst).is_empty() {
-                            new_value.entry(second).or_default().insert(*inst);
-                            new_value.entry(third).or_default().insert(*inst);
-                            new_value.remove(inst);
-                        }
-                    }
-                    Node::Read {
-                        collect,
-                        indices: _,
-                    } if !objects[&func_id].objects(*inst).is_empty() => {
-                        new_value.entry(collect).or_default().insert(*inst);
-                        new_value.remove(inst);
-                    }
-                    Node::Write {
-                        collect,
-                        data: _,
-                        indices: _,
-                    } => {
-                        new_value.entry(collect).or_default().insert(*inst);
-                        new_value.remove(inst);
+        // The lattice value of following points are determined by their
+        // immediate preceding instructions.
+        let insts = &bbs.1[bb.idx()];
+        for (prev_pt, inst) in insts.iter().enumerate() {
+            let old_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1];
+            let prev_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt];
+            let mut new_value = prev_value.clone();
+            match nodes[inst.idx()] {
+                Node::Phi {
+                    control: _,
+                    ref data,
+                } if !objects[&func_id].objects(*inst).is_empty() => {
+                    for elem in data {
+                        new_value.entry(*elem).or_default().insert(*inst);
                    }
-                    Node::Call {
-                        control: _,
-                        function: callee,
-                        dynamic_constants: _,
-                        ref args,
-                    } => {
-                        let callee_objects = &objects[&callee];
-                        for (param_idx, arg) in args.into_iter().enumerate() {
-                            if callee_objects
-                                .param_to_object(param_idx)
-                                .map(|object| {
-                                    callee_objects.is_mutated(object)
-                                        || callee_objects.returned_objects().contains(&object)
-                                })
-                                .unwrap_or(false)
-                            {
-                                new_value.entry(*arg).or_default().insert(*inst);
-                            }
-                        }
+                    new_value.remove(inst);
+                }
+                Node::Ternary {
+                    op: TernaryOperator::Select,
+                    first: _,
+                    second,
+                    third,
+                }
+                | Node::Reduce {
+                    control: _,
+                    init: second,
+                    reduct: third,
+                } => {
+                    if !objects[&func_id].objects(*inst).is_empty() {
+                        new_value.entry(second).or_default().insert(*inst);
+                        new_value.entry(third).or_default().insert(*inst);
                        new_value.remove(inst);
                    }
-                    _ => {}
                }
-                changed |= *old_value != new_value;
-                lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1] = new_value;
-            }
-
-            // Handle reduces in this block specially at the very end.
-            let last_pt = insts.len();
-            let old_bottom_value = &lattice[bb_to_prefix_sum[bb.idx()] + last_pt];
-            let mut new_bottom_value = old_bottom_value.clone();
-            for inst in insts.iter() {
-                if let Node::Reduce {
+                Node::Read {
+                    collect,
+                    indices: _,
+                } if !objects[&func_id].objects(*inst).is_empty() => {
+                    new_value.entry(collect).or_default().insert(*inst);
+                    new_value.remove(inst);
+                }
+                Node::Write {
+                    collect,
+                    data: _,
+                    indices: _,
+                } => {
+                    new_value.entry(collect).or_default().insert(*inst);
+                    new_value.remove(inst);
+                }
+                Node::Call {
                    control: _,
-                    init: _,
-                    reduct,
-                } = nodes[inst.idx()]
-                {
-                    assert!(
-                        new_bottom_value.contains_key(&reduct),
-                        "PANIC: Can't handle clones inside a reduction cycle currently."
-                    );
-                    new_bottom_value.remove(inst);
+                    function: callee,
+                    dynamic_constants: _,
+                    ref args,
+                } => {
+                    let callee_objects = &objects[&callee];
+                    for (param_idx, arg) in args.into_iter().enumerate() {
+                        if callee_objects
+                            .param_to_object(param_idx)
+                            .map(|object| {
+                                callee_objects.is_mutated(object)
+                                    || callee_objects.returned_objects().contains(&object)
+                            })
+                            .unwrap_or(false)
+                        {
+                            new_value.entry(*arg).or_default().insert(*inst);
+                        }
+                    }
+                    new_value.remove(inst);
                }
+                _ => {}
            }
-            changed |= *old_bottom_value != new_bottom_value;
-            lattice[bb_to_prefix_sum[bb.idx()] + last_pt] = new_bottom_value;
+            changed |= *old_value != new_value;
+            lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1] = new_value;
        }

-        if !changed {
-            break;
+        // Handle reduces in this block specially at the very end.
+        let last_pt = insts.len();
+        let old_bottom_value = &lattice[bb_to_prefix_sum[bb.idx()] + last_pt];
+        let mut new_bottom_value = old_bottom_value.clone();
+        for inst in insts.iter() {
+            if let Node::Reduce {
+                control: _,
+                init: _,
+                reduct,
+            } = nodes[inst.idx()]
+            {
+                assert!(
+                    new_bottom_value.contains_key(&reduct),
+                    "PANIC: Can't handle clones inside a reduction cycle currently."
+                );
+                new_bottom_value.remove(inst);
+            }
        }
-    }
-    let mut super_value = BTreeMap::new();
-    for value in lattice.iter() {
-        meet(&mut super_value, value);
+        changed |= *old_bottom_value != new_bottom_value;
+        lattice[bb_to_prefix_sum[bb.idx()] + last_pt] = new_bottom_value;
    }

-    // Helper to induce a clone when an implicit clone is identified.
-    let nodes = nodes.clone();
-    let mut induce_clone = |object: NodeID,
-                            user: NodeID,
-                            value: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
-        // If `user` already used `object` and tries to use it again, then the
-        // clone is a "loop induced" clone. Otherwise, it's a simple clone.
-        if !value[&object].contains(&user) {
-            let success = editor.edit(|mut edit| {
-                // Create the constant collection object for allocation.
-                let object_ty = typing[object.idx()];
-                let object_cons = edit.add_zero_constant(object_ty);
-                let cons_node = edit.add_node(Node::Constant { id: object_cons });
+    changed
+}

-                // Create the clone into the new constant collection.
-                let clone_node = edit.add_node(Node::Write {
-                    collect: cons_node,
-                    data: object,
-                    indices: vec![].into_boxed_slice(),
-                });
+/*
+ * Helper function to induce a clone once an object with multiple users has been
+ * found.
+ */
+fn induce_clone(
+    editor: &mut FunctionEditor,
+    object: NodeID,
+    user: NodeID,
+    value: &BTreeMap<NodeID, BTreeSet<NodeID>>,
+    super_value: &BTreeMap<NodeID, BTreeSet<NodeID>>,
+    lattice: &Vec<BTreeMap<NodeID, BTreeSet<NodeID>>>,
+    rev_po: &Vec<NodeID>,
+    typing: &Vec<TypeID>,
+    control_subgraph: &Subgraph,
+    dom: &DomTree,
+    loops: &LoopTree,
+    bb_to_prefix_sum: &Vec<usize>,
+    bbs: &BasicBlocks,
+) {
+    // If `user` already used `object` and tries to use it again, then the
+    // clone is a "loop induced" clone. Otherwise, it's a simple clone.
+    if !value[&object].contains(&user) {
+        let success = editor.edit(|mut edit| {
+            // Create the constant collection object for allocation.
+            let object_ty = typing[object.idx()];
+            let object_cons = edit.add_zero_constant(object_ty);
+            let cons_node = edit.add_node(Node::Constant { id: object_cons });

-                // Make user use the cloned object.
-                edit.replace_all_uses_where(object, clone_node, |id| *id == user)
-            });
-            assert!(success);
-        } else {
-            // Figure out where to place that phi. This is the deepest
-            // loop header where `user` is responsible for making `object` used
-            // used at the top of the block, and the block dominates the block
-            // containing `user`. If `user` is a phi, then the region it's
-            // attached to is excluded from eligibility.
-            let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
-                lattice[bb_to_prefix_sum[bb.idx()]]
-                    .get(&object)
-                    .unwrap_or(&BTreeSet::new())
-                    .contains(&user)
-                    && dom.does_dom(*bb, bbs.0[user.idx()])
-                    && loops.contains(*bb)
-                    && loops.is_in_loop(*bb, bbs.0[user.idx()])
-                    && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
+            // Create the clone into the new constant collection.
+            let clone_node = edit.add_node(Node::Write {
+                collect: cons_node,
+                data: object,
+                indices: vec![].into_boxed_slice(),
            });
-            let top_block = eligible_blocks
-                .max_by_key(|bb| loops.nesting(*bb).unwrap())
-                .unwrap();
-            assert!(editor.func().nodes[top_block.idx()].is_region());

-            // Figure out the users of `object` that we need to phi back
-            // upwards. Assign each user a number indicating how far down the
-            // user chain it is, higher is farther down. This is used for
-            // picking the most downstream user later.
-            let mut users: BTreeMap<NodeID, usize> = BTreeMap::new();
-            let mut workset: BTreeSet<NodeID> = BTreeSet::new();
-            workset.insert(object);
-            let mut chain_ordering = 1;
-            while let Some(pop) = workset.pop_first() {
-                let iterated_users: BTreeSet<_> = super_value
-                    .get(&pop)
-                    .map(|users| users.into_iter())
-                    .into_iter()
-                    .flatten()
-                    .map(|id| *id)
-                    .filter(|iterated_user| loops.is_in_loop(top_block, bbs.0[iterated_user.idx()]))
-                    .collect();
-                workset.extend(iterated_users.iter().filter(|id| !users.contains_key(id)));
-                for user in iterated_users {
-                    *users.entry(user).or_default() = chain_ordering;
-                    chain_ordering += 1;
-                }
+            // Make user use the cloned object.
+            edit.replace_all_uses_where(object, clone_node, |id| *id == user)
+        });
+        assert!(success);
+    } else {
+        // Figure out where to place that phi. This is the deepest
+        // loop header where `user` is responsible for making `object` used
+        // used at the top of the block, and the block dominates the block
+        // containing `user`. If `user` is a phi, then the region it's
+        // attached to is excluded from eligibility.
+        let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
+            lattice[bb_to_prefix_sum[bb.idx()]]
+                .get(&object)
+                .unwrap_or(&BTreeSet::new())
+                .contains(&user)
+                && dom.does_dom(*bb, bbs.0[user.idx()])
+                && loops.contains(*bb)
+                && loops.is_in_loop(*bb, bbs.0[user.idx()])
+                && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
+        });
+        let top_block = eligible_blocks
+            .max_by_key(|bb| loops.nesting(*bb).unwrap())
+            .unwrap();
+        assert!(editor.func().nodes[top_block.idx()].is_region());
+
+        // Figure out the users of `object` that we need to phi back
+        // upwards. Assign each user a number indicating how far down the
+        // user chain it is, higher is farther down. This is used for
+        // picking the most downstream user later.
+        let mut users: BTreeMap<NodeID, usize> = BTreeMap::new();
+        let mut workset: BTreeSet<NodeID> = BTreeSet::new();
+        workset.insert(object);
+        let mut chain_ordering = 1;
+        assert!(!super_value.is_empty());
+        while let Some(pop) = workset.pop_first() {
+            let iterated_users: BTreeSet<_> = super_value
+                .get(&pop)
+                .map(|users| users.into_iter())
+                .into_iter()
+                .flatten()
+                .map(|id| *id)
+                .filter(|iterated_user| loops.is_in_loop(top_block, bbs.0[iterated_user.idx()]))
+                .collect();
+            workset.extend(iterated_users.iter().filter(|id| !users.contains_key(id)));
+            for user in iterated_users {
+                *users.entry(user).or_default() = chain_ordering;
+                chain_ordering += 1;
            }
+        }

-            // The fringe users may not dominate any predecessors of the loop
-            // header. The following is some Juno code that exposes this:
-            //
-            // fn problematic(a : size) -> i32 {
-            //   for i = 0 to a {
-            //     let arr : i32[1];
-            //     for j = 0 to a {
-            //       arr[0] = 1;
-            //     }
-            //   }
-            //   return 0;
-            // }
-            //
-            // Note that `arr` induces a clone each iteration, since its value
-            // needs to be reset to all zeros. However, it should also be noted
-            // that the most fringe user of `arr`, the write inside the inner
-            // loop, does not dominate the bottom of the outer loop. Thus, we
-            // need to insert a phi in the bottom block of the outer loop to
-            // retrieve either the write, or `arr` before the inner loop. The
-            // general version of this problem requires the following solution.
-            // Our goal is to figure out which downstream user represents
-            // `object` at each block in the loop. We first assign each block
-            // containing a user the most downstream user it contains. Then, we
-            // create a dummy phi for every region (including the header) in the
-            // loop, which is the downstream user for that block. Then, every
-            // other block is assigned the downstream user of its single
-            // predecessor. This basically amounts to recreating SSA for
-            // `object` inside the loop.
-            let mut user_per_loop_bb = BTreeMap::new();
-            let mut added_phis = BTreeMap::new();
-            let mut top_phi = NodeID::new(0);
-            // Assign existing users.
-            for (user, ordering) in users.iter() {
-                let bb = bbs.0[user.idx()];
-                if let Some(old_user) = user_per_loop_bb.get(&bb)
-                    && users[old_user] > *ordering
-                {
-                } else {
-                    user_per_loop_bb.insert(bb, *user);
-                }
+        // The fringe users may not dominate any predecessors of the loop
+        // header. The following is some Juno code that exposes this:
+        //
+        // fn problematic(a : size) -> i32 {
+        //   for i = 0 to a {
+        //     let arr : i32[1];
+        //     for j = 0 to a {
+        //       arr[0] = 1;
+        //     }
+        //   }
+        //   return 0;
+        // }
+        //
+        // Note that `arr` induces a clone each iteration, since its value
+        // needs to be reset to all zeros. However, it should also be noted
+        // that the most fringe user of `arr`, the write inside the inner
+        // loop, does not dominate the bottom of the outer loop. Thus, we
+        // need to insert a phi in the bottom block of the outer loop to
+        // retrieve either the write, or `arr` before the inner loop. The
+        // general version of this problem requires the following solution.
+        // Our goal is to figure out which downstream user represents
+        // `object` at each block in the loop. We first assign each block
+        // containing a user the most downstream user it contains. Then, we
+        // create a dummy phi for every region (including the header) in the
+        // loop, which is the downstream user for that block. Then, every
+        // other block is assigned the downstream user of its single
+        // predecessor. This basically amounts to recreating SSA for
+        // `object` inside the loop.
+        let mut user_per_loop_bb = BTreeMap::new();
+        let mut added_phis = BTreeMap::new();
+        let mut top_phi = NodeID::new(0);
+        // Assign existing users.
+        for (user, ordering) in users.iter() {
+            let bb = bbs.0[user.idx()];
+            if let Some(old_user) = user_per_loop_bb.get(&bb)
+                && users[old_user] > *ordering
+            {
+            } else {
+                user_per_loop_bb.insert(bb, *user);
            }
-            // Assign dummy phis.
-            for bb in loops.nodes_in_loop(top_block) {
-                if (!user_per_loop_bb.contains_key(&bb) || bb == top_block)
-                    && editor.func().nodes[bb.idx()].is_region()
-                {
-                    let success = editor.edit(|mut edit| {
-                        let phi_node = edit.add_node(Node::Phi {
-                            control: bb,
-                            data: empty().collect(),
-                        });
-                        if bb != top_block || !user_per_loop_bb.contains_key(&bb) {
-                            user_per_loop_bb.insert(bb, phi_node);
-                        }
-                        if bb == top_block {
-                            top_phi = phi_node;
-                        }
-                        added_phis.insert(phi_node, bb);
-                        Ok(edit)
+        }
+        // Assign dummy phis.
+        for bb in loops.nodes_in_loop(top_block) {
+            if (!user_per_loop_bb.contains_key(&bb) || bb == top_block)
+                && editor.func().nodes[bb.idx()].is_region()
+            {
+                let success = editor.edit(|mut edit| {
+                    let phi_node = edit.add_node(Node::Phi {
+                        control: bb,
+                        data: empty().collect(),
                    });
-                    assert!(success);
-                }
+                    if bb != top_block || !user_per_loop_bb.contains_key(&bb) {
+                        user_per_loop_bb.insert(bb, phi_node);
+                    }
+                    if bb == top_block {
+                        top_phi = phi_node;
+                    }
+                    added_phis.insert(phi_node, bb);
+                    Ok(edit)
+                });
+                assert!(success);
            }
-            // Assign users for the rest of the blocks.
-            for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) {
-                if !user_per_loop_bb.contains_key(&bb) {
-                    assert!(control_subgraph.preds(*bb).count() == 1);
-                    user_per_loop_bb.insert(
-                        *bb,
-                        user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()],
-                    );
-                }
+        }
+        // Assign users for the rest of the blocks.
+        for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) {
+            if !user_per_loop_bb.contains_key(&bb) {
+                assert!(control_subgraph.preds(*bb).count() == 1);
+                user_per_loop_bb.insert(
+                    *bb,
+                    user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()],
+                );
            }
+        }

-            // Induce the clone.
-            let success = editor.edit(|mut edit| {
-                // Create the constant collection object for allocation.
-                let object_ty = typing[object.idx()];
-                let object_cons = edit.add_zero_constant(object_ty);
-                let cons_node = edit.add_node(Node::Constant { id: object_cons });
+        // Induce the clone.
+        let success = editor.edit(|mut edit| {
+            // Create the constant collection object for allocation.
+            let object_ty = typing[object.idx()];
+            let object_cons = edit.add_zero_constant(object_ty);
+            let cons_node = edit.add_node(Node::Constant { id: object_cons });

-                // Create the phis.
-                let mut phi_map = BTreeMap::new();
-                let mut real_phis = BTreeSet::new();
-                for (dummy, bb) in added_phis {
-                    let real = edit.add_node(Node::Phi {
-                        control: bb,
-                        data: control_subgraph
-                            .preds(bb)
-                            .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
-                            .collect(),
-                    });
-                    phi_map.insert(dummy, real);
-                    real_phis.insert(real);
-                }
-
-                // Create the clone into the phi.
-                let real_top_phi = phi_map[&top_phi];
-                let clone_node = edit.add_node(Node::Write {
-                    collect: real_top_phi,
-                    data: object,
-                    indices: vec![].into_boxed_slice(),
+            // Create the phis.
+            let mut phi_map = BTreeMap::new();
+            let mut real_phis = BTreeSet::new();
+            for (dummy, bb) in added_phis {
+                let real = edit.add_node(Node::Phi {
+                    control: bb,
+                    data: control_subgraph
+                        .preds(bb)
+                        .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
+                        .collect(),
                });
+                phi_map.insert(dummy, real);
+                real_phis.insert(real);
+            }

-                // Make users use the cloned object.
-                edit = edit.replace_all_uses_where(object, clone_node, |id| {
-                    id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
-                })?;
+            // Create the clone into the phi.
+            let real_top_phi = phi_map[&top_phi];
+            let clone_node = edit.add_node(Node::Write {
+                collect: real_top_phi,
+                data: object,
+                indices: vec![].into_boxed_slice(),
+            });

-                // Get rid of the dummy phis.
-                for (dummy, real) in phi_map {
-                    edit = edit.replace_all_uses(dummy, real)?;
-                    edit = edit.delete_node(dummy)?;
-                }
+            // Make users use the cloned object.
+            edit = edit.replace_all_uses_where(object, clone_node, |id| {
+                id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
+            })?;

-                // Make phis use the clone instead of the top phi.
-                edit =
-                    edit.replace_all_uses_where(real_top_phi, clone_node, |id| *id != clone_node)?;
+            // Get rid of the dummy phis.
+            for (dummy, real) in phi_map {
+                edit = edit.replace_all_uses(dummy, real)?;
+                edit = edit.delete_node(dummy)?;
+            }

-                Ok(edit)
-            });
-            assert!(success);
+            // Make phis use the clone instead of the top phi.
+            edit.replace_all_uses_where(real_top_phi, clone_node, |id| *id != clone_node)
+        });
+        assert!(success);

-            // De-duplicate phis.
-            gvn(editor, false);
+        // De-duplicate phis.
+        gvn(editor, false);

-            // Get rid of unused phis.
-            dce(editor);
+        // Get rid of unused phis.
+        dce(editor);

-            // Simplify phis.
-            phi_elim(editor);
-        }
-    };
+        // Simplify phis.
+        phi_elim(editor);
+    }
+}

-    // Now that we've computed the used collections dataflow analysis, use the
-    // results to materialize a clone whenever a node attempts to use an already
-    // used node. As soon as any clone is found, return since that clone needs
-    // to get placed before other clones can be discovered. Traverse blocks in
-    // postorder so that clones inside loops are discovered before loop-induced
-    // clones.
+/*
+ * Helper function to analyze lattice values at each program point and find
+ * multiple dynamic users of a single write. Return as soon as any clone is
+ * found.
+ */
+fn find_clones(
+    editor: &mut FunctionEditor,
+    super_value: &BTreeMap<NodeID, BTreeSet<NodeID>>,
+    lattice: &Vec<BTreeMap<NodeID, BTreeSet<NodeID>>>,
+    rev_po: &Vec<NodeID>,
+    typing: &Vec<TypeID>,
+    control_subgraph: &Subgraph,
+    dom: &DomTree,
+    loops: &LoopTree,
+    objects: &CollectionObjects,
+    bb_to_prefix_sum: &Vec<usize>,
+    bbs: &BasicBlocks,
+) -> bool {
+    let nodes = &editor.func().nodes;
+    let func_id = editor.func_id();
    for bb in rev_po.iter().rev() {
        let insts = &bbs.1[bb.idx()];
        // Accumulate predecessor bottom used sets for phis. Phis are special in
@@ -955,7 +1055,21 @@ fn materialize_clones(
                            bottom.clone()
                        });
                        if bottom.contains_key(&arg) {
-                            induce_clone(*arg, *inst, bottom);
+                            induce_clone(
+                                editor,
+                                *arg,
+                                *inst,
+                                bottom,
+                                super_value,
+                                lattice,
+                                rev_po,
+                                typing,
+                                control_subgraph,
+                                dom,
+                                loops,
+                                bb_to_prefix_sum,
+                                bbs,
+                            );
                            return true;
                        } else {
                            // Subsequent phis using `arg` along the same
@@ -971,11 +1085,39 @@ fn materialize_clones(
                    third,
                } => {
                    if value.contains_key(&second) {
-                        induce_clone(second, *inst, &value);
+                        induce_clone(
+                            editor,
+                            second,
+                            *inst,
+                            &value,
+                            super_value,
+                            lattice,
+                            rev_po,
+                            typing,
+                            control_subgraph,
+                            dom,
+                            loops,
+                            bb_to_prefix_sum,
+                            bbs,
+                        );
                        return true;
                    }
                    if value.contains_key(&third) {
-                        induce_clone(third, *inst, &value);
+                        induce_clone(
+                            editor,
+                            third,
+                            *inst,
+                            &value,
+                            super_value,
+                            lattice,
+                            rev_po,
+                            typing,
+                            control_subgraph,
+                            dom,
+                            loops,
+                            bb_to_prefix_sum,
+                            bbs,
+                        );
                        return true;
                    }
                }
@@ -985,7 +1127,21 @@ fn materialize_clones(
                    reduct: _,
                } => {
                    if value.contains_key(&init) {
-                        induce_clone(init, *inst, &value);
+                        induce_clone(
+                            editor,
+                            init,
+                            *inst,
+                            &value,
+                            super_value,
+                            lattice,
+                            rev_po,
+                            typing,
+                            control_subgraph,
+                            dom,
+                            loops,
+                            bb_to_prefix_sum,
+                            bbs,
+                        );
                        return true;
                    }
                }
@@ -994,7 +1150,21 @@ fn materialize_clones(
                    indices: _,
                } if !objects[&func_id].objects(*inst).is_empty() => {
                    if value.contains_key(&collect) {
-                        induce_clone(collect, *inst, &value);
+                        induce_clone(
+                            editor,
+                            collect,
+                            *inst,
+                            &value,
+                            super_value,
+                            lattice,
+                            rev_po,
+                            typing,
+                            control_subgraph,
+                            dom,
+                            loops,
+                            bb_to_prefix_sum,
+                            bbs,
+                        );
                        return true;
                    }
                }
@@ -1004,7 +1174,21 @@ fn materialize_clones(
                    indices: _,
                } => {
                    if value.contains_key(&collect) {
-                        induce_clone(collect, *inst, &value);
+                        induce_clone(
+                            editor,
+                            collect,
+                            *inst,
+                            &value,
+                            super_value,
+                            lattice,
+                            rev_po,
+                            typing,
+                            control_subgraph,
+                            dom,
+                            loops,
+                            bb_to_prefix_sum,
+                            bbs,
+                        );
                        return true;
                    }
                }
@@ -1025,7 +1209,21 @@ fn materialize_clones(
                            .unwrap_or(false)
                            && value.contains_key(arg)
                        {
-                            induce_clone(*arg, *inst, value);
+                            induce_clone(
+                                editor,
+                                *arg,
+                                *inst,
+                                value,
+                                super_value,
+                                lattice,
+                                rev_po,
+                                typing,
+                                control_subgraph,
+                                dom,
+                                loops,
+                                bb_to_prefix_sum,
+                                bbs,
+                            );
                            return true;
                        }
                    }

--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -64,16 +64,32 @@ fn tricky_loop_implicit_clone(a : usize, b : usize) -> i32 {
 fn tricky2_loop_implicit_clone(a : usize, b : usize) -> i32 {
  let x = 0;
  for i = 0 to 3 {
-    let arr : i32[1];
+    let arr1 : i32[1];
+    let arr2 : i32[1];
    if a == b {
-      arr[0] = 6;
+      arr1[0] = 6;
    } else {
-      arr[0] = 9;
+      arr2[0] = 9;
    }
+    arr1[0] = 2;
    for j = 0 to 4 {
-      arr[0] += 1;
+      arr2[0] += 1;
+    }
+    x += arr2[0];
+  }
+  return x;
+}
+
+#[entry]
+fn tricky3_loop_implicit_clone(a : usize, b : usize) -> usize {
+  let x = 0;
+  for i = 0 to b {
+    let arr1 : usize[10];
+    let arr2 : usize[10];
+    arr1[1] = 1;
+    for kk = 0 to 10 {
+      arr2[kk] += arr1[kk];
    }
-    x += arr[0];
  }
  return x;
 }

--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -27,6 +27,10 @@ fn main() {
        println!("{}", output);
        assert_eq!(output, 39);

+        let output = tricky3_loop_implicit_clone(5, 7).await;
+        println!("{}", output);
+        assert_eq!(output, 0);
+
        let output = no_implicit_clone(4).await;
        println!("{}", output);
        assert_eq!(output, 13);