From 525ac54fa89972926ca0f3852176586c45e7e5b4 Mon Sep 17 00:00:00 2001
From: rarbore2 <rarbore2@illinois.edu>
Date: Wed, 8 Jan 2025 12:57:53 -0600
Subject: [PATCH] Proper loop induced clone detection

---
 hercules_ir/src/dom.rs                        |  20 +
 hercules_ir/src/ir.rs                         |  13 +
 hercules_opt/src/dce.rs                       |   3 +-
 hercules_opt/src/gvn.rs                       |  12 +-
 .../src/legalize_reference_semantics.rs       | 405 +++++++++++++-----
 hercules_opt/src/pass.rs                      |   8 +-
 juno_frontend/src/lib.rs                      |   2 +
 .../implicit_clone/src/implicit_clone.jn      |  51 +++
 juno_samples/implicit_clone/src/main.rs       |  12 +
 juno_samples/matmul/src/matmul.jn             |  11 +-
 10 files changed, 421 insertions(+), 116 deletions(-)

diff --git a/hercules_ir/src/dom.rs b/hercules_ir/src/dom.rs
index f9046b43..2c0f085d 100644
--- a/hercules_ir/src/dom.rs
+++ b/hercules_ir/src/dom.rs
@@ -77,6 +77,26 @@ impl DomTree {
         .1
     }
 
+    /*
+     * Find the node with the shallowest level in the dom tree amongst the nodes
+     * given.
+     */
+    pub fn shallowest_amongst<I>(&self, x: I) -> NodeID
+    where
+        I: Iterator<Item = NodeID>,
+    {
+        x.map(|x| {
+            if x == self.root {
+                (0, x)
+            } else {
+                (self.idom[&x].0, x)
+            }
+        })
+        .min_by(|x, y| x.0.cmp(&y.0))
+        .unwrap()
+        .1
+    }
+
     /*
      * Find the least common ancestor in the tree of two nodes. This is an
      * ancestor of the two nodes that is as far down the tree as possible.
diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs
index 11d23e61..8bb8e4ef 100644
--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -1296,6 +1296,19 @@ impl Node {
         }
     }
 
+    pub fn try_write(&self) -> Option<(NodeID, NodeID, &[Index])> {
+        if let Node::Write {
+            collect,
+            data,
+            indices,
+        } = self
+        {
+            Some((*collect, *data, indices))
+        } else {
+            None
+        }
+    }
+
     pub fn is_zero_constant(&self, constants: &Vec<Constant>) -> bool {
         if let Node::Constant { id } = self
             && constants[id.idx()].is_zero()
diff --git a/hercules_opt/src/dce.rs b/hercules_opt/src/dce.rs
index 75268694..839e9ad1 100644
--- a/hercules_opt/src/dce.rs
+++ b/hercules_opt/src/dce.rs
@@ -6,8 +6,7 @@ use self::hercules_ir::ir::*;
 use crate::*;
 
 /*
- * Top level function to run dead code elimination. Deletes nodes by setting
- * nodes to gravestones. Works with a function already containing gravestones.
+ * Top level function to run dead code elimination.
  */
 pub fn dce(editor: &mut FunctionEditor) {
     // Create worklist (starts as all nodes).
diff --git a/hercules_opt/src/gvn.rs b/hercules_opt/src/gvn.rs
index a9db0cd8..5f7138e5 100644
--- a/hercules_opt/src/gvn.rs
+++ b/hercules_opt/src/gvn.rs
@@ -11,7 +11,7 @@ use crate::*;
  * fairly simple compared to in a normal CFG. Needs access to constants for
  * identity function simplification.
  */
-pub fn gvn(editor: &mut FunctionEditor) {
+pub fn gvn(editor: &mut FunctionEditor, gvn_constants_and_clones: bool) {
     // Create worklist (starts as all nodes) and value number hashmap.
     let mut worklist: Vec<NodeID> = (0..editor.func().nodes.len()).map(NodeID::new).collect();
     let mut value_numbers: HashMap<Node, NodeID> = HashMap::new();
@@ -28,7 +28,15 @@ pub fn gvn(editor: &mut FunctionEditor) {
         // Next, check if there is a value number for this simplified value yet.
         if let Some(number) = value_numbers.get(&editor.func().nodes[value.idx()]) {
             // If the number is this worklist item, there's nothing to be done.
-            if *number == work {
+            // Also, don't GVN constants and clones if indicated to not do so.
+            if *number == work
+                || (!gvn_constants_and_clones
+                    && (editor.func().nodes[work.idx()].is_constant()
+                        || editor.func().nodes[work.idx()]
+                            .try_write()
+                            .map(|(_, _, indices)| indices.is_empty())
+                            .unwrap_or(false)))
+            {
                 continue;
             }
 
diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index 254524f9..3c89c7da 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -2,9 +2,8 @@ extern crate bitvec;
 extern crate hercules_cg;
 extern crate hercules_ir;
 
-use std::collections::{BTreeSet, HashMap, VecDeque};
+use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque};
 use std::iter::{empty, once, zip, FromIterator};
-use std::mem::take;
 
 use self::bitvec::prelude::*;
 
@@ -70,11 +69,11 @@ pub fn legalize_reference_semantics(
     ) {
         Ok(bbs) => bbs,
         Err((obj, reader)) => {
-            induce_clone(editor, typing, obj, reader);
+            todo!();
             return None;
         }
     };
-    if materialize_clones(editor, typing, control_subgraph, objects, &bbs) {
+    if materialize_clones(editor, typing, control_subgraph, dom, loops, objects, &bbs) {
         None
     } else {
         Some(bbs)
@@ -223,6 +222,7 @@ fn basic_blocks(
                             || root_block_iterated_users.contains(&mutator))
                         && mutating_objects(function, func_id, *mutator, objects)
                             .any(|mutated| read_objs.contains(&mutated))
+                        && id != mutator
                     {
                         antideps.insert((*id, *mutator));
                     }
@@ -534,6 +534,8 @@ fn materialize_clones(
     editor: &mut FunctionEditor,
     typing: &Vec<TypeID>,
     control_subgraph: &Subgraph,
+    dom: &DomTree,
+    loops: &LoopTree,
     objects: &CollectionObjects,
     bbs: &BasicBlocks,
 ) -> bool {
@@ -543,7 +545,8 @@ fn materialize_clones(
     // This is the only place this form is used, so just hardcode it here.
     //
     // This forward dataflow analysis tracks which collections are used at each
-    // program point. Collections are referred to using node IDs. Specifically:
+    // program point, and by what user nodes. Collections are referred to using
+    // node IDs. Specifically:
     //
     // - Phi - a phi node adds its inputs to the used set and removes itself
     //   from the used set. If a phi uses an ID that is used along the edge of
@@ -567,11 +570,11 @@ fn materialize_clones(
     //   to the used set and removes itself from the used set. If any mutated
     //   input is already used, a clone is induced.
     //
-    // Reads of sub-collections (select, read, and call nodes) use a collection
-    // because they may have downstream writes that depend on the new "view" of
-    // the same collection. This does not include reads that "end" (the `data`
-    // input of a write). This analysis does not consider parallel mutations in
-    // fork-joins, which are handled separately later in this function.
+    // Reads of sub-collections (select, some read, and some call nodes) use a
+    // collection because they may have downstream writes that depend on the new
+    // "sub-view" of the same collection. This does not include reads that "end"
+    // (most reads, some calls, the `data` input of a write). This analysis does
+    // not consider parallel mutations in fork-joins.
     let rev_po = control_subgraph.rev_po(NodeID::new(0));
     let mut total_num_pts = 0;
     let mut bb_to_prefix_sum = vec![0; bbs.0.len()];
@@ -581,82 +584,85 @@ fn materialize_clones(
             total_num_pts += insts.len() + 1;
         }
     }
-    // Lattice maps each program point to a set of used values. Top is that no
-    // nodes are used yet.
+    // Lattice maps each program point to a set of used values and their
+    // possible users. Top is that no nodes are used yet.
     let nodes = &editor.func().nodes;
     let func_id = editor.func_id();
-    let mut lattice: Vec<BTreeSet<NodeID>> = vec![BTreeSet::new(); total_num_pts];
+    let meet = |left: &mut BTreeMap<NodeID, BTreeSet<NodeID>>,
+                right: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
+        for (used, users) in right.into_iter() {
+            left.entry(*used).or_default().extend(users.into_iter());
+        }
+    };
+    let mut lattice: Vec<BTreeMap<NodeID, BTreeSet<NodeID>>> = vec![BTreeMap::new(); total_num_pts];
     loop {
         let mut changed = false;
 
         for bb in rev_po.iter() {
             // The lattice value of the first point is the meet of the
             // predecessor terminating lattice values.
-            let mut top_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()]]);
+            let old_top_value = &lattice[bb_to_prefix_sum[bb.idx()]];
+            let mut new_top_value = BTreeMap::new();
             // Clearing `top_value` is not necessary since used nodes are never
             // removed from lattice values, only added.
             for pred in control_subgraph.preds(*bb) {
-                // It should not be possible in Hercules IR for a basic block to
-                // be one of its own predecessors.
-                assert_ne!(*bb, pred);
                 let last_pt = bbs.1[pred.idx()].len();
-                for elem in lattice[bb_to_prefix_sum[pred.idx()] + last_pt].iter() {
-                    changed |= top_value.insert(*elem);
-                }
+                meet(
+                    &mut new_top_value,
+                    &lattice[bb_to_prefix_sum[pred.idx()] + last_pt],
+                );
             }
-            lattice[bb_to_prefix_sum[bb.idx()]] = top_value;
+            changed |= *old_top_value != new_top_value;
+            lattice[bb_to_prefix_sum[bb.idx()]] = new_top_value;
 
             // The lattice value of following points are determined by their
             // immediate preceding instructions.
             let insts = &bbs.1[bb.idx()];
             for (prev_pt, inst) in insts.iter().enumerate() {
-                let mut new_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1]);
+                let old_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1];
                 let prev_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt];
+                let mut new_value = prev_value.clone();
                 match nodes[inst.idx()] {
                     Node::Phi {
                         control: _,
                         ref data,
                     } if !objects[&func_id].objects(*inst).is_empty() => {
                         for elem in data {
-                            changed |= new_value.insert(*elem);
+                            new_value.entry(*elem).or_default().insert(*inst);
                         }
-                        changed |= new_value.remove(inst);
+                        new_value.remove(inst);
                     }
                     Node::Ternary {
                         op: TernaryOperator::Select,
                         first: _,
                         second,
                         third,
+                    }
+                    | Node::Reduce {
+                        control: _,
+                        init: second,
+                        reduct: third,
                     } => {
                         if !objects[&func_id].objects(*inst).is_empty() {
-                            changed |= new_value.insert(second);
-                            changed |= new_value.insert(third);
-                            changed |= new_value.remove(inst);
+                            new_value.entry(second).or_default().insert(*inst);
+                            new_value.entry(third).or_default().insert(*inst);
+                            new_value.remove(inst);
                         }
                     }
-                    Node::Reduce {
-                        control: _,
-                        init,
-                        reduct,
-                    } if !objects[&func_id].objects(*inst).is_empty() => {
-                        changed |= new_value.insert(init);
-                        changed |= new_value.insert(reduct);
-                        changed |= new_value.remove(inst);
-                    }
                     Node::Read {
                         collect,
                         indices: _,
                     } if !objects[&func_id].objects(*inst).is_empty() => {
-                        changed |= new_value.insert(collect);
-                        changed |= new_value.remove(inst);
+                        new_value.entry(collect).or_default().insert(*inst);
+                        new_value.remove(inst);
                     }
                     Node::Write {
                         collect,
                         data: _,
                         indices: _,
                     } => {
-                        changed |= new_value.insert(collect);
-                        changed |= new_value.remove(inst);
+                        new_value.entry(collect).or_default().insert(*inst);
+                        new_value.remove(inst);
                     }
                     Node::Call {
                         control: _,
@@ -674,23 +680,21 @@ fn materialize_clones(
                                 })
                                 .unwrap_or(false)
                             {
-                                changed |= new_value.insert(*arg);
+                                new_value.entry(*arg).or_default().insert(*inst);
                             }
                         }
-                        changed |= new_value.remove(inst);
-                    }
-                    _ => {
-                        for elem in prev_value {
-                            changed |= new_value.insert(*elem);
-                        }
+                        new_value.remove(inst);
                     }
+                    _ => {}
                 }
+                changed |= *old_value != new_value;
                 lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1] = new_value;
             }
 
             // Handle reduces in this block specially at the very end.
             let last_pt = insts.len();
-            let mut bottom_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()] + last_pt]);
+            let old_bottom_value = &lattice[bb_to_prefix_sum[bb.idx()] + last_pt];
+            let mut new_bottom_value = old_bottom_value.clone();
             for inst in insts.iter() {
                 if let Node::Reduce {
                     control: _,
@@ -699,27 +703,241 @@ fn materialize_clones(
                 } = nodes[inst.idx()]
                 {
                     assert!(
-                        bottom_value.contains(&reduct),
+                        new_bottom_value.contains_key(&reduct),
                         "PANIC: Can't handle clones inside a reduction cycle currently."
                     );
-                    changed |= bottom_value.remove(inst);
+                    new_bottom_value.remove(inst);
                 }
             }
-            lattice[bb_to_prefix_sum[bb.idx()] + last_pt] = bottom_value;
+            changed |= *old_bottom_value != new_bottom_value;
+            lattice[bb_to_prefix_sum[bb.idx()] + last_pt] = new_bottom_value;
         }
 
         if !changed {
             break;
         }
     }
+    let mut super_value = BTreeMap::new();
+    for value in lattice.iter() {
+        meet(&mut super_value, value);
+    }
+
+    // Helper to induce a clone when an implicit clone is identified.
+    let nodes = nodes.clone();
+    let mut induce_clone = |object: NodeID,
+                            user: NodeID,
+                            value: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
+        // If `user` already used `object` and tries to use it again, then the
+        // clone is a "loop induced" clone. Otherwise, it's a simple clone.
+        if !value[&object].contains(&user) {
+            let success = editor.edit(|mut edit| {
+                // Create the constant collection object for allocation.
+                let object_ty = typing[object.idx()];
+                let object_cons = edit.add_zero_constant(object_ty);
+                let cons_node = edit.add_node(Node::Constant { id: object_cons });
+
+                // Create the clone into the new constant collection.
+                let clone_node = edit.add_node(Node::Write {
+                    collect: cons_node,
+                    data: object,
+                    indices: vec![].into_boxed_slice(),
+                });
+
+                // Make user use the cloned object.
+                edit.replace_all_uses_where(object, clone_node, |id| *id == user)
+            });
+            assert!(success);
+        } else {
+            // Figure out where to place that phi. This is the deepest
+            // loop header where `user` is responsible for making `object` used
+            // used at the top of the block, and the block dominates the block
+            // containing `user`. If `user` is a phi, then the region it's
+            // attached to is excluded from eligibility.
+            let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
+                lattice[bb_to_prefix_sum[bb.idx()]]
+                    .get(&object)
+                    .unwrap_or(&BTreeSet::new())
+                    .contains(&user)
+                    && dom.does_dom(*bb, bbs.0[user.idx()])
+                    && loops.contains(*bb)
+                    && loops.is_in_loop(*bb, bbs.0[user.idx()])
+                    && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
+            });
+            let top_block = eligible_blocks
+                .max_by_key(|bb| loops.nesting(*bb).unwrap())
+                .unwrap();
+            assert!(editor.func().nodes[top_block.idx()].is_region());
+
+            // Figure out the users of `object` that we need to phi back
+            // upwards. Assign each user a number indicating how far down the
+            // user chain it is, higher is farther down. This is used for
+            // picking the most downstream user later.
+            let mut users: BTreeMap<NodeID, usize> = BTreeMap::new();
+            let mut workset: BTreeSet<NodeID> = BTreeSet::new();
+            workset.insert(object);
+            let mut chain_ordering = 1;
+            while let Some(pop) = workset.pop_first() {
+                let iterated_users: BTreeSet<_> = super_value
+                    .get(&pop)
+                    .map(|users| users.into_iter())
+                    .into_iter()
+                    .flatten()
+                    .map(|id| *id)
+                    .filter(|iterated_user| loops.is_in_loop(top_block, bbs.0[iterated_user.idx()]))
+                    .collect();
+                workset.extend(iterated_users.iter().filter(|id| !users.contains_key(id)));
+                for user in iterated_users {
+                    *users.entry(user).or_default() = chain_ordering;
+                    chain_ordering += 1;
+                }
+            }
+
+            // The fringe users may not dominate any predecessors of the loop
+            // header. The following is some Juno code that exposes this:
+            //
+            // fn problematic(a : size) -> i32 {
+            //   for i = 0 to a {
+            //     let arr : i32[1];
+            //     for j = 0 to a {
+            //       arr[0] = 1;
+            //     }
+            //   }
+            //   return 0;
+            // }
+            //
+            // Note that `arr` induces a clone each iteration, since its value
+            // needs to be reset to all zeros. However, it should also be noted
+            // that the most fringe user of `arr`, the write inside the inner
+            // loop, does not dominate the bottom of the outer loop. Thus, we
+            // need to insert a phi in the bottom block of the outer loop to
+            // retrieve either the write, or `arr` before the inner loop. The
+            // general version of this problem requires the following solution.
+            // Our goal is to figure out which downstream user represents
+            // `object` at each block in the loop. We first assign each block
+            // containing a user the most downstream user it contains. Then, we
+            // create a dummy phi for every region (including the header) in the
+            // loop, which is the downstream user for that block. Then, every
+            // other block is assigned the downstream user of its single
+            // predecessor. This basically amounts to recreating SSA for
+            // `object` inside the loop.
+            let mut user_per_loop_bb = BTreeMap::new();
+            let mut added_phis = BTreeMap::new();
+            let mut top_phi = NodeID::new(0);
+            // Assign existing users.
+            for (user, ordering) in users.iter() {
+                let bb = bbs.0[user.idx()];
+                if let Some(old_user) = user_per_loop_bb.get(&bb)
+                    && users[old_user] > *ordering
+                {
+                } else {
+                    user_per_loop_bb.insert(bb, *user);
+                }
+            }
+            // Assign dummy phis.
+            for bb in loops.nodes_in_loop(top_block) {
+                if (!user_per_loop_bb.contains_key(&bb) || bb == top_block)
+                    && editor.func().nodes[bb.idx()].is_region()
+                {
+                    let success = editor.edit(|mut edit| {
+                        let phi_node = edit.add_node(Node::Phi {
+                            control: bb,
+                            data: empty().collect(),
+                        });
+                        if bb != top_block || !user_per_loop_bb.contains_key(&bb) {
+                            user_per_loop_bb.insert(bb, phi_node);
+                        }
+                        if bb == top_block {
+                            top_phi = phi_node;
+                        }
+                        added_phis.insert(phi_node, bb);
+                        Ok(edit)
+                    });
+                    assert!(success);
+                }
+            }
+            // Assign users for the rest of the blocks.
+            for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) {
+                if !user_per_loop_bb.contains_key(&bb) {
+                    assert!(control_subgraph.preds(*bb).count() == 1);
+                    user_per_loop_bb.insert(
+                        *bb,
+                        user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()],
+                    );
+                }
+            }
+
+            // Induce the clone.
+            let success = editor.edit(|mut edit| {
+                // Create the constant collection object for allocation.
+                let object_ty = typing[object.idx()];
+                let object_cons = edit.add_zero_constant(object_ty);
+                let cons_node = edit.add_node(Node::Constant { id: object_cons });
+
+                // Create the phis.
+                let mut phi_map = BTreeMap::new();
+                let mut real_phis = BTreeSet::new();
+                for (dummy, bb) in added_phis {
+                    let real = edit.add_node(Node::Phi {
+                        control: bb,
+                        data: control_subgraph
+                            .preds(bb)
+                            .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
+                            .collect(),
+                    });
+                    phi_map.insert(dummy, real);
+                    real_phis.insert(real);
+                }
+
+                // Create the clone into the phi.
+                let real_top_phi = phi_map[&top_phi];
+                let clone_node = edit.add_node(Node::Write {
+                    collect: real_top_phi,
+                    data: object,
+                    indices: vec![].into_boxed_slice(),
+                });
+
+                // Make users use the cloned object.
+                edit = edit.replace_all_uses_where(object, clone_node, |id| {
+                    id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
+                })?;
+
+                // Get rid of the dummy phis.
+                for (dummy, real) in phi_map {
+                    edit = edit.replace_all_uses(dummy, real)?;
+                    edit = edit.delete_node(dummy)?;
+                }
+
+                // Make phis use the clone instead of the top phi.
+                edit =
+                    edit.replace_all_uses_where(real_top_phi, clone_node, |id| *id != clone_node)?;
+
+                Ok(edit)
+            });
+            assert!(success);
+
+            // De-duplicate phis.
+            gvn(editor, false);
+
+            // Get rid of unused phis.
+            dce(editor);
+
+            // Simplify phis.
+            phi_elim(editor);
+        }
+    };
 
     // Now that we've computed the used collections dataflow analysis, use the
     // results to materialize a clone whenever a node attempts to use an already
-    // used node.
-    let mut any_induced = false;
-    let nodes = nodes.clone();
-    for bb in rev_po.iter() {
+    // used node. As soon as any clone is found, return since that clone needs
+    // to get placed before other clones can be discovered. Traverse blocks in
+    // postorder so that clones inside loops are discovered before loop-induced
+    // clones.
+    for bb in rev_po.iter().rev() {
         let insts = &bbs.1[bb.idx()];
+        // Accumulate predecessor bottom used sets for phis. Phis are special in
+        // that they need to be path sensitive, but multiple phis in a single
+        // block may use a single collection, and that needs to induce a clone.
+        let mut phi_acc_bottoms = BTreeMap::new();
         for (prev_pt, inst) in insts.iter().enumerate() {
             let value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt];
             match nodes[inst.idx()] {
@@ -731,11 +949,18 @@ fn materialize_clones(
                     // predecessor's bottom lattice value (phis need to be path-
                     // sensitive).
                     for (pred, arg) in zip(control_subgraph.preds(*bb), data) {
-                        let last_pt = bbs.1[pred.idx()].len();
-                        let bottom = &lattice[bb_to_prefix_sum[pred.idx()] + last_pt];
-                        if bottom.contains(arg) {
-                            induce_clone(editor, typing, *arg, *inst);
-                            any_induced = true;
+                        let bottom = phi_acc_bottoms.entry(pred).or_insert_with(|| {
+                            let last_pt = bbs.1[pred.idx()].len();
+                            let bottom = &lattice[bb_to_prefix_sum[pred.idx()] + last_pt];
+                            bottom.clone()
+                        });
+                        if bottom.contains_key(&arg) {
+                            induce_clone(*arg, *inst, bottom);
+                            return true;
+                        } else {
+                            // Subsequent phis using `arg` along the same
+                            // predecessor induce a clone.
+                            bottom.insert(*arg, once(*inst).collect());
                         }
                     }
                 }
@@ -745,13 +970,13 @@ fn materialize_clones(
                     second,
                     third,
                 } => {
-                    if value.contains(&second) {
-                        induce_clone(editor, typing, second, *inst);
-                        any_induced = true;
+                    if value.contains_key(&second) {
+                        induce_clone(second, *inst, &value);
+                        return true;
                     }
-                    if value.contains(&third) {
-                        induce_clone(editor, typing, third, *inst);
-                        any_induced = true;
+                    if value.contains_key(&third) {
+                        induce_clone(third, *inst, &value);
+                        return true;
                     }
                 }
                 Node::Reduce {
@@ -759,18 +984,18 @@ fn materialize_clones(
                     init,
                     reduct: _,
                 } => {
-                    if value.contains(&init) {
-                        induce_clone(editor, typing, init, *inst);
-                        any_induced = true;
+                    if value.contains_key(&init) {
+                        induce_clone(init, *inst, &value);
+                        return true;
                     }
                 }
                 Node::Read {
                     collect,
                     indices: _,
                 } if !objects[&func_id].objects(*inst).is_empty() => {
-                    if value.contains(&collect) {
-                        induce_clone(editor, typing, collect, *inst);
-                        any_induced = true;
+                    if value.contains_key(&collect) {
+                        induce_clone(collect, *inst, &value);
+                        return true;
                     }
                 }
                 Node::Write {
@@ -778,9 +1003,9 @@ fn materialize_clones(
                     data: _,
                     indices: _,
                 } => {
-                    if value.contains(&collect) {
-                        induce_clone(editor, typing, collect, *inst);
-                        any_induced = true;
+                    if value.contains_key(&collect) {
+                        induce_clone(collect, *inst, &value);
+                        return true;
                     }
                 }
                 Node::Call {
@@ -798,10 +1023,10 @@ fn materialize_clones(
                                     || callee_objects.returned_objects().contains(&object)
                             })
                             .unwrap_or(false)
-                            && value.contains(arg)
+                            && value.contains_key(arg)
                         {
-                            induce_clone(editor, typing, *arg, *inst);
-                            any_induced = true;
+                            induce_clone(*arg, *inst, value);
+                            return true;
                         }
                     }
                 }
@@ -809,27 +1034,5 @@ fn materialize_clones(
             }
         }
     }
-    any_induced
-}
-
-/*
- * Utility to insert a clone before a use of a collection.
- */
-fn induce_clone(editor: &mut FunctionEditor, typing: &Vec<TypeID>, object: NodeID, user: NodeID) {
-    editor.edit(|mut edit| {
-        // Create the constant collection object for allocation.
-        let object_ty = typing[object.idx()];
-        let object_cons = edit.add_zero_constant(object_ty);
-        let cons_node = edit.add_node(Node::Constant { id: object_cons });
-
-        // Create the clone into the new constant collection.
-        let clone_node = edit.add_node(Node::Write {
-            collect: cons_node,
-            data: object,
-            indices: vec![].into_boxed_slice(),
-        });
-
-        // Make user use the cloned object.
-        edit.replace_all_uses_where(object, clone_node, |id| *id == user)
-    });
+    false
 }
diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs
index d0449a4a..baeaae87 100644
--- a/hercules_opt/src/pass.rs
+++ b/hercules_opt/src/pass.rs
@@ -395,7 +395,7 @@ impl PassManager {
                             &types_ref,
                             &def_uses[idx],
                         );
-                        gvn(&mut editor);
+                        gvn(&mut editor, false);
 
                         self.module.constants = constants_ref.take();
                         self.module.dynamic_constants = dynamic_constants_ref.take();
@@ -828,7 +828,11 @@ impl PassManager {
                             &types_ref,
                             &def_uses[idx],
                         );
-                        infer_parallel_reduce(&mut editor, &fork_join_maps[idx], &reduce_cycles[idx]);
+                        infer_parallel_reduce(
+                            &mut editor,
+                            &fork_join_maps[idx],
+                            &reduce_cycles[idx],
+                        );
                         infer_parallel_fork(&mut editor, &fork_join_maps[idx]);
                         infer_vectorizable(&mut editor, &fork_join_maps[idx]);
                         infer_tight_associative(&mut editor, &reduce_cycles[idx]);
diff --git a/juno_frontend/src/lib.rs b/juno_frontend/src/lib.rs
index 0dd5cdd3..075c7a45 100644
--- a/juno_frontend/src/lib.rs
+++ b/juno_frontend/src/lib.rs
@@ -201,6 +201,8 @@ pub fn compile_ir(
         pm.add_pass(hercules_opt::pass::Pass::Xdot(true));
     }
 
+    add_pass!(pm, verify, LegalizeReferenceSemantics);
+    add_verified_pass!(pm, verify, DCE);
     add_pass!(pm, verify, LegalizeReferenceSemantics);
     pm.add_pass(hercules_opt::pass::Pass::Codegen(output_dir, module_name));
     pm.run_passes();
diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn
index a2d6cba0..d06a6498 100644
--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -27,6 +27,57 @@ fn loop_implicit_clone(input : i32) -> i32 {
   return r + 7;
 }
 
+#[entry]
+fn double_loop_implicit_clone(a : usize) -> usize {
+  for i = 0 to a {
+    let arr : i32[1];
+    for j = 0 to a {
+      arr[0] = 1;
+    }
+  }
+  return 42;
+}
+
+#[entry]
+fn tricky_loop_implicit_clone(a : usize, b : usize) -> i32 {
+  let x = 0;
+  for j = 0 to 2 {
+    for i = 0 to 5 {
+      let arr : i32[3];
+      let arr2 : i32[1];
+      if a == b {
+        arr[a] += 7;
+      } else {
+        arr[a] += 1;
+      }
+      for k = 0 to (a + b - 1) {
+        arr[a] += 2;
+	arr2[0] += 1;
+      }
+      x += arr[b];
+    }
+  }
+  return x;
+}
+
+#[entry]
+fn tricky2_loop_implicit_clone(a : usize, b : usize) -> i32 {
+  let x = 0;
+  for i = 0 to 3 {
+    let arr : i32[1];
+    if a == b {
+      arr[0] = 6;
+    } else {
+      arr[0] = 9;
+    }
+    for j = 0 to 4 {
+      arr[0] += 1;
+    }
+    x += arr[0];
+  }
+  return x;
+}
+
 #[entry]
 fn no_implicit_clone(input : i32) -> i32 {
   let arr : i32[2];
diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs
index 45c722d7..c0adaaef 100644
--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -15,6 +15,18 @@ fn main() {
         println!("{}", output);
         assert_eq!(output, 7);
 
+        let output = double_loop_implicit_clone(3).await;
+        println!("{}", output);
+        assert_eq!(output, 42);
+
+        let output = tricky_loop_implicit_clone(2, 2).await;
+        println!("{}", output);
+        assert_eq!(output, 130);
+
+        let output = tricky2_loop_implicit_clone(2, 3).await;
+        println!("{}", output);
+        assert_eq!(output, 39);
+
         let output = no_implicit_clone(4).await;
         println!("{}", output);
         assert_eq!(output, 13);
diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn
index 775bb382..f59cc215 100644
--- a/juno_samples/matmul/src/matmul.jn
+++ b/juno_samples/matmul/src/matmul.jn
@@ -20,10 +20,8 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]
   
   for bi = 0 to n / 64 {
     for bk = 0 to l / 64 {
-      // TODO: make these all the same size, clone analysis should undo GVN's
-      // combining of these three arrays.
-      let atile : i32[66, 64];
-      let btile : i32[65, 64];
+      let atile : i32[64, 64];
+      let btile : i32[64, 64];
       let ctile : i32[64, 64];
 
       for tile_idx = 0 to m / 64 {
@@ -31,11 +29,6 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]
 	  for tk = 0 to 64 {
 	    atile[ti, tk] = a[bi * 64 + ti, tile_idx * 64 + tk];
 	    btile[ti, tk] = b[tile_idx * 64 + ti, bk * 64 + tk];
-	    // TODO: remove setting ctile to zero explicitly, clone analysis
-	    // should see a lack of a phi for ctile in the block loops and
-	    // induce a copy of an initial value of ctile (all zeros) on each
-	    // iteration of the block loops.
-	    ctile[ti, tk] = 0;
 	  }
 	}
         for ti = 0 to 64 {
-- 
GitLab