From 5fa92052468fbe317c0c8b54921972e7a3ff7160 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 6 Jan 2025 11:21:05 -0800
Subject: [PATCH 01/14] Fix handling of clones with phis

---
 hercules_opt/src/legalize_reference_semantics.rs | 15 +++++++++++++--
 juno_samples/matmul/src/matmul.jn                |  6 ++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index 254524f9..cd7a3d5a 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -720,6 +720,10 @@ fn materialize_clones(
     let nodes = nodes.clone();
     for bb in rev_po.iter() {
         let insts = &bbs.1[bb.idx()];
+        // Accumulate predecessor bottom used sets for phis. Phis are special in
+        // that they need to be path sensitive, but multiple phis in a single
+        // block may use a single collection, and that needs to induce a clone.
+        let mut phi_acc_bottoms = HashMap::new();
         for (prev_pt, inst) in insts.iter().enumerate() {
             let value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt];
             match nodes[inst.idx()] {
@@ -731,11 +735,18 @@ fn materialize_clones(
                     // predecessor's bottom lattice value (phis need to be path-
                     // sensitive).
                     for (pred, arg) in zip(control_subgraph.preds(*bb), data) {
-                        let last_pt = bbs.1[pred.idx()].len();
-                        let bottom = &lattice[bb_to_prefix_sum[pred.idx()] + last_pt];
+                        let bottom = phi_acc_bottoms.entry(pred).or_insert_with(|| {
+                            let last_pt = bbs.1[pred.idx()].len();
+                            let bottom = &lattice[bb_to_prefix_sum[pred.idx()] + last_pt];
+                            bottom.clone()
+                        });
                         if bottom.contains(arg) {
                             induce_clone(editor, typing, *arg, *inst);
                             any_induced = true;
+                        } else {
+                            // Subsequent phis using `arg` along the same
+                            // predecessor induce a clone.
+                            bottom.insert(*arg);
                         }
                     }
                 }
diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn
index 775bb382..16a262b9 100644
--- a/juno_samples/matmul/src/matmul.jn
+++ b/juno_samples/matmul/src/matmul.jn
@@ -20,10 +20,8 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]
   
   for bi = 0 to n / 64 {
     for bk = 0 to l / 64 {
-      // TODO: make these all the same size, clone analysis should undo GVN's
-      // combining of these three arrays.
-      let atile : i32[66, 64];
-      let btile : i32[65, 64];
+      let atile : i32[64, 64];
+      let btile : i32[64, 64];
       let ctile : i32[64, 64];
 
       for tile_idx = 0 to m / 64 {
-- 
GitLab


From a4b47f937a6e4d888251bb507a7b94ec0a3629c4 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 6 Jan 2025 15:40:34 -0800
Subject: [PATCH 02/14] Do enough analysis to identify loop-induced clones

---
 .../src/legalize_reference_semantics.rs       | 200 ++++++++++--------
 1 file changed, 107 insertions(+), 93 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index cd7a3d5a..b3a106d5 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -2,9 +2,8 @@ extern crate bitvec;
 extern crate hercules_cg;
 extern crate hercules_ir;
 
-use std::collections::{BTreeSet, HashMap, VecDeque};
+use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque};
 use std::iter::{empty, once, zip, FromIterator};
-use std::mem::take;
 
 use self::bitvec::prelude::*;
 
@@ -70,7 +69,7 @@ pub fn legalize_reference_semantics(
     ) {
         Ok(bbs) => bbs,
         Err((obj, reader)) => {
-            induce_clone(editor, typing, obj, reader);
+            todo!();
             return None;
         }
     };
@@ -543,7 +542,8 @@ fn materialize_clones(
     // This is the only place this form is used, so just hardcode it here.
     //
     // This forward dataflow analysis tracks which collections are used at each
-    // program point. Collections are referred to using node IDs. Specifically:
+    // program point, and by what user nodes. Collections are referred to using
+    // node IDs. Specifically:
     //
     // - Phi - a phi node adds its inputs to the used set and removes itself
     //   from the used set. If a phi uses an ID that is used along the edge of
@@ -567,11 +567,11 @@ fn materialize_clones(
     //   to the used set and removes itself from the used set. If any mutated
     //   input is already used, a clone is induced.
     //
-    // Reads of sub-collections (select, read, and call nodes) use a collection
-    // because they may have downstream writes that depend on the new "view" of
-    // the same collection. This does not include reads that "end" (the `data`
-    // input of a write). This analysis does not consider parallel mutations in
-    // fork-joins, which are handled separately later in this function.
+    // Reads of sub-collections (select, some read, and some call nodes) use a
+    // collection because they may have downstream writes that depend on the new
+    // "sub-view" of the same collection. This does not include reads that "end"
+    // (most reads, some calls, the `data` input of a write). This analysis does
+    // not consider parallel mutations in fork-joins.
     let rev_po = control_subgraph.rev_po(NodeID::new(0));
     let mut total_num_pts = 0;
     let mut bb_to_prefix_sum = vec![0; bbs.0.len()];
@@ -581,82 +581,85 @@ fn materialize_clones(
             total_num_pts += insts.len() + 1;
         }
     }
-    // Lattice maps each program point to a set of used values. Top is that no
-    // nodes are used yet.
+    // Lattice maps each program point to a set of used values and their
+    // possible users. Top is that no nodes are used yet.
     let nodes = &editor.func().nodes;
     let func_id = editor.func_id();
-    let mut lattice: Vec<BTreeSet<NodeID>> = vec![BTreeSet::new(); total_num_pts];
+    let meet = |left: &mut BTreeMap<NodeID, BTreeSet<NodeID>>,
+                right: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
+        for (used, users) in right.into_iter() {
+            left.entry(*used).or_default().extend(users.into_iter());
+        }
+    };
+    let mut lattice: Vec<BTreeMap<NodeID, BTreeSet<NodeID>>> = vec![BTreeMap::new(); total_num_pts];
     loop {
         let mut changed = false;
 
         for bb in rev_po.iter() {
             // The lattice value of the first point is the meet of the
             // predecessor terminating lattice values.
-            let mut top_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()]]);
+            let old_top_value = &lattice[bb_to_prefix_sum[bb.idx()]];
+            let mut new_top_value = BTreeMap::new();
             // Clearing `top_value` is not necessary since used nodes are never
             // removed from lattice values, only added.
             for pred in control_subgraph.preds(*bb) {
-                // It should not be possible in Hercules IR for a basic block to
-                // be one of its own predecessors.
-                assert_ne!(*bb, pred);
                 let last_pt = bbs.1[pred.idx()].len();
-                for elem in lattice[bb_to_prefix_sum[pred.idx()] + last_pt].iter() {
-                    changed |= top_value.insert(*elem);
-                }
+                meet(
+                    &mut new_top_value,
+                    &lattice[bb_to_prefix_sum[pred.idx()] + last_pt],
+                );
             }
-            lattice[bb_to_prefix_sum[bb.idx()]] = top_value;
+            changed |= *old_top_value != new_top_value;
+            lattice[bb_to_prefix_sum[bb.idx()]] = new_top_value;
 
             // The lattice value of following points are determined by their
             // immediate preceding instructions.
             let insts = &bbs.1[bb.idx()];
             for (prev_pt, inst) in insts.iter().enumerate() {
-                let mut new_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1]);
+                let old_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1];
                 let prev_value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt];
+                let mut new_value = prev_value.clone();
                 match nodes[inst.idx()] {
                     Node::Phi {
                         control: _,
                         ref data,
                     } if !objects[&func_id].objects(*inst).is_empty() => {
                         for elem in data {
-                            changed |= new_value.insert(*elem);
+                            new_value.entry(*elem).or_default().insert(*inst);
                         }
-                        changed |= new_value.remove(inst);
+                        new_value.remove(inst);
                     }
                     Node::Ternary {
                         op: TernaryOperator::Select,
                         first: _,
                         second,
                         third,
+                    }
+                    | Node::Reduce {
+                        control: _,
+                        init: second,
+                        reduct: third,
                     } => {
                         if !objects[&func_id].objects(*inst).is_empty() {
-                            changed |= new_value.insert(second);
-                            changed |= new_value.insert(third);
-                            changed |= new_value.remove(inst);
+                            new_value.entry(second).or_default().insert(*inst);
+                            new_value.entry(third).or_default().insert(*inst);
+                            new_value.remove(inst);
                         }
                     }
-                    Node::Reduce {
-                        control: _,
-                        init,
-                        reduct,
-                    } if !objects[&func_id].objects(*inst).is_empty() => {
-                        changed |= new_value.insert(init);
-                        changed |= new_value.insert(reduct);
-                        changed |= new_value.remove(inst);
-                    }
                     Node::Read {
                         collect,
                         indices: _,
                     } if !objects[&func_id].objects(*inst).is_empty() => {
-                        changed |= new_value.insert(collect);
-                        changed |= new_value.remove(inst);
+                        new_value.entry(collect).or_default().insert(*inst);
+                        new_value.remove(inst);
                     }
                     Node::Write {
                         collect,
                         data: _,
                         indices: _,
                     } => {
-                        changed |= new_value.insert(collect);
-                        changed |= new_value.remove(inst);
+                        new_value.entry(collect).or_default().insert(*inst);
+                        new_value.remove(inst);
                     }
                     Node::Call {
                         control: _,
@@ -674,23 +677,21 @@ fn materialize_clones(
                                 })
                                 .unwrap_or(false)
                             {
-                                changed |= new_value.insert(*arg);
+                                new_value.entry(*arg).or_default().insert(*inst);
                             }
                         }
-                        changed |= new_value.remove(inst);
-                    }
-                    _ => {
-                        for elem in prev_value {
-                            changed |= new_value.insert(*elem);
-                        }
+                        new_value.remove(inst);
                     }
+                    _ => {}
                 }
+                changed |= *old_value != new_value;
                 lattice[bb_to_prefix_sum[bb.idx()] + prev_pt + 1] = new_value;
             }
 
             // Handle reduces in this block specially at the very end.
             let last_pt = insts.len();
-            let mut bottom_value = take(&mut lattice[bb_to_prefix_sum[bb.idx()] + last_pt]);
+            let old_bottom_value = &lattice[bb_to_prefix_sum[bb.idx()] + last_pt];
+            let mut new_bottom_value = old_bottom_value.clone();
             for inst in insts.iter() {
                 if let Node::Reduce {
                     control: _,
@@ -699,13 +700,14 @@ fn materialize_clones(
                 } = nodes[inst.idx()]
                 {
                     assert!(
-                        bottom_value.contains(&reduct),
+                        new_bottom_value.contains_key(&reduct),
                         "PANIC: Can't handle clones inside a reduction cycle currently."
                     );
-                    changed |= bottom_value.remove(inst);
+                    new_bottom_value.remove(inst);
                 }
             }
-            lattice[bb_to_prefix_sum[bb.idx()] + last_pt] = bottom_value;
+            changed |= *old_bottom_value != new_bottom_value;
+            lattice[bb_to_prefix_sum[bb.idx()] + last_pt] = new_bottom_value;
         }
 
         if !changed {
@@ -715,15 +717,17 @@ fn materialize_clones(
 
     // Now that we've computed the used collections dataflow analysis, use the
     // results to materialize a clone whenever a node attempts to use an already
-    // used node.
-    let mut any_induced = false;
+    // used node. As soon as any clone is found, return since that clone needs
+    // to get placed before other clones can be discovered. Traverse blocks in
+    // postorder so that clones inside loops are discovered before loop-induced
+    // clones.
     let nodes = nodes.clone();
-    for bb in rev_po.iter() {
+    for bb in rev_po.iter().rev() {
         let insts = &bbs.1[bb.idx()];
         // Accumulate predecessor bottom used sets for phis. Phis are special in
         // that they need to be path sensitive, but multiple phis in a single
         // block may use a single collection, and that needs to induce a clone.
-        let mut phi_acc_bottoms = HashMap::new();
+        let mut phi_acc_bottoms = BTreeMap::new();
         for (prev_pt, inst) in insts.iter().enumerate() {
             let value = &lattice[bb_to_prefix_sum[bb.idx()] + prev_pt];
             match nodes[inst.idx()] {
@@ -740,13 +744,13 @@ fn materialize_clones(
                             let bottom = &lattice[bb_to_prefix_sum[pred.idx()] + last_pt];
                             bottom.clone()
                         });
-                        if bottom.contains(arg) {
-                            induce_clone(editor, typing, *arg, *inst);
-                            any_induced = true;
+                        if let Some(other_users) = bottom.get(&arg) {
+                            induce_clone(editor, typing, *arg, *inst, other_users);
+                            return true;
                         } else {
                             // Subsequent phis using `arg` along the same
                             // predecessor induce a clone.
-                            bottom.insert(*arg);
+                            bottom.insert(*arg, once(*inst).collect());
                         }
                     }
                 }
@@ -756,13 +760,13 @@ fn materialize_clones(
                     second,
                     third,
                 } => {
-                    if value.contains(&second) {
-                        induce_clone(editor, typing, second, *inst);
-                        any_induced = true;
+                    if let Some(other_users) = value.get(&second) {
+                        induce_clone(editor, typing, second, *inst, other_users);
+                        return true;
                     }
-                    if value.contains(&third) {
-                        induce_clone(editor, typing, third, *inst);
-                        any_induced = true;
+                    if let Some(other_users) = value.get(&third) {
+                        induce_clone(editor, typing, third, *inst, other_users);
+                        return true;
                     }
                 }
                 Node::Reduce {
@@ -770,18 +774,18 @@ fn materialize_clones(
                     init,
                     reduct: _,
                 } => {
-                    if value.contains(&init) {
-                        induce_clone(editor, typing, init, *inst);
-                        any_induced = true;
+                    if let Some(other_users) = value.get(&init) {
+                        induce_clone(editor, typing, init, *inst, other_users);
+                        return true;
                     }
                 }
                 Node::Read {
                     collect,
                     indices: _,
                 } if !objects[&func_id].objects(*inst).is_empty() => {
-                    if value.contains(&collect) {
-                        induce_clone(editor, typing, collect, *inst);
-                        any_induced = true;
+                    if let Some(other_users) = value.get(&collect) {
+                        induce_clone(editor, typing, collect, *inst, other_users);
+                        return true;
                     }
                 }
                 Node::Write {
@@ -789,9 +793,9 @@ fn materialize_clones(
                     data: _,
                     indices: _,
                 } => {
-                    if value.contains(&collect) {
-                        induce_clone(editor, typing, collect, *inst);
-                        any_induced = true;
+                    if let Some(other_users) = value.get(&collect) {
+                        induce_clone(editor, typing, collect, *inst, other_users);
+                        return true;
                     }
                 }
                 Node::Call {
@@ -809,10 +813,10 @@ fn materialize_clones(
                                     || callee_objects.returned_objects().contains(&object)
                             })
                             .unwrap_or(false)
-                            && value.contains(arg)
+                            && let Some(other_users) = value.get(arg)
                         {
-                            induce_clone(editor, typing, *arg, *inst);
-                            any_induced = true;
+                            induce_clone(editor, typing, *arg, *inst, other_users);
+                            return true;
                         }
                     }
                 }
@@ -820,27 +824,37 @@ fn materialize_clones(
             }
         }
     }
-    any_induced
+    false
 }
 
 /*
  * Utility to insert a clone before a use of a collection.
  */
-fn induce_clone(editor: &mut FunctionEditor, typing: &Vec<TypeID>, object: NodeID, user: NodeID) {
-    editor.edit(|mut edit| {
-        // Create the constant collection object for allocation.
-        let object_ty = typing[object.idx()];
-        let object_cons = edit.add_zero_constant(object_ty);
-        let cons_node = edit.add_node(Node::Constant { id: object_cons });
-
-        // Create the clone into the new constant collection.
-        let clone_node = edit.add_node(Node::Write {
-            collect: cons_node,
-            data: object,
-            indices: vec![].into_boxed_slice(),
+fn induce_clone(
+    editor: &mut FunctionEditor,
+    typing: &Vec<TypeID>,
+    object: NodeID,
+    user: NodeID,
+    other_users: &BTreeSet<NodeID>,
+) {
+    if other_users.contains(&user) {
+        editor.edit(|mut edit| {
+            // Create the constant collection object for allocation.
+            let object_ty = typing[object.idx()];
+            let object_cons = edit.add_zero_constant(object_ty);
+            let cons_node = edit.add_node(Node::Constant { id: object_cons });
+
+            // Create the clone into the new constant collection.
+            let clone_node = edit.add_node(Node::Write {
+                collect: cons_node,
+                data: object,
+                indices: vec![].into_boxed_slice(),
+            });
+
+            // Make user use the cloned object.
+            edit.replace_all_uses_where(object, clone_node, |id| *id == user)
         });
-
-        // Make user use the cloned object.
-        edit.replace_all_uses_where(object, clone_node, |id| *id == user)
-    });
+    } else {
+        todo!()
+    }
 }
-- 
GitLab


From b01a31f3b3134cd1c40a6fa272441b6b7210c2cb Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 6 Jan 2025 22:27:43 -0800
Subject: [PATCH 03/14] Handle cloning writes in loops

---
 hercules_ir/src/dom.rs                        |  20 +++
 .../src/legalize_reference_semantics.rs       | 170 +++++++++++++-----
 juno_frontend/src/lib.rs                      |   2 +
 3 files changed, 145 insertions(+), 47 deletions(-)

diff --git a/hercules_ir/src/dom.rs b/hercules_ir/src/dom.rs
index f9046b43..2c0f085d 100644
--- a/hercules_ir/src/dom.rs
+++ b/hercules_ir/src/dom.rs
@@ -77,6 +77,26 @@ impl DomTree {
         .1
     }
 
+    /*
+     * Find the node with the shallowest level in the dom tree amongst the nodes
+     * given.
+     */
+    pub fn shallowest_amongst<I>(&self, x: I) -> NodeID
+    where
+        I: Iterator<Item = NodeID>,
+    {
+        x.map(|x| {
+            if x == self.root {
+                (0, x)
+            } else {
+                (self.idom[&x].0, x)
+            }
+        })
+        .min_by(|x, y| x.0.cmp(&y.0))
+        .unwrap()
+        .1
+    }
+
     /*
      * Find the least common ancestor in the tree of two nodes. This is an
      * ancestor of the two nodes that is as far down the tree as possible.
diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index b3a106d5..ae5b5c3f 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -73,7 +73,7 @@ pub fn legalize_reference_semantics(
             return None;
         }
     };
-    if materialize_clones(editor, typing, control_subgraph, objects, &bbs) {
+    if materialize_clones(editor, typing, control_subgraph, dom, objects, &bbs) {
         None
     } else {
         Some(bbs)
@@ -533,6 +533,7 @@ fn materialize_clones(
     editor: &mut FunctionEditor,
     typing: &Vec<TypeID>,
     control_subgraph: &Subgraph,
+    dom: &DomTree,
     objects: &CollectionObjects,
     bbs: &BasicBlocks,
 ) -> bool {
@@ -722,6 +723,113 @@ fn materialize_clones(
     // postorder so that clones inside loops are discovered before loop-induced
     // clones.
     let nodes = nodes.clone();
+    let mut induce_clone = |object: NodeID,
+                            user: NodeID,
+                            value: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
+        if !value[&object].contains(&user) {
+            editor.edit(|mut edit| {
+                // Create the constant collection object for allocation.
+                let object_ty = typing[object.idx()];
+                let object_cons = edit.add_zero_constant(object_ty);
+                let cons_node = edit.add_node(Node::Constant { id: object_cons });
+
+                // Create the clone into the new constant collection.
+                let clone_node = edit.add_node(Node::Write {
+                    collect: cons_node,
+                    data: object,
+                    indices: vec![].into_boxed_slice(),
+                });
+
+                // Make user use the cloned object.
+                edit.replace_all_uses_where(object, clone_node, |id| *id == user)
+            });
+        } else {
+            // Figure out the downstream users of `user` that we need to phi
+            // back upwards.
+            let mut user_iterated_users: BTreeSet<NodeID> = BTreeSet::new();
+            let mut workset = BTreeSet::new();
+            workset.insert(user);
+            while let Some(pop) = workset.pop_first() {
+                user_iterated_users.insert(pop);
+                workset.extend(
+                    value
+                        .get(&pop)
+                        .map(|users| users.into_iter())
+                        .into_iter()
+                        .flatten(),
+                );
+            }
+
+            // Figure out which downstream users aren't themselves used -
+            // these are the nodes that get cycled through the phi.
+            let fringe_users: BTreeSet<NodeID> = user_iterated_users
+                .into_iter()
+                .filter(|user| !value.contains_key(&user))
+                .collect();
+
+            // Figure out where to place that phi. This is the shallowest
+            // node in the domtree where `user` is responsible for making
+            // `object` used at the top of the block, and the block
+            // dominates the block containing `user`.
+            let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
+                lattice[bb_to_prefix_sum[bb.idx()]]
+                    .get(&object)
+                    .unwrap_or(&BTreeSet::new())
+                    .contains(&user)
+                    && dom.does_dom(*bb, bbs.0[user.idx()])
+            });
+            let top_block = dom.shallowest_amongst(eligible_blocks);
+            assert!(editor.func().nodes[top_block.idx()].is_region());
+
+            // Figure out which fringe users, if any, dominate each
+            // predecessor of the top block. If none dominate a predecessor,
+            // then the input for that predecessor is the new collection
+            // constant.
+            let mut fringe_user_per_pred = vec![None; control_subgraph.preds(top_block).count()];
+            for fringe_user in fringe_users {
+                for (idx, pred) in control_subgraph.preds(top_block).enumerate() {
+                    if dom.does_dom(bbs.0[fringe_user.idx()], pred) {
+                        assert!(fringe_user_per_pred[idx].is_none(), "PANIC: Multiple fringe users of a cloning write in a loop dominate the same predecessor of the loop header. This shouldn't be possible!");
+                        fringe_user_per_pred[idx] = Some(fringe_user);
+                    }
+                }
+            }
+            assert!(fringe_user_per_pred.iter().any(|user|user.is_none()), "PANIC: No predecessor of the loop header containing a cloning write has no fringe user of that write. This shouldn't be possible!");
+
+            // Induce the clone.
+            editor.edit(|mut edit| {
+                // Create the constant collection object for allocation.
+                let object_ty = typing[object.idx()];
+                let object_cons = edit.add_zero_constant(object_ty);
+                let cons_node = edit.add_node(Node::Constant { id: object_cons });
+
+                // Create the phi.
+                let phi_node = edit.add_node(Node::Phi {
+                    control: top_block,
+                    data: fringe_user_per_pred
+                        .into_iter()
+                        .map(|user| {
+                            if let Some(user) = user {
+                                user
+                            } else {
+                                cons_node
+                            }
+                        })
+                        .collect(),
+                });
+
+                // Create the clone into the phi.
+                let clone_node = edit.add_node(Node::Write {
+                    collect: phi_node,
+                    data: object,
+                    indices: vec![].into_boxed_slice(),
+                });
+
+                // Make user use the cloned object.
+                edit.replace_all_uses_where(object, clone_node, |id| *id == user)
+            });
+        }
+    };
     for bb in rev_po.iter().rev() {
         let insts = &bbs.1[bb.idx()];
         // Accumulate predecessor bottom used sets for phis. Phis are special in
@@ -744,8 +852,8 @@ fn materialize_clones(
                             let bottom = &lattice[bb_to_prefix_sum[pred.idx()] + last_pt];
                             bottom.clone()
                         });
-                        if let Some(other_users) = bottom.get(&arg) {
-                            induce_clone(editor, typing, *arg, *inst, other_users);
+                        if bottom.contains_key(&arg) {
+                            induce_clone(*arg, *inst, bottom);
                             return true;
                         } else {
                             // Subsequent phis using `arg` along the same
@@ -760,12 +868,12 @@ fn materialize_clones(
                     second,
                     third,
                 } => {
-                    if let Some(other_users) = value.get(&second) {
-                        induce_clone(editor, typing, second, *inst, other_users);
+                    if value.contains_key(&second) {
+                        induce_clone(second, *inst, &value);
                         return true;
                     }
-                    if let Some(other_users) = value.get(&third) {
-                        induce_clone(editor, typing, third, *inst, other_users);
+                    if value.contains_key(&third) {
+                        induce_clone(third, *inst, &value);
                         return true;
                     }
                 }
@@ -774,8 +882,8 @@ fn materialize_clones(
                     init,
                     reduct: _,
                 } => {
-                    if let Some(other_users) = value.get(&init) {
-                        induce_clone(editor, typing, init, *inst, other_users);
+                    if value.contains_key(&init) {
+                        induce_clone(init, *inst, &value);
                         return true;
                     }
                 }
@@ -783,8 +891,8 @@ fn materialize_clones(
                     collect,
                     indices: _,
                 } if !objects[&func_id].objects(*inst).is_empty() => {
-                    if let Some(other_users) = value.get(&collect) {
-                        induce_clone(editor, typing, collect, *inst, other_users);
+                    if value.contains_key(&collect) {
+                        induce_clone(collect, *inst, &value);
                         return true;
                     }
                 }
@@ -793,8 +901,8 @@ fn materialize_clones(
                     data: _,
                     indices: _,
                 } => {
-                    if let Some(other_users) = value.get(&collect) {
-                        induce_clone(editor, typing, collect, *inst, other_users);
+                    if value.contains_key(&collect) {
+                        induce_clone(collect, *inst, &value);
                         return true;
                     }
                 }
@@ -813,9 +921,9 @@ fn materialize_clones(
                                     || callee_objects.returned_objects().contains(&object)
                             })
                             .unwrap_or(false)
-                            && let Some(other_users) = value.get(arg)
+                            && value.contains_key(arg)
                         {
-                            induce_clone(editor, typing, *arg, *inst, other_users);
+                            induce_clone(*arg, *inst, value);
                             return true;
                         }
                     }
@@ -826,35 +934,3 @@ fn materialize_clones(
     }
     false
 }
-
-/*
- * Utility to insert a clone before a use of a collection.
- */
-fn induce_clone(
-    editor: &mut FunctionEditor,
-    typing: &Vec<TypeID>,
-    object: NodeID,
-    user: NodeID,
-    other_users: &BTreeSet<NodeID>,
-) {
-    if other_users.contains(&user) {
-        editor.edit(|mut edit| {
-            // Create the constant collection object for allocation.
-            let object_ty = typing[object.idx()];
-            let object_cons = edit.add_zero_constant(object_ty);
-            let cons_node = edit.add_node(Node::Constant { id: object_cons });
-
-            // Create the clone into the new constant collection.
-            let clone_node = edit.add_node(Node::Write {
-                collect: cons_node,
-                data: object,
-                indices: vec![].into_boxed_slice(),
-            });
-
-            // Make user use the cloned object.
-            edit.replace_all_uses_where(object, clone_node, |id| *id == user)
-        });
-    } else {
-        todo!()
-    }
-}
diff --git a/juno_frontend/src/lib.rs b/juno_frontend/src/lib.rs
index 0dd5cdd3..075c7a45 100644
--- a/juno_frontend/src/lib.rs
+++ b/juno_frontend/src/lib.rs
@@ -201,6 +201,8 @@ pub fn compile_ir(
         pm.add_pass(hercules_opt::pass::Pass::Xdot(true));
     }
 
+    add_pass!(pm, verify, LegalizeReferenceSemantics);
+    add_verified_pass!(pm, verify, DCE);
     add_pass!(pm, verify, LegalizeReferenceSemantics);
     pm.add_pass(hercules_opt::pass::Pass::Codegen(output_dir, module_name));
     pm.run_passes();
-- 
GitLab


From 7041352d9e4a45e044ae9a244292ccb23567507c Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 6 Jan 2025 22:35:42 -0800
Subject: [PATCH 04/14] Fix how top block is picked

---
 hercules_opt/src/legalize_reference_semantics.rs | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index ae5b5c3f..3ee393c0 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -73,7 +73,7 @@ pub fn legalize_reference_semantics(
             return None;
         }
     };
-    if materialize_clones(editor, typing, control_subgraph, dom, objects, &bbs) {
+    if materialize_clones(editor, typing, control_subgraph, dom, loops, objects, &bbs) {
         None
     } else {
         Some(bbs)
@@ -534,6 +534,7 @@ fn materialize_clones(
     typing: &Vec<TypeID>,
     control_subgraph: &Subgraph,
     dom: &DomTree,
+    loops: &LoopTree,
     objects: &CollectionObjects,
     bbs: &BasicBlocks,
 ) -> bool {
@@ -767,18 +768,21 @@ fn materialize_clones(
                 .filter(|user| !value.contains_key(&user))
                 .collect();
 
-            // Figure out where to place that phi. This is the shallowest
-            // node in the domtree where `user` is responsible for making
-            // `object` used at the top of the block, and the block
-            // dominates the block containing `user`.
+            // Figure out where to place that phi. This is the deepest
+            // loop header where `user` is responsible for making `object` used
+            // used at the top of the block, and the block dominates the block
+            // containing `user`.
             let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
                 lattice[bb_to_prefix_sum[bb.idx()]]
                     .get(&object)
                     .unwrap_or(&BTreeSet::new())
                     .contains(&user)
                     && dom.does_dom(*bb, bbs.0[user.idx()])
+                    && loops.contains(*bb)
             });
-            let top_block = dom.shallowest_amongst(eligible_blocks);
+            let top_block = eligible_blocks
+                .max_by_key(|bb| loops.nesting(*bb).unwrap())
+                .unwrap();
             assert!(editor.func().nodes[top_block.idx()].is_region());
 
             // Figure out which fringe users, if any, dominate each
-- 
GitLab


From b8dcc346362e83a479e1eac12b217509374f2c54 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Mon, 6 Jan 2025 22:57:34 -0800
Subject: [PATCH 05/14] Fixes for double-loop clone

---
 hercules_opt/src/legalize_reference_semantics.rs  | 10 ++++++++--
 juno_samples/implicit_clone/src/implicit_clone.jn | 13 +++++++++++++
 juno_samples/implicit_clone/src/main.rs           |  4 ++++
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index 3ee393c0..b124aecf 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -222,6 +222,7 @@ fn basic_blocks(
                             || root_block_iterated_users.contains(&mutator))
                         && mutating_objects(function, func_id, *mutator, objects)
                             .any(|mutated| read_objs.contains(&mutated))
+                        && id != mutator
                     {
                         antideps.insert((*id, *mutator));
                     }
@@ -757,7 +758,10 @@ fn materialize_clones(
                         .get(&pop)
                         .map(|users| users.into_iter())
                         .into_iter()
-                        .flatten(),
+                        .flatten()
+                        .filter(|iterated_user| {
+                            dom.does_dom(bbs.0[user.idx()], bbs.0[iterated_user.idx()])
+                        }),
                 );
             }
 
@@ -771,7 +775,8 @@ fn materialize_clones(
             // Figure out where to place that phi. This is the deepest
             // loop header where `user` is responsible for making `object` used
             // used at the top of the block, and the block dominates the block
-            // containing `user`.
+            // containing `user`. If `user` is a phi, then the region it's
+            // attached to is excluded from eligibility.
             let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
                 lattice[bb_to_prefix_sum[bb.idx()]]
                     .get(&object)
@@ -779,6 +784,7 @@ fn materialize_clones(
                     .contains(&user)
                     && dom.does_dom(*bb, bbs.0[user.idx()])
                     && loops.contains(*bb)
+                    && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
             });
             let top_block = eligible_blocks
                 .max_by_key(|bb| loops.nesting(*bb).unwrap())
diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn
index a2d6cba0..a4a28559 100644
--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -27,6 +27,19 @@ fn loop_implicit_clone(input : i32) -> i32 {
   return r + 7;
 }
 
+#[entry]
+fn double_loop_implicit_clone(a : usize, b : usize) -> i32 {
+  let x = 0;
+  for j = 0 to 2 {
+    for i = 0 to 5 {
+      let arr : i32[3];
+      arr[a] += 7;
+      x += arr[b];
+    }
+  }
+  return x / 2;
+}
+
 #[entry]
 fn no_implicit_clone(input : i32) -> i32 {
   let arr : i32[2];
diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs
index 45c722d7..036e4edb 100644
--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -15,6 +15,10 @@ fn main() {
         println!("{}", output);
         assert_eq!(output, 7);
 
+        let output = double_loop_implicit_clone(2, 2).await;
+        println!("{}", output);
+        assert_eq!(output, 35);
+
         let output = no_implicit_clone(4).await;
         println!("{}", output);
         assert_eq!(output, 13);
-- 
GitLab


From 061a191e46f5af7e896153522dbd155ba9deca8f Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 10:57:10 -0800
Subject: [PATCH 06/14] Fix for more complex loops

---
 .../src/legalize_reference_semantics.rs       | 57 ++++++++++---------
 .../implicit_clone/src/implicit_clone.jn      |  6 +-
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index b124aecf..6d41c773 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -746,32 +746,6 @@ fn materialize_clones(
                 edit.replace_all_uses_where(object, clone_node, |id| *id == user)
             });
         } else {
-            // Figure out the downstream users of `user` that we need to phi
-            // back upwards.
-            let mut user_iterated_users: BTreeSet<NodeID> = BTreeSet::new();
-            let mut workset = BTreeSet::new();
-            workset.insert(user);
-            while let Some(pop) = workset.pop_first() {
-                user_iterated_users.insert(pop);
-                workset.extend(
-                    value
-                        .get(&pop)
-                        .map(|users| users.into_iter())
-                        .into_iter()
-                        .flatten()
-                        .filter(|iterated_user| {
-                            dom.does_dom(bbs.0[user.idx()], bbs.0[iterated_user.idx()])
-                        }),
-                );
-            }
-
-            // Figure out which downstream users aren't themselves used -
-            // these are the nodes that get cycled through the phi.
-            let fringe_users: BTreeSet<NodeID> = user_iterated_users
-                .into_iter()
-                .filter(|user| !value.contains_key(&user))
-                .collect();
-
             // Figure out where to place that phi. This is the deepest
             // loop header where `user` is responsible for making `object` used
             // used at the top of the block, and the block dominates the block
@@ -791,6 +765,33 @@ fn materialize_clones(
                 .unwrap();
             assert!(editor.func().nodes[top_block.idx()].is_region());
 
+            // `user` may just be the canary in the coal mine - there may be
+            // multiple users of `object` in this loop. Find all of them.
+            let all_loop_users: BTreeSet<_> = value[&object]
+                .iter()
+                .map(|id| *id)
+                .filter(|user| loops.is_in_loop(top_block, bbs.0[user.idx()]))
+                .collect();
+
+            // Figure out the fringe users of the users of `object` that we need
+            // to phi back upwards.
+            let mut fringe_users: BTreeSet<NodeID> = BTreeSet::new();
+            let mut workset = all_loop_users;
+            while let Some(pop) = workset.pop_first() {
+                let iterated_users: BTreeSet<_> = value
+                    .get(&pop)
+                    .map(|users| users.into_iter())
+                    .into_iter()
+                    .flatten()
+                    .filter(|iterated_user| loops.is_in_loop(top_block, bbs.0[iterated_user.idx()]))
+                    .collect();
+                if iterated_users.is_empty() {
+                    fringe_users.insert(pop);
+                } else {
+                    workset.extend(iterated_users);
+                }
+            }
+
             // Figure out which fringe users, if any, dominate each
             // predecessor of the top block. If none dominate a predecessor,
             // then the input for that predecessor is the new collection
@@ -836,7 +837,9 @@ fn materialize_clones(
                 });
 
                 // Make user use the cloned object.
-                edit.replace_all_uses_where(object, clone_node, |id| *id == user)
+                edit.replace_all_uses_where(object, clone_node, |id| {
+                    id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
+                })
             });
         }
     };
diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn
index a4a28559..a575ae59 100644
--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -33,7 +33,11 @@ fn double_loop_implicit_clone(a : usize, b : usize) -> i32 {
   for j = 0 to 2 {
     for i = 0 to 5 {
       let arr : i32[3];
-      arr[a] += 7;
+      if a == b {
+        arr[a] += 7;
+      } else {
+        arr[a] += 1;
+      }
       x += arr[b];
     }
   }
-- 
GitLab


From 97f5bb6dc749c46f5b12aaa8835e7e0c9e9d44a7 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 13:46:35 -0800
Subject: [PATCH 07/14] More complex loops are broken

---
 .../src/legalize_reference_semantics.rs       | 184 ++++++++++++++----
 .../implicit_clone/src/implicit_clone.jn      |  18 +-
 juno_samples/implicit_clone/src/main.rs       |   6 +-
 3 files changed, 168 insertions(+), 40 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index 6d41c773..5e80ec73 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -718,18 +718,13 @@ fn materialize_clones(
         }
     }
 
-    // Now that we've computed the used collections dataflow analysis, use the
-    // results to materialize a clone whenever a node attempts to use an already
-    // used node. As soon as any clone is found, return since that clone needs
-    // to get placed before other clones can be discovered. Traverse blocks in
-    // postorder so that clones inside loops are discovered before loop-induced
-    // clones.
+    // Helper to induce a clone when an implicit clone is identified.
     let nodes = nodes.clone();
     let mut induce_clone = |object: NodeID,
                             user: NodeID,
                             value: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
         if !value[&object].contains(&user) {
-            editor.edit(|mut edit| {
+            let success = editor.edit(|mut edit| {
                 // Create the constant collection object for allocation.
                 let object_ty = typing[object.idx()];
                 let object_cons = edit.add_zero_constant(object_ty);
@@ -745,6 +740,7 @@ fn materialize_clones(
                 // Make user use the cloned object.
                 edit.replace_all_uses_where(object, clone_node, |id| *id == user)
             });
+            assert!(success);
         } else {
             // Figure out where to place that phi. This is the deepest
             // loop header where `user` is responsible for making `object` used
@@ -758,6 +754,7 @@ fn materialize_clones(
                     .contains(&user)
                     && dom.does_dom(*bb, bbs.0[user.idx()])
                     && loops.contains(*bb)
+                    && loops.is_in_loop(*bb, bbs.0[user.idx()])
                     && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
             });
             let top_block = eligible_blocks
@@ -765,50 +762,149 @@ fn materialize_clones(
                 .unwrap();
             assert!(editor.func().nodes[top_block.idx()].is_region());
 
-            // `user` may just be the canary in the coal mine - there may be
-            // multiple users of `object` in this loop. Find all of them.
-            let all_loop_users: BTreeSet<_> = value[&object]
-                .iter()
-                .map(|id| *id)
-                .filter(|user| loops.is_in_loop(top_block, bbs.0[user.idx()]))
-                .collect();
-
-            // Figure out the fringe users of the users of `object` that we need
-            // to phi back upwards.
-            let mut fringe_users: BTreeSet<NodeID> = BTreeSet::new();
-            let mut workset = all_loop_users;
+            // Figure out the users of `object` that we need to phi back
+            // upwards. Assign each user a number indicating how far down the
+            // user chain it is, higher is farther down. This is used for
+            // picking the most downstream user later.
+            let mut users: BTreeMap<NodeID, usize> = BTreeMap::new();
+            let mut workset = BTreeSet::new();
+            workset.insert(object);
+            let mut chain_ordering = 1;
             while let Some(pop) = workset.pop_first() {
                 let iterated_users: BTreeSet<_> = value
                     .get(&pop)
                     .map(|users| users.into_iter())
                     .into_iter()
                     .flatten()
+                    .map(|id| *id)
                     .filter(|iterated_user| loops.is_in_loop(top_block, bbs.0[iterated_user.idx()]))
                     .collect();
-                if iterated_users.is_empty() {
-                    fringe_users.insert(pop);
-                } else {
-                    workset.extend(iterated_users);
+                workset.extend(&iterated_users);
+                for user in iterated_users {
+                    *users.entry(user).or_default() = chain_ordering;
+                    chain_ordering += 1;
                 }
             }
 
-            // Figure out which fringe users, if any, dominate each
-            // predecessor of the top block. If none dominate a predecessor,
-            // then the input for that predecessor is the new collection
-            // constant.
-            let mut fringe_user_per_pred = vec![None; control_subgraph.preds(top_block).count()];
-            for fringe_user in fringe_users {
-                for (idx, pred) in control_subgraph.preds(top_block).enumerate() {
-                    if dom.does_dom(bbs.0[fringe_user.idx()], pred) {
-                        assert!(fringe_user_per_pred[idx].is_none(), "PANIC: Multiple fringe users of a cloning write in a loop dominate the same predecessor of the loop header. This shouldn't be possible!");
-                        fringe_user_per_pred[idx] = Some(fringe_user);
+            // The fringe users may not dominate any predecessors of the loop
+            // header. The following is some Juno code that exposes this:
+            //
+            // fn problematic(a : size) -> i32 {
+            //   for i = 0 to a {
+            //     let arr : i32[1];
+            //     for j = 0 to a {
+            //       arr[0] = 1;
+            //     }
+            //   }
+            //   return 0;
+            // }
+            //
+            // Note that `arr` induces a clone each iteration, since its value
+            // needs to be reset to all zeros. However, it should also be noted
+            // that the most fringe user of `arr`, the write inside the inner
+            // loop, does not dominate the bottom of the outer loop. Thus, we
+            // need to insert a phi in the bottom block of the outer loop to
+            // retrieve either the write, or `arr` before the inner loop. Create
+            // a dummy phi that will be replaced with the real phi we create
+            // later. This is needed for finding downstream users below.
+            let mut dummy_phi = NodeID::new(0);
+            let success = editor.edit(|mut edit| {
+                dummy_phi = edit.add_node(Node::Phi {
+                    control: top_block,
+                    data: empty().collect(),
+                });
+                Ok(edit)
+            });
+            assert!(success);
+            users.insert(dummy_phi, 0);
+
+            // Figure out which downstream user is phi-ed back up on each
+            // predecessor. For a predecessor not in the loop, no downstream
+            // user is phi-ed back up, rather the newly allocation collection
+            // is used.
+            let mut downstream_user_per_pred =
+                vec![None; control_subgraph.preds(top_block).count()];
+            let mut added_phi_bbs: BTreeMap<NodeID, NodeID> = BTreeMap::new();
+            added_phi_bbs.insert(dummy_phi, top_block);
+            let lowest_dominating_user =
+                |bb: NodeID,
+                 users: &BTreeMap<NodeID, usize>,
+                 added_phi_bbs: &BTreeMap<NodeID, NodeID>| {
+                    users
+                        .iter()
+                        .filter(|(user, _)| {
+                            let user_bb = added_phi_bbs
+                                .get(user)
+                                .unwrap_or_else(|| &bbs.0[user.idx()]);
+                            dom.does_dom(*user_bb, bb)
+                        })
+                        .max_by_key(|(_, order)| *order)
+                        .map(|(user, _)| *user)
+                };
+            for (idx, pred) in control_subgraph.preds(top_block).enumerate() {
+                if !loops.is_in_loop(top_block, pred) {
+                    continue;
+                }
+
+                // Figure out which, if any, downstream user dominates this
+                // predecessor. If none do, traverse predecessors until reaching
+                // a region, and figure out which downstream users dominate the
+                // region - create a phi joining those users, and the phi now
+                // dominates the predecessor. This is recursive. Created phis
+                // get added to `users`, so that duplicate phis aren't made. The
+                // basic blocks of the added phis are also tracked, since they
+                // are needed for domination checking.
+                let mut downstream_user = NodeID::new(0);
+                let mut worklist = VecDeque::from(vec![pred]);
+                while let Some(bb) = worklist.pop_front() {
+                    if let Some(lowest_dominating_user) =
+                        lowest_dominating_user(bb, &users, &added_phi_bbs)
+                    {
+                        // If there is a dominating user of this block, then
+                        // nothing needs to be added - record this as the
+                        // downstream user, and the last assignment of this form
+                        // will be from the immediate predecessor of the loop
+                        // header.
+                        downstream_user = lowest_dominating_user;
+                    } else if let Some(preds_lowest_dominating_users) = control_subgraph
+                        .preds(bb)
+                        .map(|pred| lowest_dominating_user(pred, &users, &added_phi_bbs))
+                        .collect::<Option<Vec<_>>>()
+                    {
+                        // If there isn't a dominating user of this block yet,
+                        // but each predecessor has a dominating user, create a
+                        // phi over those users.
+                        assert!(preds_lowest_dominating_users.len() > 1);
+                        let success = editor.edit(|mut edit| {
+                            let phi_node = edit.add_node(Node::Phi {
+                                control: bb,
+                                data: preds_lowest_dominating_users.into_boxed_slice(),
+                            });
+                            added_phi_bbs.insert(phi_node, bb);
+                            users.insert(phi_node, chain_ordering);
+                            chain_ordering += 1;
+                            downstream_user = phi_node;
+                            Ok(edit)
+                        });
+                        assert!(success);
+                    } else {
+                        // If the predecessors don't have dominating users yet,
+                        // push them to the stack. Also push ourselves back on
+                        // the stack to be revisited.
+                        for pred in control_subgraph.preds(bb) {
+                            assert!(loops.is_in_loop(top_block, pred));
+                            worklist.push_back(pred);
+                        }
+                        worklist.push_back(bb);
                     }
                 }
+                downstream_user_per_pred[idx] = Some(downstream_user);
             }
-            assert!(fringe_user_per_pred.iter().any(|user|user.is_none()), "PANIC: No predecessor of the loop header containing a cloning write has no fringe user of that write. This shouldn't be possible!");
+            assert!(downstream_user_per_pred.iter().any(|user|user.is_none()), "PANIC: No predecessor of the loop header containing a cloning write has no downstream user of that write. This shouldn't be possible!");
+            assert!(downstream_user_per_pred.iter().any(|user|user.is_some()), "PANIC: No predecessor of the loop header containing a cloning write has a downstream user of that write. This shouldn't be possible!");
 
             // Induce the clone.
-            editor.edit(|mut edit| {
+            let success = editor.edit(|mut edit| {
                 // Create the constant collection object for allocation.
                 let object_ty = typing[object.idx()];
                 let object_cons = edit.add_zero_constant(object_ty);
@@ -817,7 +913,7 @@ fn materialize_clones(
                 // Create the phi.
                 let phi_node = edit.add_node(Node::Phi {
                     control: top_block,
-                    data: fringe_user_per_pred
+                    data: downstream_user_per_pred
                         .into_iter()
                         .map(|user| {
                             if let Some(user) = user {
@@ -837,12 +933,24 @@ fn materialize_clones(
                 });
 
                 // Make user use the cloned object.
-                edit.replace_all_uses_where(object, clone_node, |id| {
+                let edit = edit.replace_all_uses_where(object, clone_node, |id| {
                     id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
-                })
+                })?;
+
+                // Get rid of the dummy phi.
+                let edit = edit.replace_all_uses(dummy_phi, phi_node)?;
+                edit.delete_node(dummy_phi)
             });
+            assert!(success);
         }
     };
+
+    // Now that we've computed the used collections dataflow analysis, use the
+    // results to materialize a clone whenever a node attempts to use an already
+    // used node. As soon as any clone is found, return since that clone needs
+    // to get placed before other clones can be discovered. Traverse blocks in
+    // postorder so that clones inside loops are discovered before loop-induced
+    // clones.
     for bb in rev_po.iter().rev() {
         let insts = &bbs.1[bb.idx()];
         // Accumulate predecessor bottom used sets for phis. Phis are special in
diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn
index a575ae59..7017942c 100644
--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -27,21 +27,37 @@ fn loop_implicit_clone(input : i32) -> i32 {
   return r + 7;
 }
 
+#[entry]
+fn tricky_loop_implicit_clone(a : usize) -> i32 {
+  for i = 0 to a {
+    let arr : i32[1];
+    for j = 0 to a {
+      arr[0] = 1;
+    }
+  }
+  return 42;
+}
+
 #[entry]
 fn double_loop_implicit_clone(a : usize, b : usize) -> i32 {
   let x = 0;
   for j = 0 to 2 {
     for i = 0 to 5 {
       let arr : i32[3];
+      //let arr2 : i32[1];
       if a == b {
         arr[a] += 7;
       } else {
         arr[a] += 1;
       }
+      for k = 0 to (a + b - 1) {
+        arr[a] += 2;
+	//arr2[0] += 1;
+      }
       x += arr[b];
     }
   }
-  return x / 2;
+  return x;
 }
 
 #[entry]
diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs
index 036e4edb..508de775 100644
--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -15,9 +15,13 @@ fn main() {
         println!("{}", output);
         assert_eq!(output, 7);
 
+        let output = tricky_loop_implicit_clone(3).await;
+        println!("{}", output);
+        assert_eq!(output, 42);
+
         let output = double_loop_implicit_clone(2, 2).await;
         println!("{}", output);
-        assert_eq!(output, 35);
+        assert_eq!(output, 130);
 
         let output = no_implicit_clone(4).await;
         println!("{}", output);
-- 
GitLab


From 734c6183c663d5ef56189f446f72ac8576c403ec Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 15:41:38 -0800
Subject: [PATCH 08/14] Fix loop induced clones

---
 .../src/legalize_reference_semantics.rs       | 184 +++++++-----------
 1 file changed, 71 insertions(+), 113 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index 5e80ec73..21a42056 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -804,104 +804,60 @@ fn materialize_clones(
             // that the most fringe user of `arr`, the write inside the inner
             // loop, does not dominate the bottom of the outer loop. Thus, we
             // need to insert a phi in the bottom block of the outer loop to
-            // retrieve either the write, or `arr` before the inner loop. Create
-            // a dummy phi that will be replaced with the real phi we create
-            // later. This is needed for finding downstream users below.
-            let mut dummy_phi = NodeID::new(0);
-            let success = editor.edit(|mut edit| {
-                dummy_phi = edit.add_node(Node::Phi {
-                    control: top_block,
-                    data: empty().collect(),
-                });
-                Ok(edit)
-            });
-            assert!(success);
-            users.insert(dummy_phi, 0);
-
-            // Figure out which downstream user is phi-ed back up on each
-            // predecessor. For a predecessor not in the loop, no downstream
-            // user is phi-ed back up, rather the newly allocation collection
-            // is used.
-            let mut downstream_user_per_pred =
-                vec![None; control_subgraph.preds(top_block).count()];
-            let mut added_phi_bbs: BTreeMap<NodeID, NodeID> = BTreeMap::new();
-            added_phi_bbs.insert(dummy_phi, top_block);
-            let lowest_dominating_user =
-                |bb: NodeID,
-                 users: &BTreeMap<NodeID, usize>,
-                 added_phi_bbs: &BTreeMap<NodeID, NodeID>| {
-                    users
-                        .iter()
-                        .filter(|(user, _)| {
-                            let user_bb = added_phi_bbs
-                                .get(user)
-                                .unwrap_or_else(|| &bbs.0[user.idx()]);
-                            dom.does_dom(*user_bb, bb)
-                        })
-                        .max_by_key(|(_, order)| *order)
-                        .map(|(user, _)| *user)
-                };
-            for (idx, pred) in control_subgraph.preds(top_block).enumerate() {
-                if !loops.is_in_loop(top_block, pred) {
-                    continue;
+            // retrieve either the write, or `arr` before the inner loop. The
+            // general version of this problem requires the following solution.
+            // Our goal is to figure out which downstream user represents
+            // `object` at each block in the loop. We first assign each block
+            // containing a user the most downstream user it contains. Then, we
+            // create a dummy phi for every region (including the header) in the
+            // loop, which is the downstream user for that block. Then, every
+            // other block is assigned the downstream user of its single
+            // predecessor. This basically amounts to recreating SSA for
+            // `object` inside the loop.
+            let mut user_per_loop_bb = BTreeMap::new();
+            let mut added_phis = BTreeMap::new();
+            let mut top_phi = NodeID::new(0);
+            // Assign existing users.
+            for (user, ordering) in users.iter() {
+                if let Some(old_user) = user_per_loop_bb.get(&bbs.0[user.idx()])
+                    && users[old_user] > *ordering
+                {
+                } else {
+                    user_per_loop_bb.insert(bbs.0[user.idx()], *user);
                 }
-
-                // Figure out which, if any, downstream user dominates this
-                // predecessor. If none do, traverse predecessors until reaching
-                // a region, and figure out which downstream users dominate the
-                // region - create a phi joining those users, and the phi now
-                // dominates the predecessor. This is recursive. Created phis
-                // get added to `users`, so that duplicate phis aren't made. The
-                // basic blocks of the added phis are also tracked, since they
-                // are needed for domination checking.
-                let mut downstream_user = NodeID::new(0);
-                let mut worklist = VecDeque::from(vec![pred]);
-                while let Some(bb) = worklist.pop_front() {
-                    if let Some(lowest_dominating_user) =
-                        lowest_dominating_user(bb, &users, &added_phi_bbs)
-                    {
-                        // If there is a dominating user of this block, then
-                        // nothing needs to be added - record this as the
-                        // downstream user, and the last assignment of this form
-                        // will be from the immediate predecessor of the loop
-                        // header.
-                        downstream_user = lowest_dominating_user;
-                    } else if let Some(preds_lowest_dominating_users) = control_subgraph
-                        .preds(bb)
-                        .map(|pred| lowest_dominating_user(pred, &users, &added_phi_bbs))
-                        .collect::<Option<Vec<_>>>()
-                    {
-                        // If there isn't a dominating user of this block yet,
-                        // but each predecessor has a dominating user, create a
-                        // phi over those users.
-                        assert!(preds_lowest_dominating_users.len() > 1);
-                        let success = editor.edit(|mut edit| {
-                            let phi_node = edit.add_node(Node::Phi {
-                                control: bb,
-                                data: preds_lowest_dominating_users.into_boxed_slice(),
-                            });
-                            added_phi_bbs.insert(phi_node, bb);
-                            users.insert(phi_node, chain_ordering);
-                            chain_ordering += 1;
-                            downstream_user = phi_node;
-                            Ok(edit)
+            }
+            // Assign dummy phis.
+            for bb in loops.nodes_in_loop(top_block) {
+                if (!user_per_loop_bb.contains_key(&bb) || bb == top_block)
+                    && editor.func().nodes[bb.idx()].is_region()
+                {
+                    let success = editor.edit(|mut edit| {
+                        let phi_node = edit.add_node(Node::Phi {
+                            control: bb,
+                            data: empty().collect(),
                         });
-                        assert!(success);
-                    } else {
-                        // If the predecessors don't have dominating users yet,
-                        // push them to the stack. Also push ourselves back on
-                        // the stack to be revisited.
-                        for pred in control_subgraph.preds(bb) {
-                            assert!(loops.is_in_loop(top_block, pred));
-                            worklist.push_back(pred);
+                        if bb != top_block || !user_per_loop_bb.contains_key(&bb) {
+                            user_per_loop_bb.insert(bb, phi_node);
                         }
-                        worklist.push_back(bb);
-                    }
+                        if bb == top_block {
+                            top_phi = phi_node;
+                        }
+                        added_phis.insert(phi_node, bb);
+                        Ok(edit)
+                    });
+                    assert!(success);
+                }
+            }
+            // Assign users for the rest of the blocks.
+            for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) {
+                if !user_per_loop_bb.contains_key(&bb) {
+                    assert!(control_subgraph.preds(*bb).count() == 1);
+                    user_per_loop_bb.insert(
+                        *bb,
+                        user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()],
+                    );
                 }
-                downstream_user_per_pred[idx] = Some(downstream_user);
             }
-            assert!(downstream_user_per_pred.iter().any(|user|user.is_none()), "PANIC: No predecessor of the loop header containing a cloning write has no downstream user of that write. This shouldn't be possible!");
-            assert!(downstream_user_per_pred.iter().any(|user|user.is_some()), "PANIC: No predecessor of the loop header containing a cloning write has a downstream user of that write. This shouldn't be possible!");
 
             // Induce the clone.
             let success = editor.edit(|mut edit| {
@@ -910,36 +866,38 @@ fn materialize_clones(
                 let object_cons = edit.add_zero_constant(object_ty);
                 let cons_node = edit.add_node(Node::Constant { id: object_cons });
 
-                // Create the phi.
-                let phi_node = edit.add_node(Node::Phi {
-                    control: top_block,
-                    data: downstream_user_per_pred
-                        .into_iter()
-                        .map(|user| {
-                            if let Some(user) = user {
-                                user
-                            } else {
-                                cons_node
-                            }
-                        })
-                        .collect(),
-                });
-
                 // Create the clone into the phi.
                 let clone_node = edit.add_node(Node::Write {
-                    collect: phi_node,
+                    collect: top_phi,
                     data: object,
                     indices: vec![].into_boxed_slice(),
                 });
 
+                // Create the phis.
+                let mut real_phis = BTreeMap::new();
+                for (dummy, bb) in added_phis {
+                    let real = edit.add_node(Node::Phi {
+                        control: bb,
+                        data: control_subgraph
+                            .preds(bb)
+                            .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
+                            .collect(),
+                    });
+                    real_phis.insert(dummy, real);
+                }
+
                 // Make user use the cloned object.
-                let edit = edit.replace_all_uses_where(object, clone_node, |id| {
+                edit = edit.replace_all_uses_where(object, clone_node, |id| {
                     id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
                 })?;
 
-                // Get rid of the dummy phi.
-                let edit = edit.replace_all_uses(dummy_phi, phi_node)?;
-                edit.delete_node(dummy_phi)
+                // Get rid of the dummy phis.
+                for (dummy, real) in real_phis {
+                    edit = edit.replace_all_uses(dummy, real)?;
+                    edit = edit.delete_node(dummy)?;
+                }
+
+                Ok(edit)
             });
             assert!(success);
         }
-- 
GitLab


From 371c2df209ee07c98f1f4023fb71a1f138cb3981 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 16:47:42 -0800
Subject: [PATCH 09/14] DCE is needed

---
 hercules_opt/src/dce.rs                          | 3 +--
 hercules_opt/src/legalize_reference_semantics.rs | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hercules_opt/src/dce.rs b/hercules_opt/src/dce.rs
index 75268694..839e9ad1 100644
--- a/hercules_opt/src/dce.rs
+++ b/hercules_opt/src/dce.rs
@@ -6,8 +6,7 @@ use self::hercules_ir::ir::*;
 use crate::*;
 
 /*
- * Top level function to run dead code elimination. Deletes nodes by setting
- * nodes to gravestones. Works with a function already containing gravestones.
+ * Top level function to run dead code elimination.
  */
 pub fn dce(editor: &mut FunctionEditor) {
     // Create worklist (starts as all nodes).
diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index 21a42056..4c467b4f 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -900,6 +900,8 @@ fn materialize_clones(
                 Ok(edit)
             });
             assert!(success);
+
+            dce(editor);
         }
     };
 
-- 
GitLab


From 45a630a1d86d971f2864e33dd50c62b2df59d885 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 17:34:11 -0800
Subject: [PATCH 10/14] More fixes

---
 .../src/legalize_reference_semantics.rs       | 347 +++++++++---------
 1 file changed, 178 insertions(+), 169 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index 4c467b4f..c4f58745 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -717,193 +717,202 @@ fn materialize_clones(
             break;
         }
     }
+    let mut super_value = BTreeMap::new();
+    for value in lattice.iter() {
+        meet(&mut super_value, value);
+    }
 
     // Helper to induce a clone when an implicit clone is identified.
     let nodes = nodes.clone();
-    let mut induce_clone = |object: NodeID,
-                            user: NodeID,
-                            value: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
-        if !value[&object].contains(&user) {
-            let success = editor.edit(|mut edit| {
-                // Create the constant collection object for allocation.
-                let object_ty = typing[object.idx()];
-                let object_cons = edit.add_zero_constant(object_ty);
-                let cons_node = edit.add_node(Node::Constant { id: object_cons });
-
-                // Create the clone into the new constant collection.
-                let clone_node = edit.add_node(Node::Write {
-                    collect: cons_node,
-                    data: object,
-                    indices: vec![].into_boxed_slice(),
+    let mut induce_clone =
+        |object: NodeID, user: NodeID, value: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
+            // If `user` already used `object` and tries to use it again, then the
+            // clone is a "loop induced" clone. Otherwise, it's a simple clone.
+            if !value[&object].contains(&user) {
+                let success = editor.edit(|mut edit| {
+                    // Create the constant collection object for allocation.
+                    let object_ty = typing[object.idx()];
+                    let object_cons = edit.add_zero_constant(object_ty);
+                    let cons_node = edit.add_node(Node::Constant { id: object_cons });
+
+                    // Create the clone into the new constant collection.
+                    let clone_node = edit.add_node(Node::Write {
+                        collect: cons_node,
+                        data: object,
+                        indices: vec![].into_boxed_slice(),
+                    });
+
+                    // Make user use the cloned object.
+                    edit.replace_all_uses_where(object, clone_node, |id| *id == user)
+                });
+                assert!(success);
+            } else {
+                // Figure out where to place that phi. This is the deepest
+                // loop header where `user` is responsible for making `object` used
+                // used at the top of the block, and the block dominates the block
+                // containing `user`. If `user` is a phi, then the region it's
+                // attached to is excluded from eligibility.
+                let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
+                    lattice[bb_to_prefix_sum[bb.idx()]]
+                        .get(&object)
+                        .unwrap_or(&BTreeSet::new())
+                        .contains(&user)
+                        && dom.does_dom(*bb, bbs.0[user.idx()])
+                        && loops.contains(*bb)
+                        && loops.is_in_loop(*bb, bbs.0[user.idx()])
+                        && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
                 });
+                let top_block = eligible_blocks
+                    .max_by_key(|bb| loops.nesting(*bb).unwrap())
+                    .unwrap();
+                assert!(editor.func().nodes[top_block.idx()].is_region());
 
-                // Make user use the cloned object.
-                edit.replace_all_uses_where(object, clone_node, |id| *id == user)
-            });
-            assert!(success);
-        } else {
-            // Figure out where to place that phi. This is the deepest
-            // loop header where `user` is responsible for making `object` used
-            // used at the top of the block, and the block dominates the block
-            // containing `user`. If `user` is a phi, then the region it's
-            // attached to is excluded from eligibility.
-            let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
-                lattice[bb_to_prefix_sum[bb.idx()]]
-                    .get(&object)
-                    .unwrap_or(&BTreeSet::new())
-                    .contains(&user)
-                    && dom.does_dom(*bb, bbs.0[user.idx()])
-                    && loops.contains(*bb)
-                    && loops.is_in_loop(*bb, bbs.0[user.idx()])
-                    && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
-            });
-            let top_block = eligible_blocks
-                .max_by_key(|bb| loops.nesting(*bb).unwrap())
-                .unwrap();
-            assert!(editor.func().nodes[top_block.idx()].is_region());
-
-            // Figure out the users of `object` that we need to phi back
-            // upwards. Assign each user a number indicating how far down the
-            // user chain it is, higher is farther down. This is used for
-            // picking the most downstream user later.
-            let mut users: BTreeMap<NodeID, usize> = BTreeMap::new();
-            let mut workset = BTreeSet::new();
-            workset.insert(object);
-            let mut chain_ordering = 1;
-            while let Some(pop) = workset.pop_first() {
-                let iterated_users: BTreeSet<_> = value
-                    .get(&pop)
-                    .map(|users| users.into_iter())
-                    .into_iter()
-                    .flatten()
-                    .map(|id| *id)
-                    .filter(|iterated_user| loops.is_in_loop(top_block, bbs.0[iterated_user.idx()]))
-                    .collect();
-                workset.extend(&iterated_users);
-                for user in iterated_users {
-                    *users.entry(user).or_default() = chain_ordering;
-                    chain_ordering += 1;
+                // Figure out the users of `object` that we need to phi back
+                // upwards. Assign each user a number indicating how far down the
+                // user chain it is, higher is farther down. This is used for
+                // picking the most downstream user later.
+                let mut users: BTreeMap<NodeID, usize> = BTreeMap::new();
+                let mut workset: BTreeSet<NodeID> = BTreeSet::new();
+                workset.insert(object);
+                let mut chain_ordering = 1;
+                while let Some(pop) = workset.pop_first() {
+                    let iterated_users: BTreeSet<_> = super_value
+                        .get(&pop)
+                        .map(|users| users.into_iter())
+                        .into_iter()
+                        .flatten()
+                        .map(|id| *id)
+                        .filter(|iterated_user| {
+                            loops.is_in_loop(top_block, bbs.0[iterated_user.idx()])
+                                && dom.does_dom(bbs.0[pop.idx()], bbs.0[iterated_user.idx()])
+                        })
+                        .collect();
+                    workset.extend(iterated_users.iter().filter(|id| !users.contains_key(id)));
+                    for user in iterated_users {
+                        *users.entry(user).or_default() = chain_ordering;
+                        chain_ordering += 1;
+                    }
                 }
-            }
 
-            // The fringe users may not dominate any predecessors of the loop
-            // header. The following is some Juno code that exposes this:
-            //
-            // fn problematic(a : size) -> i32 {
-            //   for i = 0 to a {
-            //     let arr : i32[1];
-            //     for j = 0 to a {
-            //       arr[0] = 1;
-            //     }
-            //   }
-            //   return 0;
-            // }
-            //
-            // Note that `arr` induces a clone each iteration, since its value
-            // needs to be reset to all zeros. However, it should also be noted
-            // that the most fringe user of `arr`, the write inside the inner
-            // loop, does not dominate the bottom of the outer loop. Thus, we
-            // need to insert a phi in the bottom block of the outer loop to
-            // retrieve either the write, or `arr` before the inner loop. The
-            // general version of this problem requires the following solution.
-            // Our goal is to figure out which downstream user represents
-            // `object` at each block in the loop. We first assign each block
-            // containing a user the most downstream user it contains. Then, we
-            // create a dummy phi for every region (including the header) in the
-            // loop, which is the downstream user for that block. Then, every
-            // other block is assigned the downstream user of its single
-            // predecessor. This basically amounts to recreating SSA for
-            // `object` inside the loop.
-            let mut user_per_loop_bb = BTreeMap::new();
-            let mut added_phis = BTreeMap::new();
-            let mut top_phi = NodeID::new(0);
-            // Assign existing users.
-            for (user, ordering) in users.iter() {
-                if let Some(old_user) = user_per_loop_bb.get(&bbs.0[user.idx()])
-                    && users[old_user] > *ordering
-                {
-                } else {
-                    user_per_loop_bb.insert(bbs.0[user.idx()], *user);
+                // The fringe users may not dominate any predecessors of the loop
+                // header. The following is some Juno code that exposes this:
+                //
+                // fn problematic(a : size) -> i32 {
+                //   for i = 0 to a {
+                //     let arr : i32[1];
+                //     for j = 0 to a {
+                //       arr[0] = 1;
+                //     }
+                //   }
+                //   return 0;
+                // }
+                //
+                // Note that `arr` induces a clone each iteration, since its value
+                // needs to be reset to all zeros. However, it should also be noted
+                // that the most fringe user of `arr`, the write inside the inner
+                // loop, does not dominate the bottom of the outer loop. Thus, we
+                // need to insert a phi in the bottom block of the outer loop to
+                // retrieve either the write, or `arr` before the inner loop. The
+                // general version of this problem requires the following solution.
+                // Our goal is to figure out which downstream user represents
+                // `object` at each block in the loop. We first assign each block
+                // containing a user the most downstream user it contains. Then, we
+                // create a dummy phi for every region (including the header) in the
+                // loop, which is the downstream user for that block. Then, every
+                // other block is assigned the downstream user of its single
+                // predecessor. This basically amounts to recreating SSA for
+                // `object` inside the loop.
+                let mut user_per_loop_bb = BTreeMap::new();
+                let mut added_phis = BTreeMap::new();
+                let mut top_phi = NodeID::new(0);
+                // Assign existing users.
+                for (user, ordering) in users.iter() {
+                    let bb = bbs.0[user.idx()];
+                    if let Some(old_user) = user_per_loop_bb.get(&bb)
+                        && users[old_user] > *ordering
+                    {
+                    } else {
+                        user_per_loop_bb.insert(bb, *user);
+                    }
                 }
-            }
-            // Assign dummy phis.
-            for bb in loops.nodes_in_loop(top_block) {
-                if (!user_per_loop_bb.contains_key(&bb) || bb == top_block)
-                    && editor.func().nodes[bb.idx()].is_region()
-                {
-                    let success = editor.edit(|mut edit| {
-                        let phi_node = edit.add_node(Node::Phi {
-                            control: bb,
-                            data: empty().collect(),
+                // Assign dummy phis.
+                for bb in loops.nodes_in_loop(top_block) {
+                    if (!user_per_loop_bb.contains_key(&bb) || bb == top_block)
+                        && editor.func().nodes[bb.idx()].is_region()
+                    {
+                        let success = editor.edit(|mut edit| {
+                            let phi_node = edit.add_node(Node::Phi {
+                                control: bb,
+                                data: empty().collect(),
+                            });
+                            if bb != top_block || !user_per_loop_bb.contains_key(&bb) {
+                                user_per_loop_bb.insert(bb, phi_node);
+                            }
+                            if bb == top_block {
+                                top_phi = phi_node;
+                            }
+                            added_phis.insert(phi_node, bb);
+                            Ok(edit)
                         });
-                        if bb != top_block || !user_per_loop_bb.contains_key(&bb) {
-                            user_per_loop_bb.insert(bb, phi_node);
-                        }
-                        if bb == top_block {
-                            top_phi = phi_node;
-                        }
-                        added_phis.insert(phi_node, bb);
-                        Ok(edit)
-                    });
-                    assert!(success);
+                        assert!(success);
+                    }
                 }
-            }
-            // Assign users for the rest of the blocks.
-            for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) {
-                if !user_per_loop_bb.contains_key(&bb) {
-                    assert!(control_subgraph.preds(*bb).count() == 1);
-                    user_per_loop_bb.insert(
-                        *bb,
-                        user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()],
-                    );
+                // Assign users for the rest of the blocks.
+                for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) {
+                    if !user_per_loop_bb.contains_key(&bb) {
+                        assert!(control_subgraph.preds(*bb).count() == 1);
+                        user_per_loop_bb.insert(
+                            *bb,
+                            user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()],
+                        );
+                    }
                 }
-            }
 
-            // Induce the clone.
-            let success = editor.edit(|mut edit| {
-                // Create the constant collection object for allocation.
-                let object_ty = typing[object.idx()];
-                let object_cons = edit.add_zero_constant(object_ty);
-                let cons_node = edit.add_node(Node::Constant { id: object_cons });
-
-                // Create the clone into the phi.
-                let clone_node = edit.add_node(Node::Write {
-                    collect: top_phi,
-                    data: object,
-                    indices: vec![].into_boxed_slice(),
-                });
+                // Induce the clone.
+                let success = editor.edit(|mut edit| {
+                    // Create the constant collection object for allocation.
+                    let object_ty = typing[object.idx()];
+                    let object_cons = edit.add_zero_constant(object_ty);
+                    let cons_node = edit.add_node(Node::Constant { id: object_cons });
 
-                // Create the phis.
-                let mut real_phis = BTreeMap::new();
-                for (dummy, bb) in added_phis {
-                    let real = edit.add_node(Node::Phi {
-                        control: bb,
-                        data: control_subgraph
-                            .preds(bb)
-                            .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
-                            .collect(),
+                    // Create the phis.
+                    let mut real_phis = BTreeMap::new();
+                    for (dummy, bb) in added_phis {
+                        let real = edit.add_node(Node::Phi {
+                            control: bb,
+                            data: control_subgraph
+                                .preds(bb)
+                                .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
+                                .collect(),
+                        });
+                        real_phis.insert(dummy, real);
+                    }
+
+                    // Create the clone into the phi.
+                    let clone_node = edit.add_node(Node::Write {
+                        collect: real_phis[&top_phi],
+                        data: object,
+                        indices: vec![].into_boxed_slice(),
                     });
-                    real_phis.insert(dummy, real);
-                }
 
-                // Make user use the cloned object.
-                edit = edit.replace_all_uses_where(object, clone_node, |id| {
-                    id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
-                })?;
+                    // Make user use the cloned object.
+                    edit = edit.replace_all_uses_where(object, clone_node, |id| {
+                        id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
+                    })?;
 
-                // Get rid of the dummy phis.
-                for (dummy, real) in real_phis {
-                    edit = edit.replace_all_uses(dummy, real)?;
-                    edit = edit.delete_node(dummy)?;
-                }
+                    // Get rid of the dummy phis.
+                    for (dummy, real) in real_phis {
+                        edit = edit.replace_all_uses(dummy, real)?;
+                        edit = edit.delete_node(dummy)?;
+                    }
 
-                Ok(edit)
-            });
-            assert!(success);
+                    Ok(edit)
+                });
+                assert!(success);
 
-            dce(editor);
-        }
-    };
+                dce(editor);
+            }
+        };
 
     // Now that we've computed the used collections dataflow analysis, use the
     // results to materialize a clone whenever a node attempts to use an already
-- 
GitLab


From 54df5a02de5694aa00a14f460b1c74c9c95297ab Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 17:52:07 -0800
Subject: [PATCH 11/14] ugh

---
 hercules_ir/src/ir.rs                         | 13 +++++++++++
 hercules_opt/src/gvn.rs                       | 12 ++++++++--
 .../src/legalize_reference_semantics.rs       |  4 ++++
 hercules_opt/src/pass.rs                      |  8 +++++--
 .../implicit_clone/src/implicit_clone.jn      | 23 +++++++++++++++----
 juno_samples/implicit_clone/src/main.rs       |  4 ++--
 juno_samples/matmul/src/matmul.jn             |  5 ++--
 7 files changed, 56 insertions(+), 13 deletions(-)

diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs
index 11d23e61..8bb8e4ef 100644
--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -1296,6 +1296,19 @@ impl Node {
         }
     }
 
+    pub fn try_write(&self) -> Option<(NodeID, NodeID, &[Index])> {
+        if let Node::Write {
+            collect,
+            data,
+            indices,
+        } = self
+        {
+            Some((*collect, *data, indices))
+        } else {
+            None
+        }
+    }
+
     pub fn is_zero_constant(&self, constants: &Vec<Constant>) -> bool {
         if let Node::Constant { id } = self
             && constants[id.idx()].is_zero()
diff --git a/hercules_opt/src/gvn.rs b/hercules_opt/src/gvn.rs
index a9db0cd8..5f7138e5 100644
--- a/hercules_opt/src/gvn.rs
+++ b/hercules_opt/src/gvn.rs
@@ -11,7 +11,7 @@ use crate::*;
  * fairly simple compared to in a normal CFG. Needs access to constants for
  * identity function simplification.
  */
-pub fn gvn(editor: &mut FunctionEditor) {
+pub fn gvn(editor: &mut FunctionEditor, gvn_constants_and_clones: bool) {
     // Create worklist (starts as all nodes) and value number hashmap.
     let mut worklist: Vec<NodeID> = (0..editor.func().nodes.len()).map(NodeID::new).collect();
     let mut value_numbers: HashMap<Node, NodeID> = HashMap::new();
@@ -28,7 +28,15 @@ pub fn gvn(editor: &mut FunctionEditor) {
         // Next, check if there is a value number for this simplified value yet.
         if let Some(number) = value_numbers.get(&editor.func().nodes[value.idx()]) {
             // If the number is this worklist item, there's nothing to be done.
-            if *number == work {
+            // Also, don't GVN constants and clones if indicated to not do so.
+            if *number == work
+                || (!gvn_constants_and_clones
+                    && (editor.func().nodes[work.idx()].is_constant()
+                        || editor.func().nodes[work.idx()]
+                            .try_write()
+                            .map(|(_, _, indices)| indices.is_empty())
+                            .unwrap_or(false)))
+            {
                 continue;
             }
 
diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index c4f58745..f60c8a7f 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -910,6 +910,10 @@ fn materialize_clones(
                 });
                 assert!(success);
 
+                // De-duplicate phis.
+                gvn(editor, false);
+
+                // Get rid of unused phis.
                 dce(editor);
             }
         };
diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs
index d0449a4a..d2ef3f28 100644
--- a/hercules_opt/src/pass.rs
+++ b/hercules_opt/src/pass.rs
@@ -395,7 +395,7 @@ impl PassManager {
                             &types_ref,
                             &def_uses[idx],
                         );
-                        gvn(&mut editor);
+                        gvn(&mut editor, true);
 
                         self.module.constants = constants_ref.take();
                         self.module.dynamic_constants = dynamic_constants_ref.take();
@@ -828,7 +828,11 @@ impl PassManager {
                             &types_ref,
                             &def_uses[idx],
                         );
-                        infer_parallel_reduce(&mut editor, &fork_join_maps[idx], &reduce_cycles[idx]);
+                        infer_parallel_reduce(
+                            &mut editor,
+                            &fork_join_maps[idx],
+                            &reduce_cycles[idx],
+                        );
                         infer_parallel_fork(&mut editor, &fork_join_maps[idx]);
                         infer_vectorizable(&mut editor, &fork_join_maps[idx]);
                         infer_tight_associative(&mut editor, &reduce_cycles[idx]);
diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn
index 7017942c..a66cfc21 100644
--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -28,7 +28,7 @@ fn loop_implicit_clone(input : i32) -> i32 {
 }
 
 #[entry]
-fn tricky_loop_implicit_clone(a : usize) -> i32 {
+fn double_loop_implicit_clone(a : usize) -> usize {
   for i = 0 to a {
     let arr : i32[1];
     for j = 0 to a {
@@ -39,12 +39,12 @@ fn tricky_loop_implicit_clone(a : usize) -> i32 {
 }
 
 #[entry]
-fn double_loop_implicit_clone(a : usize, b : usize) -> i32 {
+fn tricky_loop_implicit_clone(a : usize, b : usize) -> i32 {
   let x = 0;
   for j = 0 to 2 {
     for i = 0 to 5 {
       let arr : i32[3];
-      //let arr2 : i32[1];
+      let arr2 : i32[1];
       if a == b {
         arr[a] += 7;
       } else {
@@ -52,11 +52,26 @@ fn double_loop_implicit_clone(a : usize, b : usize) -> i32 {
       }
       for k = 0 to (a + b - 1) {
         arr[a] += 2;
-	//arr2[0] += 1;
+	arr2[0] += 1;
       }
       x += arr[b];
     }
   }
+  return x + 70;
+}
+
+#[entry]
+fn tricky2_loop_implicit_clone(a : usize, b : usize) -> i32 {
+  let x = 0;
+  for i = 0 to 3 {
+    let arr : i32[1];
+    if a == b {
+      arr[0] = 6;
+    } else {
+      arr[0] = 9;
+    }
+    x += arr[0];
+  }
   return x;
 }
 
diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs
index 508de775..764b1df4 100644
--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -15,11 +15,11 @@ fn main() {
         println!("{}", output);
         assert_eq!(output, 7);
 
-        let output = tricky_loop_implicit_clone(3).await;
+        let output = double_loop_implicit_clone(3).await;
         println!("{}", output);
         assert_eq!(output, 42);
 
-        let output = double_loop_implicit_clone(2, 2).await;
+        let output = tricky_loop_implicit_clone(2, 2).await;
         println!("{}", output);
         assert_eq!(output, 130);
 
diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn
index 16a262b9..b9ca2949 100644
--- a/juno_samples/matmul/src/matmul.jn
+++ b/juno_samples/matmul/src/matmul.jn
@@ -20,8 +20,8 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]
   
   for bi = 0 to n / 64 {
     for bk = 0 to l / 64 {
-      let atile : i32[64, 64];
-      let btile : i32[64, 64];
+      let atile : i32[66, 64];
+      let btile : i32[65, 64];
       let ctile : i32[64, 64];
 
       for tile_idx = 0 to m / 64 {
@@ -33,7 +33,6 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]
 	    // should see a lack of a phi for ctile in the block loops and
 	    // induce a copy of an initial value of ctile (all zeros) on each
 	    // iteration of the block loops.
-	    ctile[ti, tk] = 0;
 	  }
 	}
         for ti = 0 to 64 {
-- 
GitLab


From d190f629b11bb41f23a710a4be82782bde313a0d Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 18:07:27 -0800
Subject: [PATCH 12/14] ugh fix

---
 .../src/legalize_reference_semantics.rs       | 350 +++++++++---------
 .../implicit_clone/src/implicit_clone.jn      |   1 +
 2 files changed, 175 insertions(+), 176 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index f60c8a7f..f76fe831 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -724,199 +724,197 @@ fn materialize_clones(
 
     // Helper to induce a clone when an implicit clone is identified.
     let nodes = nodes.clone();
-    let mut induce_clone =
-        |object: NodeID, user: NodeID, value: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
-            // If `user` already used `object` and tries to use it again, then the
-            // clone is a "loop induced" clone. Otherwise, it's a simple clone.
-            if !value[&object].contains(&user) {
-                let success = editor.edit(|mut edit| {
-                    // Create the constant collection object for allocation.
-                    let object_ty = typing[object.idx()];
-                    let object_cons = edit.add_zero_constant(object_ty);
-                    let cons_node = edit.add_node(Node::Constant { id: object_cons });
+    let mut induce_clone = |object: NodeID,
+                            user: NodeID,
+                            value: &BTreeMap<NodeID, BTreeSet<NodeID>>| {
+        // If `user` already used `object` and tries to use it again, then the
+        // clone is a "loop induced" clone. Otherwise, it's a simple clone.
+        if !value[&object].contains(&user) {
+            let success = editor.edit(|mut edit| {
+                // Create the constant collection object for allocation.
+                let object_ty = typing[object.idx()];
+                let object_cons = edit.add_zero_constant(object_ty);
+                let cons_node = edit.add_node(Node::Constant { id: object_cons });
 
-                    // Create the clone into the new constant collection.
-                    let clone_node = edit.add_node(Node::Write {
-                        collect: cons_node,
-                        data: object,
-                        indices: vec![].into_boxed_slice(),
-                    });
-
-                    // Make user use the cloned object.
-                    edit.replace_all_uses_where(object, clone_node, |id| *id == user)
-                });
-                assert!(success);
-            } else {
-                // Figure out where to place that phi. This is the deepest
-                // loop header where `user` is responsible for making `object` used
-                // used at the top of the block, and the block dominates the block
-                // containing `user`. If `user` is a phi, then the region it's
-                // attached to is excluded from eligibility.
-                let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
-                    lattice[bb_to_prefix_sum[bb.idx()]]
-                        .get(&object)
-                        .unwrap_or(&BTreeSet::new())
-                        .contains(&user)
-                        && dom.does_dom(*bb, bbs.0[user.idx()])
-                        && loops.contains(*bb)
-                        && loops.is_in_loop(*bb, bbs.0[user.idx()])
-                        && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
+                // Create the clone into the new constant collection.
+                let clone_node = edit.add_node(Node::Write {
+                    collect: cons_node,
+                    data: object,
+                    indices: vec![].into_boxed_slice(),
                 });
-                let top_block = eligible_blocks
-                    .max_by_key(|bb| loops.nesting(*bb).unwrap())
-                    .unwrap();
-                assert!(editor.func().nodes[top_block.idx()].is_region());
 
-                // Figure out the users of `object` that we need to phi back
-                // upwards. Assign each user a number indicating how far down the
-                // user chain it is, higher is farther down. This is used for
-                // picking the most downstream user later.
-                let mut users: BTreeMap<NodeID, usize> = BTreeMap::new();
-                let mut workset: BTreeSet<NodeID> = BTreeSet::new();
-                workset.insert(object);
-                let mut chain_ordering = 1;
-                while let Some(pop) = workset.pop_first() {
-                    let iterated_users: BTreeSet<_> = super_value
-                        .get(&pop)
-                        .map(|users| users.into_iter())
-                        .into_iter()
-                        .flatten()
-                        .map(|id| *id)
-                        .filter(|iterated_user| {
-                            loops.is_in_loop(top_block, bbs.0[iterated_user.idx()])
-                                && dom.does_dom(bbs.0[pop.idx()], bbs.0[iterated_user.idx()])
-                        })
-                        .collect();
-                    workset.extend(iterated_users.iter().filter(|id| !users.contains_key(id)));
-                    for user in iterated_users {
-                        *users.entry(user).or_default() = chain_ordering;
-                        chain_ordering += 1;
-                    }
+                // Make user use the cloned object.
+                edit.replace_all_uses_where(object, clone_node, |id| *id == user)
+            });
+            assert!(success);
+        } else {
+            // Figure out where to place that phi. This is the deepest
+            // loop header where `user` is responsible for making `object` used
+            // used at the top of the block, and the block dominates the block
+            // containing `user`. If `user` is a phi, then the region it's
+            // attached to is excluded from eligibility.
+            let eligible_blocks = rev_po.iter().map(|bb| *bb).filter(|bb| {
+                lattice[bb_to_prefix_sum[bb.idx()]]
+                    .get(&object)
+                    .unwrap_or(&BTreeSet::new())
+                    .contains(&user)
+                    && dom.does_dom(*bb, bbs.0[user.idx()])
+                    && loops.contains(*bb)
+                    && loops.is_in_loop(*bb, bbs.0[user.idx()])
+                    && (!editor.func().nodes[user.idx()].is_phi() || *bb != bbs.0[user.idx()])
+            });
+            let top_block = eligible_blocks
+                .max_by_key(|bb| loops.nesting(*bb).unwrap())
+                .unwrap();
+            assert!(editor.func().nodes[top_block.idx()].is_region());
+
+            // Figure out the users of `object` that we need to phi back
+            // upwards. Assign each user a number indicating how far down the
+            // user chain it is, higher is farther down. This is used for
+            // picking the most downstream user later.
+            let mut users: BTreeMap<NodeID, usize> = BTreeMap::new();
+            let mut workset: BTreeSet<NodeID> = BTreeSet::new();
+            workset.insert(object);
+            let mut chain_ordering = 1;
+            while let Some(pop) = workset.pop_first() {
+                let iterated_users: BTreeSet<_> = super_value
+                    .get(&pop)
+                    .map(|users| users.into_iter())
+                    .into_iter()
+                    .flatten()
+                    .map(|id| *id)
+                    .filter(|iterated_user| loops.is_in_loop(top_block, bbs.0[iterated_user.idx()]))
+                    .collect();
+                workset.extend(iterated_users.iter().filter(|id| !users.contains_key(id)));
+                for user in iterated_users {
+                    *users.entry(user).or_default() = chain_ordering;
+                    chain_ordering += 1;
                 }
+            }
 
-                // The fringe users may not dominate any predecessors of the loop
-                // header. The following is some Juno code that exposes this:
-                //
-                // fn problematic(a : size) -> i32 {
-                //   for i = 0 to a {
-                //     let arr : i32[1];
-                //     for j = 0 to a {
-                //       arr[0] = 1;
-                //     }
-                //   }
-                //   return 0;
-                // }
-                //
-                // Note that `arr` induces a clone each iteration, since its value
-                // needs to be reset to all zeros. However, it should also be noted
-                // that the most fringe user of `arr`, the write inside the inner
-                // loop, does not dominate the bottom of the outer loop. Thus, we
-                // need to insert a phi in the bottom block of the outer loop to
-                // retrieve either the write, or `arr` before the inner loop. The
-                // general version of this problem requires the following solution.
-                // Our goal is to figure out which downstream user represents
-                // `object` at each block in the loop. We first assign each block
-                // containing a user the most downstream user it contains. Then, we
-                // create a dummy phi for every region (including the header) in the
-                // loop, which is the downstream user for that block. Then, every
-                // other block is assigned the downstream user of its single
-                // predecessor. This basically amounts to recreating SSA for
-                // `object` inside the loop.
-                let mut user_per_loop_bb = BTreeMap::new();
-                let mut added_phis = BTreeMap::new();
-                let mut top_phi = NodeID::new(0);
-                // Assign existing users.
-                for (user, ordering) in users.iter() {
-                    let bb = bbs.0[user.idx()];
-                    if let Some(old_user) = user_per_loop_bb.get(&bb)
-                        && users[old_user] > *ordering
-                    {
-                    } else {
-                        user_per_loop_bb.insert(bb, *user);
-                    }
+            // The fringe users may not dominate any predecessors of the loop
+            // header. The following is some Juno code that exposes this:
+            //
+            // fn problematic(a : size) -> i32 {
+            //   for i = 0 to a {
+            //     let arr : i32[1];
+            //     for j = 0 to a {
+            //       arr[0] = 1;
+            //     }
+            //   }
+            //   return 0;
+            // }
+            //
+            // Note that `arr` induces a clone each iteration, since its value
+            // needs to be reset to all zeros. However, it should also be noted
+            // that the most fringe user of `arr`, the write inside the inner
+            // loop, does not dominate the bottom of the outer loop. Thus, we
+            // need to insert a phi in the bottom block of the outer loop to
+            // retrieve either the write, or `arr` before the inner loop. The
+            // general version of this problem requires the following solution.
+            // Our goal is to figure out which downstream user represents
+            // `object` at each block in the loop. We first assign each block
+            // containing a user the most downstream user it contains. Then, we
+            // create a dummy phi for every region (including the header) in the
+            // loop, which is the downstream user for that block. Then, every
+            // other block is assigned the downstream user of its single
+            // predecessor. This basically amounts to recreating SSA for
+            // `object` inside the loop.
+            let mut user_per_loop_bb = BTreeMap::new();
+            let mut added_phis = BTreeMap::new();
+            let mut top_phi = NodeID::new(0);
+            // Assign existing users.
+            for (user, ordering) in users.iter() {
+                let bb = bbs.0[user.idx()];
+                if let Some(old_user) = user_per_loop_bb.get(&bb)
+                    && users[old_user] > *ordering
+                {
+                } else {
+                    user_per_loop_bb.insert(bb, *user);
                 }
-                // Assign dummy phis.
-                for bb in loops.nodes_in_loop(top_block) {
-                    if (!user_per_loop_bb.contains_key(&bb) || bb == top_block)
-                        && editor.func().nodes[bb.idx()].is_region()
-                    {
-                        let success = editor.edit(|mut edit| {
-                            let phi_node = edit.add_node(Node::Phi {
-                                control: bb,
-                                data: empty().collect(),
-                            });
-                            if bb != top_block || !user_per_loop_bb.contains_key(&bb) {
-                                user_per_loop_bb.insert(bb, phi_node);
-                            }
-                            if bb == top_block {
-                                top_phi = phi_node;
-                            }
-                            added_phis.insert(phi_node, bb);
-                            Ok(edit)
+            }
+            // Assign dummy phis.
+            for bb in loops.nodes_in_loop(top_block) {
+                if (!user_per_loop_bb.contains_key(&bb) || bb == top_block)
+                    && editor.func().nodes[bb.idx()].is_region()
+                {
+                    let success = editor.edit(|mut edit| {
+                        let phi_node = edit.add_node(Node::Phi {
+                            control: bb,
+                            data: empty().collect(),
                         });
-                        assert!(success);
-                    }
+                        if bb != top_block || !user_per_loop_bb.contains_key(&bb) {
+                            user_per_loop_bb.insert(bb, phi_node);
+                        }
+                        if bb == top_block {
+                            top_phi = phi_node;
+                        }
+                        added_phis.insert(phi_node, bb);
+                        Ok(edit)
+                    });
+                    assert!(success);
                 }
-                // Assign users for the rest of the blocks.
-                for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) {
-                    if !user_per_loop_bb.contains_key(&bb) {
-                        assert!(control_subgraph.preds(*bb).count() == 1);
-                        user_per_loop_bb.insert(
-                            *bb,
-                            user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()],
-                        );
-                    }
+            }
+            // Assign users for the rest of the blocks.
+            for bb in rev_po.iter().filter(|bb| loops.is_in_loop(top_block, **bb)) {
+                if !user_per_loop_bb.contains_key(&bb) {
+                    assert!(control_subgraph.preds(*bb).count() == 1);
+                    user_per_loop_bb.insert(
+                        *bb,
+                        user_per_loop_bb[&control_subgraph.preds(*bb).next().unwrap()],
+                    );
                 }
+            }
 
-                // Induce the clone.
-                let success = editor.edit(|mut edit| {
-                    // Create the constant collection object for allocation.
-                    let object_ty = typing[object.idx()];
-                    let object_cons = edit.add_zero_constant(object_ty);
-                    let cons_node = edit.add_node(Node::Constant { id: object_cons });
+            // Induce the clone.
+            let success = editor.edit(|mut edit| {
+                // Create the constant collection object for allocation.
+                let object_ty = typing[object.idx()];
+                let object_cons = edit.add_zero_constant(object_ty);
+                let cons_node = edit.add_node(Node::Constant { id: object_cons });
 
-                    // Create the phis.
-                    let mut real_phis = BTreeMap::new();
-                    for (dummy, bb) in added_phis {
-                        let real = edit.add_node(Node::Phi {
-                            control: bb,
-                            data: control_subgraph
-                                .preds(bb)
-                                .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
-                                .collect(),
-                        });
-                        real_phis.insert(dummy, real);
-                    }
-
-                    // Create the clone into the phi.
-                    let clone_node = edit.add_node(Node::Write {
-                        collect: real_phis[&top_phi],
-                        data: object,
-                        indices: vec![].into_boxed_slice(),
+                // Create the phis.
+                let mut real_phis = BTreeMap::new();
+                for (dummy, bb) in added_phis {
+                    let real = edit.add_node(Node::Phi {
+                        control: bb,
+                        data: control_subgraph
+                            .preds(bb)
+                            .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
+                            .collect(),
                     });
+                    real_phis.insert(dummy, real);
+                }
 
-                    // Make user use the cloned object.
-                    edit = edit.replace_all_uses_where(object, clone_node, |id| {
-                        id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
-                    })?;
+                // Create the clone into the phi.
+                let clone_node = edit.add_node(Node::Write {
+                    collect: real_phis[&top_phi],
+                    data: object,
+                    indices: vec![].into_boxed_slice(),
+                });
 
-                    // Get rid of the dummy phis.
-                    for (dummy, real) in real_phis {
-                        edit = edit.replace_all_uses(dummy, real)?;
-                        edit = edit.delete_node(dummy)?;
-                    }
+                // Make users use the cloned object.
+                edit = edit.replace_all_uses_where(object, clone_node, |id| {
+                    id.idx() < bbs.0.len() && loops.is_in_loop(top_block, bbs.0[id.idx()])
+                })?;
 
-                    Ok(edit)
-                });
-                assert!(success);
+                // Get rid of the dummy phis.
+                for (dummy, real) in real_phis {
+                    edit = edit.replace_all_uses(dummy, real)?;
+                    edit = edit.delete_node(dummy)?;
+                }
 
-                // De-duplicate phis.
-                gvn(editor, false);
+                Ok(edit)
+            });
+            assert!(success);
 
-                // Get rid of unused phis.
-                dce(editor);
-            }
-        };
+            // De-duplicate phis.
+            gvn(editor, false);
+
+            // Get rid of unused phis.
+            dce(editor);
+        }
+    };
 
     // Now that we've computed the used collections dataflow analysis, use the
     // results to materialize a clone whenever a node attempts to use an already
diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn
index a66cfc21..64fa34da 100644
--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -70,6 +70,7 @@ fn tricky2_loop_implicit_clone(a : usize, b : usize) -> i32 {
     } else {
       arr[0] = 9;
     }
+    arr[0] = 7;
     x += arr[0];
   }
   return x;
-- 
GitLab


From 6864929c97765625edbb362605c3870cde0ab29d Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 18:42:26 -0800
Subject: [PATCH 13/14] finally

---
 .../src/legalize_reference_semantics.rs        | 18 ++++++++++++++----
 .../implicit_clone/src/implicit_clone.jn       |  6 ++++--
 juno_samples/implicit_clone/src/main.rs        |  4 ++++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/hercules_opt/src/legalize_reference_semantics.rs b/hercules_opt/src/legalize_reference_semantics.rs
index f76fe831..3c89c7da 100644
--- a/hercules_opt/src/legalize_reference_semantics.rs
+++ b/hercules_opt/src/legalize_reference_semantics.rs
@@ -874,7 +874,8 @@ fn materialize_clones(
                 let cons_node = edit.add_node(Node::Constant { id: object_cons });
 
                 // Create the phis.
-                let mut real_phis = BTreeMap::new();
+                let mut phi_map = BTreeMap::new();
+                let mut real_phis = BTreeSet::new();
                 for (dummy, bb) in added_phis {
                     let real = edit.add_node(Node::Phi {
                         control: bb,
@@ -883,12 +884,14 @@ fn materialize_clones(
                             .map(|pred| *user_per_loop_bb.get(&pred).unwrap_or(&cons_node))
                             .collect(),
                     });
-                    real_phis.insert(dummy, real);
+                    phi_map.insert(dummy, real);
+                    real_phis.insert(real);
                 }
 
                 // Create the clone into the phi.
+                let real_top_phi = phi_map[&top_phi];
                 let clone_node = edit.add_node(Node::Write {
-                    collect: real_phis[&top_phi],
+                    collect: real_top_phi,
                     data: object,
                     indices: vec![].into_boxed_slice(),
                 });
@@ -899,11 +902,15 @@ fn materialize_clones(
                 })?;
 
                 // Get rid of the dummy phis.
-                for (dummy, real) in real_phis {
+                for (dummy, real) in phi_map {
                     edit = edit.replace_all_uses(dummy, real)?;
                     edit = edit.delete_node(dummy)?;
                 }
 
+                // Make phis use the clone instead of the top phi.
+                edit =
+                    edit.replace_all_uses_where(real_top_phi, clone_node, |id| *id != clone_node)?;
+
                 Ok(edit)
             });
             assert!(success);
@@ -913,6 +920,9 @@ fn materialize_clones(
 
             // Get rid of unused phis.
             dce(editor);
+
+            // Simplify phis.
+            phi_elim(editor);
         }
     };
 
diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn
index 64fa34da..d06a6498 100644
--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -57,7 +57,7 @@ fn tricky_loop_implicit_clone(a : usize, b : usize) -> i32 {
       x += arr[b];
     }
   }
-  return x + 70;
+  return x;
 }
 
 #[entry]
@@ -70,7 +70,9 @@ fn tricky2_loop_implicit_clone(a : usize, b : usize) -> i32 {
     } else {
       arr[0] = 9;
     }
-    arr[0] = 7;
+    for j = 0 to 4 {
+      arr[0] += 1;
+    }
     x += arr[0];
   }
   return x;
diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs
index 764b1df4..c0adaaef 100644
--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -23,6 +23,10 @@ fn main() {
         println!("{}", output);
         assert_eq!(output, 130);
 
+        let output = tricky2_loop_implicit_clone(2, 3).await;
+        println!("{}", output);
+        assert_eq!(output, 39);
+
         let output = no_implicit_clone(4).await;
         println!("{}", output);
         assert_eq!(output, 13);
-- 
GitLab


From e632507e46eea340313a216c566b10a59ed547a2 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 7 Jan 2025 18:49:30 -0800
Subject: [PATCH 14/14] Hack because I want to sleep

---
 hercules_opt/src/pass.rs          | 2 +-
 juno_samples/matmul/src/matmul.jn | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs
index d2ef3f28..baeaae87 100644
--- a/hercules_opt/src/pass.rs
+++ b/hercules_opt/src/pass.rs
@@ -395,7 +395,7 @@ impl PassManager {
                             &types_ref,
                             &def_uses[idx],
                         );
-                        gvn(&mut editor, true);
+                        gvn(&mut editor, false);
 
                         self.module.constants = constants_ref.take();
                         self.module.dynamic_constants = dynamic_constants_ref.take();
diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn
index b9ca2949..f59cc215 100644
--- a/juno_samples/matmul/src/matmul.jn
+++ b/juno_samples/matmul/src/matmul.jn
@@ -20,8 +20,8 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]
   
   for bi = 0 to n / 64 {
     for bk = 0 to l / 64 {
-      let atile : i32[66, 64];
-      let btile : i32[65, 64];
+      let atile : i32[64, 64];
+      let btile : i32[64, 64];
       let ctile : i32[64, 64];
 
       for tile_idx = 0 to m / 64 {
@@ -29,10 +29,6 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]
 	  for tk = 0 to 64 {
 	    atile[ti, tk] = a[bi * 64 + ti, tile_idx * 64 + tk];
 	    btile[ti, tk] = b[tile_idx * 64 + ti, bk * 64 + tk];
-	    // TODO: remove setting ctile to zero explicitly, clone analysis
-	    // should see a lack of a phi for ctile in the block loops and
-	    // induce a copy of an initial value of ctile (all zeros) on each
-	    // iteration of the block loops.
 	  }
 	}
         for ti = 0 to 64 {
-- 
GitLab