rarbore2 · rarbore2 · 2b60a19f · 3d1e5b15 · 37567e75 · 65aae3b1
--- a/hercules_opt/src/gcm.rs

+ 67

− 10
+++ b/hercules_opt/src/gcm.rs

+ 67

− 10
 @@ -152,6 +152,7 @@ pub fn gcm(
    let backing_allocation = object_allocation(
        editor,
        typing,
+        fork_join_nest,
        &node_colors,
        &alignments,
        &liveness,
 @@ -1053,19 +1054,22 @@ fn add_extra_collection_dims(
    devices: &Vec<Device>,
    bbs: &BasicBlocks,
 ) -> bool {
-    if devices[editor.func_id().idx()] == Device::AsyncRust
-        && editor.func().name == "_1_laplacian_estimate"
-    {
+    if devices[editor.func_id().idx()] == Device::AsyncRust {
        // Look for collection constant nodes inside fork-joins that are mutated
        // inside the fork-join, aren't involved in any of the reduces of the
        // fork-join, and have a user that isn't a direct read based on all of
        // the thread IDs.
-        let nodes = &editor.func().nodes;
        let fco = &objects[&editor.func_id()];
-        for id in editor.node_ids().filter(|id| {
-            nodes[id.idx()].is_constant() && !editor.get_type(typing[id.idx()]).is_primitive()
-        }) {
+        let candidates: Vec<_> = editor
+            .node_ids()
+            .filter(|id| {
+                editor.func().nodes[id.idx()].is_constant()
+                    && !editor.get_type(typing[id.idx()]).is_primitive()
+            })
+            .collect();
+        for id in candidates {
            // Check all of the above conditions.
+            let nodes = &editor.func().nodes;
            if editor.get_users(id).len() != 1 {
                continue;
            }
 @@ -1115,7 +1119,43 @@ fn add_extra_collection_dims(

            // We know that this collection needs to be replicated across the
            // fork-join dimensions, so do that.
-            todo!()
+            let ty = typing[id.idx()];
+            let num_dims: Vec<_> = forks
+                .into_iter()
+                .rev()
+                .map(|id| nodes[id.idx()].try_fork().unwrap().1.len())
+                .collect();
+            let factors = forks
+                .into_iter()
+                .rev()
+                .flat_map(|id| nodes[id.idx()].try_fork().unwrap().1.into_iter())
+                .map(|dc| *dc)
+                .collect();
+            let array_ty = Type::Array(ty, factors);
+            let success = editor.edit(|mut edit| {
+                let new_ty = edit.add_type(array_ty);
+                let new_cons = edit.add_zero_constant(new_ty);
+                let new_cons = edit.add_node(Node::Constant { id: new_cons });
+                let mut tids = vec![];
+                for (fork, num_dims) in forks.into_iter().rev().zip(num_dims) {
+                    for dim in 0..num_dims {
+                        tids.push(edit.add_node(Node::ThreadID {
+                            control: *fork,
+                            dimension: dim,
+                        }));
+                    }
+                }
+                let read = edit.add_node(Node::Read {
+                    collect: new_cons,
+                    indices: Box::new([Index::Position(tids.into_boxed_slice())]),
+                });
+                edit.sub_edit(id, new_cons);
+                edit = edit.replace_all_uses(id, read)?;
+                edit = edit.delete_node(id)?;
+                Ok(edit)
+            });
+            assert!(success);
+            return true;
        }
    }
    false
 @@ -1601,6 +1641,7 @@ fn type_size(edit: &mut FunctionEdit, ty_id: TypeID, alignments: &Vec<usize>) ->
 fn object_allocation(
    editor: &mut FunctionEditor,
    typing: &Vec<TypeID>,
+    fork_join_nest: &HashMap<NodeID, Vec<NodeID>>,
    node_colors: &FunctionNodeColors,
    alignments: &Vec<usize>,
    _liveness: &Liveness,
 @@ -1626,7 +1667,7 @@ fn object_allocation(
                    }
                }
                Node::Call {
-                    control: _,
+                    control,
                    function: callee,
                    ref dynamic_constants,
                    args: _,
 @@ -1656,9 +1697,25 @@ fn object_allocation(
                                callee_backing_size,
                                &mut edit,
                            );
+                            // Multiply the backing allocation size of the
+                            // callee by the number of parallel threads that
+                            // will call the function.
+                            let forks = &fork_join_nest[&control];
+                            let factors: Vec<_> = forks
+                                .into_iter()
+                                .rev()
+                                .flat_map(|id| edit.get_node(*id).try_fork().unwrap().1.into_iter())
+                                .map(|dc| *dc)
+                                .collect();
+                            let mut multiplied_callee_backing_size = callee_backing_size;
+                            for factor in factors {
+                                multiplied_callee_backing_size = edit.add_dynamic_constant(
+                                    DynamicConstant::mul(multiplied_callee_backing_size, factor),
+                                );
+                            }
                            *total = edit.add_dynamic_constant(DynamicConstant::add(
                                *total,
-                                callee_backing_size,
+                                multiplied_callee_backing_size,
                            ));
                        }
                    }