prathi3
--- a/hercules_cg/src/gpu.rs

+ 43

− 48
+++ b/hercules_cg/src/gpu.rs

+ 43

− 48
 @@ -47,12 +47,17 @@ pub fn gpu_codegen<W: Write>(
 @@ -47,12 +47,17 @@ pub fn gpu_codegen<W: Write>(
        .map(NodeID::new)
        .collect();
+    // Fork reduce map should have all reduces contained in some key
    let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new();
+    // Reduct reduce map should have all non-parallel and non-associative reduces
+    // contained in some key. Unlike fork, reduct is not involved in any assertions,
+    // put it here for convenience but can move.
+    let reduct_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new();
    for reduce_node in &reduce_nodes {
        if let Node::Reduce {
            control,
            init: _,
-            reduct: _,
+            reduct,
        } = &function.nodes[reduce_node.idx()]
        {
            match function.nodes[control.idx()] {
 @@ -71,6 +76,13 @@ pub fn gpu_codegen<W: Write>(
 @@ -71,6 +76,13 @@ pub fn gpu_codegen<W: Write>(
                    panic!("Reduce's control must be a join or region node");
                }
            }
+            if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce)
+                && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) {
+                reduct_reduce_map
+                    .entry(*reduct)
+                    .or_default()
+                    .push(*reduce_node);
+            }
        }
    }
    for idx in 0..function.nodes.len() {
 @@ -160,6 +172,7 @@ pub fn gpu_codegen<W: Write>(
 @@ -160,6 +172,7 @@ pub fn gpu_codegen<W: Write>(
        bbs,
        kernel_params,
        fork_reduce_map,
+        reduct_reduce_map,
        label_data_for_phi,
        return_type_id,
    };
 @@ -187,6 +200,7 @@ struct GPUContext<'a> {
 @@ -187,6 +200,7 @@ struct GPUContext<'a> {
    bbs: &'a Vec<NodeID>,
    kernel_params: &'a GPUKernelParams,
    fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>,
+    reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>,
    label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>,
    return_type_id: &'a TypeID,
 }
 @@ -367,7 +381,9 @@ impl GPUContext<'_> {
 @@ -367,7 +381,9 @@ impl GPUContext<'_> {
    }
    // Emit helper registers that are used throughout the kernel- alignment
-    // is for proper dynamic shared memory allocation
+    // is for proper dynamic shared memory allocation, max_variant_size is
+    // for variant selection during read/write copies since we don't keep
+    // tag (don't need and it can double summation memory usage due to alignment)
    fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
        write!(w, "\tsize_t alignment;\n")?;
        write!(w, "\tsize_t max_variant_size;\n")?;
 @@ -555,45 +571,6 @@ impl GPUContext<'_> {
 @@ -555,45 +571,6 @@ impl GPUContext<'_> {
        }
    }
-    // /*
-    //  * For each parallel reduce with a reduct write, meaning it's at the end of
-    //  * a potential parallel reduction chain, we walk back to beginning of chain
-    //  * and update the write's collect to be the beginning's init.
-    //  */
-    // fn update_write_collects(&self) -> HashMap<NodeID, NodeID> {
-    //     let mut write_collect_map = HashMap::new();
-    //     let mut parallel_reduces: HashSet<NodeID> = (0..self.function.nodes.len())
-    //         .map(NodeID::new)
-    //         .filter(|&node_id| {
-    //             self.function.schedules[node_id.idx()].contains(&Schedule::ParallelReduce)
-    //         })
-    //         .collect();
-    //     for reduce in parallel_reduces.clone() {
-    //         if let Node::Reduce {
-    //             control: _,
-    //             init,
-    //             reduct,
-    //         } = &self.function.nodes[reduce.idx()]
-    //             && let Node::Write { .. } = &self.function.nodes[reduct.idx()]
-    //         {
-    //             parallel_reduces.remove(&reduce);
-    //             while parallel_reduces.contains(&init) {
-    //                 let Node::Reduce {
-    //                     control: _,
-    //                     init,
-    //                     reduct: _,
-    //                 } = &self.function.nodes[init.idx()]
-    //                 else {
-    //                     panic!("Expected reduce node");
-    //                 };
-    //                 parallel_reduces.remove(&init);
-    //             }
-    //             write_collect_map.insert(*reduct, *init);
-    //         }
-    //     }
-    //     write_collect_map
-    // }
    fn codegen_data_control(
        &self,
        root_forks: &Vec<NodeID>,
 @@ -684,10 +661,8 @@ impl GPUContext<'_> {
 @@ -684,10 +661,8 @@ impl GPUContext<'_> {
                    _ => { panic!("Unsupported state for ThreadID") }
                }
            }
-            Node::Reduce { control: _, init, reduct: _ } => {
+            // Fork initializes the reduce and reduct updates the reduce
-                let init_val = self.get_value(*init, false, false);
+            Node::Reduce { control: _, init: _, reduct: _ } => {}
-                write!(w, "{}{} = {};\n", tabs, declare_variable, init_val)?;
-            }
            // Parameters emitted at top
            Node::Parameter { index: _ } => {}
            Node::Constant { id: cons_id } => {
 @@ -889,16 +864,25 @@ impl GPUContext<'_> {
 @@ -889,16 +864,25 @@ impl GPUContext<'_> {
            }
        }
        if let Some(phis) = self.label_data_for_phi.get(&id) {
+            let val = self.get_value(id, false, false);
            for phi in phis {
+                let phi_val = self.get_value(*phi, false, false);
                write!(
                    w,
                    "{}{} = {};\n",
                    tabs,
-                    self.get_value(*phi, false, false),
+                    phi_val,
-                    self.get_value(id, false, false),
+                    val,
                )?;
            }
        }
+        if let Some(reduces) = self.reduct_reduce_map.get(&id) {
+            let val = self.get_value(id, true, false);
+            for reduce in reduces {
+                let reduce_val = self.get_value(*reduce, false, false);
+                write!(w, "{}{} = {};\n", tabs, reduce_val, val)?;
+            }
+        }
        Ok(())
    }
 @@ -937,7 +921,18 @@ impl GPUContext<'_> {
 @@ -937,7 +921,18 @@ impl GPUContext<'_> {
            Node::Fork {
                control: _,
                factors: _,
-            } => {}
+            } => {
+                // Emitting reduces before the fork allows the reduce to be 
+                // used outside of the fork.
+                for &reduce in self.fork_reduce_map.get(&id).unwrap() {
+                    let reduce_val = self.get_value(reduce, true, false);
+                    let Node::Reduce { control: _, init, reduct: _ } = &self.function.nodes[reduce.idx()] else {
+                        panic!("Expected reduce node");
+                    };
+                    let init_val = self.get_value(*init, true, false);
+                    write!(w, "{}{} = {};\n", tabs, reduce_val, init_val)?;
+                }
+            }
            Node::Join { control: _ } => {}
            Node::Return { control: _, data } => {
                if self.types[self.typing[data.idx()].idx()].is_primitive() {