prathi3
--- a/hercules_cg/src/gpu.rs

+ 240

− 70
+++ b/hercules_cg/src/gpu.rs

+ 240

− 70
 @@ -85,6 +85,7 @@ pub fn gpu_codegen<W: Write>(
    };
    let (fork_forward_adjacency, fork_join_map) = make_fork_structures();

+    // Maybe can delete
    let map_join_reduce = || -> HashMap<NodeID, Vec<NodeID>> {
        let reduce_nodes: Vec<NodeID> = (0..function.nodes.len())
            .filter(|idx| function.nodes[*idx].is_reduce())
 @@ -219,6 +220,7 @@ impl GPUContext<'_> {
            "
 #include <assert.h>
 #include <stdio.h>
+#include <stddef.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <mma.h>
 @@ -293,7 +295,7 @@ impl GPUContext<'_> {
        for (id, ty) in self.types.iter().enumerate() {
            match ty {
                Type::Product(ref product_ty_ids) => {
-                    write!(w, "\nstruct Product_{} {{\n", id)?;
+                    write!(w, "\ntypedef struct Product_{} {{\n", id)?;
                    for (i, product_ty_id) in product_ty_ids.iter().enumerate() {
                        write!(
                            w,
 @@ -302,10 +304,10 @@ impl GPUContext<'_> {
                            i
                        )?;
                    }
-                    write!(w, "}};\n")?;
+                    write!(w, "}} Product_{};\n", id)?;
                }
                Type::Summation(ref summation_ty_ids) => {
-                    write!(w, "\nstruct Summation_{} {{\n\t union {{\n", id)?;
+                    write!(w, "\ntypedef struct Summation_{} {{\n\t union {{\n", id)?;
                    for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() {
                        write!(
                            w,
 @@ -314,7 +316,7 @@ impl GPUContext<'_> {
                            i
                        )?;
                    }
-                    write!(w, "\t}};\n\tuint8_t tag;\n}};\n")?;
+                    write!(w, "\t}};\n\tuint8_t tag;\n}} Summation_{};\n", id)?;
                }
                _ => {}
            }
 @@ -348,7 +350,7 @@ impl GPUContext<'_> {
            write!(w, "{} p{}", self.get_type(*ty, true), idx)?;
        }
        // We convert originally non-void functions to void functions by adding a
-        // return parameter. For now we ignore the case where return was derived 
+        // return parameter. For now we ignore the case where return was derived
        // from a parameter through reads and writes, and instead always memcpy.
        let return_index = self.function.nodes.iter().position(|node| node.is_return());
        if let Some(return_index) = return_index {
 @@ -424,7 +426,7 @@ impl GPUContext<'_> {
        Ok(())
    }

-    // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the join has no user reduce nodes, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred.
+    // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the forks are parallel reduce forks, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred.
    fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> {
        let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect();
        for (_, children) in self.fork_forward_adjacency.iter() {
 @@ -442,7 +444,7 @@ impl GPUContext<'_> {
        let mut curr_fork = root_forks[0];
        while let Some(join) = self.fork_join_map.get(&curr_fork) {
            let children = &self.fork_forward_adjacency[&curr_fork];
-            if children.len() != 1 || !self.join_reduce_map.contains_key(join) {
+            if children.len() != 1 || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) {
                break;
            }
            curr_fork = children[0];
 @@ -556,7 +558,7 @@ impl GPUContext<'_> {
            }
        }

-        // 2. Emit data flow for nodes assigned to those basic blocks
+        // 2. Emit data flow for nodes assigned to basic blocks in block sink
        // 2a. All phi registers first
        self.codegen_phi_registers(w, |id| control_nodes_between.contains(&self.bbs[id.idx()]))?;
        // 2b. All other data nodes
 @@ -578,6 +580,7 @@ impl GPUContext<'_> {
            {
                self.codegen_data_node(
                    *id,
+                    1,
                    &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body,
                )?;
                visited.insert(id);
 @@ -598,13 +601,10 @@ impl GPUContext<'_> {
        Ok(())
    }

-    fn codegen_data_node(
-        &self,
-        id: NodeID,
-        w: &mut String,
-    ) -> Result<(), Error> {
+    fn codegen_data_node(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> {
        // For now only used shared memory when creating an array
        let declare_variable = self.get_value(id, true, false).to_string();
+        let tabs = "\t".repeat(num_tabs);
        match &self.function.nodes[id.idx()] {
            // Phi registers were already emitted.
            Node::Phi {
 @@ -628,7 +628,8 @@ impl GPUContext<'_> {
                    Type::Boolean => {
                        write!(
                            w,
-                            "\t{} = !{};\n",
+                            "{}{} = !{};\n",
+                            tabs,
                            declare_variable,
                            self.get_value(*input, false, false),
                        )?;
 @@ -636,7 +637,8 @@ impl GPUContext<'_> {
                    ty if ty.is_fixed() => {
                        write!(
                            w,
-                            "\t{} = ~{};\n",
+                            "{}{} = ~{};\n",
+                            tabs,
                            declare_variable,
                            self.get_value(*input, false, false),
                        )?;
 @@ -647,7 +649,8 @@ impl GPUContext<'_> {
                    ty if ty.is_signed() || ty.is_float() => {
                        write!(
                            w,
-                            "\t{} = -{};\n",
+                            "{}{} = -{};\n",
+                            tabs,
                            declare_variable,
                            self.get_value(*input, false, false),
                        )?;
 @@ -659,7 +662,8 @@ impl GPUContext<'_> {
                UnaryOperator::Cast(dst_ty_id) => {
                    write!(
                        w,
-                        "\t{} = static_cast<{}>({});\n",
+                        "{}{} = static_cast<{}>({});\n",
+                        tabs,
                        declare_variable,
                        self.get_type(*dst_ty_id, false),
                        self.get_value(*input, false, false),
 @@ -672,36 +676,29 @@ impl GPUContext<'_> {
                match (op, &self.types[self.typing[left.idx()].idx()]) {
                    (BinaryOperator::Rem, Type::Float32) => write!(
                        w,
-                        "\t{} = fmodf({}, {});\n",
-                        declare_variable,
-                        left_val,
-                        right_val,
+                        "{}{} = fmodf({}, {});\n",
+                        tabs, declare_variable, left_val, right_val,
                    )?,
                    (BinaryOperator::Rem, Type::Float64) => write!(
                        w,
-                        "\t{} = fmod({}, {});\n",
-                        declare_variable,
-                        left_val,
-                        right_val,
+                        "{}{} = fmod({}, {});\n",
+                        tabs, declare_variable, left_val, right_val,
                    )?,
                    // Doesn't need special syntax but bool type
                    (BinaryOperator::Or, Type::Boolean) => write!(
                        w,
-                        "\t{} = {} || {};\n",
-                        declare_variable,
-                        left_val,
-                        right_val,
+                        "{}{} = {} || {};\n",
+                        tabs, declare_variable, left_val, right_val,
                    )?,
                    (BinaryOperator::And, Type::Boolean) => write!(
                        w,
-                        "\t{} = {} && {};\n",
-                        declare_variable,
-                        left_val,
-                        right_val,
+                        "{}{} = {} && {};\n",
+                        tabs, declare_variable, left_val, right_val,
                    )?,
                    (op, _) => write!(
                        w,
-                        "\t{} = {} {} {};\n",
+                        "{}{} = {} {} {};\n",
+                        tabs,
                        declare_variable,
                        left_val,
                        match op {
 @@ -726,11 +723,17 @@ impl GPUContext<'_> {
                    )?,
                };
            }
-            Node::Ternary {op, first, second, third} => match op {
+            Node::Ternary {
+                op,
+                first,
+                second,
+                third,
+            } => match op {
                TernaryOperator::Select => {
                    write!(
                        w,
-                        "\t{} = {} ? {} : {};\n",
+                        "{}{} = {} ? {} : {};\n",
+                        tabs,
                        declare_variable,
                        self.get_value(*first, false, false),
                        self.get_value(*second, false, false),
 @@ -743,30 +746,53 @@ impl GPUContext<'_> {
                let func_name = self.codegen_intrinsic(intrinsic, ty);
                write!(
                    w,
-                    "\t{} = {}({});\n",
+                    "{}{} = {}({});\n",
+                    tabs,
                    declare_variable,
                    func_name,
                    self.get_value(args[0], false, false),
                )?;
            }
            Node::Read { collect, indices } => {
-                let index_ptr_name = self.codegen_indices(*collect, indices);
-                // If it's a parameter node then copy from global memory, else 
-                // reference from shared memory or registers.
+                // If it's a parameter node then copy from global memory, else
+                // from shared memory or registers.
                if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] {
-                    // We parallelize copies from global memory across threads for 
-                    // array types, either immediate or nested in the collection.
-                    if self.types[self.typing[id.idx()].idx()].is_primitive() {
-                        write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?;
-                    } else {
-                        self.codegen_global_to_shared(id, declare_variable, index_ptr_name, indices.len(), true, w)?;
-                    }
+                    let index_ptr_name = self.codegen_indices(*collect, indices, true);
+                    self.codegen_copy_from_global(
+                        true,
+                        self.typing[id.idx()],
+                        &declare_variable,
+                        &index_ptr_name,
+                        Some(indices.len()),
+                        true,
+                        num_tabs,
+                        w,
+                    )?;
                } else {
-                    write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?;
+                    let index_ptr_name = self.codegen_indices(*collect, indices,false);
+                    write!(w, "{}{} = {};\n", tabs, declare_variable, index_ptr_name)?;
                }
            }
-            Node::Write {collect: _, data: _, indices: _} => {
-                // TODO
+            Node::Write {collect, data, indices} => {
+                let data_variable = self.get_value(*data, false, false);
+                // If it's a parameter node then copy to global memory, else
+                // to shared memory or registers
+                if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] {
+                    let index_ptr_name = self.codegen_indices(*collect, indices, true);
+                    self.codegen_copy_to_from_global(
+                        false,
+                        self.typing[id.idx()],
+                        &data_variable,
+                        &index_ptr_name,
+                        Some(indices.len()),
+                        true,
+                        num_tabs,
+                        w,
+                    )?;
+                } else {
+                    let index_ptr_name = self.codegen_indices(*collect, indices, false);
+                    write!(w, "{}{} = {};\n", tabs, index_ptr_name, data_variable)?;
+                }
            }
            _ => {
                panic!("Unsupported node type")
 @@ -848,7 +874,7 @@ impl GPUContext<'_> {
                let Type::Array(element_type, extents) = &self.types[type_id.idx()] else {
                    panic!("Expected array type")
                };
-                // For now we do element-wise alignment, later could consider (n-1)d array 
+                // For now we do element-wise alignment, later could consider (n-1)d array
                // alignment. Then we "allocate" from the single dynamic shared memory buffer
                // by using and updating the offset.
                let element_size = format!("sizeof({})", self.get_type(*element_type, false));
 @@ -857,18 +883,41 @@ impl GPUContext<'_> {
                    .map(|id| format!("dc{}", id.idx()))
                    .collect::<Vec<_>>()
                    .join("*");
-                write!(w, ";\n\talignment = {};\n\tdynamic_shared_offset = 
+                write!(
+                    w,
+                    ";\n\talignment = {};\n\tdynamic_shared_offset = 
                    (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = 
                    reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t
-                    dynamic_shared_offset += {}", element_size, name, self.get_type(*element_type, false), array_size)?;
+                    dynamic_shared_offset += {}",
+                    element_size,
+                    name,
+                    self.get_type(*element_type, false),
+                    array_size
+                )?;
            }
        }
        Ok(())
    }

-    fn codegen_global_to_shared(&self, id: NodeID, declare_variable: String, index_ptr_name: String, array_depth: Option<usize>, outermost: bool, w: &mut String) -> Result<(), Error> {
-        match &self.types[self.typing[id.idx()].idx()] {
-            Type::Array(_, extents) => {
+    // Used for reads and writes due to identical logic. data_variable is the
+    // resulting reference for reads, and is the source for writes. Writes don't 
+    // emit a new reference.
+    fn codegen_copy_from_global(
+        &self,
+        is_read: bool,
+        type_id: TypeID,
+        data_variable: &String,
+        index_ptr_name: &String,
+        array_depth: Option<usize>,
+        parallelize: bool,
+        num_tabs: usize,
+        w: &mut String,
+    ) -> Result<(), Error> {
+        let tabs = "\t".repeat(num_tabs);
+        let lhs = if is_read { data_variable } else { index_ptr_name };
+        let rhs = if is_read { index_ptr_name } else { data_variable };
+        match &self.types[type_id.idx()] {
+            Type::Array(element_type_id, extents) => {
                let array_depth = array_depth.unwrap();
                let rem_array_size = extents
                    .iter()
 @@ -878,23 +927,95 @@ impl GPUContext<'_> {
                    .collect::<Vec<_>>()
                    .join("*");
                let mut running_div_factor = "1".to_string();
-                write!(w, "\tfor (int i = threadIdx.x; i < {}; i += {}) {{\n", rem_array_size, self.kernel_attrs.num_threads)?;
                let mut indices = vec![];
                for i in (array_depth..extents.len()).rev() {
-                    indices.push(format!("[(({}) / ({})) % dc{}]", rem_array_size, running_div_factor, extents[i].idx()));
-                    running_div_factor = format!("{} * {}", running_div_factor, format!("dc{}", extents[i].idx()));
+                    indices.push(format!(
+                        "[(({}) / ({})) % dc{}]",
+                        rem_array_size,
+                        running_div_factor,
+                        extents[i].idx()
+                    ));
+                    running_div_factor = format!(
+                        "{} * {}",
+                        running_div_factor,
+                        format!("dc{}", extents[i].idx())
+                    );
                }
                let indices_str = indices.join("");
-                // TODO: condition by primitive vs collection, if latter then recurse
-                // with outermost = false
-                write!(w, "\t\t{}{} = {}{};\n", declare_variable, indices_str, index_ptr_name, indices_str)?;
+                // Parallelizing only affects loop bounds
+                let begin_copy = if parallelize {
+                    format!(
+                        "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n",
+                        tabs, rem_array_size, self.kernel_attrs.num_threads
+                    )
+                } else {
+                    format!("{}for (int i = 0; i < {}; i++) {{\n", tabs, rem_array_size)
+                };
+                write!(w, "{}", begin_copy)?;
+                self.codegen_copy_to_from_global(
+                    is_read,
+                    *element_type_id,
+                    &format!("{}{}", data_variable, indices_str),
+                    &format!("{}{}", index_ptr_name, indices_str),
+                    None,
+                    false,
+                    num_tabs + 1,
+                    w,
+                )?;
+                let end_copy = if parallelize {
+                    format!("{}}}\n{}__syncthreads();\n", tabs, tabs)
+                } else {
+                    format!("{}}}\n", tabs)
+                };
+                write!(w, "{}", end_copy)?;
+            }
+            Type::Product(fields) => {
+                for field in fields {
+                    self.codegen_copy_to_from_global(
+                        is_read,
+                        *field,
+                        &format!("{}{}", data_variable, field.idx()),
+                        &format!("{}{}", index_ptr_name, field.idx()),
+                        None,
+                        false,
+                        num_tabs + 1,
+                        w,
+                    )?;
+                }
+            }
+            Type::Summation(fields) => {
+                // First copy the tag
+                write!(w, "{}{}.tag = {}.tag;\n", tabs, lhs, rhs)?;
+                // Then copy the active field based on the tag
+                write!(w, "{}switch({}.tag) {{\n", tabs, rhs)?;
+                for (variant_idx, field) in fields.iter().enumerate() {
+                    write!(w, "{}\tcase {}: {{\n", tabs, variant_idx)?;
+                    // Recursively copy the field's contents
+                    self.codegen_copy_to_from_global(
+                        is_read,
+                        *field,
+                        &format!("{}.field_{}", data_variable, variant_idx),
+                        &format!("{}.field_{}", index_ptr_name, variant_idx),
+                        None,
+                        false,
+                        num_tabs + 2,
+                        w
+                    )?;
+                    write!(w, "{}\t\tbreak;\n", tabs)?;
+                    write!(w, "{}\t}}\n", tabs)?;
+                }
+                write!(w, "{}}}\n", tabs)?;
+            }
+            // Primitive types
+            _ => {
+                write!(w, "{}{} = {};\n", tabs, lhs, rhs)?;
            }
-            // TODO: handle product and summation collections
        }
        Ok(())
    }

-    fn codegen_indices(&self, collect: NodeID, indices: &[Index]) -> String {
+    // Use normal indexing for local collections
+    fn codegen_indices_local(&self, collect: NodeID, indices: &[Index]) -> String {
        let mut index_ptr_name = format!("{}", self.get_value(collect, false, false));
        for index in indices {
            match index {
 @@ -905,15 +1026,54 @@ impl GPUContext<'_> {
                    index_ptr_name.push_str(&format!(".field_{}", variant));
                }
                Index::Position(indices) => {
-                    index_ptr_name.push_str(&indices
+                    index_ptr_name.push_str(
+                        &indices
+                            .iter()
+                            .map(|index| format!("[{}]", self.get_value(*index, false, false)))
+                            .collect::<Vec<_>>()
+                            .join(""),
+                    );
+                }
+            }
+        }
+        index_ptr_name
+    }
+
+    // Use arithmetic for global collections as they're accessed as pointers
+    fn codegen_indices_global(&self, collect: NodeID, indices: &[Index]) -> String {
+        let mut index_ptr_name = format!("{}[0", self.get_value(collect, false, false));
+        let type_id = self.typing[collect.idx()];
+        for index in indices {
+            match index {
+                Index::Field(field) => { 
+                    let offset = (0..*field)
+                        .map(|i| format!("offsetof({}, field_{})", self.get_type(type_id, false), i))
+                        .collect::<Vec<_>>()
+                        .join(" + ");
+                    index_ptr_name.push_str(&format!(" + {}", offset));
+                }
+                // Variants of summations have zero offset
+                Index::Variant(_) => {}
+                Index::Position(array_indices) => {
+                    let Type::Array(_, extents) = &self.types[self.typing[collect.idx()].idx()] else {
+                        panic!("Expected array type")
+                    };
+                    let mut cumulative_offset = "1 * ".to_string() + extents
                        .iter()
-                        .map(|index| format!("[{}]", self.get_value(*index, false, false)))
+                        .enumerate()
+                        .filter(|(i, _)| *i >= array_indices.len())
+                        .map(|(_, id)| format!("dc{}", id.idx()))
                        .collect::<Vec<_>>()
-                        .join(""));
+                        .join(" * ")
+                        .as_str();
+                    for index in array_indices.iter().rev() {
+                        cumulative_offset = format!("{} * ({} + ", cumulative_offset, self.get_value(*index, false, false));
+                    }
+                    index_ptr_name.push_str(&format!(" + {}{}", cumulative_offset, ")".repeat(array_indices.len())));
                }
            }
        }
-        index_ptr_name
+        format!("{}]", index_ptr_name)
    }

    fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String {
 @@ -1050,7 +1210,17 @@ impl GPUContext<'_> {
                panic!("Parameters shouldn't be re-initialized")
            }
            format!("p{}", index)
-        } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] {
+        } else if let Node::Write { collect, data: _, indices: _ } = &self.function.nodes[id.idx()] {
+            if ty {
+                panic!("Writes shouldn't be initialized, they're replaced with the referenced collection")
+            }
+            if make_pointer {
+                panic!("Writes shouldn't be called as pointer")
+            }
+            self.get_value(*collect, false, false)
+        } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()]
+        {
+            // Shmem/register arrays have special formatting
            let mut declare_array = format!(
                "{} (*{}{})",
                self.get_type(*element_type, false),
 @@ -1077,7 +1247,7 @@ impl GPUContext<'_> {
        }
    }

-    // make_pointer enforces static pointer and not recursive or array pointer: 
+    // make_pointer enforces static pointer and not recursive or array pointer:
    // multi-d arrays are single pointers with custom indexing.
    fn get_type(&self, id: TypeID, make_pointer: bool) -> String {
        match &self.types[id.idx()] {