From cd747c23d0972f6004fd4f321ecd502fd9906929 Mon Sep 17 00:00:00 2001
From: Praneet Rathi <prrathi10@gmail.com>
Date: Wed, 22 Jan 2025 09:13:44 -0600
Subject: [PATCH] origins

---
 hercules_cg/src/fork_tree.rs |  0
 hercules_cg/src/gpu.rs       | 98 +++++++++++++++++++-----------------
 2 files changed, 52 insertions(+), 46 deletions(-)
 create mode 100644 hercules_cg/src/fork_tree.rs

diff --git a/hercules_cg/src/fork_tree.rs b/hercules_cg/src/fork_tree.rs
new file mode 100644
index 00000000..e69de29b
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index fb9526bf..df95f63f 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -28,9 +28,8 @@ pub fn gpu_codegen<W: Write>(
     /*
      * We assert the following:
      * - Fork node must have >= 1 Reduce nodes
-     * - If the returned data type is a collection, it must have
-     *   originated from a single known parameter. Can relax to allow
-     *   one of multiple parameters.
+     * - (Later in code) If the returned data type is a collection, it must have
+     *   originated from (potentially multiple) parameter(s).
      *
      * We don't assert but assume the following:
      * - max_num_blocks in KernelParams is within constraint of 1D grid size. This
@@ -165,6 +164,7 @@ pub fn gpu_codegen<W: Write>(
         typing,
         control_subgraph,
         bbs,
+        collection_objects,
         kernel_params,
         def_use_map,
         fork_join_map,
@@ -190,6 +190,7 @@ struct GPUContext<'a> {
     typing: &'a Vec<TypeID>,
     control_subgraph: &'a Subgraph,
     bbs: &'a BasicBlocks,
+    collection_objects: &'a FunctionCollectionObjects,
     kernel_params: &'a GPUKernelParams,
     def_use_map: &'a ImmutableDefUseMap,
     fork_join_map: &'a HashMap<NodeID, NodeID>,
@@ -247,7 +248,13 @@ impl GPUContext<'_> {
     fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> {
         // Emit all code up to the "goto" to Start's block
         let mut top = String::new();
-        self.codegen_kernel_begin(&mut top)?;
+        let return_parameter = if self.collection_objects.returned_objects().len() == 1 {
+            Some(self.collection_objects.origin(*self.collection_objects.returned_objects()
+                .first().unwrap()).try_parameter().unwrap())
+        } else {
+            None
+        };
+        self.codegen_kernel_begin(return_parameter.is_none(), &mut top)?;
         let mut dynamic_shared_offset = "0".to_string();
         self.codegen_dynamic_constants(&mut top)?;
         self.codegen_declare_data(&mut top)?;
@@ -271,7 +278,7 @@ impl GPUContext<'_> {
             (1, 1)
         } else {
             // Create structures and determine block and thread parallelization strategy
-            let (fork_tree, fork_control_map) = self.make_fork_structures(&self.fork_join_map);
+            let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map);
             let (root_forks, num_blocks) =
                 self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks);
             let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks);
@@ -308,14 +315,14 @@ impl GPUContext<'_> {
 
         // Emit host launch code
         let mut host_launch = String::new();
-        self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?;
+        self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, return_parameter, &mut host_launch)?;
         write!(w, "{}", host_launch)?;
 
         Ok(())
     }
 
     // Emit kernel headers, signature, arguments, and dynamic shared memory declaration
-    fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> {
+    fn codegen_kernel_begin(&self, has_ret_var: bool, w: &mut String) -> Result<(), Error> {
         write!(w, "
 #include <assert.h>
 #include <stdio.h>
@@ -367,13 +374,15 @@ namespace cg = cooperative_groups;
             };
             write!(w, "{} p{}", param_type, idx)?;
         }
-        if !first_param {
-            write!(w, ", ")?;
+        if has_ret_var {
+            if !first_param {
+                write!(w, ", ")?;
+            }
+            write!(
+                w,
+                "void* __restrict__ ret",
+            )?;
         }
-        write!(
-            w,
-            "void* __restrict__ ret",
-        )?;
 
         // Type is char since it's simplest to use single bytes for indexing
         // and it's required for heterogeneous Product and Summation types.
@@ -480,7 +489,7 @@ namespace cg = cooperative_groups;
         Ok(())
     }
 
-    fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> {
+    fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, return_parameter: Option<usize>, w: &mut String) -> Result<(), Error> {
         // The following steps are for host-side C function arguments, but we also
         // need to pass arguments to kernel, so we keep track of the arguments here.
         let ret_type = self.get_type(self.function.return_type, false);
@@ -512,20 +521,27 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
             write!(pass_args, "p{}", idx)?;
         }
         write!(w, ") {{\n")?;
-        // Allocate return parameter and lift to kernel argument
-        let ret_type_pnt = self.get_type(self.function.return_type, true);
-        write!(w, "\t{} ret;\n", ret_type_pnt)?;
-        if !first_param {
-            write!(pass_args, ", ")?;
+        let has_ret_var = return_parameter.is_none();
+        if has_ret_var {
+            // Allocate return parameter and lift to kernel argument
+            let ret_type_pnt = self.get_type(self.function.return_type, true);
+            write!(w, "\t{} ret;\n", ret_type_pnt)?;
+            if !first_param {
+                write!(pass_args, ", ")?;
+            }
+            write!(pass_args, "ret")?;
+            write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
         }
-        write!(pass_args, "ret")?;
-        write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
         write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?;
         write!(w, "\tcudaDeviceSynchronize();\n")?;
-        // Copy return from device to host, whether it's primitive value or collection pointer
-        write!(w, "\t{} host_ret;\n", ret_type)?;
-        write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?;
-        write!(w, "\treturn host_ret;\n")?;
+        if has_ret_var {
+            // Copy return from device to host, whether it's primitive value or collection pointer
+            write!(w, "\t{} host_ret;\n", ret_type)?;
+            write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?;
+            write!(w, "\treturn host_ret;\n")?;
+        } else {
+            write!(w, "\treturn p{};\n", return_parameter.unwrap())?;
+        }
         write!(w, "}}\n")?;
         Ok(())
     }
@@ -585,8 +601,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
             panic!("Expected fork node");
         };
         let fork_size = self.multiply_fork_factors(factors);
+        let reduces = &self.fork_reduce_map[root_fork];
+        assert!(reduces.iter().all(|reduce| {
+            self.collection_objects.objects(*reduce).iter().all(|object| {
+                self.collection_objects.origin(*object).try_parameter().is_some()
+            })
+        }), "All collection reduces in block fork must originate from parameters");
         if let Some(fork_size) = fork_size
             && fork_size <= max_num_blocks
+            && fork_size.is_power_of_two()
             && self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork)
         {
             (root_forks, fork_size)
@@ -1206,8 +1229,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                     )?;
                 }
             }
-            // If we read collection, distribute elements among threads with cg
-            // sync after. If we read primitive, copy read on all threads.
+            // Read of primitive requires load after pointer math.
             Node::Read { collect, indices } => {
                 let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()]));
                 let data_type_id = self.typing[id.idx()];
@@ -1215,27 +1237,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                     let type_name = self.get_type(data_type_id, true);
                     write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?;
                 } else {
-                    if KernelState::OutBlock == state && num_blocks.unwrap() > 1 {
-                        panic!("GPU can't guarantee correctness for multi-block collection reads");
-                    }
-                    let cg_tile = match state {
-                        KernelState::OutBlock | KernelState::InBlock => "block".to_string(),
-                        KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId),
-                    };
-                    // Divide up "elements", which are collection size divided
-                    // by element size, among threads.
-                    let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects));
-                    write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?;
-                    write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, define_variable, collect_with_indices)?;
-                    write!(w, "{}}}\n", tabs)?;
-                    write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?;
-                    write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, define_variable, cg_tile, data_size, cg_tile, cg_tile, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile)?;
-                    write!(w, "{}}}\n", tabs)?;
-                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    write!(w, "{}{} = {};\n", tabs, define_variable, collect_with_indices)?;
                 }
             }
-            // Write is same as read, but when writing a primitive, we need to gate with
-            // a thread rank check.
+            // Write of primitive needs a thread rank gate for safety. Write of
+            // collection is memcpy that we distribute among threads.
             Node::Write {
                 collect,
                 data,
-- 
GitLab