From cd747c23d0972f6004fd4f321ecd502fd9906929 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Wed, 22 Jan 2025 09:13:44 -0600 Subject: [PATCH] origins --- hercules_cg/src/fork_tree.rs | 0 hercules_cg/src/gpu.rs | 98 +++++++++++++++++++----------------- 2 files changed, 52 insertions(+), 46 deletions(-) create mode 100644 hercules_cg/src/fork_tree.rs diff --git a/hercules_cg/src/fork_tree.rs b/hercules_cg/src/fork_tree.rs new file mode 100644 index 00000000..e69de29b diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index fb9526bf..df95f63f 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -28,9 +28,8 @@ pub fn gpu_codegen<W: Write>( /* * We assert the following: * - Fork node must have >= 1 Reduce nodes - * - If the returned data type is a collection, it must have - * originated from a single known parameter. Can relax to allow - * one of multiple parameters. + * - (Later in code) If the returned data type is a collection, it must have + * originated from (potentially multiple) parameter(s). * * We don't assert but assume the following: * - max_num_blocks in KernelParams is within constraint of 1D grid size. This @@ -165,6 +164,7 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, + collection_objects, kernel_params, def_use_map, fork_join_map, @@ -190,6 +190,7 @@ struct GPUContext<'a> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, + collection_objects: &'a FunctionCollectionObjects, kernel_params: &'a GPUKernelParams, def_use_map: &'a ImmutableDefUseMap, fork_join_map: &'a HashMap<NodeID, NodeID>, @@ -247,7 +248,13 @@ impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // Emit all code up to the "goto" to Start's block let mut top = String::new(); - self.codegen_kernel_begin(&mut top)?; + let return_parameter = if self.collection_objects.returned_objects().len() == 1 { + Some(self.collection_objects.origin(*self.collection_objects.returned_objects() + .first().unwrap()).try_parameter().unwrap()) + } else { + None + }; + self.codegen_kernel_begin(return_parameter.is_none(), &mut top)?; let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; @@ -271,7 +278,7 @@ impl GPUContext<'_> { (1, 1) } else { // Create structures and determine block and thread parallelization strategy - let (fork_tree, fork_control_map) = self.make_fork_structures(&self.fork_join_map); + let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); @@ -308,14 +315,14 @@ impl GPUContext<'_> { // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; + self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, return_parameter, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) } // Emit kernel headers, signature, arguments, and dynamic shared memory declaration - fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { + fn codegen_kernel_begin(&self, has_ret_var: bool, w: &mut String) -> Result<(), Error> { write!(w, " #include <assert.h> #include <stdio.h> @@ -367,13 +374,15 @@ namespace cg = cooperative_groups; }; write!(w, "{} p{}", param_type, idx)?; } - if !first_param { - write!(w, ", ")?; + if has_ret_var { + if !first_param { + write!(w, ", ")?; + } + write!( + w, + "void* __restrict__ ret", + )?; } - write!( - w, - "void* __restrict__ ret", - )?; // Type is char since it's simplest to use single bytes for indexing // and it's required for heterogeneous Product and Summation types. @@ -480,7 +489,7 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, return_parameter: Option<usize>, w: &mut String) -> Result<(), Error> { // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let ret_type = self.get_type(self.function.return_type, false); @@ -512,20 +521,27 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "p{}", idx)?; } write!(w, ") {{\n")?; - // Allocate return parameter and lift to kernel argument - let ret_type_pnt = self.get_type(self.function.return_type, true); - write!(w, "\t{} ret;\n", ret_type_pnt)?; - if !first_param { - write!(pass_args, ", ")?; + let has_ret_var = return_parameter.is_none(); + if has_ret_var { + // Allocate return parameter and lift to kernel argument + let ret_type_pnt = self.get_type(self.function.return_type, true); + write!(w, "\t{} ret;\n", ret_type_pnt)?; + if !first_param { + write!(pass_args, ", ")?; + } + write!(pass_args, "ret")?; + write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; } - write!(pass_args, "ret")?; - write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tcudaDeviceSynchronize();\n")?; - // Copy return from device to host, whether it's primitive value or collection pointer - write!(w, "\t{} host_ret;\n", ret_type)?; - write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; - write!(w, "\treturn host_ret;\n")?; + if has_ret_var { + // Copy return from device to host, whether it's primitive value or collection pointer + write!(w, "\t{} host_ret;\n", ret_type)?; + write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; + write!(w, "\treturn host_ret;\n")?; + } else { + write!(w, "\treturn p{};\n", return_parameter.unwrap())?; + } write!(w, "}}\n")?; Ok(()) } @@ -585,8 +601,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; panic!("Expected fork node"); }; let fork_size = self.multiply_fork_factors(factors); + let reduces = &self.fork_reduce_map[root_fork]; + assert!(reduces.iter().all(|reduce| { + self.collection_objects.objects(*reduce).iter().all(|object| { + self.collection_objects.origin(*object).try_parameter().is_some() + }) + }), "All collection reduces in block fork must originate from parameters"); if let Some(fork_size) = fork_size && fork_size <= max_num_blocks + && fork_size.is_power_of_two() && self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { (root_forks, fork_size) @@ -1206,8 +1229,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; )?; } } - // If we read collection, distribute elements among threads with cg - // sync after. If we read primitive, copy read on all threads. + // Read of primitive requires load after pointer math. Node::Read { collect, indices } => { let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_type_id = self.typing[id.idx()]; @@ -1215,27 +1237,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let type_name = self.get_type(data_type_id, true); write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; } else { - if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { - panic!("GPU can't guarantee correctness for multi-block collection reads"); - } - let cg_tile = match state { - KernelState::OutBlock | KernelState::InBlock => "block".to_string(), - KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), - }; - // Divide up "elements", which are collection size divided - // by element size, among threads. - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); - write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; - write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, define_variable, collect_with_indices)?; - write!(w, "{}}}\n", tabs)?; - write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; - write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, define_variable, cg_tile, data_size, cg_tile, cg_tile, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile)?; - write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + write!(w, "{}{} = {};\n", tabs, define_variable, collect_with_indices)?; } } - // Write is same as read, but when writing a primitive, we need to gate with - // a thread rank check. + // Write of primitive needs a thread rank gate for safety. Write of + // collection is memcpy that we distribute among threads. Node::Write { collect, data, -- GitLab