From 1c92d5b86684fc302a591657731df75ff9e3b8a2 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 10 Jan 2025 18:53:09 -0600 Subject: [PATCH] runs --- hercules_cg/src/gpu.rs | 212 +++++++++++++++++++++++++++++------------ 1 file changed, 149 insertions(+), 63 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a153b7ef..499ecce8 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -296,6 +296,7 @@ impl GPUContext<'_> { // Emit all code up to the "goto" to Start's block let mut top = String::new(); self.codegen_kernel_begin(&mut top)?; + let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; @@ -306,10 +307,11 @@ impl GPUContext<'_> { let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - println!("num_blocks: {}", num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); - let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); + // TODO: Uncomment and adjust once we know logic of extra dim + // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); + let extra_dim_collects = HashSet::new(); // Setup for CUDA's "goto" for control flow between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -333,19 +335,20 @@ impl GPUContext<'_> { &fork_control_map, &fork_thread_quota_map, &extra_dim_collects, + &mut dynamic_shared_offset, num_threads, &mut gotos, )?; // Emit all GPU kernel code from previous steps let mut kernel_body = String::new(); - self.codegen_gotos(&mut gotos, &mut kernel_body)?; + self.codegen_gotos(false, &mut gotos, &mut kernel_body)?; write!(w, "{}", kernel_body)?; write!(w, "}}\n")?; // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(num_blocks, num_threads, &mut host_launch)?; + self.codegen_launch_code(true, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) @@ -420,8 +423,9 @@ namespace cg = cooperative_groups; write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; // This will only get used by thread rank 0 in each block, since it - // does all shared memory "allocation" - write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; + // does all shared memory "allocation". The actual state is preserved + // in Rust string and this offset is assigned to for ease of readability. + write!(w, "\tuint64_t dynamic_shared_offset;\n")?; Ok(()) } @@ -489,11 +493,14 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_gotos(&self, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { + fn codegen_gotos(&self, goto_debug: bool, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { write!(w, "\n")?; for (id, goto) in gotos.iter() { let goto_block = self.get_block_name(*id, false); write!(w, "{}:\n", goto_block)?; + if goto_debug { + write!(w, "\tprintf(\"goto {}\\n\");\n", goto_block)?; + } write!(w, "{}", goto.init)?; if !goto.post_init.is_empty() { let goto_block = self.get_block_name(*id, true); @@ -506,47 +513,119 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, run_debug: bool, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { write!(w, " -int main(")?; +int main() {{ +")?; // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); - // The first set of parameters are dynamic constants. - let mut first_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(w, ", ")?; + if run_debug { + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(pass_args, ", ")?; + } + write!(w, "\tunsigned long long dc_p{} = 1ull;\n", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + self.codegen_dynamic_constants(w)?; + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(pass_args, ", ")?; + } + let param_type = self.get_type(*ty, false); + if self.types[ty.idx()].is_primitive() { + write!(w, "\t{} p{} = 1;\n", param_type, idx)?; + } else { + let param_size = self.get_size(*ty, None, None); + write!(w, "\t{} p{};\n", param_type, idx); + write!(w, "\tif (cudaMalloc(&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; + write!(w, "\t\tprintf(\"Error allocating memory for parameter %d\\n\", {});\n", idx)?; + write!(w, "\t\treturn -1;\n"); + write!(w, "\t}}\n"); + } + write!(pass_args, "p{}", idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { write!(pass_args, ", ")?; + let ret_type_no_pnt = self.get_type(*self.return_type_id, false); + let ret_type = self.get_type(*self.return_type_id, true); + write!(w, "\t{} ret;\n", ret_type)?; + write!(w, "\tif (cudaMalloc(&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; + write!(w, "\t\tprintf(\"Error allocating memory for return value\\n\");\n")?; + write!(w, "\t\treturn -1;\n")?; + write!(w, "\t}}\n"); + write!(pass_args, "ret")?; } - write!(w, "unsigned long long dc_p{}", idx)?; - write!(pass_args, "dc_p{}", idx)?; - } - // The second set of parameters are normal arguments. - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { + write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); + write!(w, "\tbool skip = false;\n")?; + write!(w, "\tcudaError_t err = cudaGetLastError();\n")?; + write!(w, "\tif (err != cudaSuccess) {{\n")?; + write!(w, "\t\tprintf(\"Error launching kernel: %s\\n\", cudaGetErrorString(err));\n")?; + write!(w, "\t\tskip = true;\n")?; + write!(w, "\t}}\n"); + write!(w, "\tif (cudaDeviceSynchronize() != cudaSuccess && !skip) {{\n")?; + write!(w, "\t\tprintf(\"Error synchronizing device\\n\");\n")?; + write!(w, "\t\tskip = true;\n")?; + write!(w, "\t}}\n"); + for (idx, ty) in self.function.param_types.iter().enumerate() { + if !self.types[ty.idx()].is_primitive() { + write!(w, "\tcudaFree(p{});\n", idx)?; + } + } + if self.types[self.return_type_id.idx()].is_primitive() { + write!(w, "\tcudaFree(ret);\n"); + } + write!(w, "\treturn 0;\n"); + write!(w, "}}\n"); + } + + else { + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + write!(w, "unsigned long long dc_p{}", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + let param_type = self.get_type(*ty, false); + write!(w, "{} p{}", param_type, idx)?; + write!(pass_args, "p{}", idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { write!(w, ", ")?; write!(pass_args, ", ")?; + let ret_type = self.get_type(*self.return_type_id, true); + write!(w, "{} ret", ret_type)?; + write!(pass_args, "ret")?; } - let param_type = self.get_type(*ty, false); - write!(w, "{} p{}", param_type, idx)?; - write!(pass_args, "p{}", idx)?; - } - // Pull primitive return to a pointer parameter - if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, ", ")?; - write!(pass_args, ", ")?; - let ret_type = self.get_type(*self.return_type_id, true); - write!(w, "{} ret", ret_type)?; - write!(pass_args, "ret")?; + write!(w, ") {{ + {}<<<{}, {}, {}>>>({}); +}}", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); } - write!(w, ") {{ - {}<<<{}, {}>>>({}); -}}", self.function.name, num_blocks, num_threads, pass_args); + Ok(()) } @@ -575,12 +654,10 @@ int main(")?; // Then get it's nesting fork- index = 1 to not count itself! let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0)); fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(control); - println!("fork_tree parent: {}, child: {}", nesting_fork.idx(), control.idx()); } // Here the desired fork is always the first fork let fork = forks.first().copied().unwrap_or(NodeID::new(0)); fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(control); - println!("fork_control_map parent: {}, child: {}", fork.idx(), control.idx()); (fork_tree, fork_control_map) }, ) @@ -676,7 +753,7 @@ int main(")?; .iter() .map(|child| (child, self.recurse_thread_quotas(*child, fork_tree, false))) .fold( - (HashMap::new(), HashMap::new(), 0), + (HashMap::new(), HashMap::new(), 1), |(mut subsubtree_map, mut children_quota_map, subtree_quota), (child, (curr_map, curr_quota, use_curr))| { subsubtree_map.extend(curr_map); if use_curr { @@ -771,8 +848,6 @@ int main(")?; (collect_const, users.iter().map(|user| control_fork_map[&self.bbs.0[user.idx()]]).collect()) }) .collect(); - // For now assert that each collection is used by a single fork and get - // parallel status, TODO: revisit collect_fork_users.iter() .filter(|(_, fork_users)| !fork_thread_quota_map.contains_key(fork_users.iter().next().unwrap())) .map(|(collect_const, _)| self.typing[collect_const.idx()]) @@ -790,6 +865,7 @@ int main(")?; fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -797,7 +873,6 @@ int main(")?; // Recall that this was tracked through a fake fork node with NodeID 0. let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -805,14 +880,13 @@ int main(")?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -820,7 +894,7 @@ int main(")?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } } @@ -836,6 +910,7 @@ int main(")?; 1, num_threads, extra_dim_collects, + dynamic_shared_offset, gotos, )?; } @@ -858,6 +933,7 @@ int main(")?; parent_quota: usize, num_threads: usize, extra_dim_collections: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map @@ -879,7 +955,6 @@ int main(")?; HashSet::new() }; for control in fork_control_map.get(&curr_fork).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -895,6 +970,7 @@ int main(")?; Some(curr_fork), reducts.contains(data), extra_dim_collections, + dynamic_shared_offset, body, &mut tabs, )?; @@ -910,6 +986,7 @@ int main(")?; use_thread_quota, num_threads, extra_dim_collections, + dynamic_shared_offset, gotos, )?; } @@ -925,6 +1002,7 @@ int main(")?; nesting_fork: Option<NodeID>, is_special_reduct: bool, extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { @@ -1013,6 +1091,7 @@ int main(")?; *cons_id, true, Some(extra_dim_collects), + dynamic_shared_offset, w, *num_tabs, )?; @@ -1460,18 +1539,23 @@ int main(")?; panic!("Expected array type") }; let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); + let max_left_array_index = array_indices.len() - 1 - if has_extra_dim { 1 } else { 0 }; for (i, index) in array_indices.iter().skip(if has_extra_dim { 1 } else { 0 }).rev().enumerate() { cumulative_offset = format!( - "{} * ({} + {}", + "{} * ({}{}", cumulative_offset, self.get_value(*index, false, false), - format!("dc{}", extents[i].idx()) + if i != max_left_array_index { + format!(" + dc{}", extents[max_left_array_index - i].idx()) + } else { + "".to_string() + } ); } index_ptr.push_str(&format!( " + {}{}", cumulative_offset, - ")".repeat(array_indices.len()) + ")".repeat(array_indices.len() - if has_extra_dim { 1 } else { 0 }) )); if is_char { let element_size = self.get_size(*element_type, None, None); @@ -1501,6 +1585,7 @@ int main(")?; cons_id: ConstantID, allow_allocate: bool, extra_dim_collects: Option<&HashSet<TypeID>>, + dynamic_shared_offset: &mut String, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { @@ -1523,9 +1608,10 @@ int main(")?; if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); - write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") @@ -1541,11 +1627,12 @@ int main(")?; constant_fields[i], false, extra_dim_collects, + dynamic_shared_offset, w, num_tabs, ); } else if !field_constant.is_array() { - self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, w, num_tabs); + self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, dynamic_shared_offset, w, num_tabs); } } } @@ -1553,9 +1640,10 @@ int main(")?; if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); - write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } // No offset updating needed since all variants start at 0 let Type::Summation(variants) = &self.types[type_id.idx()] else { @@ -1570,11 +1658,12 @@ int main(")?; *field, false, extra_dim_collects, + dynamic_shared_offset, w, num_tabs, ); } else if !variant_constant.is_array() { - self.codegen_constant(name, *field, false, extra_dim_collects, w, num_tabs); + self.codegen_constant(name, *field, false, extra_dim_collects, dynamic_shared_offset, w, num_tabs); }; } Constant::Array(type_id) => { @@ -1587,13 +1676,10 @@ int main(")?; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); let element_type = self.get_type(*element_type, true); - write!( - w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {};\n", - tabs, alignment, alignment, alignment - )?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n", tabs, name, element_type)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } } Ok(()) -- GitLab