From 2fed8e6b88b726494ed568bc2b6c85c7360991a5 Mon Sep 17 00:00:00 2001 From: Russel Arbore <prathi3@illinois.edu> Date: Fri, 31 Jan 2025 16:03:35 -0600 Subject: [PATCH] gpu non-tile works --- hercules_cg/src/gpu.rs | 84 +++++++++++++------ hercules_samples/matmul/src/gpu.sch | 1 + juno_samples/cava/src/gpu.sch | 2 - juno_samples/concat/src/main.rs | 37 +++++++- juno_samples/implicit_clone/src/main.rs | 8 -- juno_samples/matmul/src/cpu.sch | 20 ----- juno_samples/matmul/src/gpu.sch | 1 - juno_samples/nested_ccp/build.rs | 25 ++++-- juno_samples/nested_ccp/src/gpu.sch | 2 - juno_samples/nested_ccp/src/main.rs | 8 +- juno_samples/patterns/Cargo.toml | 3 + juno_samples/patterns/build.rs | 23 +++-- .../src/cpu.sch => patterns/src/gpu.sch} | 5 +- juno_samples/simple3/build.rs | 25 ++++-- juno_samples/simple3/src/gpu.sch | 2 - 15 files changed, 155 insertions(+), 91 deletions(-) delete mode 100644 juno_samples/matmul/src/cpu.sch rename juno_samples/{nested_ccp/src/cpu.sch => patterns/src/gpu.sch} (73%) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 341e143e..6c62ed76 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -23,6 +23,7 @@ pub fn gpu_codegen<W: Write>( typing: &Vec<TypeID>, control_subgraph: &Subgraph, bbs: &BasicBlocks, + backing_allocation: &FunctionBackingAllocation, collection_objects: &FunctionCollectionObjects, def_use_map: &ImmutableDefUseMap, fork_join_map: &HashMap<NodeID, NodeID>, @@ -168,6 +169,7 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, + backing_allocation, collection_objects, def_use_map, fork_join_map, @@ -196,6 +198,7 @@ struct GPUContext<'a> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, + backing_allocation: &'a FunctionBackingAllocation, collection_objects: &'a FunctionCollectionObjects, def_use_map: &'a ImmutableDefUseMap, fork_join_map: &'a HashMap<NodeID, NodeID>, @@ -352,8 +355,14 @@ namespace cg = cooperative_groups; "__global__ void __launch_bounds__({}) {}_gpu(", self.kernel_params.max_num_threads, self.function.name )?; - // The first set of parameters are dynamic constants. let mut first_param = true; + // The first parameter is a pointer to GPU backing memory, if it's + // needed. + if self.backing_allocation.contains_key(&Device::CUDA) { + first_param = false; + write!(w, "char* backing")?; + } + // The second set of parameters are dynamic constants. for idx in 0..self.function.num_dynamic_constants { if first_param { first_param = false; @@ -362,7 +371,7 @@ namespace cg = cooperative_groups; } write!(w, "unsigned long long dc_p{}", idx)?; } - // The second set of parameters are normal arguments. + // The third set of parameters are normal arguments. for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; @@ -403,38 +412,46 @@ namespace cg = cooperative_groups; * Emit calculation of all dynamic constants */ fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { - for dc in dynamic_constants_bottom_up(self.dynamic_constants) { + for dc in dynamic_constants_bottom_up(&self.dynamic_constants) { let dc_val = format!("unsigned long long dc{}", dc.idx()); - match self.dynamic_constants[dc.idx()] { + match &self.dynamic_constants[dc.idx()] { DynamicConstant::Constant(val) => write!(w, "\t{} = {}ull;\n", dc_val, val)?, DynamicConstant::Parameter(idx) => { - if idx < self.function.num_dynamic_constants as usize { + if *idx < self.function.num_dynamic_constants as usize { write!(w, "\t{} = dc_p{};\n", dc_val, idx)? } else { write!(w, "\t{} = 0;\n", dc_val)? } } - DynamicConstant::Add(left, right) => { - write!(w, "\t{} = dc{} + dc{};\n", dc_val, left.idx(), right.idx())? + DynamicConstant::Add(args) => { + let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" + "); + write!(w, "\t{} = {};\n", dc_val, rhs)? + } + DynamicConstant::Mul(args) => { + let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" * "); + write!(w, "\t{} = {};\n", dc_val, rhs)? + } + DynamicConstant::Min(args) => { + let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("min(dc{}, ", arg.idx())).collect(); + let rhs_last = format!("dc{}", args.last().unwrap().idx()); + let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect(); + write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)? + } + DynamicConstant::Max(args) => { + let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("max(dc{}, ", arg.idx())).collect(); + let rhs_last = format!("dc{}", args.last().unwrap().idx()); + let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect(); + write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)? } DynamicConstant::Sub(left, right) => { write!(w, "\t{} = dc{} - dc{};\n", dc_val, left.idx(), right.idx())? } - DynamicConstant::Mul(left, right) => { - write!(w, "\t{} = dc{} * dc{};\n", dc_val, left.idx(), right.idx())? - } DynamicConstant::Div(left, right) => { write!(w, "\t{} = dc{} / dc{};\n", dc_val, left.idx(), right.idx())? } DynamicConstant::Rem(left, right) => { write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())? } - DynamicConstant::Min(left, right) => { - write!(w, "\t{} = min(dc{}, dc{});\n", dc_val, left.idx(), right.idx())? - } - DynamicConstant::Max(left, right) => { - write!(w, "\t{} = max(dc{}, dc{});\n", dc_val, left.idx(), right.idx())? - } } } Ok(()) @@ -502,8 +519,15 @@ namespace cg = cooperative_groups; let mut pass_args = String::new(); write!(w, " extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; - // The first set of parameters are dynamic constants. let mut first_param = true; + // The first parameter is a pointer to GPU backing memory, if it's + // needed. + if self.backing_allocation.contains_key(&Device::CUDA) { + first_param = false; + write!(w, "char* backing")?; + write!(pass_args, "backing")?; + } + // The second set of parameters are dynamic constants. for idx in 0..self.function.num_dynamic_constants { if first_param { first_param = false; @@ -514,7 +538,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w, "unsigned long long dc_p{}", idx)?; write!(pass_args, "dc_p{}", idx)?; } - // The second set of parameters are normal arguments. + // The third set of parameters are normal arguments. for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; @@ -540,8 +564,13 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "ret")?; write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; } + write!(w, "\tcudaError_t err;\n"); write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; + write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n"); write!(w, "\tcudaDeviceSynchronize();\n")?; + write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n"); if has_ret_var { // Copy return from device to host, whether it's primitive value or collection pointer write!(w, "\t{} host_ret;\n", ret_type)?; @@ -979,17 +1008,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Node::Parameter { index: _ } => {} // If the constant is primitive, it's stored in register so we repeat // for all threads. Otherwise, it can be inside or outside block fork. - // If inside, it's stored in shared memory so we only want to "allocate" - // and initialize it once. In either case, we then parallelize memset to 0. + // If inside, it's stored in shared memory so we "allocate" it once + // and parallelize memset to 0. If outside, we initialize as offset + // to backing, but if multi-block grid, don't memset to avoid grid-level sync. Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); let cg_tile = match state { KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; - if !is_primitive && state == KernelState::OutBlock && is_block_parallel.is_some() && is_block_parallel.unwrap() { - panic!("GPU can't memset collection for multi-block grid"); - } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; @@ -1007,9 +1034,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; *num_tabs -= 1; } - if !is_primitive { + if !is_primitive && state == KernelState::OutBlock { + let (_, offsets) = &self.backing_allocation[&Device::CUDA]; + let offset = offsets[&id]; + write!(w, "{}{} = backing + dc{};\n", tabs, define_variable, offset.idx())?; + } + if !is_primitive && (state != KernelState::OutBlock || is_block_parallel.is_none() || !is_block_parallel.unwrap()) { let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; @@ -1223,9 +1256,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; - if KernelState::OutBlock == state && is_block_parallel.is_some() && is_block_parallel.unwrap() { - panic!("GPU can't guarantee correctness for multi-block collection writes"); - } let cg_tile = match state { KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index a4eb3240..c0a1a5ce 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -12,6 +12,7 @@ gvn(*); phi-elim(*); dce(*); +forkify(*); infer-schedules(*); gcm(*); diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index ace9082c..a5570b8d 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -17,5 +17,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*) diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs index 83534c9d..9674c2c5 100644 --- a/juno_samples/concat/src/main.rs +++ b/juno_samples/concat/src/main.rs @@ -2,16 +2,47 @@ use hercules_rt::runner; use hercules_rt::HerculesCPURef; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("concat"); fn main() { async_std::task::block_on(async { let mut r = runner!(concat_entry); - let output = r.run(7).await; - println!("{}", output); - assert_eq!(output, 42); + let mut a_data = [7, 7, 0]; + let mut b_data = [7, 7, 0, 0, 7, 7]; + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&mut a_data); + let b = HerculesCPURef::from_slice(&mut b_data); + let output = r.run(3, 6, a, b).await; + assert_eq!(output, 42); + const N: usize = 3; + let arr : Box<[i32]> = (2..=4).collect(); + let arr = HerculesCPURef::from_slice(&arr); + + let mut r = runner!(concat_switch); + let output = r.run(N as u64, 50, arr.clone()).await; + let result = output.as_slice::<i32>(); + println!("{:?}", result); + assert_eq!(result, [0, 1, 2, 3, 4]); + + let output = r.run(N as u64, 30, arr).await; + let result = output.as_slice::<i32>(); + println!("{:?}", result); + assert_eq!(result, [2, 3, 4, 0, 1]); + } + #[cfg(feature = "cuda")] + { + let mut a_data = [7, 7, 0]; + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data)); + let mut b_data = [7, 7, 0, 0, 7, 7]; + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data)); + let output = r.run(3, 6, a.get_ref(), b.get_ref()).await; + assert_eq!(output, 42); + } }); } diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs index 1e94ff89..c1f82528 100644 --- a/juno_samples/implicit_clone/src/main.rs +++ b/juno_samples/implicit_clone/src/main.rs @@ -8,42 +8,34 @@ fn main() { async_std::task::block_on(async { let mut r = runner!(simple_implicit_clone); let output = r.run(3).await; - println!("{}", output); assert_eq!(output, 11); let mut r = runner!(loop_implicit_clone); let output = r.run(100).await; - println!("{}", output); assert_eq!(output, 7); let mut r = runner!(double_loop_implicit_clone); let output = r.run(3).await; - println!("{}", output); assert_eq!(output, 42); let mut r = runner!(tricky_loop_implicit_clone); let output = r.run(2, 2).await; - println!("{}", output); assert_eq!(output, 130); let mut r = runner!(tricky2_loop_implicit_clone); let output = r.run(2, 3).await; - println!("{}", output); assert_eq!(output, 39); let mut r = runner!(tricky3_loop_implicit_clone); let output = r.run(5, 7).await; - println!("{}", output); assert_eq!(output, 7); let mut r = runner!(no_implicit_clone); let output = r.run(4).await; - println!("{}", output); assert_eq!(output, 13); let mut r = runner!(mirage_implicit_clone); let output = r.run(73).await; - println!("{}", output); assert_eq!(output, 843); }); } diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch deleted file mode 100644 index b256d73b..00000000 --- a/juno_samples/matmul/src/cpu.sch +++ /dev/null @@ -1,20 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); - -forkify(*); -infer-schedules(*); - -gcm(*); -float-collections(*); -dce(*); -gcm(*); diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index 205dee9c..3d3f919c 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -12,7 +12,6 @@ gvn(*); phi-elim(*); dce(*); -forkify(*); infer-schedules(*); gcm(*); diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index ec111bc1..074937e7 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("nested_ccp.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 4f36ddd8..c56d046a 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 99ef150d..bc99a4bd 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp"); fn main() { async_std::task::block_on(async { - let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); + let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); #[cfg(not(feature = "cuda"))] { - let a = HerculesCPURefMut::from_slice(&mut a); + let a = HerculesCPURef::from_slice(&a); let b = HerculesCPURefMut::from_slice(&mut b); let mut r = runner!(ccp_example); let output_example = r.run(a).await; @@ -23,8 +23,8 @@ fn main() { } #[cfg(feature = "cuda")] { - let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); - let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b)); let mut r = runner!(ccp_example); let output_example = r.run(a.get_ref_mut()).await; let mut r = runner!(median_array); diff --git a/juno_samples/patterns/Cargo.toml b/juno_samples/patterns/Cargo.toml index a8dda157..bedaf7ca 100644 --- a/juno_samples/patterns/Cargo.toml +++ b/juno_samples/patterns/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_patterns" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/patterns/build.rs b/juno_samples/patterns/build.rs index 8ac92f00..625da0a5 100644 --- a/juno_samples/patterns/build.rs +++ b/juno_samples/patterns/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("patterns.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("patterns.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("patterns.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/patterns/src/gpu.sch similarity index 73% rename from juno_samples/nested_ccp/src/cpu.sch rename to juno_samples/patterns/src/gpu.sch index 7e6be7ee..3d9c8c9e 100644 --- a/juno_samples/nested_ccp/src/cpu.sch +++ b/juno_samples/patterns/src/gpu.sch @@ -2,7 +2,8 @@ gvn(*); phi-elim(*); dce(*); -auto-outline(*); +let out = auto-outline(*); +gpu(out.entry); ip-sroa(*); sroa(*); @@ -14,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index bfd37cb5..58c2c5aa 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("simple3.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index 93e85c48..d6c2a9d6 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); -- GitLab