diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 341e143eb60bd082d79f2763be969b699e0e3753..6c62ed76392dabc10c0fc48bf20d56d8577ad99e 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -23,6 +23,7 @@ pub fn gpu_codegen<W: Write>( typing: &Vec<TypeID>, control_subgraph: &Subgraph, bbs: &BasicBlocks, + backing_allocation: &FunctionBackingAllocation, collection_objects: &FunctionCollectionObjects, def_use_map: &ImmutableDefUseMap, fork_join_map: &HashMap<NodeID, NodeID>, @@ -168,6 +169,7 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, + backing_allocation, collection_objects, def_use_map, fork_join_map, @@ -196,6 +198,7 @@ struct GPUContext<'a> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, + backing_allocation: &'a FunctionBackingAllocation, collection_objects: &'a FunctionCollectionObjects, def_use_map: &'a ImmutableDefUseMap, fork_join_map: &'a HashMap<NodeID, NodeID>, @@ -352,8 +355,14 @@ namespace cg = cooperative_groups; "__global__ void __launch_bounds__({}) {}_gpu(", self.kernel_params.max_num_threads, self.function.name )?; - // The first set of parameters are dynamic constants. let mut first_param = true; + // The first parameter is a pointer to GPU backing memory, if it's + // needed. + if self.backing_allocation.contains_key(&Device::CUDA) { + first_param = false; + write!(w, "char* backing")?; + } + // The second set of parameters are dynamic constants. for idx in 0..self.function.num_dynamic_constants { if first_param { first_param = false; @@ -362,7 +371,7 @@ namespace cg = cooperative_groups; } write!(w, "unsigned long long dc_p{}", idx)?; } - // The second set of parameters are normal arguments. + // The third set of parameters are normal arguments. for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; @@ -403,38 +412,46 @@ namespace cg = cooperative_groups; * Emit calculation of all dynamic constants */ fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { - for dc in dynamic_constants_bottom_up(self.dynamic_constants) { + for dc in dynamic_constants_bottom_up(&self.dynamic_constants) { let dc_val = format!("unsigned long long dc{}", dc.idx()); - match self.dynamic_constants[dc.idx()] { + match &self.dynamic_constants[dc.idx()] { DynamicConstant::Constant(val) => write!(w, "\t{} = {}ull;\n", dc_val, val)?, DynamicConstant::Parameter(idx) => { - if idx < self.function.num_dynamic_constants as usize { + if *idx < self.function.num_dynamic_constants as usize { write!(w, "\t{} = dc_p{};\n", dc_val, idx)? } else { write!(w, "\t{} = 0;\n", dc_val)? } } - DynamicConstant::Add(left, right) => { - write!(w, "\t{} = dc{} + dc{};\n", dc_val, left.idx(), right.idx())? + DynamicConstant::Add(args) => { + let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" + "); + write!(w, "\t{} = {};\n", dc_val, rhs)? + } + DynamicConstant::Mul(args) => { + let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" * "); + write!(w, "\t{} = {};\n", dc_val, rhs)? + } + DynamicConstant::Min(args) => { + let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("min(dc{}, ", arg.idx())).collect(); + let rhs_last = format!("dc{}", args.last().unwrap().idx()); + let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect(); + write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)? + } + DynamicConstant::Max(args) => { + let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("max(dc{}, ", arg.idx())).collect(); + let rhs_last = format!("dc{}", args.last().unwrap().idx()); + let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect(); + write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)? } DynamicConstant::Sub(left, right) => { write!(w, "\t{} = dc{} - dc{};\n", dc_val, left.idx(), right.idx())? } - DynamicConstant::Mul(left, right) => { - write!(w, "\t{} = dc{} * dc{};\n", dc_val, left.idx(), right.idx())? - } DynamicConstant::Div(left, right) => { write!(w, "\t{} = dc{} / dc{};\n", dc_val, left.idx(), right.idx())? } DynamicConstant::Rem(left, right) => { write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())? } - DynamicConstant::Min(left, right) => { - write!(w, "\t{} = min(dc{}, dc{});\n", dc_val, left.idx(), right.idx())? - } - DynamicConstant::Max(left, right) => { - write!(w, "\t{} = max(dc{}, dc{});\n", dc_val, left.idx(), right.idx())? - } } } Ok(()) @@ -502,8 +519,15 @@ namespace cg = cooperative_groups; let mut pass_args = String::new(); write!(w, " extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; - // The first set of parameters are dynamic constants. let mut first_param = true; + // The first parameter is a pointer to GPU backing memory, if it's + // needed. + if self.backing_allocation.contains_key(&Device::CUDA) { + first_param = false; + write!(w, "char* backing")?; + write!(pass_args, "backing")?; + } + // The second set of parameters are dynamic constants. for idx in 0..self.function.num_dynamic_constants { if first_param { first_param = false; @@ -514,7 +538,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w, "unsigned long long dc_p{}", idx)?; write!(pass_args, "dc_p{}", idx)?; } - // The second set of parameters are normal arguments. + // The third set of parameters are normal arguments. for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; @@ -540,8 +564,13 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "ret")?; write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; } + write!(w, "\tcudaError_t err;\n"); write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; + write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n"); write!(w, "\tcudaDeviceSynchronize();\n")?; + write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n"); if has_ret_var { // Copy return from device to host, whether it's primitive value or collection pointer write!(w, "\t{} host_ret;\n", ret_type)?; @@ -979,17 +1008,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Node::Parameter { index: _ } => {} // If the constant is primitive, it's stored in register so we repeat // for all threads. Otherwise, it can be inside or outside block fork. - // If inside, it's stored in shared memory so we only want to "allocate" - // and initialize it once. In either case, we then parallelize memset to 0. + // If inside, it's stored in shared memory so we "allocate" it once + // and parallelize memset to 0. If outside, we initialize as offset + // to backing, but if multi-block grid, don't memset to avoid grid-level sync. Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); let cg_tile = match state { KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; - if !is_primitive && state == KernelState::OutBlock && is_block_parallel.is_some() && is_block_parallel.unwrap() { - panic!("GPU can't memset collection for multi-block grid"); - } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; @@ -1007,9 +1034,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; *num_tabs -= 1; } - if !is_primitive { + if !is_primitive && state == KernelState::OutBlock { + let (_, offsets) = &self.backing_allocation[&Device::CUDA]; + let offset = offsets[&id]; + write!(w, "{}{} = backing + dc{};\n", tabs, define_variable, offset.idx())?; + } + if !is_primitive && (state != KernelState::OutBlock || is_block_parallel.is_none() || !is_block_parallel.unwrap()) { let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; @@ -1223,9 +1256,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; - if KernelState::OutBlock == state && is_block_parallel.is_some() && is_block_parallel.unwrap() { - panic!("GPU can't guarantee correctness for multi-block collection writes"); - } let cg_tile = match state { KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index a4eb324034bf102c3f49bbea9a9475eb864c4d58..c0a1a5cebd46a47378132d357b7c0fc83a4446a6 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -12,6 +12,7 @@ gvn(*); phi-elim(*); dce(*); +forkify(*); infer-schedules(*); gcm(*); diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index ace9082cbdb6a5b5512d88f435169df8eaf1238a..a5570b8d98324410aea9902cf29bce386126ce12 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -17,5 +17,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*) diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs index 83534c9d605c1a61c7854ac1e76d9bd7fa596a50..9674c2c54b328aefcb4e670dc7e9ec482f8b2508 100644 --- a/juno_samples/concat/src/main.rs +++ b/juno_samples/concat/src/main.rs @@ -2,16 +2,47 @@ use hercules_rt::runner; use hercules_rt::HerculesCPURef; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("concat"); fn main() { async_std::task::block_on(async { let mut r = runner!(concat_entry); - let output = r.run(7).await; - println!("{}", output); - assert_eq!(output, 42); + let mut a_data = [7, 7, 0]; + let mut b_data = [7, 7, 0, 0, 7, 7]; + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&mut a_data); + let b = HerculesCPURef::from_slice(&mut b_data); + let output = r.run(3, 6, a, b).await; + assert_eq!(output, 42); + const N: usize = 3; + let arr : Box<[i32]> = (2..=4).collect(); + let arr = HerculesCPURef::from_slice(&arr); + + let mut r = runner!(concat_switch); + let output = r.run(N as u64, 50, arr.clone()).await; + let result = output.as_slice::<i32>(); + println!("{:?}", result); + assert_eq!(result, [0, 1, 2, 3, 4]); + + let output = r.run(N as u64, 30, arr).await; + let result = output.as_slice::<i32>(); + println!("{:?}", result); + assert_eq!(result, [2, 3, 4, 0, 1]); + } + #[cfg(feature = "cuda")] + { + let mut a_data = [7, 7, 0]; + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data)); + let mut b_data = [7, 7, 0, 0, 7, 7]; + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data)); + let output = r.run(3, 6, a.get_ref(), b.get_ref()).await; + assert_eq!(output, 42); + } }); } diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs index 1e94ff8977a6a5e340b6180ceff777351dda088b..c1f82528de40598007291b9258ec1cf59acbcc07 100644 --- a/juno_samples/implicit_clone/src/main.rs +++ b/juno_samples/implicit_clone/src/main.rs @@ -8,42 +8,34 @@ fn main() { async_std::task::block_on(async { let mut r = runner!(simple_implicit_clone); let output = r.run(3).await; - println!("{}", output); assert_eq!(output, 11); let mut r = runner!(loop_implicit_clone); let output = r.run(100).await; - println!("{}", output); assert_eq!(output, 7); let mut r = runner!(double_loop_implicit_clone); let output = r.run(3).await; - println!("{}", output); assert_eq!(output, 42); let mut r = runner!(tricky_loop_implicit_clone); let output = r.run(2, 2).await; - println!("{}", output); assert_eq!(output, 130); let mut r = runner!(tricky2_loop_implicit_clone); let output = r.run(2, 3).await; - println!("{}", output); assert_eq!(output, 39); let mut r = runner!(tricky3_loop_implicit_clone); let output = r.run(5, 7).await; - println!("{}", output); assert_eq!(output, 7); let mut r = runner!(no_implicit_clone); let output = r.run(4).await; - println!("{}", output); assert_eq!(output, 13); let mut r = runner!(mirage_implicit_clone); let output = r.run(73).await; - println!("{}", output); assert_eq!(output, 843); }); } diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch deleted file mode 100644 index b256d73b9ac7a9b501df54a7b028613f10eff4ca..0000000000000000000000000000000000000000 --- a/juno_samples/matmul/src/cpu.sch +++ /dev/null @@ -1,20 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); - -forkify(*); -infer-schedules(*); - -gcm(*); -float-collections(*); -dce(*); -gcm(*); diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index 205dee9ce3e5ec76f681a7fa9bd32deb2a7187b2..3d3f919cd26e4eba480540df06f839a8b86976b0 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -12,7 +12,6 @@ gvn(*); phi-elim(*); dce(*); -forkify(*); infer-schedules(*); gcm(*); diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index ec111bc1b9d57eda7416d89d7df06caeb60bc258..074937e7b0a0ce50032928f30c72feb39b5ecd79 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("nested_ccp.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 4f36ddd8102209a7a9da5223069d6617c499a4b0..c56d046a686ea8bddd8c9e9f91d88eb39a2eaf31 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 99ef150d942d256c4475344822efcfb0cb6f693a..bc99a4bdd071ff19c70977e29241b76e3e249014 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp"); fn main() { async_std::task::block_on(async { - let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); + let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); #[cfg(not(feature = "cuda"))] { - let a = HerculesCPURefMut::from_slice(&mut a); + let a = HerculesCPURef::from_slice(&a); let b = HerculesCPURefMut::from_slice(&mut b); let mut r = runner!(ccp_example); let output_example = r.run(a).await; @@ -23,8 +23,8 @@ fn main() { } #[cfg(feature = "cuda")] { - let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); - let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b)); let mut r = runner!(ccp_example); let output_example = r.run(a.get_ref_mut()).await; let mut r = runner!(median_array); diff --git a/juno_samples/patterns/Cargo.toml b/juno_samples/patterns/Cargo.toml index a8dda157ff331ae9b1c5e1cb2a120db9bab3bb82..bedaf7ca01c0d7cfadbba5e11b8e94203ccda2b4 100644 --- a/juno_samples/patterns/Cargo.toml +++ b/juno_samples/patterns/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_patterns" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/patterns/build.rs b/juno_samples/patterns/build.rs index 8ac92f003e549a9aeb289001b8792ee4dcb51284..625da0a5fed548984b724aa89085c456ef22f12c 100644 --- a/juno_samples/patterns/build.rs +++ b/juno_samples/patterns/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("patterns.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("patterns.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("patterns.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/patterns/src/gpu.sch similarity index 73% rename from juno_samples/nested_ccp/src/cpu.sch rename to juno_samples/patterns/src/gpu.sch index 7e6be7eefb05b7660d3c27ae9d937ce00cf79a0e..3d9c8c9e6ddb588fbaa96e3c27f78cf21abd8f1a 100644 --- a/juno_samples/nested_ccp/src/cpu.sch +++ b/juno_samples/patterns/src/gpu.sch @@ -2,7 +2,8 @@ gvn(*); phi-elim(*); dce(*); -auto-outline(*); +let out = auto-outline(*); +gpu(out.entry); ip-sroa(*); sroa(*); @@ -14,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index bfd37cb50d74bf09118b7b768600d132d7fbf9e1..58c2c5aab14e65facfe5154db13f453be411c55b 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("simple3.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index 93e85c48ad442a6a9ed17cdf3e59bbe67989b73d..d6c2a9d666c11d4ee07ce89a573ac73d8ea25bff 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*);