diff --git a/Cargo.lock b/Cargo.lock index 4a9b889119f59ef0097453a9d22a3caef39ac7bc..ffb61f4d8beb387ba3c756617f5bb3c8ce18ef3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1209,6 +1209,16 @@ dependencies = [ "with_builtin_macros", ] +[[package]] +name = "juno_product_read" +version = "0.1.0" +dependencies = [ + "async-std", + "hercules_rt", + "juno_build", + "with_builtin_macros", +] + [[package]] name = "juno_schedule_test" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index eeb5e69d92410b2174c1db19886b0ff76f773257..3e86bad053f8847c9f10d18117635de1cfa76250 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,4 +33,5 @@ members = [ "juno_samples/edge_detection", "juno_samples/fork_join_tests", "juno_samples/multi_device", + "juno_samples/product_read", ] diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index e6b540aed6218f56e0a1fa33b0e4b1e1db09e9b0..d6461a1ee2d1208071090358bf05d7d34c2638d0 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -284,11 +284,7 @@ impl GPUContext<'_> { // If there are no forks, fast forward to single-block, single-thread codegen let (num_blocks, num_threads) = if self.fork_join_map.is_empty() { - self.codegen_data_control_no_forks( - &HashSet::new(), - &mut dynamic_shared_offset, - &mut gotos, - )?; + self.codegen_data_control_no_forks(&mut dynamic_shared_offset, &mut gotos)?; ("1".to_string(), "1".to_string()) } else { // Create structures and determine block and thread parallelization strategy @@ -298,10 +294,6 @@ impl GPUContext<'_> { self.get_thread_root_forks(&root_forks, self.fork_tree, is_block_parallel); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(self.fork_tree, thread_root_root_fork); - // TODO: Uncomment and adjust once we know logic of extra dim. This will affect constant - // collections, reads, and writes. - // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); - let extra_dim_collects = HashSet::new(); // Core function for the CUDA code of all data and control nodes. self.codegen_data_control( @@ -312,7 +304,6 @@ impl GPUContext<'_> { }, &thread_root_forks, &fork_thread_quota_map, - &extra_dim_collects, &mut dynamic_shared_offset, is_block_parallel, num_threads, @@ -859,25 +850,8 @@ extern \"C\" {} {}(", } } - /* - * All non reduced-over collections used in fork joins have an extra dimension. - * However, this is only useful if ThreadIDs run in parallel not serially, - * otherwise it's unnecessarily consuming shared memory. This function returns - * the set of collections that have an unnecessary extra dimension. - */ - fn get_extra_dim_collects( - &self, - fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, - fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, - ) -> HashSet<TypeID> { - // Determine which fork each collection is used in, and check if it's - // parallelized via the fork_thread_quota_map. - todo!() - } - fn codegen_data_control_no_forks( &self, - extra_dim_collects: &HashSet<TypeID>, dynamic_shared_offset: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -901,7 +875,6 @@ extern \"C\" {} {}(", None, None, false, - extra_dim_collects, dynamic_shared_offset, body, &mut tabs, @@ -919,7 +892,6 @@ extern \"C\" {} {}(", block_fork: Option<NodeID>, thread_root_forks: &HashSet<NodeID>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, - extra_dim_collects: &HashSet<TypeID>, dynamic_shared_offset: &mut String, is_block_parallel: bool, num_threads: usize, @@ -945,7 +917,6 @@ extern \"C\" {} {}(", None, None, false, - extra_dim_collects, dynamic_shared_offset, body, &mut tabs, @@ -979,7 +950,6 @@ extern \"C\" {} {}(", None, Some(block_fork.unwrap()), false, - extra_dim_collects, dynamic_shared_offset, body, &mut tabs, @@ -996,7 +966,6 @@ extern \"C\" {} {}(", fork_thread_quota_map, 1, num_threads, - extra_dim_collects, dynamic_shared_offset, gotos, )?; @@ -1017,7 +986,6 @@ extern \"C\" {} {}(", fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, parent_quota: usize, num_threads: usize, - extra_dim_collections: &HashSet<TypeID>, dynamic_shared_offset: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -1068,7 +1036,6 @@ extern \"C\" {} {}(", parallel_factor, Some(curr_fork), reducts.contains(data), - extra_dim_collections, dynamic_shared_offset, body, &mut tabs, @@ -1082,7 +1049,6 @@ extern \"C\" {} {}(", fork_thread_quota_map, use_thread_quota, num_threads, - extra_dim_collections, dynamic_shared_offset, gotos, )?; @@ -1099,7 +1065,6 @@ extern \"C\" {} {}(", parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, is_special_reduct: bool, - extra_dim_collects: &HashSet<TypeID>, dynamic_shared_offset: &mut String, w: &mut String, num_tabs: &mut usize, @@ -1206,7 +1171,6 @@ extern \"C\" {} {}(", define_variable.clone(), *cons_id, true, - Some(extra_dim_collects), dynamic_shared_offset, w, *num_tabs, @@ -1232,8 +1196,7 @@ extern \"C\" {} {}(", if !is_primitive && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false)) { - let data_size = - self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); + let data_size = self.get_size(self.typing[id.idx()], None); write!( w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", @@ -1453,8 +1416,7 @@ extern \"C\" {} {}(", } // Read of primitive requires load after pointer math. Node::Read { collect, indices } => { - let collect_with_indices = - self.codegen_collect(*collect, indices, extra_dim_collects); + let collect_with_indices = self.codegen_collect(*collect, indices); let data_type_id = self.typing[id.idx()]; if self.types[data_type_id.idx()].is_primitive() { let type_name = self.get_type(data_type_id, true); @@ -1478,8 +1440,7 @@ extern \"C\" {} {}(", data, indices, } => { - let collect_with_indices = - self.codegen_collect(*collect, indices, extra_dim_collects); + let collect_with_indices = self.codegen_collect(*collect, indices); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; let cg_tile = match state { @@ -1498,7 +1459,7 @@ extern \"C\" {} {}(", )?; write!(w, "{}}}\n", tabs)?; } else { - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); + let data_size = self.get_size(data_type_id, None); write!( w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", @@ -1754,31 +1715,31 @@ extern \"C\" {} {}(", * This function emits collection name + pointer math for the provided indices. * All collection types use char pointers. */ - fn codegen_collect( - &self, - collect: NodeID, - indices: &[Index], - extra_dim_collects: &HashSet<TypeID>, - ) -> String { + fn codegen_collect(&self, collect: NodeID, indices: &[Index]) -> String { let mut index_ptr = "0".to_string(); - let type_id = self.typing[collect.idx()]; + let mut type_id = self.typing[collect.idx()]; for index in indices { match index { Index::Field(field) => { - self.get_size(type_id, Some(*field), Some(extra_dim_collects)); + index_ptr.push_str(&format!(" + ({})", self.get_size(type_id, Some(*field)))); + type_id = if let Type::Product(fields) = &self.types[type_id.idx()] { + fields[*field] + } else { + panic!("Expected product type") + }; } // Variants of summations have zero offset - Index::Variant(_) => {} + Index::Variant(index) => { + type_id = if let Type::Summation(variants) = &self.types[type_id.idx()] { + variants[*index] + } else { + panic!("Expected summation type") + }; + } // Convert multi-d array index to 1-d index, and optionally // convert to single-byte index by multiplying by element size Index::Position(array_indices) => { - let has_extra_dim = extra_dim_collects.contains(&self.typing[collect.idx()]); - if has_extra_dim { - continue; - } - let Type::Array(element_type, extents) = - &self.types[self.typing[collect.idx()].idx()] - else { + let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { panic!("Expected array type") }; let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); @@ -1800,8 +1761,9 @@ extern \"C\" {} {}(", cumulative_offset, ")".repeat(array_indices.len()) )); - let element_size = self.get_size(*element_type, None, Some(extra_dim_collects)); - index_ptr.push_str(&format!(" * {}", element_size)); + let element_size = self.get_size(*element_type, None); + index_ptr.push_str(&format!(" * ({})", element_size)); + type_id = *element_type; } } } @@ -1825,7 +1787,6 @@ extern \"C\" {} {}(", name: String, cons_id: ConstantID, allow_allocate: bool, - extra_dim_collects: Option<&HashSet<TypeID>>, dynamic_shared_offset: &mut String, w: &mut String, num_tabs: usize, @@ -1850,7 +1811,7 @@ extern \"C\" {} {}(", Constant::Product(type_id, constant_fields) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( "(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment @@ -1872,7 +1833,7 @@ extern \"C\" {} {}(", }; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call - let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects); + let offset = self.get_size(type_fields[i], Some(i)); let field_constant = &self.constants[constant_fields[i].idx()]; if field_constant.is_scalar() { let field_type = self.get_type(type_fields[i], true); @@ -1880,7 +1841,6 @@ extern \"C\" {} {}(", format!("*reinterpret_cast<{}>({}+{})", field_type, name, offset), constant_fields[i], false, - None, dynamic_shared_offset, w, num_tabs, @@ -1890,7 +1850,6 @@ extern \"C\" {} {}(", format!("{}+{}", name, offset), constant_fields[i], false, - extra_dim_collects, dynamic_shared_offset, w, num_tabs, @@ -1901,7 +1860,7 @@ extern \"C\" {} {}(", Constant::Summation(type_id, variant, field) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( "(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment @@ -1930,21 +1889,12 @@ extern \"C\" {} {}(", format!("*reinterpret_cast<{}>({})", variant_type, name), *field, false, - extra_dim_collects, dynamic_shared_offset, w, num_tabs, )?; } else if !variant_constant.is_array() { - self.codegen_constant( - name, - *field, - false, - extra_dim_collects, - dynamic_shared_offset, - w, - num_tabs, - )?; + self.codegen_constant(name, *field, false, dynamic_shared_offset, w, num_tabs)?; }; } Constant::Array(type_id) => { @@ -1952,7 +1902,7 @@ extern \"C\" {} {}(", panic!("Nested array constant should not be re-allocated"); } let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( "(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment @@ -1979,35 +1929,19 @@ extern \"C\" {} {}(", * and offset to 2nd field. This is useful for constant initialization and read/write * index math. */ - fn get_size( - &self, - type_id: TypeID, - num_fields: Option<usize>, - extra_dim_collects: Option<&HashSet<TypeID>>, - ) -> String { + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { - let array_size = if extra_dim_collects.is_some() - && extra_dim_collects.unwrap().contains(&type_id) - { - "1".to_string() - } else { - multiply_dcs(extents) - }; - format!("{} * {}", self.get_alignment(*element_type), array_size) + assert!(num_fields.is_none()); + let array_size = multiply_dcs(extents); + format!("{} * {}", self.get_size(*element_type, None), array_size) } Type::Product(fields) => { - let num_fields = &num_fields.unwrap_or(fields.len()); - let with_field = fields + let num_fields = num_fields.unwrap_or(fields.len()); + fields .iter() - .enumerate() - .filter(|(i, _)| i < num_fields) - .map(|(_, id)| { - ( - self.get_size(*id, None, extra_dim_collects), - self.get_alignment(*id), - ) - }) + .take(num_fields) + .map(|id| (self.get_size(*id, None), self.get_alignment(*id))) .fold(String::from("0"), |acc, (size, align)| { if acc == "0" { size @@ -2017,31 +1951,23 @@ extern \"C\" {} {}(", acc, align, align, align, size ) } - }); - if num_fields < &fields.len() { - format!( - "{} - {}", - with_field, - self.get_size(fields[*num_fields], None, extra_dim_collects) - ) - } else { - with_field - } + }) } Type::Summation(variants) => { + assert!(num_fields.is_none()); // The argmax variant by size is not guaranteed to be same as // argmax variant by alignment, eg product of 3 4-byte primitives // vs 1 8-byte primitive, so we need to calculate both. - let max_size = variants - .iter() - .map(|id| self.get_size(*id, None, extra_dim_collects)) - .fold(String::from("0"), |acc, x| { + let max_size = variants.iter().map(|id| self.get_size(*id, None)).fold( + String::from("0"), + |acc, x| { if acc == "0" { x } else { format!("umax({}, {})", acc, x) } - }); + }, + ); let max_alignment = variants .iter() .map(|id| self.get_alignment(*id)) @@ -2052,7 +1978,10 @@ extern \"C\" {} {}(", max_size, max_alignment, max_alignment, max_alignment ) } - _ => format!("{}", self.get_alignment(type_id)), + _ => { + assert!(num_fields.is_none()); + format!("{}", self.get_alignment(type_id)) + } } } diff --git a/juno_samples/product_read/Cargo.toml b/juno_samples/product_read/Cargo.toml new file mode 100644 index 0000000000000000000000000000000000000000..d466f5550b77e426040842339684a1e8906b22fa --- /dev/null +++ b/juno_samples/product_read/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "juno_product_read" +version = "0.1.0" +authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] +edition = "2021" + +[[bin]] +name = "juno_product_read" +path = "src/main.rs" + +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + +[build-dependencies] +juno_build = { path = "../../juno_build" } + +[dependencies] +juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } +with_builtin_macros = "0.1.0" +async-std = "*" diff --git a/juno_samples/product_read/build.rs b/juno_samples/product_read/build.rs new file mode 100644 index 0000000000000000000000000000000000000000..2bd5172e661e65e2284a986e2d710cd890d71b90 --- /dev/null +++ b/juno_samples/product_read/build.rs @@ -0,0 +1,22 @@ +use juno_build::JunoCompiler; + +fn main() { + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("product_read.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("product_read.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } +} diff --git a/juno_samples/product_read/src/gpu.sch b/juno_samples/product_read/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..549b421561da50023719fd432fe8d943a4ab37f6 --- /dev/null +++ b/juno_samples/product_read/src/gpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.product_read); + +ip-sroa(*); +sroa(*); +crc(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +float-collections(*); +gcm(*); diff --git a/juno_samples/product_read/src/main.rs b/juno_samples/product_read/src/main.rs new file mode 100644 index 0000000000000000000000000000000000000000..5211098ceebd6d7b15871ba0dd73cdbefb993313 --- /dev/null +++ b/juno_samples/product_read/src/main.rs @@ -0,0 +1,20 @@ +#![feature(concat_idents)] + +use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox}; + +juno_build::juno!("product_read"); + +fn main() { + async_std::task::block_on(async { + let input = vec![(0, 1), (2, 3)]; + let input : HerculesImmBox<(i32, i32)> = HerculesImmBox::from(input.as_slice()); + let mut r = runner!(product_read); + let res : Vec<i32> = HerculesMutBox::from(r.run(input.to()).await).as_slice().to_vec(); + assert_eq!(res, vec![0, 1, 2, 3]); + }); +} + +#[test] +fn products_test() { + main(); +} diff --git a/juno_samples/product_read/src/product_read.jn b/juno_samples/product_read/src/product_read.jn new file mode 100644 index 0000000000000000000000000000000000000000..7bf74a105b32099341f299c38898f6f6c08eb467 --- /dev/null +++ b/juno_samples/product_read/src/product_read.jn @@ -0,0 +1,9 @@ +#[entry] +fn product_read(input: (i32, i32)[2]) -> i32[4] { + let result : i32[4]; + result[0] = input[0].0; + result[1] = input[0].1; + result[2] = input[1].0; + result[3] = input[1].1; + return result; +}