diff --git a/.gitignore b/.gitignore index f8a684ce5223884e46f65b7fa67b6328eadf3100..87af5349ee5aa22c4f3763c6f8905b69f2773bb2 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,12 @@ *.out *.ll *.c +*.cu *.o *.a *.hrt -.*.swp +*.png +*.swp .vscode +*_env +*.txt diff --git a/Cargo.lock b/Cargo.lock index 2802b8b14af4c87362529671057ebf4eadbc8c9e..49630436a0f0b90d8252824046c29f0e18b78af2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1217,9 +1217,9 @@ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libfuzzer-sys" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa" +checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75" dependencies = [ "arbitrary", "cc", @@ -2174,9 +2174,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee" dependencies = [ "indexmap", "serde", @@ -2444,9 +2444,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "7e49d2d35d3fad69b39b94139037ecfb4f359f08958b9c11e7315ce770462419" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index a9197bf4fb4490ef0c25f37c787d766f9b6fae53..ced011a96c96793891228876314debeabcb561ed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,12 +5,12 @@ members = [ "hercules_ir", "hercules_opt", "hercules_rt", - + "juno_utils", "juno_frontend", "juno_scheduler", "juno_build", - + "hercules_test/hercules_interpreter", "hercules_test/hercules_tests", diff --git a/hercules_cg/Cargo.toml b/hercules_cg/Cargo.toml index cf0767de04c81e35974a7aca89e9085b543e1a9d..0952ee57e0dd9e4b8f7e9639e10a3f2b0fc35a72 100644 --- a/hercules_cg/Cargo.toml +++ b/hercules_cg/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = [] + [dependencies] rand = "*" ordered-float = "*" diff --git a/hercules_cg/src/fork_tree.rs b/hercules_cg/src/fork_tree.rs new file mode 100644 index 0000000000000000000000000000000000000000..64a93160aabc5564c784bf0ff8b14e82045b81bb --- /dev/null +++ b/hercules_cg/src/fork_tree.rs @@ -0,0 +1,37 @@ +use std::collections::{HashMap, HashSet}; + +use crate::*; + +/* + * Construct a map from fork node to all control nodes (including itself) satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we do count self-domination + * Here too we include the non-fork start node, as key for all controls outside any fork. + */ +pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> { + let mut fork_control_map = HashMap::new(); + for (control, forks) in fork_join_nesting { + let fork = forks.first().copied().unwrap_or(NodeID::new(0)); + fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(*control); + } + fork_control_map +} + +/* Construct a map from each fork node F to all forks satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we don't count self-domination + * Note that the fork_tree also includes the non-fork start node, as unique root node. + */ +pub fn fork_tree(function: &Function, fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> { + let mut fork_tree = HashMap::new(); + for (control, forks) in fork_join_nesting { + if function.nodes[control.idx()].is_fork() { + fork_tree.entry(*control).or_insert_with(HashSet::new); + let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0)); + fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(*control); + } + } + fork_tree +} diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs new file mode 100644 index 0000000000000000000000000000000000000000..d7a6d258358920d8a42aea1530843c08e223022a --- /dev/null +++ b/hercules_cg/src/gpu.rs @@ -0,0 +1,1894 @@ +extern crate bitvec; +extern crate hercules_ir; + +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::fmt::{Error, Write}; +use std::fs::{OpenOptions, File}; +use std::io::Write as _; + +use self::hercules_ir::*; + +use crate::*; + +/* + * The top level function to compile a Hercules IR function into CUDA + * kernel for execution on the GPU. We generate CUDA C textually, with a lot + * of similarities with the CPU LLVM generation plus custom GPU parallelization. + */ +pub fn gpu_codegen<W: Write>( + function: &Function, + types: &Vec<Type>, + constants: &Vec<Constant>, + dynamic_constants: &Vec<DynamicConstant>, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + bbs: &BasicBlocks, + backing_allocation: &FunctionBackingAllocation, + collection_objects: &FunctionCollectionObjects, + def_use_map: &ImmutableDefUseMap, + fork_join_map: &HashMap<NodeID, NodeID>, + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + w: &mut W, +) -> Result<(), Error> { + /* + * We assert the following: + * - Fork node must have >= 1 Reduce nodes + * - (Later in code) If the returned data type is a collection, it must have + * originated from (potentially multiple) parameter(s). + * + * We don't assert but assume the following: + * - max_num_blocks in KernelParams is within constraint of 1D grid size. This + * can be relaxed if we want to support larger grids. + * - Product types are packed with padding inserted for each element to + * be aligned for its type and for full product to be aligned to its + * largest element + * - Summation types must be aligned to their largest element + * + * Notes on GPU parallelization strategy and tips for IR transformations: + * - The top level block fork and any lower thread forks require a known Fork + * size. Thus for an otherwise parallelizable Fork with unknown size, + * consider splitting it into two Forks with one of known size. For block + * level, the known fork has to be the (only) top-most fork. + * - The thread-level strategy is determined by starting at the most nested + * Forks and working outwards in a greedy manner, with caps by GPU spec. + * Thus, to ensure some outer Fork is parallelized, ensure the inner + * parallelizable Forks aren't too large or consider removing schedule + * annotations. + * - Tight-Associative reductions can only be efficiently implemented if + * different Hercules ThreadIDs correspond to consecutive CUDA threads. But + * this prevents nested parallelization since each parallel group must always + * be a contiguous tile of threads. We use a heuristic of choosing the larger + * factor when this results in a conflict between a Fork and it's subtree, + * but this choice may not be optimal. + * - A given Fork (not talking about its children) can only be parallelized + * if all its Reduces are Parallel-Reduce or Tight-Associative. So if the + * Fork contains expensive parallelizable operations, ensure all reductions + * are parallelizable or if not try pulling those out into a different Fork. + * - We do nothing to mitigate intra-warp divergence. To mitigate this, the + * IR, for example, should ensure the innermost parallelizable Forks either + * have factor >= warp size (32) or remove Fork/Reduce node schedule + * annotations. + * + * Main TODOs: + * - Fix dynamic shared memory allocation to reuse old shmem. The main case + * for improvement is when we have serialized forks with unused intermediate + * values from previous iterations. + * - Add mapping from Region node to Fork node if there's a reduce whose control + * is a Region not Join. + * - Matmul/Conv detection + * - Add float8, float16, bfloat16 dtypes if they come + */ + + let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) + .filter(|idx| function.nodes[*idx].is_reduce()) + .map(NodeID::new) + .collect(); + + + let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map + .iter() + .map(|(fork, join)| (*join, *fork)) + .collect(); + // Fork Reduce map should have all reduces contained in some key + let mut fork_reduce_map: HashMap<NodeID, Vec<NodeID>> = HashMap::new(); + // Reduct Reduce map should have all non-parallel and non-associative reduces + // contained in some key. Unlike Fork, Reduct is not involved in any assertions. + // It's placed here for convenience but can be moved. + let mut reduct_reduce_map: HashMap<NodeID, Vec<NodeID>> = HashMap::new(); + for reduce_node in &reduce_nodes { + if let Node::Reduce { + control, + init: _, + reduct, + } = &function.nodes[reduce_node.idx()] + { + match function.nodes[control.idx()] { + Node::Join {..} => { + let fork_node = join_fork_map[control]; + fork_reduce_map + .entry(fork_node) + .or_default() + .push(*reduce_node); + } + Node::Region { preds: _ } => { + // TODO: map region node to fork node + } + _ => { + panic!("Reduce's control must be a join or region node"); + } + } + reduct_reduce_map + .entry(*reduct) + .or_default() + .push(*reduce_node); + } + } + for idx in 0..function.nodes.len() { + if function.nodes[idx].is_fork() { + assert!(fork_reduce_map + .get(&NodeID::new(idx)) + .is_some_and(|reduces| !reduces.is_empty()), + "Fork node {} has no reduce nodes", + idx + ); + } + } + + // Map from control to pairs of data to update phi + // For each phi, we go to its region and get region's controls + let mut control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>> = HashMap::new(); + for (idx, node) in function.nodes.iter().enumerate() { + if let Node::Phi { control, data } = node { + let Node::Region { preds } = &function.nodes[control.idx()] else { + panic!("Phi's control must be a region node"); + }; + for (i, &pred) in preds.iter().enumerate() { + control_data_phi_map.entry(pred).or_default().push((data[i], NodeID::new(idx))); + } + } + } + + let return_parameter = if collection_objects.returned_objects().len() == 1 { + collection_objects.origin(*collection_objects.returned_objects() + .first().unwrap()).try_parameter() + } else { + None + }; + + let kernel_params = &GPUKernelParams { + max_num_threads: 1024, + threads_per_warp: 32, + }; + + let ctx = GPUContext { + function, + types, + constants, + dynamic_constants, + typing, + control_subgraph, + bbs, + backing_allocation, + collection_objects, + def_use_map, + fork_join_map, + fork_control_map, + fork_tree, + join_fork_map, + fork_reduce_map, + reduct_reduce_map, + control_data_phi_map, + return_parameter, + kernel_params, + }; + ctx.codegen_function(w) +} + +struct GPUKernelParams { + max_num_threads: usize, + threads_per_warp: usize, +} + +struct GPUContext<'a> { + function: &'a Function, + types: &'a Vec<Type>, + constants: &'a Vec<Constant>, + dynamic_constants: &'a Vec<DynamicConstant>, + typing: &'a Vec<TypeID>, + control_subgraph: &'a Subgraph, + bbs: &'a BasicBlocks, + backing_allocation: &'a FunctionBackingAllocation, + collection_objects: &'a FunctionCollectionObjects, + def_use_map: &'a ImmutableDefUseMap, + fork_join_map: &'a HashMap<NodeID, NodeID>, + fork_control_map: &'a HashMap<NodeID, HashSet<NodeID>>, + fork_tree: &'a HashMap<NodeID, HashSet<NodeID>>, + join_fork_map: HashMap<NodeID, NodeID>, + fork_reduce_map: HashMap<NodeID, Vec<NodeID>>, + reduct_reduce_map: HashMap<NodeID, Vec<NodeID>>, + control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>>, + return_parameter: Option<usize>, + kernel_params: &'a GPUKernelParams, +} + +/* + * For all control nodes besides forks, Init, Body, and Term compose the main basic + * block, with Init and Term populated by control flow (Init used only by Fork and + * Join) and Body populated by data flow. + * For serialized Fork nodes which may be jumped back to by corresponding Join node, + * init and post_init separate one-time code (currently just cooperative group + * creation) from repeated code. + */ +#[derive(Default, Debug)] +struct CudaGoto { + init: String, + post_init: String, + body: String, + term: String, +} + +/* + * KernelState is used for data and control node organization and generation. + * We define a block fork as one with each ThreadID being a block, and a thread + * fork as one with each ThreadID being a subset of threads within a block. + * OutBlock is outside a potential block fork at the full grid level, InBlock + * is inside a block fork but outside any thread forks, and InThread is inside + * a thread fork. + */ +#[derive(Clone, Copy, PartialEq, Debug)] +enum KernelState { + OutBlock, + InBlock, + InThread, +} + +/* + * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) + * threads for a current ThreadID, Use is the union of such threads for all ThreadIDs + * in the current innermost Fork, and Available is Use plus additional threads not + * used in the current Fork. + */ +#[derive(Clone, Copy, PartialEq, Debug)] +enum CGType { + UsePerId, + Use, + Available, +} + +impl GPUContext<'_> { + fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { + // Emit all code up to the "goto" to Start's block + let mut top = String::new(); + self.codegen_kernel_begin(self.return_parameter.is_none(), &mut top)?; + let mut dynamic_shared_offset = "0".to_string(); + self.codegen_dynamic_constants(&mut top)?; + self.codegen_declare_data(&mut top)?; + self.codegen_helpers(&mut top)?; + self.codegen_goto_start(&mut top)?; + write!(w, "{}", top)?; + + // Setup for CUDA's "goto" for control flow between basic blocks. + let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_control()) + .map(|idx| { + let node_id = NodeID::new(idx); + let goto = CudaGoto::default(); + (node_id, goto) + }) + .collect(); + + // If there are no forks, fast forward to single-block, single-thread codegen + let (num_blocks, num_threads) = if self.fork_join_map.is_empty() { + self.codegen_data_control_no_forks(&HashSet::new(), &mut dynamic_shared_offset, &mut gotos)?; + ("1".to_string(), "1".to_string()) + } else { + // Create structures and determine block and thread parallelization strategy + let (root_forks, num_blocks, is_block_parallel) = + self.get_root_forks_and_num_blocks(self.fork_tree); + let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, self.fork_tree, is_block_parallel); + let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(self.fork_tree, thread_root_root_fork); + // TODO: Uncomment and adjust once we know logic of extra dim. This will affect constant + // collections, reads, and writes. + // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); + let extra_dim_collects = HashSet::new(); + + // Core function for the CUDA code of all data and control nodes. + self.codegen_data_control( + if is_block_parallel { + Some(thread_root_root_fork) + } else { + None + }, + &thread_root_forks, + &fork_thread_quota_map, + &extra_dim_collects, + &mut dynamic_shared_offset, + is_block_parallel, + num_threads, + &mut gotos, + )?; + (num_blocks, num_threads.to_string()) + }; + + // Emit all GPU kernel code from previous steps + let mut kernel_body = String::new(); + self.codegen_gotos(false, &mut gotos, &mut kernel_body)?; + write!(w, "{}", kernel_body)?; + write!(w, "}}\n")?; + + // Emit host launch code + let mut host_launch = String::new(); + self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; + write!(w, "{}", host_launch)?; + + Ok(()) + } + + /* + * Emit kernel headers, signature, arguments, and dynamic shared memory declaration + */ + fn codegen_kernel_begin(&self, has_ret_var: bool, w: &mut String) -> Result<(), Error> { + write!(w, " +#include <assert.h> +#include <stdio.h> +#include <stddef.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <math_constants.h> +#include <mma.h> +#include <cooperative_groups.h> +#include <cooperative_groups/reduce.h> +namespace cg = cooperative_groups; + +#define uabs(a) (a) +#define umin(a, b) ((a) < (b) ? (a) : (b)) +#define umax(a, b) ((a) > (b) ? (a) : (b)) +#define powi(a, b) ({{ int res = 1; for(int i = 0; i < b; ++i) res *= a; res; }}) +#define roundi(a) (a) +#define isqrt(a) ((int)sqrtf((float)(a))) + +", + )?; + + write!( + w, + "__global__ void __launch_bounds__({}) {}_gpu(", + self.kernel_params.max_num_threads, self.function.name + )?; + let mut first_param = true; + // The first parameter is a pointer to GPU backing memory, if it's + // needed. + if self.backing_allocation.contains_key(&Device::CUDA) { + first_param = false; + write!(w, "char* backing")?; + } + // The second set of parameters are dynamic constants. + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + } + write!(w, "unsigned long long dc_p{}", idx)?; + } + // The third set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + } + let param_type = if self.types[ty.idx()].is_primitive() { + self.get_type(*ty, false) + } else { + format!("{} __restrict__", self.get_type(*ty, false)) + }; + write!(w, "{} p{}", param_type, idx)?; + } + if has_ret_var { + if !first_param { + write!(w, ", ")?; + } + write!( + w, + "void* __restrict__ ret", + )?; + } + + // Type is char since it's simplest to use single bytes for indexing + // and it's required for heterogeneous Product and Summation types. + // Casting is later used for conversion to different types like int. + write!(w, ") {{\n")?; + write!(w, "\textern __shared__ char dynamic_shared[];\n")?; + // This will only get used by thread rank 0 in each block, since it + // does all shared memory "allocation". The actual state is preserved + // in Rust string and this offset is assigned to for ease of readability. + write!(w, "\tuint64_t dynamic_shared_offset;\n")?; + + Ok(()) + } + + /* + * Emit calculation of all dynamic constants + */ + fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { + for dc in dynamic_constants_bottom_up(&self.dynamic_constants) { + let dc_val = format!("unsigned long long dc{}", dc.idx()); + match &self.dynamic_constants[dc.idx()] { + DynamicConstant::Constant(val) => write!(w, "\t{} = {}ull;\n", dc_val, val)?, + DynamicConstant::Parameter(idx) => { + if *idx < self.function.num_dynamic_constants as usize { + write!(w, "\t{} = dc_p{};\n", dc_val, idx)? + } else { + write!(w, "\t{} = 0;\n", dc_val)? + } + } + DynamicConstant::Add(args) => { + let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" + "); + write!(w, "\t{} = {};\n", dc_val, rhs)? + } + DynamicConstant::Mul(args) => { + let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" * "); + write!(w, "\t{} = {};\n", dc_val, rhs)? + } + DynamicConstant::Min(args) => { + let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("min(dc{}, ", arg.idx())).collect(); + let rhs_last = format!("dc{}", args.last().unwrap().idx()); + let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect(); + write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)? + } + DynamicConstant::Max(args) => { + let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("max(dc{}, ", arg.idx())).collect(); + let rhs_last = format!("dc{}", args.last().unwrap().idx()); + let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect(); + write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)? + } + DynamicConstant::Sub(left, right) => { + write!(w, "\t{} = dc{} - dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Div(left, right) => { + write!(w, "\t{} = dc{} / dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Rem(left, right) => { + write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())? + } + } + } + Ok(()) + } + + /* + * To abide by c++ reassignment restrictions, we declare all data values + * upfront. + */ + fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { + for id in (0..self.function.nodes.len()).map(NodeID::new) { + if !self.function.nodes[id.idx()].is_control() && + !self.function.nodes[id.idx()].is_dynamic_constant() && + !self.function.nodes[id.idx()].is_parameter() { + write!(w, "\t{};\n", self.get_value(id, true, false))?; + } + if self.function.nodes[id.idx()].is_phi() { + write!(w, "\t{}_tmp;\n", self.get_value(id, true, false))?; + } + } + Ok(()) + } + + /* + * Emit helper registers that are used throughout the kernel. grid and block + * are from CUDA's cooperative groups API and are used specifically for reads + * and writes. + */ + fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { + write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; + write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; + Ok(()) + } + + fn codegen_goto_start(&self, w: &mut String) -> Result<(), Error> { + let block_start = self.get_block_name(NodeID::new(0), false); + write!(w, "\tgoto {};\n", block_start)?; + Ok(()) + } + + fn codegen_gotos(&self, goto_debug: bool, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { + write!(w, "\n")?; + for (id, goto) in gotos.iter() { + let goto_block = self.get_block_name(*id, false); + write!(w, "{}:\n", goto_block)?; + if goto_debug { + write!(w, "\tprintf(\"goto {}\\n\");\n", goto_block)?; + } + write!(w, "{}", goto.init)?; + if !goto.post_init.is_empty() { + let goto_block = self.get_block_name(*id, true); + write!(w, "{}:\n", goto_block)?; + write!(w, "{}", goto.post_init)?; + } + write!(w, "{}", goto.body)?; + write!(w, "{}\n", goto.term)?; + } + Ok(()) + } + + fn codegen_launch_code(&self, num_blocks: String, num_threads: String, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { + // The following steps are for host-side C function arguments, but we also + // need to pass arguments to kernel, so we keep track of the arguments here. + let ret_type = self.get_type(self.function.return_type, false); + let mut pass_args = String::new(); + write!(w, " +extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; + let mut first_param = true; + // The first parameter is a pointer to GPU backing memory, if it's + // needed. + if self.backing_allocation.contains_key(&Device::CUDA) { + first_param = false; + write!(w, "char* backing")?; + write!(pass_args, "backing")?; + } + // The second set of parameters are dynamic constants. + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + write!(w, "unsigned long long dc_p{}", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + // The third set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + let param_type = self.get_type(*ty, false); + write!(w, "{} p{}", param_type, idx)?; + write!(pass_args, "p{}", idx)?; + } + write!(w, ") {{\n")?; + // For case of dynamic block count + self.codegen_dynamic_constants(w)?; + let has_ret_var = self.return_parameter.is_none(); + if has_ret_var { + // Allocate return parameter and lift to kernel argument + let ret_type_pnt = self.get_type(self.function.return_type, true); + write!(w, "\t{} ret;\n", ret_type_pnt)?; + if !first_param { + write!(pass_args, ", ")?; + } + write!(pass_args, "ret")?; + write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; + } + write!(w, "\tcudaError_t err;\n"); + write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; + write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n"); + write!(w, "\tcudaDeviceSynchronize();\n")?; + write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n"); + if has_ret_var { + // Copy return from device to host, whether it's primitive value or collection pointer + write!(w, "\t{} host_ret;\n", ret_type)?; + write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; + write!(w, "\treturn host_ret;\n")?; + } else { + write!(w, "\treturn p{};\n", self.return_parameter.unwrap())?; + } + write!(w, "}}\n")?; + Ok(()) + } + + /* + * If tree has a single root fork of known size s <= max_num_blocks + * with parallel-fork schedule, then set num_blocks to s, else set num_blocks + * to 1. Also return the root fork(s) for parallelization strategy within + * threadblocks for threads and their eventual generation. + */ + fn get_root_forks_and_num_blocks( + &self, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + ) -> (HashSet<NodeID>, String, bool) { + let root_forks: HashSet<NodeID> = fork_tree.get(&NodeID::new(0)).unwrap().clone(); + if root_forks.len() != 1 { + return (root_forks, "1".to_string(), false); + } + + let root_fork = root_forks.iter().next().unwrap(); + let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { + panic!("Expected fork node"); + }; + let reduces = &self.fork_reduce_map[root_fork]; + if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) + { + let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); + (root_forks, fork_size, true) + } else { + (root_forks, "1".to_string(), false) + } + } + + /* + * If there's a block fork, then thread root forks are it's child forks. If + * not, thread root forks are the root forks. This will be used to begin the + * thread fork tree traversal for codegen. + */ + fn get_thread_root_forks( + &self, + root_forks: &HashSet<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + is_block_parallel: bool, + ) -> (NodeID, HashSet<NodeID>) { + if is_block_parallel { + let root_fork = root_forks.iter().next().unwrap(); + (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect()) + } else { + (NodeID::new(0), root_forks.clone()) + } + } + + /* + * This analysis determines the parallelization strategy within threadblocks. + * We run post-order traversal on the fork tree to get the thread quota per + * subtree. In particular, each fork starts with a base factor as the + * maximum over its descendants (leafs have base 1). We traverse up (details + * in helper) and pass the factor and a map from fork node to a tuple of + * (max quota of its siblings (including itself), its quota, its fork factor) + * from each node to its parents. The parent then compares the received quota + * of its subtree vs just it's own. If it's associative, it chooses the larger + * of the two, if not it can parallelize both if applicable and if they fit. + * + * Finally, the map is returned such that a node is in the map IFF it will + * be parallelized. If not, the fork will use the parent's quota and serialize + * over the Fork's ThreadIDs. Nodes may be removed from the map when traversing + * up the tree due to conflicting (due to associative or limit) ancestor of + * larger factor. + */ + fn get_thread_quotas( + &self, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + root_fork: NodeID, + ) -> (HashMap<NodeID, (usize, usize, usize)>, usize) { + let (tree_map, tree_quota, _) = self.recurse_thread_quotas(root_fork, fork_tree, true); + (tree_map, tree_quota) + } + + fn recurse_thread_quotas( + &self, + curr_fork: NodeID, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + is_root: bool, + ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { + // Subsubtree map is the union of all keys for grandchildren and lower + // nodes. children_quota_map is a constructed map from parallelized children + // to their quota to update the subsubtree map at grandchildren level to + // subtreemap at children level. subtree_quota is cumulative factor of + // subtree and is then compared to this fork's factor. + let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree + .get(&curr_fork) + .unwrap() + .iter() + .map(|child| (child, self.recurse_thread_quotas(*child, fork_tree, false))) + .fold( + (HashMap::new(), HashMap::new(), 1), + |(mut subsubtree_map, mut children_quota_map, subtree_quota), (child, (curr_map, curr_quota, use_curr))| { + subsubtree_map.extend(curr_map); + if use_curr { + children_quota_map.insert(child, curr_quota); + } + (subsubtree_map, children_quota_map, subtree_quota.max(curr_quota)) + }, + ); + // First update children_quota_map items with full information and add + // to subsubtree_map + for (&child, quota) in children_quota_map.iter() { + let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else { + panic!("Expected fork node"); + }; + let fork_size = self.multiply_fork_factors(factors).unwrap(); + subsubtree_map.insert(*child, (subtree_quota, *quota, fork_size)); + } + let subtree_map = subsubtree_map; + if is_root { + return (subtree_map, subtree_quota, true) + } + // A node can only be considered for parallelization if: + // a) it has statically known size + // b) the known size is less than or equal to the max_num_threads + // c) the known size is a power of 2 + // d) all reduces are parallel-reduce or associative + // + // If not, just take the max cumulative factor of its subtree + let reduces = &self.fork_reduce_map[&curr_fork]; + if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] + && let Some(fork_size) = self.multiply_fork_factors(factors) + && fork_size <= self.kernel_params.max_num_threads + && fork_size.is_power_of_two() + && reduces.iter().all(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) + || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) + }) + { + // If there's an associative Reduce, parallelize the larger factor + // between the Fork and subtree + // Else, all Reduces must be only parallel-reduce, so parallelize + // both if they fit and the larger if not. + // The reason for this distinction is that we only perform Reduces over + // ThreadID-based values over consecutive CUDA threads, so there's no + // opportunity for further nested parallelization. In contrast, this + // restriction doesn't help for parallel Writes, so nested parallelization + // is possible. + if reduces.iter().any(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) + }) || fork_size > self.kernel_params.max_num_threads / subtree_quota { + if fork_size >= subtree_quota { + (HashMap::new(), fork_size, true) + } else { + (subtree_map, subtree_quota, false) + } + } else { + (subtree_map, fork_size * subtree_quota, true) + } + } else { + (subtree_map, subtree_quota, false) + } + } + + /* + * All non reduced-over collections used in fork joins have an extra dimension. + * However, this is only useful if ThreadIDs run in parallel not serially, + * otherwise it's unnecessarily consuming shared memory. This function returns + * the set of collections that have an unnecessary extra dimension. + */ + fn get_extra_dim_collects( + &self, + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + ) -> HashSet<TypeID> { + // Determine which fork each collection is used in, and check if it's + // parallelized via the fork_thread_quota_map. + todo!() + } + + fn codegen_data_control_no_forks( + &self, + extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_control()) + .try_for_each(|idx| -> Result<(), Error> { + let control = NodeID::new(idx); + let goto = gotos.get_mut(&control).unwrap(); + let init = &mut goto.init; + let post_init = &mut goto.post_init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(control, None, None, None, init, post_init, term)?; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node(*data, KernelState::OutBlock, Some(false), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + } + Ok(()) + }) + } + + /* + * Codegen for all control and data nodes. + */ + fn codegen_data_control( + &self, + block_fork: Option<NodeID>, + thread_root_forks: &HashSet<NodeID>, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, + is_block_parallel: bool, + num_threads: usize, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + // First emit data and control gen for each control node outside any fork. + // Recall that this was tracked through a fake fork node with NodeID 0. + let mut state = KernelState::OutBlock; + for control in self.fork_control_map.get(&NodeID::new(0)).unwrap() { + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let post_init = &mut goto.post_init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node(*data, state, Some(is_block_parallel), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + } + } + // Then generate data and control for the single block fork if it exists + if block_fork.is_some() { + state = KernelState::InBlock; + for control in self.fork_control_map.get(&block_fork.unwrap()).unwrap() { + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let post_init = &mut goto.post_init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node(*data, state, None, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + } + } + } + // Then generate for the thread fork tree through Fork node traversal. + state = KernelState::InThread; + for &root_fork in thread_root_forks { + self.codegen_data_control_traverse( + root_fork, + state, + fork_thread_quota_map, + 1, + num_threads, + extra_dim_collects, + dynamic_shared_offset, + gotos, + )?; + } + Ok(()) + } + + /* + * The important feature of this traversal is that we update the available + * thread quota, use thread quota, and parallel factor for each Fork node. + * Either this information is in the precomputed map, or we use the parent's + * quota with no parallel factor. + */ + fn codegen_data_control_traverse( + &self, + curr_fork: NodeID, + state: KernelState, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + parent_quota: usize, + num_threads: usize, + extra_dim_collections: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map + .get(&curr_fork) + .map(|(a, u, f)| (*a, *u, Some(*f))) + .unwrap_or((parent_quota, parent_quota, None)); + let reduces = &self.fork_reduce_map[&curr_fork]; + let reducts = if parallel_factor.is_some() { + reduces + .iter() + .map(|&reduce| { + let Node::Reduce { control: _, init: _, reduct} = &self.function.nodes[reduce.idx()] else { + panic!("Expected reduce node"); + }; + *reduct + }) + .collect() + } else { + HashSet::new() + }; + for control in self.fork_control_map.get(&curr_fork).unwrap() { + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let post_init = &mut goto.post_init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor, init, post_init, term)?; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node( + *data, + state, + None, + Some(use_thread_quota), + parallel_factor, + Some(curr_fork), + reducts.contains(data), + extra_dim_collections, + dynamic_shared_offset, + body, + &mut tabs, + )?; + } + } + for child in self.fork_tree.get(&curr_fork).unwrap() { + self.codegen_data_control_traverse( + *child, + state, + fork_thread_quota_map, + use_thread_quota, + num_threads, + extra_dim_collections, + dynamic_shared_offset, + gotos, + )?; + } + Ok(()) + } + + fn codegen_data_node( + &self, + id: NodeID, + state: KernelState, + is_block_parallel: Option<bool>, + use_thread_quota: Option<usize>, + parallel_factor: Option<usize>, + nesting_fork: Option<NodeID>, + is_special_reduct: bool, + extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, + w: &mut String, + num_tabs: &mut usize, + ) -> Result<(), Error> { + let define_variable = self.get_value(id, false, false).to_string(); + let tabs = "\t".repeat(*num_tabs); + match &self.function.nodes[id.idx()] { + Node::Phi { control: _, data: _ } => { + write!(w, "{}{} = {}_tmp;\n", tabs, define_variable, define_variable)?; + } + Node::ThreadID { control, dimension } => { + let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { + panic!("Expected ThreadID's control to be a fork node"); + }; + let divide = multiply_dcs(&factors[dimension + 1..]); + let modulo = format!("dc{}", factors[*dimension].idx()); + match state { + KernelState::InBlock => { + write!( + w, + "{}{} = (blockIdx.x / ({})) % {};\n", + tabs, define_variable, divide, modulo + )?; + } + KernelState::InThread => { + if parallel_factor.is_none() { + // No dependence on threadIdx.x because each used thread + // will run this Fork serially + let fork_iter = self.get_fork_iter(*control, false); + write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; + } else { + // We can directly use use_thread_quota and not worry about available + // because Fork basic block's init section already does gating + write!(w, "{}{} = (threadIdx.x % {}) / {};\n", tabs, define_variable, use_thread_quota.unwrap(), use_thread_quota.unwrap() / parallel_factor.unwrap())?; + } + } + _ => { + panic!("Unsupported state for ThreadID") + } + } + } + // The Reduce node only generates it's initialization, as reduct will + // perform the update. If serialized, add gate to prevent re-assignment + // when we hit this reduce again due to the control flow loop between + // the Fork and Join. + Node::Reduce { + control: _, + init, + reduct: _, + } => { + let init_val = self.get_value(*init, false, false); + if parallel_factor.is_none() && KernelState::InThread == state { + let Some(nesting_fork) = nesting_fork else { + panic!("Expected reduce to be nested in a fork node"); + }; + let fork_iter = self.get_fork_iter(nesting_fork, false); + write!(w, "{}if ({} == 0) {{\n", tabs, fork_iter)?; + write!(w, "{}\t{} = {};\n", tabs, define_variable, init_val)?; + write!(w, "{}}}\n", tabs)?; + } else { + write!(w, "{}{} = {};\n", tabs, define_variable, init_val)?; + } + } + // Parameters emitted at top + Node::Parameter { index: _ } => {} + // If the constant is primitive, it's stored in register so we repeat + // for all threads. Otherwise, it can be inside or outside block fork. + // If inside, it's stored in shared memory so we "allocate" it once + // and parallelize memset to 0. If outside, we initialize as offset + // to backing, but if multi-block grid, don't memset to avoid grid-level sync. + Node::Constant { id: cons_id } => { + let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); + let cg_tile = match state { + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), + }; + if !is_primitive && state != KernelState::OutBlock { + write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; + *num_tabs += 1; + } + if is_primitive || state != KernelState::OutBlock { + self.codegen_constant( + define_variable.clone(), + *cons_id, + true, + Some(extra_dim_collects), + dynamic_shared_offset, + w, + *num_tabs, + )?; + } + if !is_primitive && state != KernelState::OutBlock { + write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + *num_tabs -= 1; + } + if !is_primitive && state == KernelState::OutBlock { + let (_, offsets) = &self.backing_allocation[&Device::CUDA]; + let offset = offsets[&id]; + write!(w, "{}{} = backing + dc{};\n", tabs, define_variable, offset.idx())?; + } + if !is_primitive && (state != KernelState::OutBlock || is_block_parallel.is_none() || !is_block_parallel.unwrap()) { + let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; + write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + } + } + // Dynamic constants emitted at top + Node::DynamicConstant { id: _ } => {} + Node::Unary { op, input } => match op { + UnaryOperator::Not => match &self.types[self.typing[input.idx()].idx()] { + Type::Boolean => { + write!( + w, + "{}{} = !{};\n", + tabs, + define_variable, + self.get_value(*input, false, false), + )?; + } + ty if ty.is_fixed() => { + write!( + w, + "{}{} = ~{};\n", + tabs, + define_variable, + self.get_value(*input, false, false), + )?; + } + _ => panic!("Unsupported type for not operator"), + }, + UnaryOperator::Neg => match &self.types[self.typing[input.idx()].idx()] { + ty if ty.is_signed() || ty.is_float() => { + write!( + w, + "{}{} = -{};\n", + tabs, + define_variable, + self.get_value(*input, false, false), + )?; + } + _ => { + panic!("Unsupported type for neg operator") + } + }, + UnaryOperator::Cast(dst_ty_id) => { + write!( + w, + "{}{} = static_cast<{}>({});\n", + tabs, + define_variable, + self.get_type(*dst_ty_id, false), + self.get_value(*input, false, false), + )?; + } + }, + Node::Binary { op, left, right } => { + let mut left_val = self.get_value(*left, false, false); + let mut right_val = self.get_value(*right, false, false); + let id_type = self.typing[id.idx()]; + if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And + | BinaryOperator::Xor) && is_special_reduct { + // For parallelized associative Reduces, use the cooperative + // groups reduce API. Associative multiplication is not + // supported. We need to use CGType::Use not CGType::UsePerId + // because for parallelized reduction we only have one thread + // per ThreadID and the reduction is over Use, not UsePerId. + let (reduce_val, non_reduce_val) = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[left.idx()] { + (left_val, right_val) + } else { + (right_val, left_val) + }; + // Special reduct is only enabled for thread parallelization + // so don't need to worry about grid and block cases + let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use); + #[allow(unreachable_patterns)] + let cg_op = match op { + BinaryOperator::Add => "plus", + BinaryOperator::Or => "bit_or", + BinaryOperator::And => "bit_and", + BinaryOperator::Xor => "bit_xor", + _ => unreachable!(), + }; + let id_type_name = self.get_type(id_type, false); + write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, cg_tile, non_reduce_val, cg_op, id_type_name)?; + // Setup binop between reduce's init and reduced reduct. Since it's associative, + // we can change binop ordering + left_val = define_variable.clone(); + right_val = reduce_val; + } + match (op, &self.types[id_type.idx()]) { + (BinaryOperator::Or, Type::Boolean) => write!( + w, + "{}{} = {} || {};\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::And, Type::Boolean) => write!( + w, + "{}{} = {} && {};\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::Rem, Type::Float32) => write!( + w, + "{}{} = fmodf({}, {});\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::Rem, Type::Float64) => write!( + w, + "{}{} = fmod({}, {});\n", + tabs, define_variable, left_val, right_val, + )?, + (op, _) => write!( + w, + "{}{} = {} {} {};\n", + tabs, + define_variable, + left_val, + match op { + BinaryOperator::Add => "+", + BinaryOperator::Sub => "-", + BinaryOperator::Mul => "*", + BinaryOperator::Div => "/", + BinaryOperator::Rem => "%", + BinaryOperator::LT => "<", + BinaryOperator::LTE => "<=", + BinaryOperator::GT => ">", + BinaryOperator::GTE => ">=", + BinaryOperator::EQ => "==", + BinaryOperator::NE => "!=", + BinaryOperator::Or => "|", + BinaryOperator::And => "&", + BinaryOperator::Xor => "^", + BinaryOperator::LSh => "<<", + BinaryOperator::RSh => ">>", + }, + right_val, + )?, + }; + } + Node::Ternary { + op, + first, + second, + third, + } => match op { + TernaryOperator::Select => { + write!( + w, + "{}{} = {} ? {} : {};\n", + tabs, + define_variable, + self.get_value(*first, false, false), + self.get_value(*second, false, false), + self.get_value(*third, false, false), + )?; + } + }, + Node::IntrinsicCall { intrinsic, args } => { + let id_type = self.typing[id.idx()]; + if matches!(intrinsic, Intrinsic::Max | Intrinsic::Min) && is_special_reduct { + // Similar to associative Binops + let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[args[0].idx()] { + self.get_value(args[1], false, false) + } else { + self.get_value(args[0], false, false) + }; + let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use); + #[allow(unreachable_patterns)] + let cg_op = match intrinsic { + Intrinsic::Max => "max", + Intrinsic::Min => "min", + _ => unreachable!(), + }; + let id_type_name = self.get_type(id_type, false); + write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name)?; + } else { + let ty = &self.types[id_type.idx()]; + let intrinsic = self.codegen_intrinsic(intrinsic, ty); + let args = args.iter() + .map(|arg| self.get_value(*arg, false, false)) + .collect::<Vec<_>>() + .join(", "); + write!( + w, + "{}{} = {}({});\n", + tabs, + define_variable, + intrinsic, + args, + )?; + } + } + // Read of primitive requires load after pointer math. + Node::Read { collect, indices } => { + let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects); + let data_type_id = self.typing[id.idx()]; + if self.types[data_type_id.idx()].is_primitive() { + let type_name = self.get_type(data_type_id, true); + write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; + } else { + write!(w, "{}{} = {};\n", tabs, define_variable, collect_with_indices)?; + } + } + // Write of primitive needs a thread rank gate for safety. Write of + // collection is memcpy that we distribute among threads. + Node::Write { + collect, + data, + indices, + } => { + let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects); + let data_variable = self.get_value(*data, false, false); + let data_type_id = self.typing[data.idx()]; + let cg_tile = match state { + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), + }; + if self.types[data_type_id.idx()].is_primitive() { + write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; + let type_name = self.get_type(data_type_id, true); + write!(w, "{}\t*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; + write!(w, "{}}}\n", tabs)?; + } else { + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; + write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, collect_with_indices, data_variable)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; + write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?; + write!(w, "{}}}\n", tabs)?; + } + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + let collect_variable = self.get_value(*collect, false, false); + write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; + } + _ => { + panic!("Unsupported data node type: {:?}", self.function.nodes[id.idx()]) + } + } + // Since reducts are responsible for updating Reduce nodes, we check and + // emit those for each data node. + if let Some(reduces) = self.reduct_reduce_map.get(&id) { + let val = self.get_value(id, false, false); + for reduce in reduces { + let reduce_val = self.get_value(*reduce, false, false); + write!(w, "{}{} = {};\n", tabs, reduce_val, val)?; + } + } + Ok(()) + } + + fn codegen_control_node( + &self, + id: NodeID, + available_thread_quota: Option<usize>, + use_thread_quota: Option<usize>, + parallel_factor: Option<usize>, + w_init: &mut String, + w_post_init: &mut String, + w_term: &mut String, + ) -> Result<usize, Error> { + for (data, phi) in self.control_data_phi_map.get(&id).unwrap_or(&vec![]).iter() { + let data = self.get_value(*data, false, false); + let phi = self.get_value(*phi, false, false); + write!(w_term, "\t{}_tmp = {};\n", phi, data)?; + } + let tabs = match &self.function.nodes[id.idx()] { + Node::Start + | Node::Region { preds: _ } + | Node::Projection { control: _, selection: _ } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; + 1 + } + Node::If { control: _, cond } => { + let mut succs = self.control_subgraph.succs(id); + let succ1 = succs.next().unwrap(); + let succ2 = succs.next().unwrap(); + let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some(); + let succ1_block_name = self.get_block_name(succ1, false); + let succ2_block_name = self.get_block_name(succ2, false); + write!( + w_term, + "\tif ({}) {{\n", + self.get_value(*cond, false, false) + )?; + write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ1_block_name.clone() } else { succ2_block_name.clone() })?; + write!(w_term, "\t}} else {{\n")?; + write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ2_block_name } else { succ1_block_name })?; + write!(w_term, "\t}}\n")?; + 1 + } + Node::Fork { control: _, factors: _ } => { + // We create a cooperative group tile for each of: used threads per + // thread ID- for reads and writes-, used threads across all thread + // IDs- for parallelized reductions-, and available threads- to + // synchronize between used and unused threads. We want to create + // these only once, so we create two goto sections for each fork- + // one run only once for creating groups, and other may be ran + // multiple times if the Fork is serialized and Join jumps back + // to it. + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + if use_thread_quota.is_some() { + let use_thread_quota = use_thread_quota.unwrap(); + let use_thread_per_id = if parallel_factor.is_some() { + use_thread_quota / parallel_factor.unwrap() + } else { + use_thread_quota + }; + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id)?; + let cg_tile_use = self.get_cg_tile(id, CGType::Use); + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota)?; + let available_thread_quota = available_thread_quota.unwrap(); + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota)?; + if parallel_factor.is_none() { + write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?; + write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; + } + } + // Fork nodes gate the used vs unused threads out of all available + // threads. If unused, we jump straight to the Join, and if used, + // we jump to successor like normal. + let succ = self.control_subgraph.succs(id).next().unwrap(); + if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { + let w_target = if parallel_factor.is_none() { w_post_init } else { w_init }; + write!(w_target, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?; + write!(w_term, "\t}}\n")?; + write!(w_term, "\telse {{\n")?; + let join = self.fork_join_map.get(&id).unwrap(); + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*join, false))?; + write!(w_term, "\t}}\n")?; + 2 + } else { + // Make sure post-init isn't empty so it goto header generated + if use_thread_quota.is_some() && parallel_factor.is_none() { + write!(w_post_init, " ")?; + } + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; + 1 + } + } + Node::Join { control: _ } => { + // Join nodes also gate the used vs unused threads with a tile + // sync after the body. + let succ = self.control_subgraph.succs(id).next().unwrap(); + let has_thread_quota = available_thread_quota.is_some(); + let mut tabs = 1; + if has_thread_quota { + let available_thread_quota = available_thread_quota.unwrap(); + let use_thread_quota = use_thread_quota.unwrap(); + if use_thread_quota < available_thread_quota { + write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + write!(w_term, "\t}}\n")?; + tabs += 1; + } + let fork = self.join_fork_map.get(&id).unwrap(); + let cg_tile_available = self.get_cg_tile(*fork, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + } + // If the Fork was parallelized, each thread or UsedPerId tile of + // threads only runs one ThreadID, so we can jump straight to the + // successor. Else, we jump back to the Fork until all ThreadIDs + // or equivalently the Fork's full factor number of iterations have + // been completed. + if parallel_factor.is_some() { + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; + } else { + let fork = self.join_fork_map.get(&id).unwrap(); + let Node::Fork { factors, .. } = &self.function.nodes[fork.idx()] else { + panic!("Expected join_fork_map to point to a fork node"); + }; + let fork_size = multiply_dcs(factors); + let fork_iter = self.get_fork_iter(*fork, false); + write!(w_term, "\t{} += 1;\n", fork_iter)?; + write!(w_term, "\tif ({} == {}) {{\n", fork_iter, fork_size)?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?; + write!(w_term, "\t}}\n")?; + write!(w_term, "\telse {{\n")?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*fork, true))?; + write!(w_term, "\t}}\n")?; + } + tabs + } + Node::Return { control: _, data } => { + if self.return_parameter.is_none() { + // Since we lift return into a kernel argument, we write to that + // argument upon return. + let return_val = self.get_value(*data, false, false); + let return_type_ptr = self.get_type(self.function.return_type, true); + write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?; + write!(w_term, "\t\t*(reinterpret_cast<{}>(ret)) = {};\n", return_type_ptr, return_val)?; + write!(w_term, "\t}}\n")?; + } + write!(w_term, "\treturn;\n")?; + 1 + } + _ => { + panic!("Unsupported control node type") + } + }; + Ok(tabs) + } + + /* + * This function emits collection name + pointer math for the provided indices. + * All collection types use char pointers. + */ + fn codegen_collect(&self, collect: NodeID, indices: &[Index], extra_dim_collects: &HashSet<TypeID>) -> String { + let mut index_ptr = "0".to_string(); + let type_id = self.typing[collect.idx()]; + for index in indices { + match index { + Index::Field(field) => { + self.get_size(type_id, Some(*field), Some(extra_dim_collects)); + } + // Variants of summations have zero offset + Index::Variant(_) => {} + // Convert multi-d array index to 1-d index, and optionally + // convert to single-byte index by multiplying by element size + Index::Position(array_indices) => { + let has_extra_dim = extra_dim_collects.contains(&self.typing[collect.idx()]); + if has_extra_dim { + continue; + } + let Type::Array(element_type, extents) = + &self.types[self.typing[collect.idx()].idx()] + else { + panic!("Expected array type") + }; + let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); + let max_left_array_index = array_indices.len() - 1; + for (i, index) in array_indices.iter().rev().enumerate() { + cumulative_offset = format!( + "{} * ({}{}", + cumulative_offset, + self.get_value(*index, false, false), + if i != max_left_array_index { + format!(" + dc{}", extents[max_left_array_index - i].idx()) + } else { + "".to_string() + } + ); + } + index_ptr.push_str(&format!( + " + {}{}", + cumulative_offset, + ")".repeat(array_indices.len()) + )); + let element_size = self.get_size(*element_type, None, Some(extra_dim_collects)); + index_ptr.push_str(&format!(" * {}", element_size)); + } + } + } + let name = self.get_value(collect, false, false); + format!("{} + {}", name, index_ptr) + } + + /* + * The outlined codegen for constants allows us to handle recursive initialization + * for collections. We perform "allocation" by atomically incrementing dynamic + * shared memory and CUDA's support for dynamic is limited to a single extern + * array. Dynamic is required here because not all dynamic constants and therefore + * array sizes are known. This approach will need further work, as currently + * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` + * prevents unnecessary shared memory allocations for nested product and summation + * collections, since the outermost allocates everything for the full collection. + * Since not initialized, array collections don't need to be recursed into. + */ + fn codegen_constant( + &self, + name: String, + cons_id: ConstantID, + allow_allocate: bool, + extra_dim_collects: Option<&HashSet<TypeID>>, + dynamic_shared_offset: &mut String, + w: &mut String, + num_tabs: usize, + ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); + match &self.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer8(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger8(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer16(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger16(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer32(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger32(val) => write!(w, "{}{} = {}ul;\n", tabs, name, val)?, + Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?, + Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, + Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, format_float(**val as f64))?, + Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, format_float(**val))?, + // All three following collections involve align then allocate from the + // single dynamic shared memory buffer by using and updating the offset. + Constant::Product(type_id, constant_fields) => { + if allow_allocate { + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None, extra_dim_collects); + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; + write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); + } + let Type::Product(type_fields) = &self.types[type_id.idx()] else { + panic!("Product constant should have product type") + }; + for i in 0..constant_fields.len() { + // For each field update offset and issue recursive call + let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects); + let field_constant = &self.constants[constant_fields[i].idx()]; + if field_constant.is_scalar() { + let field_type = self.get_type(type_fields[i], true); + self.codegen_constant( + format!("*reinterpret_cast<{}>({}+{})", field_type, name, offset), + constant_fields[i], + false, + None, + dynamic_shared_offset, + w, + num_tabs, + )?; + } else if !field_constant.is_array() { + self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, dynamic_shared_offset, w, num_tabs)?; + } + } + } + Constant::Summation(type_id, variant, field) => { + if allow_allocate { + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None, extra_dim_collects); + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; + write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); + } + // No offset updating needed since all variants start at 0 + let Type::Summation(variants) = &self.types[type_id.idx()] else { + panic!("Summation constant should have summation type") + }; + let variant_constant = &self.constants[field.idx()]; + if variant_constant.is_scalar() { + let variant_type = self.get_type(self.typing[variants[*variant as usize].idx()], true); + self.codegen_constant( + format!("*reinterpret_cast<{}>({})", variant_type, name), + *field, + false, + extra_dim_collects, + dynamic_shared_offset, + w, + num_tabs, + )?; + } else if !variant_constant.is_array() { + self.codegen_constant(name, *field, false, extra_dim_collects, dynamic_shared_offset, w, num_tabs)?; + }; + } + Constant::Array(type_id) => { + if !allow_allocate { + panic!("Nested array constant should not be re-allocated"); + } + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None, extra_dim_collects); + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; + write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); + } + } + Ok(()) + } + + /* + * Emit code to calculate data size. For Product types, setting `num_fields` + * gives data size up to but not including that field, so = 2 gives 1st field + * and offset to 2nd field. This is useful for constant initialization and read/write + * index math. + */ + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>) -> String { + match &self.types[type_id.idx()] { + Type::Array(element_type, extents) => { + let array_size = if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { "1".to_string() } else { multiply_dcs(extents) }; + format!("{} * {}", self.get_alignment(*element_type), array_size) + } + Type::Product(fields) => { + let num_fields = &num_fields.unwrap_or(fields.len()); + let with_field = fields + .iter() + .enumerate() + .filter(|(i, _)| i < num_fields) + .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects), self.get_alignment(*id))) + .fold(String::from("0"), |acc, (size, align)| { + if acc == "0" { + size + } else { + format!( + "({} + {} - 1) / {} * {} + {}", + acc, align, align, align, size + ) + } + }); + if num_fields < &fields.len() { + format!( + "{} - {}", + with_field, + self.get_size(fields[*num_fields], None, extra_dim_collects) + ) + } else { + with_field + } + } + Type::Summation(variants) => { + // The argmax variant by size is not guaranteed to be same as + // argmax variant by alignment, eg product of 3 4-byte primitives + // vs 1 8-byte primitive, so we need to calculate both. + let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects)).fold( + String::from("0"), + |acc, x| { + if acc == "0" { + x + } else { + format!("umax({}, {})", acc, x) + } + }, + ); + let max_alignment = variants + .iter() + .map(|id| self.get_alignment(*id)) + .max() + .unwrap_or(0); + format!( + "({} + {} - 1) / {} * {}", + max_size, max_alignment, max_alignment, max_alignment + ) + } + _ => format!("{}", self.get_alignment(type_id)), + } + } + + fn get_alignment(&self, type_id: TypeID) -> usize { + match &self.types[type_id.idx()] { + Type::Array(element_type, _) => self.get_alignment(*element_type), + Type::Product(fields) | Type::Summation(fields) => fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(0), + Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, + Type::Integer16 | Type::UnsignedInteger16 => 2, + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, + _ => panic!("Unsupported type for alignment"), + } + } + + fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { + let func_name = match intrinsic { + Intrinsic::Abs => match ty { + Type::Float32 => "__fabsf", + Type::Float64 => "__fabs", + ty if ty.is_signed() => "abs", + ty if ty.is_unsigned() => "uabs", + _ => panic!("Unsupported type for Abs"), + }, + Intrinsic::ACos => match ty { + ty if ty.is_float() => "__acosf", + _ => "acos", + }, + Intrinsic::ASin => match ty { + ty if ty.is_float() => "__asinf", + _ => "asin", + }, + Intrinsic::ATan => match ty { + ty if ty.is_float() => "__atanf", + _ => "atan", + }, + Intrinsic::ATan2 => match ty { + ty if ty.is_float() => "__atan2f", + _ => "atan2", + }, + Intrinsic::Ceil => match ty { + ty if ty.is_float() => "__ceilf", + _ => "ceil", + }, + Intrinsic::Cos => match ty { + ty if ty.is_float() => "__cosf", + _ => "cos", + }, + Intrinsic::Cosh => match ty { + ty if ty.is_float() => "coshf", + _ => "cosh", + }, + Intrinsic::Exp => match ty { + ty if ty.is_float() => "__expf", + _ => "exp", + }, + Intrinsic::Exp2 => match ty { + ty if ty.is_float() => "__exp2f", + _ => "exp2", + }, + Intrinsic::Floor => match ty { + ty if ty.is_float() => "__floorf", + _ => "floor", + }, + Intrinsic::Ln => match ty { + ty if ty.is_float() => "__logf", + _ => "log", + }, + Intrinsic::Log10 => match ty { + ty if ty.is_float() => "__log10f", + _ => "log10", + }, + Intrinsic::Log2 => match ty { + ty if ty.is_float() => "__log2f", + _ => "log2", + }, + Intrinsic::Max => match ty { + Type::Float32 => "fmaxf", + Type::Float64 => "fmax", + ty if ty.is_signed() => "smax", + ty if ty.is_unsigned() => "umax", + _ => "max", + }, + Intrinsic::Min => match ty { + Type::Float32 => "fminf", + Type::Float64 => "fmin", + ty if ty.is_signed() => "smin", + ty if ty.is_unsigned() => "umin", + _ => "min", + }, + Intrinsic::Pow | Intrinsic::Powf => match ty { + Type::Float32 => "__powf", + Type::Float64 => "pow", + _ => panic!("Unsupported type for Pow"), + }, + Intrinsic::Powi => match ty { + ty if ty.is_signed() || ty.is_unsigned() => "powi", + _ => panic!("Unsupported type for Powi"), + }, + Intrinsic::Round => match ty { + ty if ty.is_float() => "__roundf", + ty if ty.is_signed() || ty.is_unsigned() => "roundi", + _ => "round", + }, + Intrinsic::Sin => match ty { + ty if ty.is_float() => "__sinf", + _ => "sin", + }, + Intrinsic::Sinh => match ty { + ty if ty.is_float() => "sinhf", + _ => "sinh", + }, + Intrinsic::Sqrt => match ty { + Type::Float32 => "sqrtf", + ty if ty.is_signed() || ty.is_unsigned() => "isqrt", + _ => "sqrt", + }, + Intrinsic::Tan => match ty { + ty if ty.is_float() => "__tanf", + _ => "tan", + }, + Intrinsic::Tanh => match ty { + ty if ty.is_float() => "tanhf", + _ => "tanh", + }, + _ => panic!("Unsupported intrinsic {:?}", intrinsic), + }; + func_name.to_string() + } + + fn get_cg_tile(&self, fork: NodeID, cg_type: CGType) -> String { + format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" }) + } + + fn get_fork_iter(&self, fork: NodeID, ty: bool) -> String { + if ty { + format!("unsigned int iter_{}", self.get_value(fork, false, false)) + } else { + format!("iter_{}", self.get_value(fork, false, false)) + } + } + + fn get_block_name(&self, id: NodeID, post: bool) -> String { + format!("bb_{}{}", self.get_value(id, false, false), if post { "_post" } else { "" }) + } + + /* + * Setting `ty = true` will return with type in declaration format. `make_pointer` + * is only considered if `ty = true` and only relevant for primitive types- + * otherwise it makes no difference because collections are already pointers. + */ + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { + if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { + if ty { + panic!("Dynamic constants shouldn't be re-initialized") + } + format!("dc{}", dc_id.idx()) + } else if let Node::Parameter { index } = &self.function.nodes[id.idx()] { + if ty { + panic!("Parameters shouldn't be re-initialized") + } + format!("p{}", index) + } else if ty { + format!( + "{} {}", + self.get_type(self.typing[id.idx()], make_pointer), + self.get_value(id, false, false) + ) + } else { + format!( + "{}{}", + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + ) + } + } + + fn get_type(&self, id: TypeID, make_pointer: bool) -> String { + let ty = &self.types[id.idx()]; + if ty.is_primitive() { + convert_type(ty, make_pointer) + } else { + format!("char*{}", if make_pointer { "*" } else { "" }) + } + } + + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Option<usize> { + factors.iter().try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, self.dynamic_constants) + .map(|val| acc.saturating_mul(val)) + }) + } +} + +fn multiply_dcs(dcs: &[DynamicConstantID]) -> String { + if dcs.is_empty() { + "1".to_string() + } else { + dcs.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * ") + } +} + +fn convert_type(ty: &Type, make_pointer: bool) -> String { + let mut result = match ty { + Type::Boolean => "bool".to_string(), + Type::Integer8 => "int8_t".to_string(), + Type::UnsignedInteger8 => "uint8_t".to_string(), + Type::Integer16 => "short".to_string(), + Type::UnsignedInteger16 => "unsigned short".to_string(), + Type::Integer32 => "int".to_string(), + Type::UnsignedInteger32 => "unsigned int".to_string(), + Type::Integer64 => "long long".to_string(), + Type::UnsignedInteger64 => "unsigned long long".to_string(), + Type::Float32 => "float".to_string(), + Type::Float64 => "double".to_string(), + _ => panic!("Unsupported type"), + }; + if make_pointer { + result.push('*'); + } + result +} + +fn format_float(val: f64) -> String { + let mut s = val.to_string(); + if !s.contains('.') && !s.contains('e') && !s.contains('E') { + s.push_str(".0"); + } + s +} diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index 6a12901f262c295985f9309b6798d4caa142af37..dab4dbacd31e1f7603466ef8941f07cbdd81e6df 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -1,11 +1,17 @@ #![feature(if_let_guard, let_chains)] pub mod cpu; +pub mod gpu; pub mod rt; +pub mod fork_tree; + pub use crate::cpu::*; +pub use crate::gpu::*; pub use crate::rt::*; +pub use crate::fork_tree::*; + use std::collections::BTreeMap; use hercules_ir::*; diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 9f9188af9708c0d82ab49a8330b8127554b3d8e5..187b3f986b6ae7a62ec69e388b1627761880959f 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -893,6 +893,14 @@ impl Type { } } + pub fn is_summation(&self) -> bool { + if let Type::Summation(_) = self { + true + } else { + false + } + } + pub fn is_array(&self) -> bool { if let Type::Array(_, _) = self { true diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml index 1db5dc2f555709341ef331149602c29a6bf917c3..0b250d28d9460bd0b19f574c27916e82802664c8 100644 --- a/hercules_opt/Cargo.toml +++ b/hercules_opt/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" +[features] +cuda = ["hercules_cg/cuda"] + [dependencies] ordered-float = "*" bitvec = "*" diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs index d9c689b42b161dcb5db29f8fca91cad4fbea11bb..8f82e86cab73b28f44222a8a3d8eaa4c49f04218 100644 --- a/hercules_rt/build.rs +++ b/hercules_rt/build.rs @@ -11,13 +11,14 @@ fn main() { .status() .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?"); Command::new("ar") - .args(&["crus", "librtdefs.a", "rtdefs.o"]) .current_dir(&Path::new(&out_dir)) + .args(&["crus", "librtdefs.a", "rtdefs.o"]) .status() .unwrap(); println!("cargo::rustc-link-search=native={}", out_dir); println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + println!("cargo::rustc-link-search=native=/usr/local/cuda/lib64"); println!("cargo::rustc-link-search=native=/opt/cuda/lib/"); println!("cargo::rustc-link-lib=static=rtdefs"); println!("cargo::rustc-link-lib=cudart"); diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index c0da1096cf65cb643c4ee21b2713e148cbb73ef1..ed5dca1d817f7d738a29e7c4be4a6f589a7dad41 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -152,6 +152,19 @@ impl<'a> HerculesCPURefMut<'a> { #[cfg(feature = "cuda")] impl<'a> HerculesCUDARef<'a> { + pub fn to_cpu_ref<'b, T>(self, dst: &'b mut [T]) -> HerculesCPURefMut<'b> { + unsafe { + let size = self.size; + let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap(); + __copy_cuda_to_cpu(ptr.as_ptr(), self.ptr.as_ptr(), size); + HerculesCPURefMut { + ptr, + size, + _phantom: PhantomData, + } + } + } + pub unsafe fn __ptr(&self) -> *mut u8 { self.ptr.as_ptr() } @@ -179,6 +192,19 @@ impl<'a> HerculesCUDARefMut<'a> { } } + pub fn to_cpu_ref<'b, T>(self, dst: &mut [T]) -> HerculesCPURefMut<'b> { + unsafe { + let size = self.size; + let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap(); + __copy_cuda_to_cpu(ptr.as_ptr(), self.ptr.as_ptr(), size); + HerculesCPURefMut { + ptr, + size, + _phantom: PhantomData, + } + } + } + pub unsafe fn __ptr(&self) -> *mut u8 { self.ptr.as_ptr() } diff --git a/hercules_rt/src/rtdefs.cu b/hercules_rt/src/rtdefs.cu index 24bbf2d572e00f99216da9be878d68e86406ba1d..534b297db5fc5b850453e865086af781c16995f0 100644 --- a/hercules_rt/src/rtdefs.cu +++ b/hercules_rt/src/rtdefs.cu @@ -7,7 +7,7 @@ extern "C" { } return ptr; } - + void __cuda_dealloc(void *ptr, size_t size) { (void) size; cudaFree(ptr); @@ -16,15 +16,15 @@ extern "C" { void __cuda_zero_mem(void *ptr, size_t size) { cudaMemset(ptr, 0, size); } - + void __copy_cpu_to_cuda(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); } - + void __copy_cuda_to_cpu(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost); } - + void __copy_cuda_to_cuda(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice); } diff --git a/hercules_samples/call/Cargo.toml b/hercules_samples/call/Cargo.toml index c8b570affa63657422f3857f96f6bbc06a52ef1e..38a3d1b6cf4a247b4078e62d380ea942d7fb6e09 100644 --- a/hercules_samples/call/Cargo.toml +++ b/hercules_samples/call/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_build/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs index af48fe64f7c2b778c9841e45ecf13b8a6a5740d2..e7b6dee9041ff6c3e862dec173f9d1c0784c8653 100644 --- a/hercules_samples/call/build.rs +++ b/hercules_samples/call/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .ir_in_src("call.hir") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .ir_in_src("call.hir") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .ir_in_src("call.hir") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..cc4ef88fc71754e38d8d0b1b03bebe5fc806ea94 --- /dev/null +++ b/hercules_samples/call/src/gpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.add); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/ccp/Cargo.toml b/hercules_samples/ccp/Cargo.toml index 97d4b2ef8efc96af059284df345282213dddab68..c10aced18c8a2261852e49f5f569bf13e689f72b 100644 --- a/hercules_samples/ccp/Cargo.toml +++ b/hercules_samples/ccp/Cargo.toml @@ -4,8 +4,8 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" -[build-dependencies] -juno_build = { path = "../../juno_build" } +[features] +cuda = ["juno_build/cuda"] [dependencies] juno_build = { path = "../../juno_build" } @@ -13,3 +13,6 @@ hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" + +[build-dependencies] +juno_build = { path = "../../juno_build" } diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs index f04d48c7d0ea6df8b16d70b05cedabfc04c1f6f2..d74547ad222b243e8d809ac0ec1a94131f30ba98 100644 --- a/hercules_samples/ccp/build.rs +++ b/hercules_samples/ccp/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .ir_in_src("ccp.hir") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .ir_in_src("ccp.hir") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .ir_in_src("ccp.hir") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..d49af8f5a3ec3d41ea23f74fe6ffd319bf1e53d5 --- /dev/null +++ b/hercules_samples/ccp/src/gpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.tricky); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml index 99a48115197ce853941223ed360079ec5376583e..9b11ddc10b5d185e020d639e677f5b56a2c3b8d0 100644 --- a/hercules_samples/dot/Cargo.toml +++ b/hercules_samples/dot/Cargo.toml @@ -5,7 +5,7 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" [features] -cuda = ["hercules_rt/cuda"] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs index 4cfd2a87fba14d3c542bb54806a65da2d1a9b8f5..8657fdc166fe68ad2565a8a0736984c7991be0a7 100644 --- a/hercules_samples/dot/build.rs +++ b/hercules_samples/dot/build.rs @@ -4,8 +4,7 @@ fn main() { JunoCompiler::new() .ir_in_src("dot.hir") .unwrap() - //.schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .schedule_in_src("cpu.sch") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch index 956eb99628a03a3efb3d77e97d93a8cb677bbd6a..c65827fd8bb218a53aaad5cf69b8ab4c77a38059 100644 --- a/hercules_samples/dot/src/gpu.sch +++ b/hercules_samples/dot/src/gpu.sch @@ -2,12 +2,16 @@ gvn(*); phi-elim(*); dce(*); -auto-outline(*); -gpu(*); -host(dot); +let out = auto-outline(*); +gpu(out.dot); ip-sroa(*); sroa(*); dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); gcm(*); diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs index 335e8909375d061438920fed66d58e5bf87daa11..8862c11a9273f9808f2148f1067dcf3f5953c11f 100644 --- a/hercules_samples/dot/src/main.rs +++ b/hercules_samples/dot/src/main.rs @@ -1,19 +1,37 @@ #![feature(concat_idents)] use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("dot"); fn main() { async_std::task::block_on(async { - let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; - let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURef::from_slice(&b); - let mut r = runner!(dot); - let c = r.run(8, a, b).await; - println!("{}", c); - assert_eq!(c, 70.0); + #[cfg(not(feature = "cuda"))] + { + let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; + let a = HerculesCPURef::from_slice(&a); + let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; + let b = HerculesCPURef::from_slice(&b); + let mut r = runner!(dot); + let c = r.run(8, a, b).await; + println!("{}", c); + assert_eq!(c, 70.0); + } + #[cfg(feature = "cuda")] + { + let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; + let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); + let a = a_box.get_ref(); + let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; + let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let b = b_box.get_ref(); + let mut r = runner!(dot); + let c = r.run(8, a, b).await; + println!("{}", c); + assert_eq!(c, 70.0); + } }); } diff --git a/hercules_samples/fac/Cargo.toml b/hercules_samples/fac/Cargo.toml index 9082a4fc4194ac3fa9c694a5a5973f605772a384..72f82672b18e1bf9529075c233a3efab742fda81 100644 --- a/hercules_samples/fac/Cargo.toml +++ b/hercules_samples/fac/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_build/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs index 4d8226f11183d9500e6affec4c46110e8626ee69..aa1f73a92d06e3e74c800f9d657ff911e9f85157 100644 --- a/hercules_samples/fac/build.rs +++ b/hercules_samples/fac/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .ir_in_src("fac.hir") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .ir_in_src("fac.hir") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .ir_in_src("fac.hir") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..ac1f60261c5b99d2c8a61e0d503621b1db693ecd --- /dev/null +++ b/hercules_samples/fac/src/gpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.fac); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml index 89e46dd682024012942e6a5014cc5f2f6ec12b83..49f05f29af907a72df7cb188c62f65544070f232 100644 --- a/hercules_samples/matmul/Cargo.toml +++ b/hercules_samples/matmul/Cargo.toml @@ -5,7 +5,7 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" [features] -cuda = ["hercules_rt/cuda"] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs index c15ca97fa4b0730622f28e6cf16f7ab24de7310a..735458c0c8be76bdae6cd7b3b308e38ccae78edd 100644 --- a/hercules_samples/matmul/build.rs +++ b/hercules_samples/matmul/build.rs @@ -4,8 +4,7 @@ fn main() { JunoCompiler::new() .ir_in_src("matmul.hir") .unwrap() - // .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .schedule_in_src("cpu.sch") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/hercules_samples/matmul/src/cpu.sch b/hercules_samples/matmul/src/cpu.sch index c00a33144f012fb4d3e06a42d70a6f06840f32d5..aeed7e10583a9161fa261cd6d87d1a9a0d1623a7 100644 --- a/hercules_samples/matmul/src/cpu.sch +++ b/hercules_samples/matmul/src/cpu.sch @@ -16,4 +16,6 @@ gvn(*); phi-elim(*); dce(*); +infer-schedules(*); + gcm(*); diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index 99ac21a60db89d44aff27331c9a5832bc50e8c12..c0a1a5cebd46a47378132d357b7c0fc83a4446a6 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -2,17 +2,20 @@ gvn(*); phi-elim(*); dce(*); -auto-outline(*); -gpu(*); -host(matmul); +let out = auto-outline(*); +gpu(out.matmul); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); +forkify(*); +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); gcm(*); -xdot[true](*); diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs index 8757a0fd6c8905d5dee8a71312876265f0dabee2..abd25ec9cddbd4be508b3f484cffc1df1365dc4d 100644 --- a/hercules_samples/matmul/src/main.rs +++ b/hercules_samples/matmul/src/main.rs @@ -3,13 +3,15 @@ use rand::random; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("matmul"); fn main() { async_std::task::block_on(async { const I: usize = 256; - const J: usize = 64; + const J: usize = 8; // hardcoded constant in matmul.hir const K: usize = 128; let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); @@ -21,11 +23,24 @@ fn main() { } } } - let a = HerculesCPURef::from_slice(&mut a); - let b = HerculesCPURef::from_slice(&mut b); - let mut r = runner!(matmul); - let c = r.run(I as u64, J as u64, K as u64, a, b).await; - assert_eq!(c.as_slice::<i32>(), &*correct_c); + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&mut a); + let b = HerculesCPURef::from_slice(&mut b); + let mut r = runner!(matmul); + let c = r.run(I as u64, J as u64, K as u64, a, b).await; + assert_eq!(c.as_slice::<i32>(), &*correct_c); + } + #[cfg(feature = "cuda")] + { + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let mut r = runner!(matmul); + let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await; + let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice(); + c.to_cpu_ref(&mut c_cpu); + assert_eq!(&*c_cpu, &*correct_c); + } }); } diff --git a/hercules_samples/matmul/src/matmul.hir b/hercules_samples/matmul/src/matmul.hir index ab0f384a563ccb6144e59b811745fe5aa76f08dd..f9d37afcbb9e74038aafd35dfacf91f533bc4b99 100644 --- a/hercules_samples/matmul/src/matmul.hir +++ b/hercules_samples/matmul/src/matmul.hir @@ -1,9 +1,9 @@ -fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2) +fn matmul<3>(a: array(i32, #0, 8), b: array(i32, 8, #2)) -> array(i32, #0, #2) c = constant(array(i32, #0, #2), []) i_j_ctrl = fork(start, #0, #2) i_idx = thread_id(i_j_ctrl, 0) j_idx = thread_id(i_j_ctrl, 1) - k_ctrl = fork(i_j_ctrl, #1) + k_ctrl = fork(i_j_ctrl, 8) k_idx = thread_id(k_ctrl, 0) k_join_ctrl = join(k_ctrl) i_j_join_ctrl = join(k_join_ctrl) @@ -15,4 +15,4 @@ fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2) add = add(mul, dot) dot = reduce(k_join_ctrl, zero, add) update_c = write(update_i_j_c, dot, position(i_idx, j_idx)) - update_i_j_c = reduce(i_j_join_ctrl, c, update_c) \ No newline at end of file + update_i_j_c = reduce(i_j_join_ctrl, c, update_c) diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml index 72faf4bd14da65b482f2e379c1b51ce3ede8dcf0..11ef85db46ba2c7958fa5becc57623a3c5875868 100644 --- a/juno_build/Cargo.toml +++ b/juno_build/Cargo.toml @@ -4,7 +4,10 @@ version = "0.1.0" authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_frontend/cuda"] + [dependencies] -juno_frontend = { path = "../juno_frontend" } +juno_frontend = { path = "../juno_frontend", default-features = false } hercules_ir = { path = "../hercules_ir" } with_builtin_macros = "0.1.0" diff --git a/juno_build/build.rs b/juno_build/build.rs new file mode 100644 index 0000000000000000000000000000000000000000..7ba34c8c9395926bd16a92dcf831eadda3f3af26 --- /dev/null +++ b/juno_build/build.rs @@ -0,0 +1,8 @@ +fn main() { + #[cfg(feature = "cuda")] + println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-lib=cudart"); +} diff --git a/juno_frontend/Cargo.toml b/juno_frontend/Cargo.toml index ad35b84c85ee18ed0d390c8005fc8d07a1c2916b..648daf5f4757f055259c1053d10c5849259941f8 100644 --- a/juno_frontend/Cargo.toml +++ b/juno_frontend/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" +[features] +cuda = ["hercules_opt/cuda", "juno_scheduler/cuda"] +default = [] + [[bin]] name = "juno" path = "src/main.rs" @@ -18,6 +22,7 @@ lrlex = "0.13" lrpar = "0.13" [dependencies] +hercules_opt = { path = "../hercules_opt", default-features = false } cfgrammar = "0.13" clap = { version = "*", features = ["derive"] } lrlex = "0.13" @@ -27,6 +32,5 @@ num-traits = "*" ordered-float = "*" phf = { version = "0.11", features = ["macros"] } hercules_ir = { path = "../hercules_ir" } -hercules_opt = { path = "../hercules_opt" } juno_scheduler = { path = "../juno_scheduler" } juno_utils = { path = "../juno_utils" } diff --git a/juno_samples/antideps/Cargo.toml b/juno_samples/antideps/Cargo.toml index 9bd1d5a0d484e257ee3ad1e425a09f503da4b503..e6f38e095f492b2d7934b6ff969fa468137d1d2c 100644 --- a/juno_samples/antideps/Cargo.toml +++ b/juno_samples/antideps/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_antideps" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs index 7ed716a444460d7a90965f5b7f5faf3a7aadcb14..d74d1a49054ef2c8d22a99a1b60eae4160a1d778 100644 --- a/juno_samples/antideps/build.rs +++ b/juno_samples/antideps/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("antideps.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("antideps.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("antideps.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..e166515dc5562f2e142792229fea92309a42e526 --- /dev/null +++ b/juno_samples/antideps/src/gpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_antideps2, out.very_complex_antideps, out.read_chains, out.array_of_structs); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/casts_and_intrinsics/Cargo.toml b/juno_samples/casts_and_intrinsics/Cargo.toml index f49797969012f5195a0338b7b14fa04414dddb03..b2e7c815b0e93ba1741503cb29fe7528b78b5996 100644 --- a/juno_samples/casts_and_intrinsics/Cargo.toml +++ b/juno_samples/casts_and_intrinsics/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_casts_and_intrinsics" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs index 16d5c7a4f7fcb00344fc7669b67103a27f71a7c6..342c4a053349716874d988499fb6570c5c2d971f 100644 --- a/juno_samples/casts_and_intrinsics/build.rs +++ b/juno_samples/casts_and_intrinsics/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("casts_and_intrinsics.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("casts_and_intrinsics.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("casts_and_intrinsics.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..c997cca91ae606eda36af2fba3721cf8975ac929 --- /dev/null +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.casts_and_intrinsics); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml index ff375d80d637e2ac9baa1377e168f912454b5fbc..63b6b2ac98dcc022a45f3c4084930cbfb24956ff 100644 --- a/juno_samples/cava/Cargo.toml +++ b/juno_samples/cava/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_cava" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/cava/build.rs b/juno_samples/cava/build.rs index 7f60f8019f105dda64cf12a2664668efbd637662..1b6dddf48053d5fb31c38947a651282867b7bcf3 100644 --- a/juno_samples/cava/build.rs +++ b/juno_samples/cava/build.rs @@ -1,6 +1,15 @@ use juno_build::JunoCompiler; fn main() { + #[cfg(feature = "cuda")] + JunoCompiler::new() + .file_in_src("cava.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + #[cfg(not(feature = "cuda"))] JunoCompiler::new() .file_in_src("cava.jn") .unwrap() diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn index f3096ec34230ad838ca803e8d8de79c176aa9bd2..359a83ed20db37117a8c5eef5b3e7cf31986e8cd 100644 --- a/juno_samples/cava/src/cava.jn +++ b/juno_samples/cava/src/cava.jn @@ -116,7 +116,8 @@ fn denoise<row : usize, col : usize>(input : f32[CHAN, row, col]) -> f32[CHAN, r filter[i, j] = input[chan, r + i - 1, c + j - 1]; } } - res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter); + + res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter); } else { res[chan, r, c] = input[chan, r, c]; } diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..a5570b8d98324410aea9902cf29bce386126ce12 --- /dev/null +++ b/juno_samples/cava/src/gpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +inline(*); +let out = auto-outline(*); +gpu(out.cava); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +// forkify(*); +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index 8ad6824f01d4d94a76f088ba1540ba44d2ce7b71..482bbf8deb0af323255b004a7e33e70202acb886 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -9,11 +9,15 @@ use self::cava_rust::CHAN; use self::image_proc::*; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; use image::ImageError; use clap::Parser; +use std::mem; + juno_build::juno!("cava"); fn run_cava( @@ -27,42 +31,67 @@ fn run_cava( coefs: &[f32], tonemap: &[f32], ) -> Box<[u8]> { - assert_eq!(image.len(), CHAN * rows * cols); - let image = HerculesCPURef::from_slice(image); + assert_eq!(image.len(), CHAN * rows * cols); assert_eq!(tstw.len(), CHAN * CHAN); - let tstw = HerculesCPURef::from_slice(tstw); - assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN); - let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); - assert_eq!(weights.len(), num_ctrl_pts * CHAN); - let weights = HerculesCPURef::from_slice(weights); - assert_eq!(coefs.len(), 4 * CHAN); - let coefs = HerculesCPURef::from_slice(coefs); - assert_eq!(tonemap.len(), 256 * CHAN); - let tonemap = HerculesCPURef::from_slice(tonemap); - - let mut r = runner!(cava); - async_std::task::block_on(async { - r.run( - rows as u64, - cols as u64, - num_ctrl_pts as u64, - image, - tstw, - ctrl_pts, - weights, - coefs, - tonemap, - ) - .await - }) - .as_slice::<u8>() - .to_vec() - .into_boxed_slice() + + #[cfg(not(feature = "cuda"))] + { + let image = HerculesCPURef::from_slice(image); + let tstw = HerculesCPURef::from_slice(tstw); + let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); + let weights = HerculesCPURef::from_slice(weights); + let coefs = HerculesCPURef::from_slice(coefs); + let tonemap = HerculesCPURef::from_slice(tonemap); + let mut r = runner!(cava); + async_std::task::block_on(async { + r.run( + rows as u64, + cols as u64, + num_ctrl_pts as u64, + image, + tstw, + ctrl_pts, + weights, + coefs, + tonemap, + ) + .await + }).as_slice::<u8>().to_vec().into_boxed_slice() + } + + #[cfg(feature = "cuda")] + { + let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image)); + let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw)); + let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts)); + let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights)); + let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs)); + let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap)); + let mut r = runner!(cava); + let res = async_std::task::block_on(async { + r.run( + rows as u64, + cols as u64, + num_ctrl_pts as u64, + image.get_ref(), + tstw.get_ref(), + ctrl_pts.get_ref(), + weights.get_ref(), + coefs.get_ref(), + tonemap.get_ref(), + ) + .await + }); + let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() }; + let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice(); + res.to_cpu_ref(&mut res_cpu); + res_cpu + } } enum Error { @@ -175,7 +204,17 @@ fn cava_harness(args: CavaInputs) { .expect("Error saving verification image"); } - assert_eq!(result, cpu_result.into(), "Verification failed, mismatch"); + let max_diff = result.iter() + .zip(cpu_result.iter()) + .map(|(a, b)| (*a as i16 - *b as i16).abs()) + .max() + .unwrap_or(0); + + assert!( + max_diff <= 3, + "Verification failed: maximum pixel difference of {} exceeds threshold of 3", + max_diff + ); } } diff --git a/juno_samples/concat/Cargo.toml b/juno_samples/concat/Cargo.toml index 24ba1acff56dc3997162c1f3afc2334fd3428155..f2f902379ae4c8844c02593a93d71c33996a40e3 100644 --- a/juno_samples/concat/Cargo.toml +++ b/juno_samples/concat/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_concat" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/concat/build.rs b/juno_samples/concat/build.rs index f7784b999492955289fc83ad1297907c2d0ce996..c9ef720dad8e05c4fb133decc667d1522d0c4c9b 100644 --- a/juno_samples/concat/build.rs +++ b/juno_samples/concat/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("concat.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("concat.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("concat.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn index 70c741b6c36ed780cbc4107028850c198d943483..01a549690336be138f41ce4ba5f5ca2e569e399f 100644 --- a/juno_samples/concat/src/concat.jn +++ b/juno_samples/concat/src/concat.jn @@ -18,17 +18,9 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t { } #[entry] -fn concat_entry(a : i32) -> i32 { - let arr1 : i32[3]; - let arr2 : i32[6]; - arr1[0] = a; - arr1[1] = a; - arr2[0] = a; - arr2[1] = a; - arr2[4] = a; - arr2[5] = a; - let arr3 = concat::<i32, 3, 6>(arr1, arr2); - return sum::<i32, 9>(arr3); +fn concat_entry<a : usize, b: usize>(arr1 : i32[a], arr2 : i32[b]) -> i32 { + let arr3 = concat::<i32, a, b>(arr1, arr2); + return sum::<i32, a + b>(arr3); } #[entry] diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..084f020c5a97a3432675253f66c9755954a3aaef --- /dev/null +++ b/juno_samples/concat/src/gpu.sch @@ -0,0 +1,21 @@ +gvn(*); +phi-elim(*); +dce(*); + +inline(*); +let out = auto-outline(*); +gpu(out.concat_entry); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs index 8bcd7ba54a7b61fe15c97ba74d37fc0e867aa277..9674c2c54b328aefcb4e670dc7e9ec482f8b2508 100644 --- a/juno_samples/concat/src/main.rs +++ b/juno_samples/concat/src/main.rs @@ -2,30 +2,47 @@ use hercules_rt::runner; use hercules_rt::HerculesCPURef; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("concat"); fn main() { async_std::task::block_on(async { let mut r = runner!(concat_entry); - let output = r.run(7).await; - println!("{}", output); - assert_eq!(output, 42); + let mut a_data = [7, 7, 0]; + let mut b_data = [7, 7, 0, 0, 7, 7]; + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&mut a_data); + let b = HerculesCPURef::from_slice(&mut b_data); + let output = r.run(3, 6, a, b).await; + assert_eq!(output, 42); - const N: usize = 3; - let arr : Box<[i32]> = (2..=4).collect(); - let arr = HerculesCPURef::from_slice(&arr); + const N: usize = 3; + let arr : Box<[i32]> = (2..=4).collect(); + let arr = HerculesCPURef::from_slice(&arr); - let mut r = runner!(concat_switch); - let output = r.run(N as u64, 50, arr.clone()).await; - let result = output.as_slice::<i32>(); - println!("{:?}", result); - assert_eq!(result, [0, 1, 2, 3, 4]); + let mut r = runner!(concat_switch); + let output = r.run(N as u64, 50, arr.clone()).await; + let result = output.as_slice::<i32>(); + println!("{:?}", result); + assert_eq!(result, [0, 1, 2, 3, 4]); - let output = r.run(N as u64, 30, arr).await; - let result = output.as_slice::<i32>(); - println!("{:?}", result); - assert_eq!(result, [2, 3, 4, 0, 1]); + let output = r.run(N as u64, 30, arr).await; + let result = output.as_slice::<i32>(); + println!("{:?}", result); + assert_eq!(result, [2, 3, 4, 0, 1]); + } + #[cfg(feature = "cuda")] + { + let mut a_data = [7, 7, 0]; + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data)); + let mut b_data = [7, 7, 0, 0, 7, 7]; + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data)); + let output = r.run(3, 6, a.get_ref(), b.get_ref()).await; + assert_eq!(output, 42); + } }); } diff --git a/juno_samples/cpu.sch b/juno_samples/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..4c684da2f176dd3675dc79fa506d2d60fe6e0577 --- /dev/null +++ b/juno_samples/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +fork-split(*); +unforkify(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/gpu.sch b/juno_samples/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..9a714789ac768405a2782cac6e68338c2f0697bb --- /dev/null +++ b/juno_samples/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.matmul); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/implicit_clone/Cargo.toml b/juno_samples/implicit_clone/Cargo.toml index b312f5def295a6c31a7d2c9eab5c23f4e16ccd2f..9612937190366bea144ade4b9ae334b228233b49 100644 --- a/juno_samples/implicit_clone/Cargo.toml +++ b/juno_samples/implicit_clone/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_implicit_clone" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs index 75c1afc41a75b2006b26042323df3bdc3fcf5a17..8e465874b9904a785a85dbee885086585dbf40c0 100644 --- a/juno_samples/implicit_clone/build.rs +++ b/juno_samples/implicit_clone/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("implicit_clone.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("implicit_clone.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("implicit_clone.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..0f7c80213d20481ce51628736f0ed07be21da399 --- /dev/null +++ b/juno_samples/implicit_clone/src/gpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit_clone, out.tricky_loop_implicit_clone, out.tricky2_loop_implicit_clone, out.tricky3_loop_implicit_clone, out.no_implicit_clone, out.mirage_implicit_clone); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs index 1e94ff8977a6a5e340b6180ceff777351dda088b..c1f82528de40598007291b9258ec1cf59acbcc07 100644 --- a/juno_samples/implicit_clone/src/main.rs +++ b/juno_samples/implicit_clone/src/main.rs @@ -8,42 +8,34 @@ fn main() { async_std::task::block_on(async { let mut r = runner!(simple_implicit_clone); let output = r.run(3).await; - println!("{}", output); assert_eq!(output, 11); let mut r = runner!(loop_implicit_clone); let output = r.run(100).await; - println!("{}", output); assert_eq!(output, 7); let mut r = runner!(double_loop_implicit_clone); let output = r.run(3).await; - println!("{}", output); assert_eq!(output, 42); let mut r = runner!(tricky_loop_implicit_clone); let output = r.run(2, 2).await; - println!("{}", output); assert_eq!(output, 130); let mut r = runner!(tricky2_loop_implicit_clone); let output = r.run(2, 3).await; - println!("{}", output); assert_eq!(output, 39); let mut r = runner!(tricky3_loop_implicit_clone); let output = r.run(5, 7).await; - println!("{}", output); assert_eq!(output, 7); let mut r = runner!(no_implicit_clone); let output = r.run(4).await; - println!("{}", output); assert_eq!(output, 13); let mut r = runner!(mirage_implicit_clone); let output = r.run(73).await; - println!("{}", output); assert_eq!(output, 843); }); } diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml index 8ad95853d25509d91713a080c0b63b01c0469110..eac83d15f430a7752d14a1ec3105c88209143477 100644 --- a/juno_samples/matmul/Cargo.toml +++ b/juno_samples/matmul/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_matmul" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index 926fbc33ecfa5ab31b40a92f778bb4d3b7f6a77e..0be838c620761e8726590e2dbaf7bfdb7a82e3df 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("matmul.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("matmul.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("matmul.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..3d3f919cd26e4eba480540df06f839a8b86976b0 --- /dev/null +++ b/juno_samples/matmul/src/gpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.matmul, out.tiled_64_matmul); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs index 624ee5652a78d9c2ab7bc84d3974bf2df5b02838..50fe1760eeeedc946d510a6d5285d76e1346f3cc 100644 --- a/juno_samples/matmul/src/main.rs +++ b/juno_samples/matmul/src/main.rs @@ -3,6 +3,8 @@ use rand::random; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("matmul"); @@ -11,8 +13,8 @@ fn main() { const I: usize = 256; const J: usize = 64; const K: usize = 128; - let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); - let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); + let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); + let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); for i in 0..I { for k in 0..K { @@ -21,18 +23,32 @@ fn main() { } } } - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURef::from_slice(&b); - let mut r = runner!(matmul); - let c = r - .run(I as u64, J as u64, K as u64, a.clone(), b.clone()) - .await; - assert_eq!(c.as_slice::<i32>(), &*correct_c); - let mut r = runner!(tiled_64_matmul); - let tiled_c = r - .run(I as u64, J as u64, K as u64, a.clone(), b.clone()) - .await; - assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c); + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURef::from_slice(&b); + let mut r = runner!(matmul); + let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await; + assert_eq!(c.as_slice::<i32>(), &*correct_c); + let mut r = runner!(tiled_64_matmul); + let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await; + assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c); + } + #[cfg(feature = "cuda")] + { + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let mut r = runner!(matmul); + let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await; + let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice(); + c.to_cpu_ref(&mut c_cpu); + assert_eq!(&*c_cpu, &*correct_c); + let mut r = runner!(tiled_64_matmul); + let tiled_c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await; + let mut tiled_c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice(); + tiled_c.to_cpu_ref(&mut tiled_c_cpu); + assert_eq!(&*tiled_c_cpu, &*correct_c); + } }); } diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn index ca9be73a86144ebe57048ed35adc2f36bfcd905b..fb6de5bd89e0c48b3a0ac4aa66d4764894f2fcff 100644 --- a/juno_samples/matmul/src/matmul.jn +++ b/juno_samples/matmul/src/matmul.jn @@ -20,7 +20,7 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l] let atile : i32[64, 64]; let btile : i32[64, 64]; let ctile : i32[64, 64]; - + for bi = 0 to n / 64 { for bk = 0 to l / 64 { for ti = 0 to 64 { diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml index 8c9b969d23019b8bbd3bf28b3506e2e497ae8ec7..5ee3f747477c901806642818553e94b046b50242 100644 --- a/juno_samples/nested_ccp/Cargo.toml +++ b/juno_samples/nested_ccp/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_nested_ccp" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index c5c7ca6a1b9ab5decf6a8cf0b8e8f13ff7122834..074937e7b0a0ce50032928f30c72feb39b5ecd79 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("nested_ccp.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..c56d046a686ea8bddd8c9e9f91d88eb39a2eaf31 --- /dev/null +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.ccp_example, out.median_array, out.no_underflow); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 423b66fb8c84fd1af6b0267c7c63aa204ab1dc6c..bc99a4bdd071ff19c70977e29241b76e3e249014 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -1,6 +1,8 @@ #![feature(concat_idents)] use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("nested_ccp"); @@ -8,19 +10,30 @@ fn main() { async_std::task::block_on(async { let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURefMut::from_slice(&mut b); - let mut r = runner!(ccp_example); - let output_example = r.run(a).await; - let mut r = runner!(median_array); - let output_median = r.run(9, b).await; + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURefMut::from_slice(&mut b); + let mut r = runner!(ccp_example); + let output_example = r.run(a).await; + let mut r = runner!(median_array); + let output_median = r.run(9, b).await; + assert_eq!(output_example, 1.0); + assert_eq!(output_median, 18); + } + #[cfg(feature = "cuda")] + { + let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b)); + let mut r = runner!(ccp_example); + let output_example = r.run(a.get_ref_mut()).await; + let mut r = runner!(median_array); + let output_median = r.run(9, b.get_ref_mut()).await; + assert_eq!(output_example, 1.0); + assert_eq!(output_median, 18); + } let mut r = runner!(no_underflow); let out_no_underflow = r.run().await; - println!("{}", output_example); - println!("{}", output_median); - println!("{}", out_no_underflow); - assert_eq!(output_example, 1.0); - assert_eq!(output_median, 18); assert_eq!(out_no_underflow, 7); }); } diff --git a/juno_samples/patterns/Cargo.toml b/juno_samples/patterns/Cargo.toml index a8dda157ff331ae9b1c5e1cb2a120db9bab3bb82..bedaf7ca01c0d7cfadbba5e11b8e94203ccda2b4 100644 --- a/juno_samples/patterns/Cargo.toml +++ b/juno_samples/patterns/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_patterns" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/patterns/build.rs b/juno_samples/patterns/build.rs index 8ac92f003e549a9aeb289001b8792ee4dcb51284..625da0a5fed548984b724aa89085c456ef22f12c 100644 --- a/juno_samples/patterns/build.rs +++ b/juno_samples/patterns/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("patterns.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("patterns.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("patterns.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/patterns/src/gpu.sch b/juno_samples/patterns/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..3d9c8c9e6ddb588fbaa96e3c27f78cf21abd8f1a --- /dev/null +++ b/juno_samples/patterns/src/gpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.entry); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/schedule_test/Cargo.toml b/juno_samples/schedule_test/Cargo.toml index be5d949bf1959d48c3463717f28b9fd186e05170..c783217a816960c764083c24aed1fdc5f0d1fb77 100644 --- a/juno_samples/schedule_test/Cargo.toml +++ b/juno_samples/schedule_test/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_schedule_test" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/schedule_test/build.rs b/juno_samples/schedule_test/build.rs index 4a4282473e87d6c24e12b5e3d59521ee8c99141e..749a660c551e8b231f63287898adb2863aef826e 100644 --- a/juno_samples/schedule_test/build.rs +++ b/juno_samples/schedule_test/build.rs @@ -4,7 +4,7 @@ fn main() { JunoCompiler::new() .file_in_src("code.jn") .unwrap() - .schedule_in_src("sched.sch") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/schedule_test/src/sched.sch b/juno_samples/schedule_test/src/cpu.sch similarity index 100% rename from juno_samples/schedule_test/src/sched.sch rename to juno_samples/schedule_test/src/cpu.sch diff --git a/juno_samples/schedule_test/src/gpu.sch b/juno_samples/schedule_test/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..edca678ee103ec053f54a8e2caf0ee3e9e721e18 --- /dev/null +++ b/juno_samples/schedule_test/src/gpu.sch @@ -0,0 +1,47 @@ +macro juno-setup!(X) { + //gvn(X); + phi-elim(X); + dce(X); + lift-dc-math(X); +} +macro codegen-prep!(X) { + infer-schedules(X); + dce(X); + gcm(X); + dce(X); + phi-elim(X); + float-collections(X); + gcm(X); +} + + +juno-setup!(*); + +let first = outline(test@outer); +let second = outline(test@row); + +// We can use the functions produced by outlining in our schedules +gvn(first, second, test); + +ip-sroa(*); +sroa(*); + +// We can evaluate expressions using labels and save them for later use +let inner = first@inner; + +// A fixpoint can run a (series) of passes until no more changes are made +// (though some passes seem to make edits even if there are no real changes, +// so this is fragile). +// We could just let it run until it converges but can also tell it to panic +// if it hasn't converged after a number of iterations (like here) tell it to +// just stop after a certain number of iterations (stop after #) or to print +// the iteration number (print iter) +fixpoint panic after 2 { + phi-elim(*); +} + +host(*); +gpu(first, second); + +codegen-prep!(*); +//xdot[true](*); diff --git a/juno_samples/schedule_test/src/main.rs b/juno_samples/schedule_test/src/main.rs index 2e63babf29e84bc74a7649306dc28af88225cf39..1505d4e5ff620a53d1095cdc4185a5a6d665e71e 100644 --- a/juno_samples/schedule_test/src/main.rs +++ b/juno_samples/schedule_test/src/main.rs @@ -3,6 +3,8 @@ use rand::random; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("code"); @@ -26,12 +28,26 @@ fn main() { } } - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURef::from_slice(&b); - let c = HerculesCPURef::from_slice(&c); - let mut r = runner!(test); - let res = r.run(N as u64, M as u64, K as u64, a, b, c).await; - assert_eq!(res.as_slice::<i32>(), &*correct_res); + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURef::from_slice(&b); + let c = HerculesCPURef::from_slice(&c); + let mut r = runner!(test); + let res = r.run(N as u64, M as u64, K as u64, a, b, c).await; + assert_eq!(res.as_slice::<i32>(), &*correct_res); + } + #[cfg(feature = "cuda")] + { + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b)); + let c = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&c)); + let mut r = runner!(test); + let res = r.run(N as u64, M as u64, K as u64, a.get_ref(), b.get_ref(), c.get_ref()).await; + let mut res_cpu: Box<[i32]> = vec![0; correct_res.len()].into_boxed_slice(); + res.to_cpu_ref(&mut res_cpu); + assert_eq!(&*res_cpu, &*correct_res); + } }); } diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml index 8060c5b3472ad898cb48e011332a852cd7b6705e..36d50dbd75c410b88142bb5ca5d680fbac0be3ce 100644 --- a/juno_samples/simple3/Cargo.toml +++ b/juno_samples/simple3/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_simple3" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index 94760025d53abe7e10914052e1a7783386b316b0..58c2c5aab14e65facfe5154db13f453be411c55b 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("simple3.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/simple3/src/cpu.sch b/juno_samples/simple3/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..7e6be7eefb05b7660d3c27ae9d937ce00cf79a0e --- /dev/null +++ b/juno_samples/simple3/src/cpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +dce(*); +gcm(*); diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..d6c2a9d666c11d4ee07ce89a573ac73d8ea25bff --- /dev/null +++ b/juno_samples/simple3/src/gpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.simple3); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs index 4f9fe6a708ec50fe65b1c31a2823580a117985ce..8eb78f7c93b0f195ffee1be120376dbe3f9a2a62 100644 --- a/juno_samples/simple3/src/main.rs +++ b/juno_samples/simple3/src/main.rs @@ -1,6 +1,8 @@ #![feature(concat_idents)] use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("simple3"); @@ -8,12 +10,22 @@ fn main() { async_std::task::block_on(async { let a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]); let b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]); - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURef::from_slice(&b); - let mut r = runner!(simple3); - let c = r.run(8, a, b).await; - println!("{}", c); - assert_eq!(c, 120); + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURef::from_slice(&b); + let mut r = runner!(simple3); + let c = r.run(8, a, b).await; + assert_eq!(c, 120); + } + #[cfg(feature = "cuda")] + { + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b)); + let mut r = runner!(simple3); + let c = r.run(8, a.get_ref(), b.get_ref()).await; + assert_eq!(c, 120); + } }); } diff --git a/juno_scheduler/Cargo.toml b/juno_scheduler/Cargo.toml index 04ab156c543fab4112eb0179eb1821a67c817d7a..26055b03fc6fd94fe6440f88a95faf1e2371b404 100644 --- a/juno_scheduler/Cargo.toml +++ b/juno_scheduler/Cargo.toml @@ -4,6 +4,9 @@ version = "0.0.1" authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" +[features] +cuda = [] + [build-dependencies] cfgrammar = "0.13" lrlex = "0.13" diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 584867e5ebc4dcf1771295717315e45af52ff141..2371e0f20e776e97d39596f656594fc679065264 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -178,6 +178,8 @@ pub struct PassManager { pub postdoms: Option<Vec<DomTree>>, pub fork_join_maps: Option<Vec<HashMap<NodeID, NodeID>>>, pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>, + pub fork_control_maps: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, + pub fork_trees: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub loops: Option<Vec<LoopTree>>, pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, @@ -214,6 +216,8 @@ impl PassManager { postdoms: None, fork_join_maps: None, fork_join_nests: None, + fork_control_maps: None, + fork_trees: None, loops: None, reduce_cycles: None, data_nodes_in_fork_joins: None, @@ -337,6 +341,29 @@ impl PassManager { } } + pub fn make_fork_control_maps(&mut self) { + if self.fork_control_maps.is_none() { + self.make_fork_join_nests(); + self.fork_control_maps = Some( + self.fork_join_nests.as_ref().unwrap().iter().map(fork_control_map).collect(), + ); + } + } + + pub fn make_fork_trees(&mut self) { + if self.fork_trees.is_none() { + self.make_fork_join_nests(); + self.fork_trees = Some( + zip( + self.functions.iter(), + self.fork_join_nests.as_ref().unwrap().iter(), + ) + .map(|(function, fork_join_nesting)| fork_tree(function, fork_join_nesting)) + .collect(), + ); + } + } + pub fn make_loops(&mut self) { if self.loops.is_none() { self.make_control_subgraphs(); @@ -577,6 +604,10 @@ impl PassManager { self.make_control_subgraphs(); self.make_collection_objects(); self.make_callgraph(); + self.make_def_uses(); + self.make_fork_join_maps(); + self.make_fork_control_maps(); + self.make_fork_trees(); self.make_devices(); let PassManager { @@ -589,6 +620,10 @@ impl PassManager { control_subgraphs: Some(control_subgraphs), collection_objects: Some(collection_objects), callgraph: Some(callgraph), + def_uses: Some(def_uses), + fork_join_maps: Some(fork_join_maps), + fork_control_maps: Some(fork_control_maps), + fork_trees: Some(fork_trees), devices: Some(devices), bbs: Some(bbs), node_colors: Some(node_colors), @@ -612,6 +647,7 @@ impl PassManager { let mut rust_rt = String::new(); let mut llvm_ir = String::new(); + let mut cuda_ir = String::new(); for idx in 0..module.functions.len() { match devices[idx] { Device::LLVM => cpu_codegen( @@ -629,6 +665,26 @@ impl PassManager { pass: "cpu codegen".to_string(), error: format!("{}", e), })?, + Device::CUDA => gpu_codegen( + &module.functions[idx], + &module.types, + &module.constants, + &module.dynamic_constants, + &typing[idx], + &control_subgraphs[idx], + &bbs[idx], + &backing_allocations[&FunctionID::new(idx)], + &collection_objects[&FunctionID::new(idx)], + &def_uses[idx], + &fork_join_maps[idx], + &fork_control_maps[idx], + &fork_trees[idx], + &mut cuda_ir, + ) + .map_err(|e| SchedulerError::PassError { + pass: "cuda codegen".to_string(), + error: format!("{}", e), + })?, Device::AsyncRust => rt_codegen( FunctionID::new(idx), &module, @@ -646,41 +702,76 @@ impl PassManager { pass: "rust codegen".to_string(), error: format!("{}", e), })?, - _ => todo!(), } } println!("{}", llvm_ir); + println!("{}", cuda_ir); println!("{}", rust_rt); + let output_archive = format!("{}/lib{}.a", output_dir, module_name); + println!("{}", output_archive); + // Write the LLVM IR into a temporary file. let tmp_dir = TempDir::new().unwrap(); - let mut tmp_path = tmp_dir.path().to_path_buf(); - tmp_path.push(format!("{}.ll", module_name)); - println!("{}", tmp_path.display()); - let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output LLVM IR file."); + let mut llvm_path = tmp_dir.path().to_path_buf(); + llvm_path.push(format!("{}.ll", module_name)); + println!("{}", llvm_path.display()); + let mut file = File::create(&llvm_path) + .expect("PANIC: Unable to open output LLVM IR file."); file.write_all(llvm_ir.as_bytes()) .expect("PANIC: Unable to write output LLVM IR file contents."); // Compile LLVM IR into an ELF object file. - let output_archive = format!("{}/lib{}.a", output_dir, module_name); + let llvm_object = format!("{}/{}_cpu.o", tmp_dir.path().to_str().unwrap(), module_name); let mut clang_process = Command::new("clang") - .arg(&tmp_path) - .arg("--emit-static-lib") + .arg(&llvm_path) + .arg("-c") .arg("-O3") .arg("-march=native") .arg("-o") - .arg(&output_archive) + .arg(&llvm_object) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .spawn() .expect("Error running clang. Is it installed?"); assert!(clang_process.wait().unwrap().success()); + let mut ar_args = vec!["crus", &output_archive, &llvm_object]; + + let cuda_object = format!("{}/{}_cuda.o", tmp_dir.path().to_str().unwrap(), module_name); + if cfg!(feature = "cuda") { + // Write the CUDA IR into a temporary file. + let mut cuda_path = tmp_dir.path().to_path_buf(); + cuda_path.push(format!("{}.cu", module_name)); + let mut file = File::create(&cuda_path) + .expect("PANIC: Unable to open output CUDA IR file."); + file.write_all(cuda_ir.as_bytes()) + .expect("PANIC: Unable to write output CUDA IR file contents."); + + let mut nvcc_process = Command::new("nvcc") + .arg("-c") + .arg("-O3") + .arg("-o") + .arg(&cuda_object) + .arg(&cuda_path) + .spawn() + .expect("Error running nvcc. Is it installed?"); + assert!(nvcc_process.wait().unwrap().success()); + + ar_args.push(&cuda_object); + } + + let mut ar_process = Command::new("ar") + .args(&ar_args) + .spawn() + .expect("Error running ar. Is it installed?"); + assert!(ar_process.wait().unwrap().success()); + // Write the Rust runtime into a file. let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name); println!("{}", output_rt); - let mut file = - File::create(&output_rt).expect("PANIC: Unable to open output Rust runtime file."); + let mut file = File::create(&output_rt) + .expect("PANIC: Unable to open output Rust runtime file."); file.write_all(rust_rt.as_bytes()) .expect("PANIC: Unable to write output Rust runtime file contents.");