diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs index 09a5bc2689ecf3ed1a7186f2ada52758c7b19fbb..866fa6adeeb41c2c8106a1b7de5ce1ffdc45db40 100644 --- a/hercules_cg/src/device.rs +++ b/hercules_cg/src/device.rs @@ -9,8 +9,6 @@ pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec let mut devices = vec![]; for (idx, function) in functions.into_iter().enumerate() { - devices.push(Device::CUDA); - continue; if let Some(device) = function.device { devices.push(device); } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 { diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 499ecce8f8da47aaec0d7f72b6f5805c7a6135bf..ab6e8f41f1c2262e8ea60c89c0a596caafc18f0d 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -31,24 +31,24 @@ pub fn gpu_codegen<W: Write>( * one of multiple parameters. * * We don't assert but assume the following: - * - max_num_blocks in KernelParams is within constraint of 1D grid size. This + * - max_num_blocks in KernelParams is within constraint of 1D grid size. This * can be relaxed if we want to support larger grids. * - Product types are packed with padding inserted for each element to * be aligned for its type and for full product to be aligned to its * largest element * - Summation types must be aligned to their largest element - * + * * Notes on GPU parallelization strategy and tips for IR transformations: * - The top level block fork and any lower thread forks require a known Fork - * size. Thus for an otherwise parallelizable Fork with unknown size, + * size. Thus for an otherwise parallelizable Fork with unknown size, * consider splitting it into two Forks with one of known size. For block * level, the known fork has to be the (only) top-most fork. - * - The thread-level strategy is determined by starting at the most nested + * - The thread-level strategy is determined by starting at the most nested * Forks and working outwards in a greedy manner, with caps by GPU spec. * Thus, to ensure some outer Fork is parallelized, ensure the inner * parallelizable Forks aren't too large or consider removing schedule * annotations. - * - Tight-Associative reductions can only be efficiently implemented if + * - Tight-Associative reductions can only be efficiently implemented if * different Hercules ThreadIDs correspond to consecutive CUDA threads. But * this prevents nested parallelization since each parallel group must always * be a contiguous tile of threads. We use a heuristic of choosing the larger @@ -59,10 +59,10 @@ pub fn gpu_codegen<W: Write>( * Fork contains expensive parallelizable operations, ensure all reductions * are parallelizable or if not try pulling those out into a different Fork. * - We do nothing to mitigate intra-warp divergence. To mitigate this, the - * IR, for example, should ensure the innermost parallelizable Forks either + * IR, for example, should ensure the innermost parallelizable Forks either * have factor >= warp size (32) or remove Fork/Reduce node schedule * annotations. - * + * * Main TODOs: * - Fix dynamic shared memory allocation to reuse old shmem. The main case * for improvement is when we have serialized forks with unused intermediate @@ -135,7 +135,7 @@ pub fn gpu_codegen<W: Write>( } // Obtain the Return node and if it's a collection, use the collection objects - // analysis to determine the origin. Also save the return node id for later + // analysis to determine the origin. Also save the return node id for later // conversion of primitive Return into Parameter. let (return_node_id, data_node_id) = { let pos = function @@ -248,11 +248,11 @@ struct GPUContext<'a> { } /* - * For all control nodes besides forks, Init, Body, and Term compose the main basic - * block, with Init and Term populated by control flow (Init used only by Fork and - * Join) and Body populated by data flow. + * For all control nodes besides forks, Init, Body, and Term compose the main basic + * block, with Init and Term populated by control flow (Init used only by Fork and + * Join) and Body populated by data flow. * For serialized Fork nodes which may be jumped back to by corresponding Join node, - * init and post_init separate one-time code (currently just cooperative group + * init and post_init separate one-time code (currently just cooperative group * creation) from repeated code. */ #[derive(Default, Debug)] @@ -279,9 +279,9 @@ enum KernelState { } /* - * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) + * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) * threads for a current ThreadID, Use is the union of such threads for all ThreadIDs - * in the current innermost Fork, and Available is Use plus additional threads not + * in the current innermost Fork, and Available is Use plus additional threads not * used in the current Fork. */ #[derive(Clone, Copy, PartialEq, Debug)] @@ -348,7 +348,7 @@ impl GPUContext<'_> { // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(true, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; + self.codegen_launch_code(false, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) @@ -375,7 +375,7 @@ namespace cg = cooperative_groups; #define roundi(a) (a) #define isqrt(a) ((int)sqrtf((float)(a))) -", +", )?; write!( @@ -463,12 +463,12 @@ namespace cg = cooperative_groups; Ok(()) } - // To abide by c++ reassignment restrictions, we declare all data values + // To abide by c++ reassignment restrictions, we declare all data values // upfront. fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if !self.function.nodes[id.idx()].is_control() && - !self.function.nodes[id.idx()].is_dynamic_constant() && + if !self.function.nodes[id.idx()].is_control() && + !self.function.nodes[id.idx()].is_dynamic_constant() && !self.function.nodes[id.idx()].is_parameter() { write!(w, "\t{};\n", self.get_value(id, true, false))?; } @@ -477,8 +477,8 @@ namespace cg = cooperative_groups; } /* - * Emit helper registers that are used throughout the kernel. grid and block - * are from CUDA's cooperative groups API and are used specifically for reads + * Emit helper registers that are used throughout the kernel. grid and block + * are from CUDA's cooperative groups API and are used specifically for reads * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { @@ -517,7 +517,7 @@ namespace cg = cooperative_groups; write!(w, " int main() {{ ")?; - // The following steps are for host-side C function arguments, but we also + // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); if run_debug { @@ -586,8 +586,8 @@ int main() {{ } write!(w, "\treturn 0;\n"); write!(w, "}}\n"); - } - + } + else { // The first set of parameters are dynamic constants. let mut first_param = true; @@ -633,9 +633,9 @@ int main() {{ * a) domination by F * b) no domination by F's join * c) no domination by any other fork that's also dominated by F, where we don't count self-domination - * Note that the fork_tree also includes the start node, to include all controls + * Note that the fork_tree also includes the start node, to include all controls * outside any fork. - * + * * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying: * a) domination by F * b) no domination by F's join @@ -665,8 +665,8 @@ int main() {{ /* * If tree has a single root fork of known size s <= max_num_blocks - * with parallel-fork schedule, then set num_blocks to s, else set num_blocks - * to 1. Also return the root fork(s) for parallelization strategy within + * with parallel-fork schedule, then set num_blocks to s, else set num_blocks + * to 1. Also return the root fork(s) for parallelization strategy within * threadblocks for threads and their eventual generation. */ fn get_root_forks_and_num_blocks( @@ -720,10 +720,10 @@ int main() {{ * maximum over its descendants (leafs have base 1). We traverse up (details * in helper) and pass the factor and a map from fork node to a tuple of * (max quota of its siblings (including itself), its quota, its fork factor) - * from each node to its parents. The parent then compares + * from each node to its parents. The parent then compares * - all three are needed for codegen. A node is in the map IFF it will be parallelized. - * If not, the fork will use the parent's quota and serialize over the Fork's - * ThreadIDs. Nodes may be removed from the map when traversing up the tree + * If not, the fork will use the parent's quota and serialize over the Fork's + * ThreadIDs. Nodes may be removed from the map when traversing up the tree * due to an ancestor having a larger factor that conflicts. */ fn get_thread_quotas( @@ -743,9 +743,9 @@ int main() {{ is_root: bool, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { // Subsubtree map is the union of all keys for grandchildren and lower - // nodes. children_quota_map is a constructed map from parallelized children + // nodes. children_quota_map is a constructed map from parallelized children // to their quota to update the subsubtree map at grandchildren level to - // subtreemap at children level. subtree_quota is cumulative factor of + // subtreemap at children level. subtree_quota is cumulative factor of // subtree and is then compared to this fork's factor. let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree .get(&curr_fork) @@ -762,7 +762,7 @@ int main() {{ (subsubtree_map, children_quota_map, subtree_quota.max(curr_quota)) }, ); - // First update children_quota_map items with full information and add + // First update children_quota_map items with full information and add // to subsubtree_map for (&child, quota) in children_quota_map.iter() { let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else { @@ -780,7 +780,7 @@ int main() {{ // b) the known size is less than or equal to the max_num_threads // c) the known size is a power of 2 // d) all reduces are parallel-reduce or associative - // + // // If not, just take the max cumulative factor of its subtree let reduces = &self.fork_reduce_map[&curr_fork]; if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] @@ -792,12 +792,12 @@ int main() {{ || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) { - // If there's an associative Reduce, parallelize the larger factor + // If there's an associative Reduce, parallelize the larger factor // between the Fork and subtree - // Else, all Reduces must be only parallel-reduce, so parallelize + // Else, all Reduces must be only parallel-reduce, so parallelize // both if they fit and the larger if not. // The reason for this distinction is that we only perform Reduces over - // ThreadID-based values over consecutive CUDA threads, so there's no + // ThreadID-based values over consecutive CUDA threads, so there's no // opportunity for further nested parallelization. In contrast, this // restriction doesn't help for parallel Writes, so nested parallelization // is possible. @@ -817,10 +817,10 @@ int main() {{ } } - /* + /* * All non reduced-over collections used in fork joins have an extra dimension. - * However, this is only useful if ThreadIDs run in parallel not serially, - * otherwise it's unnecessarily consuming shared memory. This function returns + * However, this is only useful if ThreadIDs run in parallel not serially, + * otherwise it's unnecessarily consuming shared memory. This function returns * the set of collections that have an unnecessary extra dimension. */ fn get_extra_dim_collects( @@ -1036,8 +1036,8 @@ int main() {{ let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { - // We can directly use use_thread_quota and not worry about available - // because Fork basic block's init section already does gating + // We can directly use use_thread_quota and not worry about available + // because Fork basic block's init section already does gating write!(w, "{}{} = (threadIdx.x % {}) / {};\n", tabs, define_variable, use_thread_quota.unwrap(), use_thread_quota.unwrap() / parallel_factor.unwrap())?; } } @@ -1046,8 +1046,8 @@ int main() {{ } } } - // The Reduce node only generates it's initialization, as reduct will - // perform the update. If serialized, add gate to prevent re-assignment + // The Reduce node only generates it's initialization, as reduct will + // perform the update. If serialized, add gate to prevent re-assignment // when we hit this reduce again due to the control flow loop between // the Fork and Join. Node::Reduce { @@ -1076,7 +1076,7 @@ int main() {{ Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); if (!is_primitive) { - let cg_tile = { + let cg_tile = { let KernelState::OutBlock = state else { panic!("Expected constant to be in start basic block outside any fork"); @@ -1153,9 +1153,9 @@ int main() {{ let left_val = self.get_value(*left, false, false); let right_val = self.get_value(*right, false, false); let id_type = self.typing[id.idx()]; - if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And + if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And | BinaryOperator::Xor) && is_special_reduct { - // For parallelized associative Reduces, use the cooperative + // For parallelized associative Reduces, use the cooperative // groups reduce API. Associative multiplication is not // supported. We need to use CGType::Use not CGType::UsePerId // because for parallelized reduction we only have one thread @@ -1349,7 +1349,7 @@ int main() {{ panic!("Unsupported data node type") } } - // Since the data uses and reducts are responsible for updating Phi and + // Since the data uses and reducts are responsible for updating Phi and // Reduce nodes, respectively, we check and emit those for each data node. if let Some(phis) = self.label_data_for_phi.get(&id) { let val = self.get_value(id, false, false); @@ -1403,12 +1403,12 @@ int main() {{ } Node::Fork { control: _, factors: _ } => { // We create a cooperative group tile for each of: used threads per - // thread ID- for reads and writes-, used threads across all thread + // thread ID- for reads and writes-, used threads across all thread // IDs- for parallelized reductions-, and available threads- to // synchronize between used and unused threads. We want to create // these only once, so we create two goto sections for each fork- // one run only once for creating groups, and other may be ran - // multiple times if the Fork is serialized and Join jumps back + // multiple times if the Fork is serialized and Join jumps back // to it. let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { @@ -1453,7 +1453,7 @@ int main() {{ } } Node::Join { control: _ } => { - // Join nodes also gate the used vs unused threads with a tile + // Join nodes also gate the used vs unused threads with a tile // sync after the body. let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); @@ -1516,7 +1516,7 @@ int main() {{ /* * This function emits collection name + pointer math for the provided indices. * One nuance is whether the collection is represented as char pointer or - * the original primitive pointer. For Field, it's always char, for Variant, + * the original primitive pointer. For Field, it's always char, for Variant, * it doesn't matter here, and for Array, it depends- so we may need to tack * on the element size to the index math. */ @@ -1571,12 +1571,12 @@ int main() {{ /* * The outlined codegen for constants allows us to handle recursive initialization * for collections. We perform "allocation" by atomically incrementing dynamic - * shared memory and CUDA's support for dynamic is limited to a single extern + * shared memory and CUDA's support for dynamic is limited to a single extern * array. Dynamic is required here because not all dynamic constants and therefore - * array sizes are known. This approach will need further work, as currently - * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` - * prevents unnecessary shared memory allocations for nested product and summation - * collections, since the outermost allocates everything for the full collection. + * array sizes are known. This approach will need further work, as currently + * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` + * prevents unnecessary shared memory allocations for nested product and summation + * collections, since the outermost allocates everything for the full collection. * Since not initialized, array collections don't need to be recursed into. */ fn codegen_constant( @@ -1911,7 +1911,7 @@ int main() {{ /* * Setting `ty = true` will return with type in declaration format. `make_pointer` - * is only considered if `ty = true` and only relevant for primitive types- + * is only considered if `ty = true` and only relevant for primitive types- * otherwise it makes no difference because collections are already pointers. */ fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String {