From f0b044a3ade131982860a35b3086f72de9ce5c56 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Wed, 25 Dec 2024 08:37:33 -0800 Subject: [PATCH 001/109] manual rebase --- hercules_cg/src/gpu.rs | 181 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 hercules_cg/src/gpu.rs diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs new file mode 100644 index 00000000..c54e5810 --- /dev/null +++ b/hercules_cg/src/gpu.rs @@ -0,0 +1,181 @@ +extern crate bitvec; +extern crate hercules_ir; + +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::fmt::{Error, Write}; +use std::iter::{zip, FromIterator}; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use self::bitvec::prelude::*; + +use self::hercules_ir::*; + +use crate::*; + +static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0); + +/* + * The top level function to compile a Hercules IR function into NVVM IR kernel for + * execution on the GPU. We generate NVVM IR textually, copying from the CPU LLVM approach. + */ +pub fn gpu_codegen<W: Write>( + function: &Function, + types: &Vec<Type>, + constants: &Vec<Constant>, + dynamic_constants: &Vec<DynamicConstant>, + reverse_postorder: &Vec<NodeID>, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + data_nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>, + bbs: &Vec<NodeID>, + w: &mut W, +) -> Result<(), Error> { + // temporary hardcoded values + let kernel = GPUKernel { + max_num_blocks: 1024, + max_num_threads: 1024, + threads_per_warp: 32, + }; + let ctx = GPUContext { + function, + types, + constants, + dynamic_constants, + reverse_postorder, + typing, + control_subgraph, + bbs, + structs: HashSet::new(), + w, + kernel, + }; + ctx.codegen_function() +} + +struct GPUContext<'a, W: Write> { + function: &'a Function, + types: &'a Vec<Type>, + constants: &'a Vec<Constant>, + dynamic_constants: &'a Vec<DynamicConstant>, + reverse_postorder: &'a Vec<NodeID>, + typing: &'a Vec<TypeID>, + control_subgraph: &'a Subgraph, + bbs: &'a Vec<NodeID>, + structs: HashSet<usize>, + w: &'a mut W, + kernel: GPUKernel, +} + +struct GPUKernel { + max_num_blocks: usize, + max_num_threads: usize, + threads_per_warp: usize, +} + +#[derive(Default, Debug)] +struct CudaBlock { + label: String, + body: String, +} + +impl<'a, W: Write> GPUContext<'a, W> { + fn codegen_function(&self) -> Result<(), Error> { + // Static content and function signature + write!( + self.w, + " +#include <assert.h> +#include <stdio.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <mma.h> +#include <helper_cuda.h> +", + )?; + + let mut function_signature = String::new(); + write!(&mut function_signature, "template <")?; + // The dynamic constants become template parameters. + let mut first_template_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(&mut function_signature, ", ")?; + } + write!(&mut function_signature, "long long int dc_p{}", idx)?; + } + write!(&mut function_signature, ">\n")?; + + write!(&mut function_signature, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_blocks, self.function.name)?; + // The second set of parameters are normal arguments. + let mut first_param = true; + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(&mut function_signature, ", ")?; + } + write!(&mut function_signature, "{} p{}", self.get_type(*ty)?, idx)?; + } + write!(&mut function_signature, ") {\n")?; + + // do actual stuff + // step 1. determine number of outermost fork joins at block level. we greedily accept while: a) total number of blocks < max_num_blocks, b) each fork join is strictly nested meaning no other neighbor fork joins, and c) each fork join's + + // finish kernel + write!(&mut function_signature, "}\n")?; + } + + // matmul detection- only called if einsum detected + fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {} + + // convolution detection- only called if einsum detected + fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {} + + fn get_type(&self, id: TypeID) -> Result<String, Error> { + match self.types[id.idx()] { + Type::Product(ref product_ty_ids) => { + if !self.structs.contains(&id.idx()) { + write!(self.w, "\nstruct Product_{} {{\n", id.idx())?; + for (i, product_ty_id) in product_ty_ids.iter().enumerate() { + write!(self.w, "\t{} field_{};\n", self.get_type(*product_ty_id)?, i)?; + } + write!(self.w, "}};\n")?; + self.structs.insert(id.idx()); + } + Ok(format!("Product_{}", id.idx())) + } + Type::Summation(ref summation_ty_ids) => { + if !self.structs.contains(&id.idx()) { + write!(self.w, "\nstruct Summation_{} {{\n\t union {{\n", id.idx())?; + for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { + write!(self.w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id)?, i)?; + } + write!(self.w, "\t}};\n\tuint8_t tag;\n}};\n")?; + self.structs.insert(id.idx()); + } + Ok(format!("Summation_{}", id.idx())) + } + _ => Ok(convert_type(&self.types[id.idx()])), + } + } + + // TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 + fn convert_type(ty: &Type) -> String { + match ty { + Type::Boolean => "bool".to_string(), + Type::Integer8 => "int8_t".to_string(), + Type::UnsignedInteger8 => "uint8_t".to_string(), + Type::Integer16 => "short".to_string(), + Type::UnsignedInteger16 => "unsigned short".to_string(), + Type::Integer32 => "int".to_string(), + Type::UnsignedInteger32 => "unsigned int".to_string(), + Type::Integer64 => "long long".to_string(), + Type::UnsignedInteger64 => "unsigned long long".to_string(), + Type::Float32 => "float".to_string(), + Type::Float64 => "double".to_string(), + _ => panic!("Unsupported type"), + } + } +} \ No newline at end of file -- GitLab From 61bb84dc895cd32891688ca7825999e024e71d8e Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Thu, 26 Dec 2024 18:15:54 -0800 Subject: [PATCH 002/109] before goto test --- .gitignore | 1 + hercules_cg/src/gpu.rs | 221 ++++++++++++++++++++++++++++++++++------- 2 files changed, 187 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index f8a684ce..22c9343e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ *.hrt .*.swp .vscode +*_env diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index c54e5810..e177f420 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -31,11 +31,13 @@ pub fn gpu_codegen<W: Write>( w: &mut W, ) -> Result<(), Error> { // temporary hardcoded values - let kernel = GPUKernel { + let kernel_params = GPUKernelParams { max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, + num_smps: 60, // ¯\_(ツ)_/¯ }; + let mut kernel_attrs = GPUKernelAttrs::default(); let ctx = GPUContext { function, types, @@ -45,13 +47,27 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, - structs: HashSet::new(), + cuda_structs: HashSet::new(), w, - kernel, + kernel_params, + &mut kernel_attrs, }; ctx.codegen_function() } +struct GPUKernelParams { + max_num_blocks: usize, + max_num_threads: usize, + threads_per_warp: usize, + num_smps: usize, +} + +#[derive(Default)] +struct GPUKernelAttrs { + num_blocks: usize, + num_threads: usize, +} + struct GPUContext<'a, W: Write> { function: &'a Function, types: &'a Vec<Type>, @@ -61,20 +77,15 @@ struct GPUContext<'a, W: Write> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a Vec<NodeID>, - structs: HashSet<usize>, + cuda_structs: HashSet<usize>, w: &'a mut W, - kernel: GPUKernel, -} - -struct GPUKernel { - max_num_blocks: usize, - max_num_threads: usize, - threads_per_warp: usize, + kernel_params: GPUKernelParams, + kernel_attrs: &'a mut GPUKernelAttrs, } #[derive(Default, Debug)] -struct CudaBlock { - label: String, +struct CudaGoto { + header: String, body: String, } @@ -93,67 +104,79 @@ impl<'a, W: Write> GPUContext<'a, W> { ", )?; - let mut function_signature = String::new(); - write!(&mut function_signature, "template <")?; + let mut kernel_begin = String::new(); + write!(&mut kernel_begin, "template <")?; // The dynamic constants become template parameters. let mut first_template_param = true; for idx in 0..self.function.num_dynamic_constants { if first_param { first_param = false; } else { - write!(&mut function_signature, ", ")?; + write!(&mut kernel_begin, ", ")?; } - write!(&mut function_signature, "long long int dc_p{}", idx)?; + write!(&mut kernel_begin, "long long int dc_p{}", idx)?; } - write!(&mut function_signature, ">\n")?; + write!(&mut kernel_begin, ">\n")?; - write!(&mut function_signature, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_blocks, self.function.name)?; + write!(&mut kernel_begin, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_threads, self.function.name)?; // The second set of parameters are normal arguments. let mut first_param = true; for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; } else { - write!(&mut function_signature, ", ")?; + write!(&mut kernel_begin, ", ")?; } - write!(&mut function_signature, "{} p{}", self.get_type(*ty)?, idx)?; + write!(&mut kernel_begin, "{} p{}", self.get_type(*ty)?, idx)?; } - write!(&mut function_signature, ") {\n")?; + write!(&mut kernel_begin, ") {\n")?; - // do actual stuff - // step 1. determine number of outermost fork joins at block level. we greedily accept while: a) total number of blocks < max_num_blocks, b) each fork join is strictly nested meaning no other neighbor fork joins, and c) each fork join's + // Uses CUDA's goto structure; we will not gen for all control nodes, eg block and thread fork joins. + let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_control()) + .map(|idx| (NodeID::new(idx), CudaGoto::default())) + .collect(); - // finish kernel - write!(&mut function_signature, "}\n")?; - } + // step 1. Assign outermost fork joins to block level + let (block_fork_ids, fork_sizes) = self.codegen_block_creation(&mut kernel_begin)?; + // step 2. Sink logic from outer block fork joins. If it's a write, add necessary block-id based qualifier. For now, it's done naively at the top of the kernel. + let mut block_stride = self.kernel_attrs.num_blocks; + gotos[NodeID::new(0)].header = "start_sink".to_string(); + self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut gotos)?; + for (i, fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { + block_stride = block_stride.saturating_div(fork_sizes[i]); + gotos[fork_id].header = format!("block_sink_{}", fork_id.idx()); + self.codegen_block_sink(fork_id, block_fork_ids[i + 1], block_stride, &mut gotos)?; + } - // matmul detection- only called if einsum detected - fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {} - // convolution detection- only called if einsum detected - fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {} + // step 3. determine fork joins at warp/thread level. a) rather than strict nest, neighbors are allowed and we get sequence of edge (aka innermost at thread level) fork joins rather than single. b) if innermost is = threads_per_warp, we can use warp-level features. c) for now punt on implementation but can likely run einsum -> matmul/conv detector on hierarhical fork joins between block edge and given thread edge. + + // finish kernel + write!(&mut kernel_begin, "}\n")?; + } fn get_type(&self, id: TypeID) -> Result<String, Error> { match self.types[id.idx()] { Type::Product(ref product_ty_ids) => { - if !self.structs.contains(&id.idx()) { + if !self.cuda_structs.contains(&id.idx()) { write!(self.w, "\nstruct Product_{} {{\n", id.idx())?; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { write!(self.w, "\t{} field_{};\n", self.get_type(*product_ty_id)?, i)?; } write!(self.w, "}};\n")?; - self.structs.insert(id.idx()); + self.cuda_structs.insert(id.idx()); } Ok(format!("Product_{}", id.idx())) } Type::Summation(ref summation_ty_ids) => { - if !self.structs.contains(&id.idx()) { + if !self.cuda_structs.contains(&id.idx()) { write!(self.w, "\nstruct Summation_{} {{\n\t union {{\n", id.idx())?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { write!(self.w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id)?, i)?; } write!(self.w, "\t}};\n\tuint8_t tag;\n}};\n")?; - self.structs.insert(id.idx()); + self.cuda_structs.insert(id.idx()); } Ok(format!("Summation_{}", id.idx())) } @@ -178,4 +201,132 @@ impl<'a, W: Write> GPUContext<'a, W> { _ => panic!("Unsupported type"), } } + + // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + fn codegen_block_creation(&self, kernel_body: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { + // a) + let mut fork_forward_adjacency: HashMap::<NodeID, Vec<NodeID>> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_fork()) + .map(|idx| (NodeID::new(idx), vec![])) + .collect(); + let mut unhandled_fork_nodes = VecDeque::new(); + + for (fork_node, children) in fork_forward_adjacency.iter_mut() { + unhandled_fork_nodes.push_back(*fork_node); + while !unhandled_fork_nodes.is_empty() { + let fork_node = unhandled_fork_nodes.pop_front().unwrap(); + let fork_node_children = self.function.nodes[fork_node.idx()].children(); + for child in fork_node_children { + if self.function.nodes[child.idx()].is_fork() { + children.push(child); + } else if !self.function.nodes[child.idx()].is_join() { + unhandled_fork_nodes.push_back(child); + } + } + } + } + + let mut root_forks: HashSet<NodeID> = fork_forward_adjacency.keys().copied().collect(); + for (fork_node, children) in fork_forward_adjacency.iter() { + for child in children { + root_forks.remove(child); + } + } + let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); + if root_forks.len() != 1 { + return vec![]; + } + + let mut strict_forks = vec![root_forks[0]]; + let mut curr_fork = root_forks[0]; + while fork_join_map.get(&curr_fork).is_some() { + let children = &fork_forward_adjacency[&curr_fork]; + if children.len() != 1 { + break; + } + curr_fork = children[0]; + strict_forks.push(curr_fork); + } + + // b, (stronger version of) c, and d + let mut valid_forks = 0; + let mut cumulative_blocks = 1usize; + let mut fork_sizes = Vec::new(); + + for fork in strict_forks.iter() { + if !self.function.schedules[fork.idx()].contains(&Schedule::Vectorizable) { + break; + } + let factors = match &self.function.nodes[fork.idx()] { + Node::Fork { factors, .. } => factors, + _ => return Err(Error::new("Expected Fork node in strict_forks")) + }; + let fork_size = factors.iter() + .try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, &self.dynamic_constants) + .ok_or_else(|| Error::new("Fork factors must be evaluatable to constants")) + .map(|val| acc.saturating_mul(val)) + })?; + let new_blocks = cumulative_blocks.saturating_mul(fork_size); + if new_blocks > self.kernel_params.max_num_blocks { + break; + } + cumulative_blocks = new_blocks; + fork_sizes.push(fork_size); + valid_forks += 1; + } + + self.kernel_attrs.num_blocks = cumulative_blocks; + let valid_forks = strict_forks.into_iter() + .take(valid_forks) + .collect::<Vec<_>>(); + + // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected by this. For now, keep it simple. + if valid_forks.len() != 0 { + write!(&mut kernel_body, "\tconst int block_x = blockIdx.x;\n")?; + } + Ok((valid_forks, fork_sizes)) + } + + fn codegen_control_node(&self, id: NodeID, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} + + fn codegen_data_node(&self, id: NodeID, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} + + fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} + + fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { + // 1. Get control nodes including fork_id that are dominated by fork_id + // and that dominate next_fork_id + let dom = dominator(self.control_subgraph, fork_id); + assert!(dom.does_dom(fork_id, next_fork_id)); + let mut control_nodes_between = vec![fork_id]; + for node_id in self.control_subgraph.iter() { + if dom.does_dom(fork_id, node_id) && dom.does_dom(node_id, next_fork_id) { + control_nodes_between.push(node_id); + } + } + // 2. Call regular data codegen for blocks corresponding to + // control nodes, with extra if surrounding index-dependent write + // (TODO: consider shared memory optimization) + for node_id in control_nodes_between.iter() { + self.codegen_data_node(node_id, &mut gotos[node_id].body)?; + } + // 3. call regular control codegen using goto structure + } + + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> usize { + factors.iter() + .map(|&factor_id| { + evaluate_dynamic_constant(factor_id, &self.dynamic_constants) + .unwrap_or_else(|| panic!("Fork factors must be evaluatable to constants")) + }) + .product() + } + + // matmul detection- only called if einsum detected + fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {} + + // convolution detection- only called if einsum detected + fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {} + } \ No newline at end of file -- GitLab From ca75f386f84c30fc82ac994c523cc7aba41d58c0 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 27 Dec 2024 14:31:46 -0800 Subject: [PATCH 003/109] post goto & lint --- hercules_cg/src/gpu.rs | 417 ++++++++++++++++++++++++++--------------- hercules_cg/src/lib.rs | 2 + 2 files changed, 272 insertions(+), 147 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index e177f420..3e86db22 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,16 +3,15 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; -use std::iter::{zip, FromIterator}; -use std::sync::atomic::{AtomicUsize, Ordering}; -use self::bitvec::prelude::*; +// use std::iter::{zip, FromIterator}; +// use std::sync::atomic::{AtomicUsize, Ordering}; -use self::hercules_ir::*; +// use self::bitvec::prelude::*; -use crate::*; +use self::hercules_ir::*; -static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0); +// use crate::*; /* * The top level function to compile a Hercules IR function into NVVM IR kernel for @@ -26,19 +25,48 @@ pub fn gpu_codegen<W: Write>( reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, - data_nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>, bbs: &Vec<NodeID>, w: &mut W, ) -> Result<(), Error> { - // temporary hardcoded values + // Temporary hardcoded values let kernel_params = GPUKernelParams { max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, - num_smps: 60, // ¯\_(ツ)_/¯ + num_smps: 60, }; let mut kernel_attrs = GPUKernelAttrs::default(); - let ctx = GPUContext { + + // Create fork forward adjacency and join map upfront as part of context + let make_fork_structures = || -> (HashMap::<NodeID, Vec<NodeID>>, HashMap::<NodeID, NodeID>) { + let mut fork_forward_adjacency: HashMap::<NodeID, Vec<NodeID>> = (0..function.nodes.len()) + .filter(|idx| function.nodes[*idx].is_fork()) + .map(|idx| (NodeID::new(idx), vec![])) + .collect(); + let mut fork_join_map = HashMap::new(); + let mut queued_nodes = VecDeque::new(); + + for (fork_node, children) in fork_forward_adjacency.iter_mut() { + queued_nodes.push_back(*fork_node); + while !queued_nodes.is_empty() { + let node = queued_nodes.pop_front().unwrap(); + for child in control_subgraph.succs(node) { + if function.nodes[child.idx()].is_fork() { + children.push(child); + } else if function.nodes[child.idx()].is_join() { + fork_join_map.insert(*fork_node, child); + } else { + queued_nodes.push_back(child); + } + } + } + } + (fork_forward_adjacency, fork_join_map) + }; + + let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); + + let mut ctx = GPUContext { function, types, constants, @@ -48,11 +76,12 @@ pub fn gpu_codegen<W: Write>( control_subgraph, bbs, cuda_structs: HashSet::new(), - w, kernel_params, - &mut kernel_attrs, + kernel_attrs: &mut kernel_attrs, + fork_forward_adjacency, + fork_join_map, }; - ctx.codegen_function() + ctx.codegen_function(w) } struct GPUKernelParams { @@ -68,7 +97,7 @@ struct GPUKernelAttrs { num_threads: usize, } -struct GPUContext<'a, W: Write> { +struct GPUContext<'a> { function: &'a Function, types: &'a Vec<Type>, constants: &'a Vec<Constant>, @@ -78,22 +107,25 @@ struct GPUContext<'a, W: Write> { control_subgraph: &'a Subgraph, bbs: &'a Vec<NodeID>, cuda_structs: HashSet<usize>, - w: &'a mut W, kernel_params: GPUKernelParams, kernel_attrs: &'a mut GPUKernelAttrs, + fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, + fork_join_map: HashMap<NodeID, NodeID>, } #[derive(Default, Debug)] struct CudaGoto { header: String, body: String, + term: String, + handled: bool, } -impl<'a, W: Write> GPUContext<'a, W> { - fn codegen_function(&self) -> Result<(), Error> { - // Static content and function signature +impl<'a> GPUContext<'a> { + fn codegen_function<W: Write>(&mut self, w: &mut W) -> Result<(), Error> { + // Include all possible imports write!( - self.w, + w, " #include <assert.h> #include <stdio.h> @@ -104,143 +136,137 @@ impl<'a, W: Write> GPUContext<'a, W> { ", )?; - let mut kernel_begin = String::new(); - write!(&mut kernel_begin, "template <")?; - // The dynamic constants become template parameters. - let mut first_template_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(&mut kernel_begin, ", ")?; - } - write!(&mut kernel_begin, "long long int dc_p{}", idx)?; - } - write!(&mut kernel_begin, ">\n")?; + let mut top = String::new(); - write!(&mut kernel_begin, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_threads, self.function.name)?; - // The second set of parameters are normal arguments. - let mut first_param = true; - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { - write!(&mut kernel_begin, ", ")?; - } - write!(&mut kernel_begin, "{} p{}", self.get_type(*ty)?, idx)?; - } - write!(&mut kernel_begin, ") {\n")?; + // Create all possible structs + self.codegen_structs(&mut top)?; + // Kernel template, signature, and arguments + self.codegen_kernel_begin(&mut top)?; - // Uses CUDA's goto structure; we will not gen for all control nodes, eg block and thread fork joins. + // Uses CUDA's goto structure; some control nodes' gen may be moved, eg + // block and thread fork joins. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) - .map(|idx| (NodeID::new(idx), CudaGoto::default())) + .map(|idx| { + let node_id = NodeID::new(idx); + let mut goto = CudaGoto::default(); + goto.header = format!("{}{}", self.function.nodes[idx].upper_case_name(), idx); + (node_id, goto) + }) .collect(); - // step 1. Assign outermost fork joins to block level - let (block_fork_ids, fork_sizes) = self.codegen_block_creation(&mut kernel_begin)?; - // step 2. Sink logic from outer block fork joins. If it's a write, add necessary block-id based qualifier. For now, it's done naively at the top of the kernel. + // Generate phi registers at top, later can consider smarter scoping + self.codegen_phi_registers(&mut top)?; + + // Assign outermost fork joins to block level + let (block_fork_ids, block_fork_sizes) = self.codegen_block_creation(&mut top)?; + // Sink logic from outer block fork joins. If it's a write, add + // necessary block-id based condition. For now, it's done naively at the + // top of the kernel. let mut block_stride = self.kernel_attrs.num_blocks; - gotos[NodeID::new(0)].header = "start_sink".to_string(); - self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut gotos)?; - for (i, fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { - block_stride = block_stride.saturating_div(fork_sizes[i]); - gotos[fork_id].header = format!("block_sink_{}", fork_id.idx()); - self.codegen_block_sink(fork_id, block_fork_ids[i + 1], block_stride, &mut gotos)?; + gotos.get_mut(&NodeID::new(0)).unwrap().header = "start_sink".to_string(); + self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut top, &mut gotos)?; + for (i, &fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { + block_stride = block_stride.saturating_div(block_fork_sizes[i]); + self.codegen_block_sink(fork_id, block_fork_ids[i + 1], block_stride, &mut top, &mut gotos)?; } - - // step 3. determine fork joins at warp/thread level. a) rather than strict nest, neighbors are allowed and we get sequence of edge (aka innermost at thread level) fork joins rather than single. b) if innermost is = threads_per_warp, we can use warp-level features. c) for now punt on implementation but can likely run einsum -> matmul/conv detector on hierarhical fork joins between block edge and given thread edge. + // Assign inner fork joins to thread level, with labels for warp + let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1], &mut top)?; + // Punting on implementation but can likely run einsum -> matmul/conv + // detector on hierarhical fork joins between block edge and given + // thread edge. // finish kernel - write!(&mut kernel_begin, "}\n")?; + write!(w, "{}", top)?; + write!(w, "}}\n")?; + + Ok(()) } - fn get_type(&self, id: TypeID) -> Result<String, Error> { - match self.types[id.idx()] { - Type::Product(ref product_ty_ids) => { - if !self.cuda_structs.contains(&id.idx()) { - write!(self.w, "\nstruct Product_{} {{\n", id.idx())?; + fn codegen_structs(&self, w: &mut String) -> Result<(), Error> { + for (id, ty) in self.types.iter().enumerate() { + match ty { + Type::Product(ref product_ty_ids) => { + write!(w, "\nstruct Product_{} {{\n", id)?; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { - write!(self.w, "\t{} field_{};\n", self.get_type(*product_ty_id)?, i)?; + write!(w, "\t{} field_{};\n", self.get_type(*product_ty_id), i)?; } - write!(self.w, "}};\n")?; - self.cuda_structs.insert(id.idx()); + write!(w, "}};\n")?; } - Ok(format!("Product_{}", id.idx())) - } - Type::Summation(ref summation_ty_ids) => { - if !self.cuda_structs.contains(&id.idx()) { - write!(self.w, "\nstruct Summation_{} {{\n\t union {{\n", id.idx())?; + Type::Summation(ref summation_ty_ids) => { + write!(w, "\nstruct Summation_{} {{\n\t union {{\n", id)?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { - write!(self.w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id)?, i)?; + write!(w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id), i)?; } - write!(self.w, "\t}};\n\tuint8_t tag;\n}};\n")?; - self.cuda_structs.insert(id.idx()); + write!(w, "\t}};\n\tuint8_t tag;\n}};\n")?; } - Ok(format!("Summation_{}", id.idx())) + _ => {} } - _ => Ok(convert_type(&self.types[id.idx()])), - } + } + Ok(()) } - // TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 - fn convert_type(ty: &Type) -> String { - match ty { - Type::Boolean => "bool".to_string(), - Type::Integer8 => "int8_t".to_string(), - Type::UnsignedInteger8 => "uint8_t".to_string(), - Type::Integer16 => "short".to_string(), - Type::UnsignedInteger16 => "unsigned short".to_string(), - Type::Integer32 => "int".to_string(), - Type::UnsignedInteger32 => "unsigned int".to_string(), - Type::Integer64 => "long long".to_string(), - Type::UnsignedInteger64 => "unsigned long long".to_string(), - Type::Float32 => "float".to_string(), - Type::Float64 => "double".to_string(), - _ => panic!("Unsupported type"), + + fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { + write!(w, "template <")?; + // The dynamic constants become template parameters. + let mut first_template_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_template_param { + first_template_param = false; + } else { + write!(w, ", ")?; + } + write!(w, "long long int dc_p{}", idx)?; } + write!(w, ">\n")?; + + write!(w, "__global__ void __launch_bounds__({}) {}(", self.kernel_params.max_num_threads, self.function.name)?; + // The second set of parameters are normal arguments. + let mut first_param = true; + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + } + write!(w, "{} p{}", self.get_type(*ty), idx)?; + } + write!(w, ") {{\n")?; + + Ok(()) } - // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. - fn codegen_block_creation(&self, kernel_body: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { - // a) - let mut fork_forward_adjacency: HashMap::<NodeID, Vec<NodeID>> = (0..self.function.nodes.len()) - .filter(|idx| self.function.nodes[*idx].is_fork()) - .map(|idx| (NodeID::new(idx), vec![])) - .collect(); - let mut unhandled_fork_nodes = VecDeque::new(); - for (fork_node, children) in fork_forward_adjacency.iter_mut() { - unhandled_fork_nodes.push_back(*fork_node); - while !unhandled_fork_nodes.is_empty() { - let fork_node = unhandled_fork_nodes.pop_front().unwrap(); - let fork_node_children = self.function.nodes[fork_node.idx()].children(); - for child in fork_node_children { - if self.function.nodes[child.idx()].is_fork() { - children.push(child); - } else if !self.function.nodes[child.idx()].is_join() { - unhandled_fork_nodes.push_back(child); - } - } + fn codegen_phi_registers(&self, kernel_body: &mut String) -> Result<(), Error> { + for id in (0..self.function.nodes.len()).map(NodeID::new) { + if let Node::Phi { control: _, data } = &self.function.nodes[id.idx()] { + let ty = self.get_type(self.typing[data[0].idx()]); + write!(kernel_body, "\t{} {}{}_value;\n", ty, self.function.nodes[id.idx()].upper_case_name(), id.idx())?; } } - - let mut root_forks: HashSet<NodeID> = fork_forward_adjacency.keys().copied().collect(); - for (fork_node, children) in fork_forward_adjacency.iter() { + Ok(()) + } + + // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + fn codegen_block_creation(&mut self, kernel_body: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { + // a) + let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); + for (_, children) in self.fork_forward_adjacency.iter() { for child in children { root_forks.remove(child); } } let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); if root_forks.len() != 1 { - return vec![]; + return Err(Error); } let mut strict_forks = vec![root_forks[0]]; let mut curr_fork = root_forks[0]; - while fork_join_map.get(&curr_fork).is_some() { - let children = &fork_forward_adjacency[&curr_fork]; + while self.fork_join_map.get(&curr_fork).is_some() { + let children = &self.fork_forward_adjacency[&curr_fork]; if children.len() != 1 { break; } @@ -249,9 +275,9 @@ impl<'a, W: Write> GPUContext<'a, W> { } // b, (stronger version of) c, and d - let mut valid_forks = 0; + let mut valid_block_forks = 0; let mut cumulative_blocks = 1usize; - let mut fork_sizes = Vec::new(); + let mut block_fork_sizes = Vec::new(); for fork in strict_forks.iter() { if !self.function.schedules[fork.idx()].contains(&Schedule::Vectorizable) { @@ -259,12 +285,12 @@ impl<'a, W: Write> GPUContext<'a, W> { } let factors = match &self.function.nodes[fork.idx()] { Node::Fork { factors, .. } => factors, - _ => return Err(Error::new("Expected Fork node in strict_forks")) + _ => return Err(Error) }; let fork_size = factors.iter() .try_fold(1usize, |acc, &factor_id| { evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .ok_or_else(|| Error::new("Fork factors must be evaluatable to constants")) + .ok_or_else(|| Error) .map(|val| acc.saturating_mul(val)) })?; let new_blocks = cumulative_blocks.saturating_mul(fork_size); @@ -272,46 +298,121 @@ impl<'a, W: Write> GPUContext<'a, W> { break; } cumulative_blocks = new_blocks; - fork_sizes.push(fork_size); - valid_forks += 1; + block_fork_sizes.push(fork_size); + valid_block_forks += 1; } self.kernel_attrs.num_blocks = cumulative_blocks; - let valid_forks = strict_forks.into_iter() - .take(valid_forks) + let valid_block_forks = strict_forks.into_iter() + .take(valid_block_forks) .collect::<Vec<_>>(); - // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected by this. For now, keep it simple. - if valid_forks.len() != 0 { - write!(&mut kernel_body, "\tconst int block_x = blockIdx.x;\n")?; - } - Ok((valid_forks, fork_sizes)) + // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. + write!(kernel_body, "\tconst int block_x = blockIdx.x;\n")?; + Ok((valid_block_forks, block_fork_sizes)) } - fn codegen_control_node(&self, id: NodeID, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} - - fn codegen_data_node(&self, id: NodeID, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} - - fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} - - fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { + fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, kernel_body: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { // 1. Get control nodes including fork_id that are dominated by fork_id - // and that dominate next_fork_id + // and not dominated by next_fork_id and not dominated by fork_id's join let dom = dominator(self.control_subgraph, fork_id); assert!(dom.does_dom(fork_id, next_fork_id)); - let mut control_nodes_between = vec![fork_id]; + let mut control_nodes_between = vec![]; for node_id in self.control_subgraph.iter() { - if dom.does_dom(fork_id, node_id) && dom.does_dom(node_id, next_fork_id) { - control_nodes_between.push(node_id); + if dom.does_dom(fork_id, *node_id) && !dom.does_dom(next_fork_id, *node_id) && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) { + control_nodes_between.push(*node_id); } } // 2. Call regular data codegen for blocks corresponding to // control nodes, with extra if surrounding index-dependent write // (TODO: consider shared memory optimization) - for node_id in control_nodes_between.iter() { - self.codegen_data_node(node_id, &mut gotos[node_id].body)?; + for &node_id in control_nodes_between.iter() { + self.codegen_data_node(node_id, &mut gotos.get_mut(&node_id).unwrap().body)?; } // 3. call regular control codegen using goto structure + + Ok(()) + } + + // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. + fn codegen_thread_creation(&mut self, inner_block_fork: NodeID, kernel_body: &mut String) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { + let fork_forward_adjacency = self.fork_forward_adjacency.as_ref().unwrap(); + + let mut thread_fork_parents = HashMap::new(); + let mut thread_fork_sizes = HashMap::new(); + let mut thread_fork_cumulative_sizes = HashMap::new(); + thread_fork_cumulative_sizes.insert(inner_block_fork, 1); + let mut thread_fork_edges = vec![]; + let mut max_thread_size = 1; + let mut stack = vec![inner_block_fork]; + let mut visited = HashSet::new(); + visited.insert(inner_block_fork); + while let Some(pop) = stack.pop() { + let children = &fork_forward_adjacency[&pop]; + + // Reverse child order due to use of stack for DFS + for &child in children.iter().rev() { + if !visited.contains(&child) { + visited.insert(child); + thread_fork_parents.insert(child, pop); + let fork_size = match &self.function.nodes[child.idx()] { + Node::Fork { factors, .. } => factors.iter() + .try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, &self.dynamic_constants) + .ok_or_else(|| Error) + .map(|val| acc.saturating_mul(val)) + })?, + _ => return Err(Error) + }; + thread_fork_sizes.insert(child, fork_size); + + let new_cumulative_size = (thread_fork_cumulative_sizes[&pop] as usize).saturating_mul(fork_size as usize); + if new_cumulative_size > self.kernel_params.max_num_threads { + // Expanding to child fork exceeds thread limit, so + // current fork is an edge fork + thread_fork_edges.push(pop); + max_thread_size = max_thread_size.max(thread_fork_cumulative_sizes[&pop]); + } else { + // Recurse into child fork + thread_fork_cumulative_sizes.insert(child, new_cumulative_size); + stack.push(child); + } + } else { + panic!("Fork child shouldn't have multiple fork parents"); + } + } + } + + // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. + self.kernel_attrs.num_threads = max_thread_size; + // If limit on number of threads in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. + write!(kernel_body, "\tconst int thread_x = threadIdx.x;\n")?; + + Ok((thread_fork_parents, thread_fork_sizes, thread_fork_edges)) + } + + fn codegen_control_node(&self, id: NodeID, kernel_body: &mut String) -> Result<(), Error> { + Ok(()) + } + + fn codegen_data_node(&self, id: NodeID, kernel_body: &mut String) -> Result<(), Error> { + Ok(()) + } + + fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, kernel_body: &mut String) -> Result<(), Error> { + Ok(()) + } + + fn get_type(&self, id: TypeID) -> String { + match self.types[id.idx()] { + Type::Product(ref product_ty_ids) => { + format!("Product_{} *", id.idx()) + } + Type::Summation(ref summation_ty_ids) => { + format!("Summation_{} *", id.idx()) + } + _ => convert_type(&self.types[id.idx()]), + } } fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> usize { @@ -324,9 +425,31 @@ impl<'a, W: Write> GPUContext<'a, W> { } // matmul detection- only called if einsum detected - fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {} + fn matmul_detection(&self, w: &mut W) -> Result<(), Error> { + Ok(()) + } // convolution detection- only called if einsum detected - fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {} + fn convolution_detection(&self, w: &mut W) -> Result<(), Error> { + Ok(()) + } + +} -} \ No newline at end of file +// TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 +fn convert_type(ty: &Type) -> String { + match ty { + Type::Boolean => "bool".to_string(), + Type::Integer8 => "int8_t".to_string(), + Type::UnsignedInteger8 => "uint8_t".to_string(), + Type::Integer16 => "short".to_string(), + Type::UnsignedInteger16 => "unsigned short".to_string(), + Type::Integer32 => "int".to_string(), + Type::UnsignedInteger32 => "unsigned int".to_string(), + Type::Integer64 => "long long".to_string(), + Type::UnsignedInteger64 => "unsigned long long".to_string(), + Type::Float32 => "float".to_string(), + Type::Float64 => "double".to_string(), + _ => panic!("Unsupported type"), + } +} diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index c579b7e9..2c1d3fc0 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -1,10 +1,12 @@ #![feature(if_let_guard, let_chains)] pub mod cpu; +pub mod gpu; pub mod device; pub mod rt; pub use crate::cpu::*; +pub use crate::gpu::*; pub use crate::device::*; pub use crate::rt::*; -- GitLab From ddc5231e82215bc073a2d2b4173547bd615ee9cc Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 27 Dec 2024 17:25:29 -0800 Subject: [PATCH 004/109] consts --- hercules_cg/src/gpu.rs | 232 ++++++++++++++++++++++++++++++----------- 1 file changed, 171 insertions(+), 61 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 3e86db22..7c56fb41 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -4,7 +4,7 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; -// use std::iter::{zip, FromIterator}; +use std::iter::FromIterator; // zip // use std::sync::atomic::{AtomicUsize, Ordering}; // use self::bitvec::prelude::*; @@ -63,9 +63,21 @@ pub fn gpu_codegen<W: Write>( } (fork_forward_adjacency, fork_join_map) }; - let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); + let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { + let mut label_data_for_phi = HashMap::new(); + for (idx, node) in function.nodes.iter().enumerate() { + if let Node::Phi { control: _, data } = node { + for &data_id in data.iter() { + label_data_for_phi.entry(data_id).or_insert(vec![]).push(NodeID::new(idx)); + } + } + } + label_data_for_phi + }; + let label_data_for_phi = label_data_for_phi(); + let mut ctx = GPUContext { function, types, @@ -80,6 +92,7 @@ pub fn gpu_codegen<W: Write>( kernel_attrs: &mut kernel_attrs, fork_forward_adjacency, fork_join_map, + label_data_for_phi, }; ctx.codegen_function(w) } @@ -111,6 +124,7 @@ struct GPUContext<'a> { kernel_attrs: &'a mut GPUKernelAttrs, fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, fork_join_map: HashMap<NodeID, NodeID>, + label_data_for_phi: HashMap<NodeID, Vec<NodeID>>, } #[derive(Default, Debug)] @@ -150,7 +164,7 @@ impl<'a> GPUContext<'a> { .map(|idx| { let node_id = NodeID::new(idx); let mut goto = CudaGoto::default(); - goto.header = format!("{}{}", self.function.nodes[idx].upper_case_name(), idx); + goto.header = self.get_value(node_id, false); (node_id, goto) }) .collect(); @@ -161,10 +175,8 @@ impl<'a> GPUContext<'a> { // Assign outermost fork joins to block level let (block_fork_ids, block_fork_sizes) = self.codegen_block_creation(&mut top)?; // Sink logic from outer block fork joins. If it's a write, add - // necessary block-id based condition. For now, it's done naively at the - // top of the kernel. + // necessary block-id based condition. let mut block_stride = self.kernel_attrs.num_blocks; - gotos.get_mut(&NodeID::new(0)).unwrap().header = "start_sink".to_string(); self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut top, &mut gotos)?; for (i, &fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { block_stride = block_stride.saturating_div(block_fork_sizes[i]); @@ -239,18 +251,17 @@ impl<'a> GPUContext<'a> { } - fn codegen_phi_registers(&self, kernel_body: &mut String) -> Result<(), Error> { + fn codegen_phi_registers(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if let Node::Phi { control: _, data } = &self.function.nodes[id.idx()] { - let ty = self.get_type(self.typing[data[0].idx()]); - write!(kernel_body, "\t{} {}{}_value;\n", ty, self.function.nodes[id.idx()].upper_case_name(), id.idx())?; + if let Node::Phi { control: _, data: _ } = &self.function.nodes[id.idx()] { + write!(w, "\t{};\n", self.get_value(id, true))?; } } Ok(()) } // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. - fn codegen_block_creation(&mut self, kernel_body: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { + fn codegen_block_creation(&mut self, w: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { // a) let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); for (_, children) in self.fork_forward_adjacency.iter() { @@ -260,7 +271,7 @@ impl<'a> GPUContext<'a> { } let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); if root_forks.len() != 1 { - return Err(Error); + panic!("Exactly one root fork is required for outermost GPU block fork"); } let mut strict_forks = vec![root_forks[0]]; @@ -285,14 +296,9 @@ impl<'a> GPUContext<'a> { } let factors = match &self.function.nodes[fork.idx()] { Node::Fork { factors, .. } => factors, - _ => return Err(Error) + _ => panic!("Expected fork node") }; - let fork_size = factors.iter() - .try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .ok_or_else(|| Error) - .map(|val| acc.saturating_mul(val)) - })?; + let fork_size = self.multiply_fork_factors(factors)?; let new_blocks = cumulative_blocks.saturating_mul(fork_size); if new_blocks > self.kernel_params.max_num_blocks { break; @@ -308,36 +314,61 @@ impl<'a> GPUContext<'a> { .collect::<Vec<_>>(); // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - write!(kernel_body, "\tconst int block_x = blockIdx.x;\n")?; + write!(w, "\tconst int block_x = blockIdx.x;\n")?; Ok((valid_block_forks, block_fork_sizes)) } - fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, kernel_body: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { + fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, w: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { // 1. Get control nodes including fork_id that are dominated by fork_id // and not dominated by next_fork_id and not dominated by fork_id's join let dom = dominator(self.control_subgraph, fork_id); assert!(dom.does_dom(fork_id, next_fork_id)); - let mut control_nodes_between = vec![]; + let mut control_nodes_between = HashSet::new(); for node_id in self.control_subgraph.iter() { if dom.does_dom(fork_id, *node_id) && !dom.does_dom(next_fork_id, *node_id) && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) { - control_nodes_between.push(*node_id); + control_nodes_between.insert(*node_id); + } + } + + // 2. Emit data flow for nodes assigned to those basic blocks. Phi + // registers were already emitted at top. + // TEMPORARY: ignoring the special write case for now + let mut worklist = VecDeque::from_iter( + self.reverse_postorder + .into_iter() + .filter(|id| !self.function.nodes[id.idx()].is_control() + && control_nodes_between.contains(&self.bbs[id.idx()]) + && !self.function.nodes[id.idx()].is_phi() + ), + ); + let mut visited = HashSet::new(); + while let Some(id) = worklist.pop_front() { + let node = &self.function.nodes[id.idx()]; + if node.is_reduce() { + panic!("Reduce nodes should not be in block sink"); + } + if get_uses(node) + .as_ref() + .into_iter() + .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) + { + self.codegen_data_node(*id, w)?; + visited.insert(id); + } else { + worklist.push_back(id); } } - // 2. Call regular data codegen for blocks corresponding to - // control nodes, with extra if surrounding index-dependent write - // (TODO: consider shared memory optimization) - for &node_id in control_nodes_between.iter() { - self.codegen_data_node(node_id, &mut gotos.get_mut(&node_id).unwrap().body)?; + + // 3. Emit control flow + for control_node in control_nodes_between { + self.codegen_control_node(control_node, w)?; } - // 3. call regular control codegen using goto structure Ok(()) } // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. - fn codegen_thread_creation(&mut self, inner_block_fork: NodeID, kernel_body: &mut String) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { - let fork_forward_adjacency = self.fork_forward_adjacency.as_ref().unwrap(); - + fn codegen_thread_creation(&mut self, inner_block_fork: NodeID, w: &mut String) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { let mut thread_fork_parents = HashMap::new(); let mut thread_fork_sizes = HashMap::new(); let mut thread_fork_cumulative_sizes = HashMap::new(); @@ -348,7 +379,7 @@ impl<'a> GPUContext<'a> { let mut visited = HashSet::new(); visited.insert(inner_block_fork); while let Some(pop) = stack.pop() { - let children = &fork_forward_adjacency[&pop]; + let children = &self.fork_forward_adjacency[&pop]; // Reverse child order due to use of stack for DFS for &child in children.iter().rev() { @@ -356,13 +387,8 @@ impl<'a> GPUContext<'a> { visited.insert(child); thread_fork_parents.insert(child, pop); let fork_size = match &self.function.nodes[child.idx()] { - Node::Fork { factors, .. } => factors.iter() - .try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .ok_or_else(|| Error) - .map(|val| acc.saturating_mul(val)) - })?, - _ => return Err(Error) + Node::Fork { factors, .. } => self.multiply_fork_factors(factors)?, + _ => panic!("Expected fork node") }; thread_fork_sizes.insert(child, fork_size); @@ -386,52 +412,136 @@ impl<'a> GPUContext<'a> { // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. self.kernel_attrs.num_threads = max_thread_size; // If limit on number of threads in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - write!(kernel_body, "\tconst int thread_x = threadIdx.x;\n")?; + write!(w, "\tconst int thread_x = threadIdx.x;\n")?; Ok((thread_fork_parents, thread_fork_sizes, thread_fork_edges)) } - fn codegen_control_node(&self, id: NodeID, kernel_body: &mut String) -> Result<(), Error> { + fn codegen_control_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { + Ok(()) + } + + fn codegen_data_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { + match &self.function.nodes[id.idx()] { + Node::Phi { control: _, data: _ } => {} + Node::Parameter { index } => { + write!(w, "\t{} = p{};\n", self.get_value(id, true), index)?; + } + Node::Constant { id: cons_id } => { + write_constant() + + } + Node::DynamicConstant { id: _ } => {} + Node::Unary { op: _, input: _ } => {} + Node::Binary { op: _, left: _, right: _ } => {} + Node::Ternary { op: _, first: _, second: _, third: _ } => {} + Node::IntrinsicCall { intrinsic: _, args: _ } => {} + Node::Read { collect: _, indices: _ } => {} + Node::Write { collect: _, data: _, indices: _ } => {} + Node::Projection { control: _, selection: _ } => {} + Node::Undef { ty: _ } => {} + _ => {} + } Ok(()) } - fn codegen_data_node(&self, id: NodeID, kernel_body: &mut String) -> Result<(), Error> { + fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, w: &mut String) -> Result<(), Error> { Ok(()) } - fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, kernel_body: &mut String) -> Result<(), Error> { + // matmul detection- only called if einsum detected + fn matmul_detection(&self) -> Result<(), Error> { Ok(()) } - fn get_type(&self, id: TypeID) -> String { - match self.types[id.idx()] { - Type::Product(ref product_ty_ids) => { - format!("Product_{} *", id.idx()) + // convolution detection- only called if einsum detected + fn convolution_detection(&self) -> Result<(), Error> { + Ok(()) + } + + fn write_constant(&self, name: String, type_name: String, cons_id: ConstantID, w: &mut String) -> Result<(), Error> { + write!(w, "\t{} {}", type_name, name)?; + match self.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(w, " = {}\n", val)?, + Constant::Integer8(val) => write!(w, " = {}\n", val)?, + Constant::UnsignedInteger8(val) => write!(w, " = {}\n", val)?, + Constant::Integer16(val) => write!(w, " = {}\n", val)?, + Constant::UnsignedInteger16(val) => write!(w, " = {}\n", val)?, + Constant::Integer32(val) => write!(w, " = {}\n", val)?, + Constant::UnsignedInteger32(val) => write!(w, " = {}ul\n", val)?, + Constant::Integer64(val) => write!(w, " = {}ll\n", val)?, + Constant::UnsignedInteger64(val) => write!(w, " = {}ull\n", val)?, + Constant::Float32(val) => write!(w, " = {}f\n", val)?, + Constant::Float64(val) => write!(w, " = {}\n", val)?, + Constant::Product(_, fields) => { + write!(w, ";\n")?; + for (i, field) in fields.iter().enumerate() { + self.write_constant(format!("{}_field_{}", name, i), self.constant_to_type_name(*field), *field, w)?; + write!(w, "\t{}.field_{} = {}_field_{};\n", name, i, name, i)?; + } } - Type::Summation(ref summation_ty_ids) => { - format!("Summation_{} *", id.idx()) + Constant::Summation(_, variant, field) => { + write!(w, ";\n")?; + self.write_constant(format!("{}_field_{}", name, variant), self.constant_to_type_name(field), field, w)?; + write!(w, "\t{}.tag = {};\n\t{}.field_{} = {}_field_{};\n", name, variant, name, variant, name, variant)?; } - _ => convert_type(&self.types[id.idx()]), + Constant::Array(_) => { + write!(w, ";\n")?; + for (i, element) in elements.iter().enumerate() { + self.write_constant(format!("{}_element_{}", name, i), self.constant_to_type_name(*element), *element, w)?; + write!(w, "\t{}[{}] = {}_element_{};\n", name, i, name, i)?; + } + } + } + Ok(()) + } + + fn constant_to_type_name(&self, cons_id: ConstantID) -> String { + match self.constants[cons_id.idx()] { + Constant::Boolean(_) => "bool".to_string(), + Constant::Integer8(_) => "int8_t".to_string(), + Constant::UnsignedInteger8(_) => "uint8_t".to_string(), + Constant::Integer16(_) => "short".to_string(), + Constant::UnsignedInteger16(_) => "unsigned short".to_string(), + Constant::Integer32(_) => "int".to_string(), + Constant::UnsignedInteger32(_) => "unsigned int".to_string(), + Constant::Integer64(_) => "long long".to_string(), + Constant::UnsignedInteger64(_) => "unsigned long long".to_string(), + Constant::Float32(_) => "float".to_string(), + Constant::Float64(_) => "double".to_string(), + Constant::Product(type_id, _) => self.get_type(type_id), + Constant::Summation(type_id, _, _) => self.get_type(type_id), + Constant::Array(type_id) => self.get_type(type_id), } } - fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> usize { + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Result<usize, Error> { factors.iter() - .map(|&factor_id| { + .try_fold(1usize, |acc, &factor_id| { evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .unwrap_or_else(|| panic!("Fork factors must be evaluatable to constants")) + .ok_or_else(|| Error) + .map(|val| acc.saturating_mul(val)) }) - .product() } - // matmul detection- only called if einsum detected - fn matmul_detection(&self, w: &mut W) -> Result<(), Error> { - Ok(()) + fn get_value(&self, id: NodeID, ty: bool) -> String { + if ty { + format!("{} {}{}", self.get_type(self.typing[id.idx()]), self.function.nodes[id.idx()].lower_case_name(), id.idx()) + } else { + format!("{}{}", self.function.nodes[id.idx()].lower_case_name(), id.idx()) + } } - // convolution detection- only called if einsum detected - fn convolution_detection(&self, w: &mut W) -> Result<(), Error> { - Ok(()) + fn get_type(&self, id: TypeID) -> String { + match self.types[id.idx()] { + Type::Product(_) => { + format!("Product_{}", id.idx()) + } + Type::Summation(_) => { + format!("Summation_{}", id.idx()) + } + _ => convert_type(&self.types[id.idx()]), + } } } -- GitLab From 10d54b200f4eb8df76f9a9c822842962d9a1435e Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 30 Dec 2024 02:41:57 -0800 Subject: [PATCH 005/109] backward and forward --- hercules_cg/src/gpu.rs | 977 ++++++++++++++++++++++++++++++++--------- hercules_ir/src/ir.rs | 233 +++++++++- 2 files changed, 1003 insertions(+), 207 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 7c56fb41..aa5908a2 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,15 +3,35 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; - -use std::iter::FromIterator; // zip -// use std::sync::atomic::{AtomicUsize, Ordering}; - -// use self::bitvec::prelude::*; +use std::hash::{Hash, Hasher}; +use std::iter::FromIterator; use self::hercules_ir::*; -// use crate::*; +#[derive(Debug, Clone)] +struct HashableIndex<'a>(Vec<&'a str>); +impl<'a> FromIterator<&'a String> for HashableIndex<'a> { + fn from_iter<I: IntoIterator<Item = &'a String>>(iter: I) -> Self { + HashableIndex(iter.into_iter().map(|s| s.as_str()).collect()) + } +} +impl<'a> PartialEq for HashableIndex<'a> { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} +impl<'a> Eq for HashableIndex<'a> {} +impl<'a> Hash for HashableIndex<'a> { + fn hash<H: Hasher>(&self, state: &mut H) { + self.0.hash(state); + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +enum MemoryType { + Shared, + Register, +} /* * The top level function to compile a Hercules IR function into NVVM IR kernel for @@ -33,13 +53,13 @@ pub fn gpu_codegen<W: Write>( max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, - num_smps: 60, + num_smps: 60, }; - let mut kernel_attrs = GPUKernelAttrs::default(); + let kernel_attrs = GPUKernelAttrs::default(); // Create fork forward adjacency and join map upfront as part of context - let make_fork_structures = || -> (HashMap::<NodeID, Vec<NodeID>>, HashMap::<NodeID, NodeID>) { - let mut fork_forward_adjacency: HashMap::<NodeID, Vec<NodeID>> = (0..function.nodes.len()) + let make_fork_structures = || -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, NodeID>) { + let mut fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>> = (0..function.nodes.len()) .filter(|idx| function.nodes[*idx].is_fork()) .map(|idx| (NodeID::new(idx), vec![])) .collect(); @@ -65,12 +85,40 @@ pub fn gpu_codegen<W: Write>( }; let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); + let map_join_reduce = || -> HashMap<NodeID, Vec<NodeID>> { + let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) + .filter(|idx| function.nodes[*idx].is_reduce()) + .map(NodeID::new) + .collect(); + let mut map_join_reduce = HashMap::new(); + for (_, join) in fork_join_map.iter() { + let reduce_nodes_for_join = reduce_nodes + .iter() + .filter(|reduce_node| match &function.nodes[reduce_node.idx()] { + Node::Reduce { + control, + init: _, + reduct: _, + } => control.idx() == join.idx(), + _ => false, + }) + .copied() + .collect(); + map_join_reduce.insert(*join, reduce_nodes_for_join); + } + map_join_reduce + }; + let join_reduce_map = map_join_reduce(); + let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { let mut label_data_for_phi = HashMap::new(); for (idx, node) in function.nodes.iter().enumerate() { if let Node::Phi { control: _, data } = node { for &data_id in data.iter() { - label_data_for_phi.entry(data_id).or_insert(vec![]).push(NodeID::new(idx)); + label_data_for_phi + .entry(data_id) + .or_insert(vec![]) + .push(NodeID::new(idx)); } } } @@ -78,6 +126,27 @@ pub fn gpu_codegen<W: Write>( }; let label_data_for_phi = label_data_for_phi(); + // Since global memory traffic is expensive, we use shared memory and + // registers basically as write-back caches, but we write back due to + // end of scope rather than due to synchronization (which is solved by + // shmem). + // param_cache tracks cache for each parameter by accessed index and + // memory type. Note that indexing is hierarchical, so [a, b] contains + // [a, b, c] and will give a hit upon query of the latter. param_cache is + // only added to for copies from global -> shared or global -> register. + // Writes update the cache, but we track specific indices written in + // param_cache_writes to know what to write back (and avoid redundant + // writes). + let param_cache = vec![ + HashMap::<(HashableIndex<'static>, MemoryType), String>::new(); + function.param_types.len() + ]; + let param_cache_writes = + vec![HashSet::<(HashableIndex<'static>, MemoryType)>::new(); function.param_types.len()]; + // Statically unknown shared memory buffers need to use dynamic offsets from. + // the dynamic shared memory buffer + let mut dynamic_shared_offset = "0".to_string(); + let mut ctx = GPUContext { function, types, @@ -87,16 +156,19 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, - cuda_structs: HashSet::new(), kernel_params, - kernel_attrs: &mut kernel_attrs, + kernel_attrs, fork_forward_adjacency, fork_join_map, label_data_for_phi, + join_reduce_map, + param_cache, + param_cache_writes, }; ctx.codegen_function(w) } +// Fixed prior to codegen struct GPUKernelParams { max_num_blocks: usize, max_num_threads: usize, @@ -104,10 +176,12 @@ struct GPUKernelParams { num_smps: usize, } +// Set during codegen #[derive(Default)] struct GPUKernelAttrs { num_blocks: usize, num_threads: usize, + extern_shmem_offset: String, } struct GPUContext<'a> { @@ -119,12 +193,14 @@ struct GPUContext<'a> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a Vec<NodeID>, - cuda_structs: HashSet<usize>, kernel_params: GPUKernelParams, - kernel_attrs: &'a mut GPUKernelAttrs, + kernel_attrs: GPUKernelAttrs, fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, fork_join_map: HashMap<NodeID, NodeID>, label_data_for_phi: HashMap<NodeID, Vec<NodeID>>, + join_reduce_map: HashMap<NodeID, Vec<NodeID>>, + param_cache: Vec<HashMap<(HashableIndex, MemoryType), String>>, + param_cache_writes: Vec<HashSet<(HashableIndex, MemoryType)>>, } #[derive(Default, Debug)] @@ -135,11 +211,11 @@ struct CudaGoto { handled: bool, } -impl<'a> GPUContext<'a> { +impl GPUContext<'_> { fn codegen_function<W: Write>(&mut self, w: &mut W) -> Result<(), Error> { - // Include all possible imports + // Include all possible includes then macros write!( - w, + w, " #include <assert.h> #include <stdio.h> @@ -147,46 +223,63 @@ impl<'a> GPUContext<'a> { #include <cuda_runtime.h> #include <mma.h> #include <helper_cuda.h> + +#define uabs(a) (a) +#define umin(a, b) ((a) < (b) ? (a) : (b)) +#define umax(a, b) ((a) > (b) ? (a) : (b)) +#define powi(a, b) ({{ int res = 1; for(int i = 0; i < b; ++i) res *= a; res; }}) +#define roundi(a) (a) +#define isqrt(a) ((int)sqrtf((float)(a))) + ", )?; let mut top = String::new(); - // Create all possible structs + // Emit all possible structs self.codegen_structs(&mut top)?; - // Kernel template, signature, and arguments + // Emit kernel template, signature, and arguments self.codegen_kernel_begin(&mut top)?; + // Need to emit dynamic offsets for extern shmem, we do this by strings. + self.kernel_attrs.extern_shmem_offset = "0".to_string(); + + // Emit calculation of all dynamic constants + self.codegen_dynamic_constants(&mut top)?; - // Uses CUDA's goto structure; some control nodes' gen may be moved, eg + // Uses CUDA's goto structure; some control nodes' gen may be moved, eg // block and thread fork joins. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) .map(|idx| { let node_id = NodeID::new(idx); - let mut goto = CudaGoto::default(); - goto.header = self.get_value(node_id, false); + let goto = CudaGoto { + header: self.get_value(node_id, false, false), + ..Default::default() + }; (node_id, goto) }) .collect(); - // Generate phi registers at top, later can consider smarter scoping - self.codegen_phi_registers(&mut top)?; - - // Assign outermost fork joins to block level - let (block_fork_ids, block_fork_sizes) = self.codegen_block_creation(&mut top)?; + // Assign outermost fork joins to block level. TODO: remove block_sizes + // if still not needed later + let (block_fork_ids, _) = self.codegen_block_creation()?; + // Assign inner fork joins to thread level. We do this before block sink + // because we need thread size for shared memory optimizations + let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = + self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1])?; // Sink logic from outer block fork joins. If it's a write, add // necessary block-id based condition. - let mut block_stride = self.kernel_attrs.num_blocks; - self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut top, &mut gotos)?; - for (i, &fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { - block_stride = block_stride.saturating_div(block_fork_sizes[i]); - self.codegen_block_sink(fork_id, block_fork_ids[i + 1], block_stride, &mut top, &mut gotos)?; + self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], &mut top, &mut gotos)?; + for (i, &fork_id) in block_fork_ids + .iter() + .enumerate() + .take(block_fork_ids.len() - 1) + { + self.codegen_block_sink(fork_id, block_fork_ids[i + 1], &mut top, &mut gotos)?; } - // Assign inner fork joins to thread level, with labels for warp - let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1], &mut top)?; - // Punting on implementation but can likely run einsum -> matmul/conv - // detector on hierarhical fork joins between block edge and given + // Punting on implementation but can likely run einsum -> matmul/conv + // detector on hierarhical fork joins between block edge and given // thread edge. // finish kernel @@ -202,67 +295,137 @@ impl<'a> GPUContext<'a> { Type::Product(ref product_ty_ids) => { write!(w, "\nstruct Product_{} {{\n", id)?; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { - write!(w, "\t{} field_{};\n", self.get_type(*product_ty_id), i)?; + write!( + w, + "\t{} field_{};\n", + self.get_type(*product_ty_id, false), + i + )?; } write!(w, "}};\n")?; } Type::Summation(ref summation_ty_ids) => { write!(w, "\nstruct Summation_{} {{\n\t union {{\n", id)?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { - write!(w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id), i)?; + write!( + w, + "\t\t{} field_{};\n", + self.get_type(*summation_ty_id, false), + i + )?; } write!(w, "\t}};\n\tuint8_t tag;\n}};\n")?; } _ => {} } - } + } Ok(()) } - fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { - write!(w, "template <")?; - // The dynamic constants become template parameters. - let mut first_template_param = true; + write!( + w, + "__global__ void __launch_bounds__({}) {}(", + self.kernel_params.max_num_threads, self.function.name + )?; + // The first set of parameters are dynamic constants. + let mut first_param = true; for idx in 0..self.function.num_dynamic_constants { - if first_template_param { - first_template_param = false; + if first_param { + first_param = false; } else { write!(w, ", ")?; } - write!(w, "long long int dc_p{}", idx)?; + write!(w, "unsigned long long dc_p{}", idx)?; } - write!(w, ">\n")?; - - write!(w, "__global__ void __launch_bounds__({}) {}(", self.kernel_params.max_num_threads, self.function.name)?; // The second set of parameters are normal arguments. - let mut first_param = true; for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; } else { write!(w, ", ")?; } - write!(w, "{} p{}", self.get_type(*ty), idx)?; + write!(w, "{} p{}", self.get_type(*ty, true), idx)?; } - write!(w, ") {{\n")?; + // We convert originally non-void functions to void functions by adding a + // return parameter. For now we ignore the case where return was derived + // from a parameter through reads and writes, and instead always memcpy. + let return_index = self.function.nodes.iter().position(|node| node.is_return()); + if let Some(return_index) = return_index { + if let Node::Return { + control: _, + data: return_data, + } = &self.function.nodes[return_index] + { + write!( + w, + ", {} return_val", + self.get_type(self.typing[return_data.idx()], true) + )?; + } else { + panic!("Expected return node"); + } + } + + // Type is char since it's simplest to use single bytes for indexing, + // casting will be needed for use with different types. + write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n")?; Ok(()) } + fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { + for dc in dynamic_constants_bottom_up(self.dynamic_constants) { + let dc_val = format!("unsigned long long dc{}", dc.idx()); + match self.dynamic_constants[dc.idx()] { + DynamicConstant::Constant(val) => write!(w, "\t{} = {}ull;\n", dc_val, val)?, + DynamicConstant::Parameter(idx) => { + if idx < self.function.num_dynamic_constants as usize { + write!(w, "\t{} = dc_p{};\n", dc_val, idx)? + } else { + write!(w, "\t{} = 0;\n", dc_val)? + } + } + DynamicConstant::Add(left, right) => { + write!(w, "\t{} = dc{} + dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Sub(left, right) => { + write!(w, "\t{} = dc{} - dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Mul(left, right) => { + write!(w, "\t{} = dc{} * dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Div(left, right) => { + write!(w, "\t{} = dc{} / dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Rem(left, right) => { + write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())? + } + } + } + Ok(()) + } - fn codegen_phi_registers(&self, w: &mut String) -> Result<(), Error> { + fn codegen_phi_registers<F>(&self, w: &mut String, should_process: F) -> Result<(), Error> + where + F: Fn(NodeID) -> bool, + { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if let Node::Phi { control: _, data: _ } = &self.function.nodes[id.idx()] { - write!(w, "\t{};\n", self.get_value(id, true))?; + if let Node::Phi { + control: _, + data: _, + } = &self.function.nodes[id.idx()] + { + if should_process(id) { + write!(w, "\t{};\n", self.get_value(id, true, true))?; + } } } Ok(()) } - // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. - fn codegen_block_creation(&mut self, w: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { - // a) + // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the join has no user reduce nodes, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> { let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); for (_, children) in self.fork_forward_adjacency.iter() { for child in children { @@ -273,19 +436,20 @@ impl<'a> GPUContext<'a> { if root_forks.len() != 1 { panic!("Exactly one root fork is required for outermost GPU block fork"); } - + + // a and b let mut strict_forks = vec![root_forks[0]]; let mut curr_fork = root_forks[0]; - while self.fork_join_map.get(&curr_fork).is_some() { + while let Some(join) = self.fork_join_map.get(&curr_fork) { let children = &self.fork_forward_adjacency[&curr_fork]; - if children.len() != 1 { + if children.len() != 1 || !self.join_reduce_map.contains_key(join) { break; } curr_fork = children[0]; strict_forks.push(curr_fork); } - // b, (stronger version of) c, and d + // c, (stronger version of) d, and e let mut valid_block_forks = 0; let mut cumulative_blocks = 1usize; let mut block_fork_sizes = Vec::new(); @@ -296,7 +460,7 @@ impl<'a> GPUContext<'a> { } let factors = match &self.function.nodes[fork.idx()] { Node::Fork { factors, .. } => factors, - _ => panic!("Expected fork node") + _ => panic!("Expected fork node"), }; let fork_size = self.multiply_fork_factors(factors)?; let new_blocks = cumulative_blocks.saturating_mul(fork_size); @@ -308,67 +472,21 @@ impl<'a> GPUContext<'a> { valid_block_forks += 1; } + // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. self.kernel_attrs.num_blocks = cumulative_blocks; - let valid_block_forks = strict_forks.into_iter() + let valid_block_forks = strict_forks + .into_iter() .take(valid_block_forks) .collect::<Vec<_>>(); - // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - write!(w, "\tconst int block_x = blockIdx.x;\n")?; Ok((valid_block_forks, block_fork_sizes)) } - fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, w: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { - // 1. Get control nodes including fork_id that are dominated by fork_id - // and not dominated by next_fork_id and not dominated by fork_id's join - let dom = dominator(self.control_subgraph, fork_id); - assert!(dom.does_dom(fork_id, next_fork_id)); - let mut control_nodes_between = HashSet::new(); - for node_id in self.control_subgraph.iter() { - if dom.does_dom(fork_id, *node_id) && !dom.does_dom(next_fork_id, *node_id) && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) { - control_nodes_between.insert(*node_id); - } - } - - // 2. Emit data flow for nodes assigned to those basic blocks. Phi - // registers were already emitted at top. - // TEMPORARY: ignoring the special write case for now - let mut worklist = VecDeque::from_iter( - self.reverse_postorder - .into_iter() - .filter(|id| !self.function.nodes[id.idx()].is_control() - && control_nodes_between.contains(&self.bbs[id.idx()]) - && !self.function.nodes[id.idx()].is_phi() - ), - ); - let mut visited = HashSet::new(); - while let Some(id) = worklist.pop_front() { - let node = &self.function.nodes[id.idx()]; - if node.is_reduce() { - panic!("Reduce nodes should not be in block sink"); - } - if get_uses(node) - .as_ref() - .into_iter() - .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) - { - self.codegen_data_node(*id, w)?; - visited.insert(id); - } else { - worklist.push_back(id); - } - } - - // 3. Emit control flow - for control_node in control_nodes_between { - self.codegen_control_node(control_node, w)?; - } - - Ok(()) - } - - // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. - fn codegen_thread_creation(&mut self, inner_block_fork: NodeID, w: &mut String) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { + // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. + fn codegen_thread_creation( + &mut self, + inner_block_fork: NodeID, + ) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { let mut thread_fork_parents = HashMap::new(); let mut thread_fork_sizes = HashMap::new(); let mut thread_fork_cumulative_sizes = HashMap::new(); @@ -381,18 +499,19 @@ impl<'a> GPUContext<'a> { while let Some(pop) = stack.pop() { let children = &self.fork_forward_adjacency[&pop]; - // Reverse child order due to use of stack for DFS + // Reverse child order due to use of stack for DFS for &child in children.iter().rev() { if !visited.contains(&child) { visited.insert(child); thread_fork_parents.insert(child, pop); let fork_size = match &self.function.nodes[child.idx()] { Node::Fork { factors, .. } => self.multiply_fork_factors(factors)?, - _ => panic!("Expected fork node") + _ => panic!("Expected fork node"), }; thread_fork_sizes.insert(child, fork_size); - let new_cumulative_size = (thread_fork_cumulative_sizes[&pop] as usize).saturating_mul(fork_size as usize); + let new_cumulative_size = (thread_fork_cumulative_sizes[&pop] as usize) + .saturating_mul(fork_size as usize); if new_cumulative_size > self.kernel_params.max_num_threads { // Expanding to child fork exceeds thread limit, so // current fork is an edge fork @@ -409,43 +528,260 @@ impl<'a> GPUContext<'a> { } } - // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. - self.kernel_attrs.num_threads = max_thread_size; + // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. // If limit on number of threads in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - write!(w, "\tconst int thread_x = threadIdx.x;\n")?; + self.kernel_attrs.num_threads = max_thread_size; Ok((thread_fork_parents, thread_fork_sizes, thread_fork_edges)) } + fn codegen_block_sink( + &self, + fork_id: NodeID, + next_fork_id: NodeID, + w: &mut String, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + // 1. Get control nodes including fork_id that are dominated by fork_id + // and not dominated by next_fork_id and not dominated by fork_id's join + let dom = dominator(self.control_subgraph, fork_id); + assert!(dom.does_dom(fork_id, next_fork_id)); + let mut control_nodes_between = HashSet::new(); + for node_id in self.control_subgraph.iter() { + if dom.does_dom(fork_id, *node_id) + && !dom.does_dom(next_fork_id, *node_id) + && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) + { + control_nodes_between.insert(*node_id); + } + } + + // 2. Emit data flow for nodes assigned to those basic blocks + // 2a. All phi registers first + self.codegen_phi_registers(w, |id| control_nodes_between.contains(&self.bbs[id.idx()]))?; + // 2b. All other data nodes + let mut worklist = VecDeque::from_iter(self.reverse_postorder.iter().filter(|id| { + !self.function.nodes[id.idx()].is_control() + && control_nodes_between.contains(&self.bbs[id.idx()]) + && !self.function.nodes[id.idx()].is_phi() + })); + let mut visited = HashSet::new(); + while let Some(id) = worklist.pop_front() { + let node = &self.function.nodes[id.idx()]; + if node.is_reduce() { + panic!("Reduce nodes should not be in block sink"); + } + if get_uses(node) + .as_ref() + .iter() + .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) + { + self.codegen_data_node( + *id, + &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body, + )?; + visited.insert(id); + } else { + worklist.push_back(id); + } + } + + // 3. Emit control flow + for control_node in control_nodes_between { + self.codegen_control_node(control_node, w)?; + } + + Ok(()) + } + fn codegen_control_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { Ok(()) } - fn codegen_data_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { + fn codegen_data_node( + &self, + id: NodeID, + w: &mut String, + ) -> Result<(), Error> { + // For now only used shared memory when creating an array + let declare_variable = self.get_value(id, true, false).to_string(); match &self.function.nodes[id.idx()] { - Node::Phi { control: _, data: _ } => {} - Node::Parameter { index } => { - write!(w, "\t{} = p{};\n", self.get_value(id, true), index)?; - } + // Phi registers were already emitted. + Node::Phi { + control: _, + data: _, + } => {} + // No SSA requirement for CUDA + Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { - write_constant() - + self.codegen_constant( + declare_variable, + self.get_value(id, false, false), + *cons_id, + w, + )?; } + // No SSA requirement for CUDA Node::DynamicConstant { id: _ } => {} - Node::Unary { op: _, input: _ } => {} - Node::Binary { op: _, left: _, right: _ } => {} - Node::Ternary { op: _, first: _, second: _, third: _ } => {} - Node::IntrinsicCall { intrinsic: _, args: _ } => {} - Node::Read { collect: _, indices: _ } => {} - Node::Write { collect: _, data: _, indices: _ } => {} - Node::Projection { control: _, selection: _ } => {} - Node::Undef { ty: _ } => {} - _ => {} + Node::Unary { op, input } => match op { + UnaryOperator::Not => match &self.types[self.typing[input.idx()].idx()] { + Type::Boolean => { + write!( + w, + "\t{} = !{};\n", + declare_variable, + self.get_value(*input, false, false), + )?; + } + ty if ty.is_fixed() => { + write!( + w, + "\t{} = ~{};\n", + declare_variable, + self.get_value(*input, false, false), + )?; + } + _ => panic!("Unsupported type for not operator"), + }, + UnaryOperator::Neg => match &self.types[self.typing[input.idx()].idx()] { + ty if ty.is_signed() || ty.is_float() => { + write!( + w, + "\t{} = -{};\n", + declare_variable, + self.get_value(*input, false, false), + )?; + } + _ => { + panic!("Unsupported type for neg operator") + } + }, + UnaryOperator::Cast(dst_ty_id) => { + write!( + w, + "\t{} = static_cast<{}>({});\n", + declare_variable, + self.get_type(*dst_ty_id, false), + self.get_value(*input, false, false), + )?; + } + }, + Node::Binary { op, left, right } => { + let left_val = self.get_value(*left, false, false); + let right_val = self.get_value(*right, false, false); + match (op, &self.types[self.typing[left.idx()].idx()]) { + (BinaryOperator::Rem, Type::Float32) => write!( + w, + "\t{} = fmodf({}, {});\n", + declare_variable, + left_val, + right_val, + )?, + (BinaryOperator::Rem, Type::Float64) => write!( + w, + "\t{} = fmod({}, {});\n", + declare_variable, + left_val, + right_val, + )?, + // Doesn't need special syntax but bool type + (BinaryOperator::Or, Type::Boolean) => write!( + w, + "\t{} = {} || {};\n", + declare_variable, + left_val, + right_val, + )?, + (BinaryOperator::And, Type::Boolean) => write!( + w, + "\t{} = {} && {};\n", + declare_variable, + left_val, + right_val, + )?, + (op, _) => write!( + w, + "\t{} = {} {} {};\n", + declare_variable, + left_val, + match op { + BinaryOperator::Add => "+", + BinaryOperator::Sub => "-", + BinaryOperator::Mul => "*", + BinaryOperator::Div => "/", + BinaryOperator::Rem => "%", + BinaryOperator::LT => "<", + BinaryOperator::LTE => "<=", + BinaryOperator::GT => ">", + BinaryOperator::GTE => ">=", + BinaryOperator::EQ => "==", + BinaryOperator::NE => "!=", + BinaryOperator::Or => "|", + BinaryOperator::And => "&", + BinaryOperator::Xor => "^", + BinaryOperator::LSh => "<<", + BinaryOperator::RSh => ">>", + }, + right_val, + )?, + }; + } + Node::Ternary {op, first, second, third} => match op { + TernaryOperator::Select => { + write!( + w, + "\t{} = {} ? {} : {};\n", + declare_variable, + self.get_value(*first, false, false), + self.get_value(*second, false, false), + self.get_value(*third, false, false), + )?; + } + }, + Node::IntrinsicCall { intrinsic, args } => { + let ty = &self.types[self.typing[args[0].idx()].idx()]; + let func_name = self.codegen_intrinsic(intrinsic, ty); + write!( + w, + "\t{} = {}({});\n", + declare_variable, + func_name, + self.get_value(args[0], false, false), + )?; + } + Node::Read { collect, indices } => { + let index_ptr_name = self.codegen_indices(*collect, indices); + // If it's a parameter node then copy from global memory, else + // reference from shared memory or registers. + if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { + // We parallelize copies from global memory across threads for + // array types, either immediate or nested in the collection. + if self.types[self.typing[id.idx()].idx()].is_primitive() { + write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?; + } else { + self.codegen_global_to_shared(id, declare_variable, index_ptr_name, indices.len(), true, w)?; + } + } else { + write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?; + } + } + Node::Write {collect: _, data: _, indices: _} => { + // TODO + } + _ => { + panic!("Unsupported node type") + } + } + if let Some(phis) = self.label_data_for_phi.get(&id) { + for phi in phis { + write!( + w, + "\t{} = {};\n", + self.get_value(*phi, false, false), + self.get_value(id, false, false) + )?; + } } - Ok(()) - } - - fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, w: &mut String) -> Result<(), Error> { Ok(()) } @@ -459,96 +795,329 @@ impl<'a> GPUContext<'a> { Ok(()) } - fn write_constant(&self, name: String, type_name: String, cons_id: ConstantID, w: &mut String) -> Result<(), Error> { - write!(w, "\t{} {}", type_name, name)?; - match self.constants[cons_id.idx()] { - Constant::Boolean(val) => write!(w, " = {}\n", val)?, - Constant::Integer8(val) => write!(w, " = {}\n", val)?, - Constant::UnsignedInteger8(val) => write!(w, " = {}\n", val)?, - Constant::Integer16(val) => write!(w, " = {}\n", val)?, - Constant::UnsignedInteger16(val) => write!(w, " = {}\n", val)?, - Constant::Integer32(val) => write!(w, " = {}\n", val)?, - Constant::UnsignedInteger32(val) => write!(w, " = {}ul\n", val)?, - Constant::Integer64(val) => write!(w, " = {}ll\n", val)?, - Constant::UnsignedInteger64(val) => write!(w, " = {}ull\n", val)?, - Constant::Float32(val) => write!(w, " = {}f\n", val)?, - Constant::Float64(val) => write!(w, " = {}\n", val)?, + // Standalone function allows us to handle recursive initialization for + // product and summation collections + fn codegen_constant( + &self, + declare_variable: String, + name: String, + cons_id: ConstantID, + w: &mut String, + ) -> Result<(), Error> { + write!(w, "\t{}", declare_variable)?; + match &self.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(w, " = {};\n", val)?, + Constant::Integer8(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger8(val) => write!(w, " = {};\n", val)?, + Constant::Integer16(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger16(val) => write!(w, " = {};\n", val)?, + Constant::Integer32(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger32(val) => write!(w, " = {}ul;\n", val)?, + Constant::Integer64(val) => write!(w, " = {}ll;\n", val)?, + Constant::UnsignedInteger64(val) => write!(w, " = {}ull;\n", val)?, + Constant::Float32(val) => write!(w, " = {}f;\n", val)?, + Constant::Float64(val) => write!(w, " = {};\n", val)?, Constant::Product(_, fields) => { write!(w, ";\n")?; for (i, field) in fields.iter().enumerate() { - self.write_constant(format!("{}_field_{}", name, i), self.constant_to_type_name(*field), *field, w)?; - write!(w, "\t{}.field_{} = {}_field_{};\n", name, i, name, i)?; + // We don't emit array fields and size was set by struct definition + if !self.constants[field.idx()].is_array() { + // Don't need type declaration for the fields + self.codegen_constant( + format!("{}.field_{}", name, i), + format!("{}.field_{}", name, i), + *field, + w, + )?; + } } } Constant::Summation(_, variant, field) => { - write!(w, ";\n")?; - self.write_constant(format!("{}_field_{}", name, variant), self.constant_to_type_name(field), field, w)?; - write!(w, "\t{}.tag = {};\n\t{}.field_{} = {}_field_{};\n", name, variant, name, variant, name, variant)?; + write!(w, ";\n\t{}.tag = {};\n", name, variant)?; + // See two comments in Constant::Product + if !self.constants[field.idx()].is_array() { + self.codegen_constant( + format!("\t{}.field_{}", name, variant), + format!("\t{}.field_{}", name, variant), + *field, + w, + )?; + } } - Constant::Array(_) => { - write!(w, ";\n")?; - for (i, element) in elements.iter().enumerate() { - self.write_constant(format!("{}_element_{}", name, i), self.constant_to_type_name(*element), *element, w)?; - write!(w, "\t{}[{}] = {}_element_{};\n", name, i, name, i)?; + Constant::Array(type_id) => { + let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { + panic!("Expected array type") + }; + // For now we do element-wise alignment, later could consider (n-1)d array + // alignment. Then we "allocate" from the single dynamic shared memory buffer + // by using and updating the offset. + let element_size = format!("sizeof({})", self.get_type(*element_type, false)); + let array_size = extents + .iter() + .map(|id| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join("*"); + write!(w, ";\n\talignment = {};\n\tdynamic_shared_offset = + (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = + reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t + dynamic_shared_offset += {}", element_size, name, self.get_type(*element_type, false), array_size)?; + } + } + Ok(()) + } + + fn codegen_global_to_shared(&self, id: NodeID, declare_variable: String, index_ptr_name: String, array_depth: Option<usize>, outermost: bool, w: &mut String) -> Result<(), Error> { + match &self.types[self.typing[id.idx()].idx()] { + Type::Array(_, extents) => { + let array_depth = array_depth.unwrap(); + let rem_array_size = extents + .iter() + .enumerate() + .filter(|(i, _)| *i >= array_depth) + .map(|(_, id)| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join("*"); + let mut running_div_factor = "1".to_string(); + write!(w, "\tfor (int i = threadIdx.x; i < {}; i += {}) {{\n", rem_array_size, self.kernel_attrs.num_threads)?; + let mut indices = vec![]; + for i in (array_depth..extents.len()).rev() { + indices.push(format!("[(({}) / ({})) % dc{}]", rem_array_size, running_div_factor, extents[i].idx())); + running_div_factor = format!("{} * {}", running_div_factor, format!("dc{}", extents[i].idx())); } + let indices_str = indices.join(""); + // TODO: condition by primitive vs collection, if latter then recurse + // with outermost = false + write!(w, "\t\t{}{} = {}{};\n", declare_variable, indices_str, index_ptr_name, indices_str)?; } + // TODO: handle product and summation collections } Ok(()) } - fn constant_to_type_name(&self, cons_id: ConstantID) -> String { - match self.constants[cons_id.idx()] { - Constant::Boolean(_) => "bool".to_string(), - Constant::Integer8(_) => "int8_t".to_string(), - Constant::UnsignedInteger8(_) => "uint8_t".to_string(), - Constant::Integer16(_) => "short".to_string(), - Constant::UnsignedInteger16(_) => "unsigned short".to_string(), - Constant::Integer32(_) => "int".to_string(), - Constant::UnsignedInteger32(_) => "unsigned int".to_string(), - Constant::Integer64(_) => "long long".to_string(), - Constant::UnsignedInteger64(_) => "unsigned long long".to_string(), - Constant::Float32(_) => "float".to_string(), - Constant::Float64(_) => "double".to_string(), - Constant::Product(type_id, _) => self.get_type(type_id), - Constant::Summation(type_id, _, _) => self.get_type(type_id), - Constant::Array(type_id) => self.get_type(type_id), + fn codegen_indices(&self, collect: NodeID, indices: &[Index]) -> String { + let mut index_ptr_name = format!("{}", self.get_value(collect, false, false)); + for index in indices { + match index { + Index::Field(field) => { + index_ptr_name.push_str(&format!(".field_{}", field)); + } + Index::Variant(variant) => { + index_ptr_name.push_str(&format!(".field_{}", variant)); + } + Index::Position(indices) => { + index_ptr_name.push_str(&indices + .iter() + .map(|index| format!("[{}]", self.get_value(*index, false, false))) + .collect::<Vec<_>>() + .join("")); + } + } } + index_ptr_name + } + + fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { + let func_name = match intrinsic { + Intrinsic::Abs => match ty { + Type::Float32 => "__fabsf", + Type::Float64 => "__fabs", + ty if ty.is_signed() => "abs", + ty if ty.is_unsigned() => "uabs", + _ => panic!("Unsupported type for Abs"), + }, + Intrinsic::ACos => match ty { + ty if ty.is_float() => "__acosf", + _ => "acos", + }, + Intrinsic::ASin => match ty { + ty if ty.is_float() => "__asinf", + _ => "asin", + }, + Intrinsic::ATan => match ty { + ty if ty.is_float() => "__atanf", + _ => "atan", + }, + Intrinsic::ATan2 => match ty { + ty if ty.is_float() => "__atan2f", + _ => "atan2", + }, + Intrinsic::Ceil => match ty { + ty if ty.is_float() => "__ceilf", + _ => "ceil", + }, + Intrinsic::Cos => match ty { + ty if ty.is_float() => "__cosf", + _ => "cos", + }, + Intrinsic::Cosh => match ty { + ty if ty.is_float() => "coshf", + _ => "cosh", + }, + Intrinsic::Exp => match ty { + ty if ty.is_float() => "__expf", + _ => "exp", + }, + Intrinsic::Exp2 => match ty { + ty if ty.is_float() => "__exp2f", + _ => "exp2", + }, + Intrinsic::Floor => match ty { + ty if ty.is_float() => "__floorf", + _ => "floor", + }, + Intrinsic::Ln => match ty { + ty if ty.is_float() => "__logf", + _ => "log", + }, + Intrinsic::Log10 => match ty { + ty if ty.is_float() => "__log10f", + _ => "log10", + }, + Intrinsic::Log2 => match ty { + ty if ty.is_float() => "__log2f", + _ => "log2", + }, + Intrinsic::Max => match ty { + Type::Float32 => "fmaxf", + Type::Float64 => "fmax", + ty if ty.is_signed() => "smax", + ty if ty.is_unsigned() => "umax", + _ => "max", + }, + Intrinsic::Min => match ty { + Type::Float32 => "__fminf", + Type::Float64 => "__fmin", + ty if ty.is_signed() => "smin", + ty if ty.is_unsigned() => "umin", + _ => "min", + }, + Intrinsic::Pow | Intrinsic::Powf => match ty { + Type::Float32 => "__powf", + Type::Float64 => "pow", + _ => panic!("Unsupported type for Pow"), + }, + Intrinsic::Powi => match ty { + ty if ty.is_signed() || ty.is_unsigned() => "powi", + _ => panic!("Unsupported type for Powi"), + }, + Intrinsic::Round => match ty { + ty if ty.is_float() => "__roundf", + ty if ty.is_signed() || ty.is_unsigned() => "roundi", + _ => "round", + }, + Intrinsic::Sin => match ty { + ty if ty.is_float() => "__sinf", + _ => "sin", + }, + Intrinsic::Sinh => match ty { + ty if ty.is_float() => "sinhf", + _ => "sinh", + }, + Intrinsic::Sqrt => match ty { + ty if ty.is_float() => "__sqrtf", + ty if ty.is_signed() || ty.is_unsigned() => "isqrt", + _ => "sqrt", + }, + Intrinsic::Tan => match ty { + ty if ty.is_float() => "__tanf", + _ => "tan", + }, + Intrinsic::Tanh => match ty { + ty if ty.is_float() => "tanhf", + _ => "tanh", + }, + _ => panic!("Unsupported intrinsic {:?}", intrinsic), + }; + func_name.to_string() } fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Result<usize, Error> { - factors.iter() - .try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .ok_or_else(|| Error) - .map(|val| acc.saturating_mul(val)) - }) + factors.iter().try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, self.dynamic_constants) + .ok_or(Error) + .map(|val| acc.saturating_mul(val)) + }) } - fn get_value(&self, id: NodeID, ty: bool) -> String { - if ty { - format!("{} {}{}", self.get_type(self.typing[id.idx()]), self.function.nodes[id.idx()].lower_case_name(), id.idx()) + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { + if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { + if ty { + panic!("Dynamic constants shouldn't be re-initialized") + } + format!("dc{}", dc_id.idx()) + } else if let Node::Parameter { index } = &self.function.nodes[id.idx()] { + if ty { + panic!("Parameters shouldn't be re-initialized") + } + format!("p{}", index) + } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { + let mut declare_array = format!( + "{} (*{}{})", + self.get_type(*element_type, false), + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + ); + for extent in extents.iter().skip(1) { + declare_array.push_str(&format!("[dc{}]", extent.idx())); + } + declare_array + } else if ty { + format!( + "{} {}{}", + self.get_type(self.typing[id.idx()], make_pointer), + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + ) } else { - format!("{}{}", self.function.nodes[id.idx()].lower_case_name(), id.idx()) + format!( + "{}{}", + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + ) } } - fn get_type(&self, id: TypeID) -> String { - match self.types[id.idx()] { + // make_pointer enforces static pointer and not recursive or array pointer: + // multi-d arrays are single pointers with custom indexing. + fn get_type(&self, id: TypeID, make_pointer: bool) -> String { + match &self.types[id.idx()] { Type::Product(_) => { - format!("Product_{}", id.idx()) + format!( + "Product_{}{}", + id.idx(), + if make_pointer { "*" } else { "" } + ) } Type::Summation(_) => { - format!("Summation_{}", id.idx()) + format!( + "Summation_{}{}", + id.idx(), + if make_pointer { "*" } else { "" } + ) } - _ => convert_type(&self.types[id.idx()]), + Type::Array(element_type, extents) => { + // This suffix lets us work with references of dynamic shared memory + // and use n-d array indexing. + let mut suffix = "(*)".to_string(); + if extents.len() > 1 { + for extent in extents.iter().skip(1) { + suffix.push_str(&format!("[dc{}]", extent.idx())); + } + } + format!( + "{}{}", + self.get_type(*element_type, false), + if make_pointer { "*" } else { &suffix } + ) + } + _ => convert_type(&self.types[id.idx()], make_pointer), } } - } // TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 -fn convert_type(ty: &Type) -> String { - match ty { +fn convert_type(ty: &Type, make_pointer: bool) -> String { + let mut result = match ty { Type::Boolean => "bool".to_string(), Type::Integer8 => "int8_t".to_string(), Type::UnsignedInteger8 => "uint8_t".to_string(), @@ -561,5 +1130,9 @@ fn convert_type(ty: &Type) -> String { Type::Float32 => "float".to_string(), Type::Float64 => "double".to_string(), _ => panic!("Unsupported type"), + }; + if make_pointer { + result.push('*'); } + result } diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 11d23e61..86edf743 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -2,14 +2,17 @@ extern crate bitvec; extern crate ordered_float; extern crate serde; -use std::fmt::Write; -use std::ops::Coroutine; -use std::ops::CoroutineState; -use std::pin::Pin; - use self::bitvec::prelude::*; use self::serde::Deserialize; use self::serde::Serialize; +use std::cmp::Ordering; +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::convert::TryInto; +use std::fmt::{Error, Write}; +use std::ops::Coroutine; +use std::ops::CoroutineState; +use std::pin::Pin; use crate::*; @@ -832,6 +835,14 @@ impl Type { } } + pub fn is_summation(&self) -> bool { + if let Type::Summation(_) = self { + true + } else { + false + } + } + pub fn is_array(&self) -> bool { if let Type::Array(_, _) = self { true @@ -995,6 +1006,218 @@ impl DynamicConstant { } } +#[derive(Default, Clone)] +struct DynamicConstantRange { + min: isize, + max: isize, +} + +// The ith element is the exponent of the ith parameter, all together giving a +// unique key for each combination of parameters aka term. +#[derive(Eq, PartialEq, Hash)] +struct ParamKey(Vec<isize>); + +pub fn dynamic_constant_cmp( + a: DynamicConstantID, + b: DynamicConstantID, + dcs: &Vec<DynamicConstant>, + num_params: usize, +) -> Result<Option<Ordering>, Error> { + fn dynamic_constant_evaluation_iter( + a: DynamicConstantID, + dcs: &Vec<DynamicConstant>, + num_params: usize, + ) -> Result<HashMap<ParamKey, DynamicConstantRange>, Error> { + // We evaluate each dynamic constant by constructing range for each "term", + // aka unique combination of parameter exponents (eg param1^0 * param2^0 + // aka scalar represented by [0, 0] or param1^1 * param2^2 by [1, 2]). + // Range instead of single value is needed due to use of modulo. + let mut ranges = HashMap::new(); + match dcs[a.idx()] { + DynamicConstant::Parameter(idx) => { + let mut param_vec = vec![0; num_params]; + param_vec[idx] = 1; + ranges.insert(ParamKey(param_vec), DynamicConstantRange { min: 1, max: 1 }); + } + DynamicConstant::Constant(cons) => { + let param_vec = vec![0; num_params]; + ranges.insert( + ParamKey(param_vec), + DynamicConstantRange { + min: cons.try_into().map_err(|_| Error)?, + max: cons.try_into().map_err(|_| Error)?, + }, + ); + } + DynamicConstant::Add(left, right) => { + // Add same-form terms by adding their values. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + ranges.extend(left_ranges); + for r in right_ranges { + if let Some(l) = ranges.get_mut(&r.0) { + l.min += r.1.min; + l.max += r.1.max; + } else { + ranges.insert(r.0, r.1); + } + } + } + DynamicConstant::Sub(left, right) => { + // Subtract same-form terms by subtracting their values. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + ranges.extend(left_ranges); + for r in right_ranges { + if let Some(l) = ranges.get_mut(&r.0) { + l.min -= r.1.max; + l.max -= r.1.min; + } else { + ranges.insert( + r.0, + DynamicConstantRange { + min: -r.1.max, + max: -r.1.min, + }, + ); + } + } + } + DynamicConstant::Mul(left, right) => { + // Pairwise multiply each term by elementwise adding the two + // exponent keys and multiplying the values. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + for l in left_ranges { + for r in right_ranges.iter() { + let mut param_vec = l.0 .0.clone(); + for (idx, r_val) in r.0 .0.iter().enumerate() { + param_vec[idx] += r_val; + } + ranges.insert( + ParamKey(param_vec), + DynamicConstantRange { + min: l.1.min * r.1.min, + max: l.1.max * r.1.max, + }, + ); + } + } + } + DynamicConstant::Div(left, right) => { + // Pairwise divide each term by elementwise subtracting the two + // exponent keys and dividing the values. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + for l in left_ranges { + for r in right_ranges.iter() { + let mut param_vec = l.0 .0.clone(); + for (idx, r_val) in r.0 .0.iter().enumerate() { + param_vec[idx] -= r_val; + } + ranges.insert( + ParamKey(param_vec), + DynamicConstantRange { + min: l.1.min / r.1.min, + max: l.1.max / r.1.max, + }, + ); + } + } + } + DynamicConstant::Rem(left, right) => { + // We do simplest check for 0 or scalar multiple, and ignore all + // other cases of pure multiple. If check fails, the remainder is + // somewhere between 0 and the right value. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let mut is_zero = true; + for l in left_ranges.iter() { + if l.1.min != 0 || l.1.max != 0 { + is_zero = false; + break; + } + } + if is_zero { + return Ok(ranges); + } + + // Scalar multiple requires both that all right terms have left + // term with same positive multiplier, and there are no + // outstanding left terms after matching. + let mut is_scalar_multiple = true; + let mut scalar_factor = 0; + let mut remaining_left_terms = left_ranges.len(); + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + for r in right_ranges.iter() { + if let Some(l_range) = left_ranges.get(r.0) { + if l_range.min != l_range.max || r.1.min != r.1.max || l_range.min % r.1.min != 0 || (scalar_factor != 0 && l_range.min / r.1.min != scalar_factor) { + is_scalar_multiple = false; + break; + } + scalar_factor = l_range.min / r.1.min; + remaining_left_terms -= 1; + } + } + if is_scalar_multiple && scalar_factor >= 0 && remaining_left_terms == 0 { + return Ok(ranges); + } + + for r in right_ranges { + ranges.insert( + r.0, + DynamicConstantRange { + min: min(0, r.1.min), + max: max(0, r.1.max), + }, + ); + } + } + } + Ok(ranges) + } + + let a_ranges = dynamic_constant_evaluation_iter(a, dcs, num_params)?; + let b_ranges = dynamic_constant_evaluation_iter(b, dcs, num_params)?; + // a >= b iff a's min >= b's max. >= requires all terms in b to satisfy: + // if also in a, then a's coef >= b's coef; if not in a, have b's coef <= 0. + let mut a_is_greater = true; + for b in b_ranges.iter() { + if let Some(a) = a_ranges.get(b.0) { + if a.min < b.1.max { + a_is_greater = false; + break; + } + } else if b.1.min > 0 { + a_is_greater = false; + break; + } + } + + // Now check if b >= a. + let mut b_is_greater = true; + for a in a_ranges.iter() { + if let Some(b) = b_ranges.get(a.0) { + if b.min < a.1.max { + b_is_greater = false; + break; + } + } else if a.1.min > 0 { + b_is_greater = false; + break; + } + } + + if a_is_greater && b_is_greater { + Ok(Some(Ordering::Equal)) + } else if a_is_greater { + Ok(Some(Ordering::Greater)) + } else if b_is_greater { + Ok(Some(Ordering::Less)) + } else { + Ok(None) + } +} + pub fn evaluate_dynamic_constant( cons: DynamicConstantID, dcs: &Vec<DynamicConstant>, -- GitLab From 3552345f55cfcb7d70d7384ae1ed580a3f77c4bf Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 30 Dec 2024 15:45:04 -0800 Subject: [PATCH 006/109] indexing --- hercules_cg/src/gpu.rs | 310 +++++++++++++++++++++++++++++++---------- 1 file changed, 240 insertions(+), 70 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index aa5908a2..29135195 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -85,6 +85,7 @@ pub fn gpu_codegen<W: Write>( }; let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); + // Maybe can delete let map_join_reduce = || -> HashMap<NodeID, Vec<NodeID>> { let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) .filter(|idx| function.nodes[*idx].is_reduce()) @@ -219,6 +220,7 @@ impl GPUContext<'_> { " #include <assert.h> #include <stdio.h> +#include <stddef.h> #include <cuda.h> #include <cuda_runtime.h> #include <mma.h> @@ -293,7 +295,7 @@ impl GPUContext<'_> { for (id, ty) in self.types.iter().enumerate() { match ty { Type::Product(ref product_ty_ids) => { - write!(w, "\nstruct Product_{} {{\n", id)?; + write!(w, "\ntypedef struct Product_{} {{\n", id)?; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { write!( w, @@ -302,10 +304,10 @@ impl GPUContext<'_> { i )?; } - write!(w, "}};\n")?; + write!(w, "}} Product_{};\n", id)?; } Type::Summation(ref summation_ty_ids) => { - write!(w, "\nstruct Summation_{} {{\n\t union {{\n", id)?; + write!(w, "\ntypedef struct Summation_{} {{\n\t union {{\n", id)?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { write!( w, @@ -314,7 +316,7 @@ impl GPUContext<'_> { i )?; } - write!(w, "\t}};\n\tuint8_t tag;\n}};\n")?; + write!(w, "\t}};\n\tuint8_t tag;\n}} Summation_{};\n", id)?; } _ => {} } @@ -348,7 +350,7 @@ impl GPUContext<'_> { write!(w, "{} p{}", self.get_type(*ty, true), idx)?; } // We convert originally non-void functions to void functions by adding a - // return parameter. For now we ignore the case where return was derived + // return parameter. For now we ignore the case where return was derived // from a parameter through reads and writes, and instead always memcpy. let return_index = self.function.nodes.iter().position(|node| node.is_return()); if let Some(return_index) = return_index { @@ -424,7 +426,7 @@ impl GPUContext<'_> { Ok(()) } - // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the join has no user reduce nodes, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the forks are parallel reduce forks, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> { let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); for (_, children) in self.fork_forward_adjacency.iter() { @@ -442,7 +444,7 @@ impl GPUContext<'_> { let mut curr_fork = root_forks[0]; while let Some(join) = self.fork_join_map.get(&curr_fork) { let children = &self.fork_forward_adjacency[&curr_fork]; - if children.len() != 1 || !self.join_reduce_map.contains_key(join) { + if children.len() != 1 || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) { break; } curr_fork = children[0]; @@ -556,7 +558,7 @@ impl GPUContext<'_> { } } - // 2. Emit data flow for nodes assigned to those basic blocks + // 2. Emit data flow for nodes assigned to basic blocks in block sink // 2a. All phi registers first self.codegen_phi_registers(w, |id| control_nodes_between.contains(&self.bbs[id.idx()]))?; // 2b. All other data nodes @@ -578,6 +580,7 @@ impl GPUContext<'_> { { self.codegen_data_node( *id, + 1, &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body, )?; visited.insert(id); @@ -598,13 +601,10 @@ impl GPUContext<'_> { Ok(()) } - fn codegen_data_node( - &self, - id: NodeID, - w: &mut String, - ) -> Result<(), Error> { + fn codegen_data_node(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> { // For now only used shared memory when creating an array let declare_variable = self.get_value(id, true, false).to_string(); + let tabs = "\t".repeat(num_tabs); match &self.function.nodes[id.idx()] { // Phi registers were already emitted. Node::Phi { @@ -628,7 +628,8 @@ impl GPUContext<'_> { Type::Boolean => { write!( w, - "\t{} = !{};\n", + "{}{} = !{};\n", + tabs, declare_variable, self.get_value(*input, false, false), )?; @@ -636,7 +637,8 @@ impl GPUContext<'_> { ty if ty.is_fixed() => { write!( w, - "\t{} = ~{};\n", + "{}{} = ~{};\n", + tabs, declare_variable, self.get_value(*input, false, false), )?; @@ -647,7 +649,8 @@ impl GPUContext<'_> { ty if ty.is_signed() || ty.is_float() => { write!( w, - "\t{} = -{};\n", + "{}{} = -{};\n", + tabs, declare_variable, self.get_value(*input, false, false), )?; @@ -659,7 +662,8 @@ impl GPUContext<'_> { UnaryOperator::Cast(dst_ty_id) => { write!( w, - "\t{} = static_cast<{}>({});\n", + "{}{} = static_cast<{}>({});\n", + tabs, declare_variable, self.get_type(*dst_ty_id, false), self.get_value(*input, false, false), @@ -672,36 +676,29 @@ impl GPUContext<'_> { match (op, &self.types[self.typing[left.idx()].idx()]) { (BinaryOperator::Rem, Type::Float32) => write!( w, - "\t{} = fmodf({}, {});\n", - declare_variable, - left_val, - right_val, + "{}{} = fmodf({}, {});\n", + tabs, declare_variable, left_val, right_val, )?, (BinaryOperator::Rem, Type::Float64) => write!( w, - "\t{} = fmod({}, {});\n", - declare_variable, - left_val, - right_val, + "{}{} = fmod({}, {});\n", + tabs, declare_variable, left_val, right_val, )?, // Doesn't need special syntax but bool type (BinaryOperator::Or, Type::Boolean) => write!( w, - "\t{} = {} || {};\n", - declare_variable, - left_val, - right_val, + "{}{} = {} || {};\n", + tabs, declare_variable, left_val, right_val, )?, (BinaryOperator::And, Type::Boolean) => write!( w, - "\t{} = {} && {};\n", - declare_variable, - left_val, - right_val, + "{}{} = {} && {};\n", + tabs, declare_variable, left_val, right_val, )?, (op, _) => write!( w, - "\t{} = {} {} {};\n", + "{}{} = {} {} {};\n", + tabs, declare_variable, left_val, match op { @@ -726,11 +723,17 @@ impl GPUContext<'_> { )?, }; } - Node::Ternary {op, first, second, third} => match op { + Node::Ternary { + op, + first, + second, + third, + } => match op { TernaryOperator::Select => { write!( w, - "\t{} = {} ? {} : {};\n", + "{}{} = {} ? {} : {};\n", + tabs, declare_variable, self.get_value(*first, false, false), self.get_value(*second, false, false), @@ -743,30 +746,53 @@ impl GPUContext<'_> { let func_name = self.codegen_intrinsic(intrinsic, ty); write!( w, - "\t{} = {}({});\n", + "{}{} = {}({});\n", + tabs, declare_variable, func_name, self.get_value(args[0], false, false), )?; } Node::Read { collect, indices } => { - let index_ptr_name = self.codegen_indices(*collect, indices); - // If it's a parameter node then copy from global memory, else - // reference from shared memory or registers. + // If it's a parameter node then copy from global memory, else + // from shared memory or registers. if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { - // We parallelize copies from global memory across threads for - // array types, either immediate or nested in the collection. - if self.types[self.typing[id.idx()].idx()].is_primitive() { - write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?; - } else { - self.codegen_global_to_shared(id, declare_variable, index_ptr_name, indices.len(), true, w)?; - } + let index_ptr_name = self.codegen_indices(*collect, indices, true); + self.codegen_copy_from_global( + true, + self.typing[id.idx()], + &declare_variable, + &index_ptr_name, + Some(indices.len()), + true, + num_tabs, + w, + )?; } else { - write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?; + let index_ptr_name = self.codegen_indices(*collect, indices,false); + write!(w, "{}{} = {};\n", tabs, declare_variable, index_ptr_name)?; } } - Node::Write {collect: _, data: _, indices: _} => { - // TODO + Node::Write {collect, data, indices} => { + let data_variable = self.get_value(*data, false, false); + // If it's a parameter node then copy to global memory, else + // to shared memory or registers + if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { + let index_ptr_name = self.codegen_indices(*collect, indices, true); + self.codegen_copy_to_from_global( + false, + self.typing[id.idx()], + &data_variable, + &index_ptr_name, + Some(indices.len()), + true, + num_tabs, + w, + )?; + } else { + let index_ptr_name = self.codegen_indices(*collect, indices, false); + write!(w, "{}{} = {};\n", tabs, index_ptr_name, data_variable)?; + } } _ => { panic!("Unsupported node type") @@ -848,7 +874,7 @@ impl GPUContext<'_> { let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { panic!("Expected array type") }; - // For now we do element-wise alignment, later could consider (n-1)d array + // For now we do element-wise alignment, later could consider (n-1)d array // alignment. Then we "allocate" from the single dynamic shared memory buffer // by using and updating the offset. let element_size = format!("sizeof({})", self.get_type(*element_type, false)); @@ -857,18 +883,41 @@ impl GPUContext<'_> { .map(|id| format!("dc{}", id.idx())) .collect::<Vec<_>>() .join("*"); - write!(w, ";\n\talignment = {};\n\tdynamic_shared_offset = + write!( + w, + ";\n\talignment = {};\n\tdynamic_shared_offset = (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t - dynamic_shared_offset += {}", element_size, name, self.get_type(*element_type, false), array_size)?; + dynamic_shared_offset += {}", + element_size, + name, + self.get_type(*element_type, false), + array_size + )?; } } Ok(()) } - fn codegen_global_to_shared(&self, id: NodeID, declare_variable: String, index_ptr_name: String, array_depth: Option<usize>, outermost: bool, w: &mut String) -> Result<(), Error> { - match &self.types[self.typing[id.idx()].idx()] { - Type::Array(_, extents) => { + // Used for reads and writes due to identical logic. data_variable is the + // resulting reference for reads, and is the source for writes. Writes don't + // emit a new reference. + fn codegen_copy_from_global( + &self, + is_read: bool, + type_id: TypeID, + data_variable: &String, + index_ptr_name: &String, + array_depth: Option<usize>, + parallelize: bool, + num_tabs: usize, + w: &mut String, + ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); + let lhs = if is_read { data_variable } else { index_ptr_name }; + let rhs = if is_read { index_ptr_name } else { data_variable }; + match &self.types[type_id.idx()] { + Type::Array(element_type_id, extents) => { let array_depth = array_depth.unwrap(); let rem_array_size = extents .iter() @@ -878,23 +927,95 @@ impl GPUContext<'_> { .collect::<Vec<_>>() .join("*"); let mut running_div_factor = "1".to_string(); - write!(w, "\tfor (int i = threadIdx.x; i < {}; i += {}) {{\n", rem_array_size, self.kernel_attrs.num_threads)?; let mut indices = vec![]; for i in (array_depth..extents.len()).rev() { - indices.push(format!("[(({}) / ({})) % dc{}]", rem_array_size, running_div_factor, extents[i].idx())); - running_div_factor = format!("{} * {}", running_div_factor, format!("dc{}", extents[i].idx())); + indices.push(format!( + "[(({}) / ({})) % dc{}]", + rem_array_size, + running_div_factor, + extents[i].idx() + )); + running_div_factor = format!( + "{} * {}", + running_div_factor, + format!("dc{}", extents[i].idx()) + ); } let indices_str = indices.join(""); - // TODO: condition by primitive vs collection, if latter then recurse - // with outermost = false - write!(w, "\t\t{}{} = {}{};\n", declare_variable, indices_str, index_ptr_name, indices_str)?; + // Parallelizing only affects loop bounds + let begin_copy = if parallelize { + format!( + "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", + tabs, rem_array_size, self.kernel_attrs.num_threads + ) + } else { + format!("{}for (int i = 0; i < {}; i++) {{\n", tabs, rem_array_size) + }; + write!(w, "{}", begin_copy)?; + self.codegen_copy_to_from_global( + is_read, + *element_type_id, + &format!("{}{}", data_variable, indices_str), + &format!("{}{}", index_ptr_name, indices_str), + None, + false, + num_tabs + 1, + w, + )?; + let end_copy = if parallelize { + format!("{}}}\n{}__syncthreads();\n", tabs, tabs) + } else { + format!("{}}}\n", tabs) + }; + write!(w, "{}", end_copy)?; + } + Type::Product(fields) => { + for field in fields { + self.codegen_copy_to_from_global( + is_read, + *field, + &format!("{}{}", data_variable, field.idx()), + &format!("{}{}", index_ptr_name, field.idx()), + None, + false, + num_tabs + 1, + w, + )?; + } + } + Type::Summation(fields) => { + // First copy the tag + write!(w, "{}{}.tag = {}.tag;\n", tabs, lhs, rhs)?; + // Then copy the active field based on the tag + write!(w, "{}switch({}.tag) {{\n", tabs, rhs)?; + for (variant_idx, field) in fields.iter().enumerate() { + write!(w, "{}\tcase {}: {{\n", tabs, variant_idx)?; + // Recursively copy the field's contents + self.codegen_copy_to_from_global( + is_read, + *field, + &format!("{}.field_{}", data_variable, variant_idx), + &format!("{}.field_{}", index_ptr_name, variant_idx), + None, + false, + num_tabs + 2, + w + )?; + write!(w, "{}\t\tbreak;\n", tabs)?; + write!(w, "{}\t}}\n", tabs)?; + } + write!(w, "{}}}\n", tabs)?; + } + // Primitive types + _ => { + write!(w, "{}{} = {};\n", tabs, lhs, rhs)?; } - // TODO: handle product and summation collections } Ok(()) } - fn codegen_indices(&self, collect: NodeID, indices: &[Index]) -> String { + // Use normal indexing for local collections + fn codegen_indices_local(&self, collect: NodeID, indices: &[Index]) -> String { let mut index_ptr_name = format!("{}", self.get_value(collect, false, false)); for index in indices { match index { @@ -905,15 +1026,54 @@ impl GPUContext<'_> { index_ptr_name.push_str(&format!(".field_{}", variant)); } Index::Position(indices) => { - index_ptr_name.push_str(&indices + index_ptr_name.push_str( + &indices + .iter() + .map(|index| format!("[{}]", self.get_value(*index, false, false))) + .collect::<Vec<_>>() + .join(""), + ); + } + } + } + index_ptr_name + } + + // Use arithmetic for global collections as they're accessed as pointers + fn codegen_indices_global(&self, collect: NodeID, indices: &[Index]) -> String { + let mut index_ptr_name = format!("{}[0", self.get_value(collect, false, false)); + let type_id = self.typing[collect.idx()]; + for index in indices { + match index { + Index::Field(field) => { + let offset = (0..*field) + .map(|i| format!("offsetof({}, field_{})", self.get_type(type_id, false), i)) + .collect::<Vec<_>>() + .join(" + "); + index_ptr_name.push_str(&format!(" + {}", offset)); + } + // Variants of summations have zero offset + Index::Variant(_) => {} + Index::Position(array_indices) => { + let Type::Array(_, extents) = &self.types[self.typing[collect.idx()].idx()] else { + panic!("Expected array type") + }; + let mut cumulative_offset = "1 * ".to_string() + extents .iter() - .map(|index| format!("[{}]", self.get_value(*index, false, false))) + .enumerate() + .filter(|(i, _)| *i >= array_indices.len()) + .map(|(_, id)| format!("dc{}", id.idx())) .collect::<Vec<_>>() - .join("")); + .join(" * ") + .as_str(); + for index in array_indices.iter().rev() { + cumulative_offset = format!("{} * ({} + ", cumulative_offset, self.get_value(*index, false, false)); + } + index_ptr_name.push_str(&format!(" + {}{}", cumulative_offset, ")".repeat(array_indices.len()))); } } } - index_ptr_name + format!("{}]", index_ptr_name) } fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { @@ -1050,7 +1210,17 @@ impl GPUContext<'_> { panic!("Parameters shouldn't be re-initialized") } format!("p{}", index) - } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { + } else if let Node::Write { collect, data: _, indices: _ } = &self.function.nodes[id.idx()] { + if ty { + panic!("Writes shouldn't be initialized, they're replaced with the referenced collection") + } + if make_pointer { + panic!("Writes shouldn't be called as pointer") + } + self.get_value(*collect, false, false) + } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] + { + // Shmem/register arrays have special formatting let mut declare_array = format!( "{} (*{}{})", self.get_type(*element_type, false), @@ -1077,7 +1247,7 @@ impl GPUContext<'_> { } } - // make_pointer enforces static pointer and not recursive or array pointer: + // make_pointer enforces static pointer and not recursive or array pointer: // multi-d arrays are single pointers with custom indexing. fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { -- GitLab From 53c811d48d9f88ce85744ad7edae94c80ccc545d Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Tue, 31 Dec 2024 12:44:10 -0800 Subject: [PATCH 007/109] rw finish --- hercules_cg/src/gpu.rs | 965 +++++++++++++++++++++++++---------------- 1 file changed, 594 insertions(+), 371 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 29135195..3ad9297b 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,39 +3,14 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; -use std::hash::{Hash, Hasher}; use std::iter::FromIterator; use self::hercules_ir::*; -#[derive(Debug, Clone)] -struct HashableIndex<'a>(Vec<&'a str>); -impl<'a> FromIterator<&'a String> for HashableIndex<'a> { - fn from_iter<I: IntoIterator<Item = &'a String>>(iter: I) -> Self { - HashableIndex(iter.into_iter().map(|s| s.as_str()).collect()) - } -} -impl<'a> PartialEq for HashableIndex<'a> { - fn eq(&self, other: &Self) -> bool { - self.0 == other.0 - } -} -impl<'a> Eq for HashableIndex<'a> {} -impl<'a> Hash for HashableIndex<'a> { - fn hash<H: Hasher>(&self, state: &mut H) { - self.0.hash(state); - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -enum MemoryType { - Shared, - Register, -} - /* - * The top level function to compile a Hercules IR function into NVVM IR kernel for - * execution on the GPU. We generate NVVM IR textually, copying from the CPU LLVM approach. + * The top level function to compile a Hercules IR function into CUDA kernel for + * execution on the GPU. We generate CUDA C textually, based on the CPU LLVM + * approach. */ pub fn gpu_codegen<W: Write>( function: &Function, @@ -57,6 +32,37 @@ pub fn gpu_codegen<W: Write>( }; let kernel_attrs = GPUKernelAttrs::default(); + // GPU backend assertions + for ty in types.iter() { + if let Type::Array(type_id, _) = ty { + if let Type::Array(..) = types[type_id.idx()] { + panic!("Array element type can't be another array"); + } + } + } + + let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) + .filter(|idx| function.nodes[*idx].is_reduce()) + .map(NodeID::new) + .collect(); + for idx in 0..function.nodes.len() { + if function.nodes[idx].is_join() && reduce_nodes + .iter() + .filter(|reduce_node| match &function.nodes[reduce_node.idx()] { + Node::Reduce { + control, + init: _, + reduct: _, + } => control.idx() == idx, + _ => false, + }) + .count() + == 0 + { + panic!("Join node {} has no reduce nodes", idx); + } + } + // Create fork forward adjacency and join map upfront as part of context let make_fork_structures = || -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, NodeID>) { let mut fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>> = (0..function.nodes.len()) @@ -85,32 +91,6 @@ pub fn gpu_codegen<W: Write>( }; let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); - // Maybe can delete - let map_join_reduce = || -> HashMap<NodeID, Vec<NodeID>> { - let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) - .filter(|idx| function.nodes[*idx].is_reduce()) - .map(NodeID::new) - .collect(); - let mut map_join_reduce = HashMap::new(); - for (_, join) in fork_join_map.iter() { - let reduce_nodes_for_join = reduce_nodes - .iter() - .filter(|reduce_node| match &function.nodes[reduce_node.idx()] { - Node::Reduce { - control, - init: _, - reduct: _, - } => control.idx() == join.idx(), - _ => false, - }) - .copied() - .collect(); - map_join_reduce.insert(*join, reduce_nodes_for_join); - } - map_join_reduce - }; - let join_reduce_map = map_join_reduce(); - let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { let mut label_data_for_phi = HashMap::new(); for (idx, node) in function.nodes.iter().enumerate() { @@ -127,27 +107,6 @@ pub fn gpu_codegen<W: Write>( }; let label_data_for_phi = label_data_for_phi(); - // Since global memory traffic is expensive, we use shared memory and - // registers basically as write-back caches, but we write back due to - // end of scope rather than due to synchronization (which is solved by - // shmem). - // param_cache tracks cache for each parameter by accessed index and - // memory type. Note that indexing is hierarchical, so [a, b] contains - // [a, b, c] and will give a hit upon query of the latter. param_cache is - // only added to for copies from global -> shared or global -> register. - // Writes update the cache, but we track specific indices written in - // param_cache_writes to know what to write back (and avoid redundant - // writes). - let param_cache = vec![ - HashMap::<(HashableIndex<'static>, MemoryType), String>::new(); - function.param_types.len() - ]; - let param_cache_writes = - vec![HashSet::<(HashableIndex<'static>, MemoryType)>::new(); function.param_types.len()]; - // Statically unknown shared memory buffers need to use dynamic offsets from. - // the dynamic shared memory buffer - let mut dynamic_shared_offset = "0".to_string(); - let mut ctx = GPUContext { function, types, @@ -162,9 +121,6 @@ pub fn gpu_codegen<W: Write>( fork_forward_adjacency, fork_join_map, label_data_for_phi, - join_reduce_map, - param_cache, - param_cache_writes, }; ctx.codegen_function(w) } @@ -182,7 +138,6 @@ struct GPUKernelParams { struct GPUKernelAttrs { num_blocks: usize, num_threads: usize, - extern_shmem_offset: String, } struct GPUContext<'a> { @@ -199,9 +154,6 @@ struct GPUContext<'a> { fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, fork_join_map: HashMap<NodeID, NodeID>, label_data_for_phi: HashMap<NodeID, Vec<NodeID>>, - join_reduce_map: HashMap<NodeID, Vec<NodeID>>, - param_cache: Vec<HashMap<(HashableIndex, MemoryType), String>>, - param_cache_writes: Vec<HashSet<(HashableIndex, MemoryType)>>, } #[derive(Default, Debug)] @@ -214,7 +166,8 @@ struct CudaGoto { impl GPUContext<'_> { fn codegen_function<W: Write>(&mut self, w: &mut W) -> Result<(), Error> { - // Include all possible includes then macros + // All possible includes followed by macros for intrinsic calls on + // types with no library support write!( w, " @@ -238,18 +191,16 @@ impl GPUContext<'_> { let mut top = String::new(); - // Emit all possible structs - self.codegen_structs(&mut top)?; - // Emit kernel template, signature, and arguments + // Emit kernel signature, arguments, and dynamic shared memory declaration self.codegen_kernel_begin(&mut top)?; - // Need to emit dynamic offsets for extern shmem, we do this by strings. - self.kernel_attrs.extern_shmem_offset = "0".to_string(); - // Emit calculation of all dynamic constants self.codegen_dynamic_constants(&mut top)?; + // Emit all possible struct definitions and dummy pointers for each type. + // These may depend on dynamic constants, for example an array field with + // dynamic constant dims. + self.codegen_type_init(&mut top)?; - // Uses CUDA's goto structure; some control nodes' gen may be moved, eg - // block and thread fork joins. + // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) .map(|idx| { @@ -262,15 +213,14 @@ impl GPUContext<'_> { }) .collect(); - // Assign outermost fork joins to block level. TODO: remove block_sizes + // Assign outermost valid fork joins to block level. TODO: remove block_sizes // if still not needed later let (block_fork_ids, _) = self.codegen_block_creation()?; // Assign inner fork joins to thread level. We do this before block sink // because we need thread size for shared memory optimizations let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1])?; - // Sink logic from outer block fork joins. If it's a write, add - // necessary block-id based condition. + // Sink logic from outer block fork joins. self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], &mut top, &mut gotos)?; for (i, &fork_id) in block_fork_ids .iter() @@ -291,39 +241,6 @@ impl GPUContext<'_> { Ok(()) } - fn codegen_structs(&self, w: &mut String) -> Result<(), Error> { - for (id, ty) in self.types.iter().enumerate() { - match ty { - Type::Product(ref product_ty_ids) => { - write!(w, "\ntypedef struct Product_{} {{\n", id)?; - for (i, product_ty_id) in product_ty_ids.iter().enumerate() { - write!( - w, - "\t{} field_{};\n", - self.get_type(*product_ty_id, false), - i - )?; - } - write!(w, "}} Product_{};\n", id)?; - } - Type::Summation(ref summation_ty_ids) => { - write!(w, "\ntypedef struct Summation_{} {{\n\t union {{\n", id)?; - for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { - write!( - w, - "\t\t{} field_{};\n", - self.get_type(*summation_ty_id, false), - i - )?; - } - write!(w, "\t}};\n\tuint8_t tag;\n}} Summation_{};\n", id)?; - } - _ => {} - } - } - Ok(()) - } - fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { write!( w, @@ -347,26 +264,7 @@ impl GPUContext<'_> { } else { write!(w, ", ")?; } - write!(w, "{} p{}", self.get_type(*ty, true), idx)?; - } - // We convert originally non-void functions to void functions by adding a - // return parameter. For now we ignore the case where return was derived - // from a parameter through reads and writes, and instead always memcpy. - let return_index = self.function.nodes.iter().position(|node| node.is_return()); - if let Some(return_index) = return_index { - if let Node::Return { - control: _, - data: return_data, - } = &self.function.nodes[return_index] - { - write!( - w, - ", {} return_val", - self.get_type(self.typing[return_data.idx()], true) - )?; - } else { - panic!("Expected return node"); - } + write!(w, "{} p{}", self.get_type(*ty, true, true), idx)?; } // Type is char since it's simplest to use single bytes for indexing, @@ -408,6 +306,78 @@ impl GPUContext<'_> { Ok(()) } + // Emit struct definitions for each typeid of product or summation type. If + // multiple typeids have the same type, they're separately emitted. Might + // not be most elegant, but using typeid is more convenient when instantiating + // than eg searching for index of type in types vector. Also emit dummy pointers + // for struct and primitive type ids for possible future use when moving to/from + // global memory + fn codegen_type_init(&self, w: &mut String) -> Result<(), Error> { + for type_id in self.typing.iter() { + let type_id_idx = type_id.idx(); + let ty = &self.types[type_id_idx]; + match ty { + Type::Product(ref product_ty_ids) => { + write!(w, "\ttypedef struct Product_{} {{\n", type_id_idx)?; + for (i, product_ty_id) in product_ty_ids.iter().enumerate() { + write!( + w, + "\t\t{} field_{};\n", + self.get_type(*product_ty_id, false, false), + i + )?; + } + write!(w, "}} Product_{};\n", type_id_idx)?; + write!( + w, + "\tProduct_{}* product_{}_dummy;\n", + type_id_idx, type_id_idx + )?; + } + Type::Summation(ref summation_ty_ids) => { + write!( + w, + "\ttypedef struct Summation_{} {{\n\t\t union {{\n", + type_id_idx + )?; + for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { + write!( + w, + "\t\t\t{} field_{};\n", + self.get_type(*summation_ty_id, false, false), + i + )?; + } + write!( + w, + "\t\t}};\n\t\tuint8_t tag;\n\t}} Summation_{};\n", + type_id_idx + )?; + write!( + w, + "\tSummation_{}* summation_{}_dummy;\n", + type_id_idx, type_id_idx + )?; + } + // Arrays are decomposed into their element type during transfer + // so no need to emit dummy pointers + Type::Array(_, _) => {} + // Primitive types + _ => { + write!( + w, + "\t{} {}_{}_dummy;\n", + convert_type(ty, true), + convert_type(ty, false), + type_id_idx + )?; + } + } + } + + Ok(()) + } + fn codegen_phi_registers<F>(&self, w: &mut String, should_process: F) -> Result<(), Error> where F: Fn(NodeID) -> bool, @@ -426,7 +396,12 @@ impl GPUContext<'_> { Ok(()) } - // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the forks are parallel reduce forks, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + /* + Construct block forks by greedily accepting while: a) each fork join is strictly + nested meaning no other neighbor fork joins, b) the forks are parallel forks, + c) total number of blocks < max_num_blocks, and d) each fork's factor is statically + known. + */ fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> { let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); for (_, children) in self.fork_forward_adjacency.iter() { @@ -442,16 +417,18 @@ impl GPUContext<'_> { // a and b let mut strict_forks = vec![root_forks[0]]; let mut curr_fork = root_forks[0]; - while let Some(join) = self.fork_join_map.get(&curr_fork) { + while self.fork_join_map.contains_key(&curr_fork) { let children = &self.fork_forward_adjacency[&curr_fork]; - if children.len() != 1 || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) { + if children.len() != 1 + || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) + { break; } curr_fork = children[0]; strict_forks.push(curr_fork); } - // c, (stronger version of) d, and e + // c and d let mut valid_block_forks = 0; let mut cumulative_blocks = 1usize; let mut block_fork_sizes = Vec::new(); @@ -474,7 +451,8 @@ impl GPUContext<'_> { valid_block_forks += 1; } - // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. + // If limit on number of blocks in 1D grid is reached, we could consider 2D + // or 3D grids. Performance is not affected so for now keep it simple with 1D. self.kernel_attrs.num_blocks = cumulative_blocks; let valid_block_forks = strict_forks .into_iter() @@ -578,11 +556,8 @@ impl GPUContext<'_> { .iter() .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) { - self.codegen_data_node( - *id, - 1, - &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body, - )?; + let body = &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body; + self.codegen_data_node(*id, body, 1)?; visited.insert(id); } else { worklist.push_back(id); @@ -591,17 +566,64 @@ impl GPUContext<'_> { // 3. Emit control flow for control_node in control_nodes_between { - self.codegen_control_node(control_node, w)?; + let term = &mut gotos.get_mut(&self.bbs[control_node.idx()]).unwrap().term; + self.codegen_control_node(control_node, term, 1)?; } Ok(()) } - fn codegen_control_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { + fn codegen_control_node( + &self, + id: NodeID, + w: &mut String, + num_tabs: usize, + ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); + match &self.function.nodes[id.idx()] { + Node::Start + | Node::Region { preds: _ } + | Node::Projection { + control: _, + selection: _, + } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(w, "{}goto {}\n", tabs, self.get_block_name(succ))?; + } + Node::If { control: _, cond } => { + let mut succs = self.control_subgraph.succs(id); + let succ1 = succs.next().unwrap(); + let succ2 = succs.next().unwrap(); + write!( + w, + "{}if ({}) {{\n", + tabs, + self.get_value(*cond, false, false) + )?; + write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; + write!(w, "{}}} else {{\n", tabs)?; + write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ2))?; + write!(w, "{}}}\n", tabs)?; + } + Node::Fork { + control: _, + factors: _, + } => {} + Node::Join { control: _ } => {} + Node::Return { + control: _, + data: _, + } => { + write!(w, "{}return;\n", tabs)?; + } + _ => { + panic!("Unsupported control node type") + } + } Ok(()) } - fn codegen_data_node(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> { + fn codegen_data_node(&self, id: NodeID, w: &mut String, num_tabs: usize) -> Result<(), Error> { // For now only used shared memory when creating an array let declare_variable = self.get_value(id, true, false).to_string(); let tabs = "\t".repeat(num_tabs); @@ -665,7 +687,7 @@ impl GPUContext<'_> { "{}{} = static_cast<{}>({});\n", tabs, declare_variable, - self.get_type(*dst_ty_id, false), + self.get_type(*dst_ty_id, false, false), self.get_value(*input, false, false), )?; } @@ -757,45 +779,54 @@ impl GPUContext<'_> { // If it's a parameter node then copy from global memory, else // from shared memory or registers. if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { - let index_ptr_name = self.codegen_indices(*collect, indices, true); - self.codegen_copy_from_global( - true, + write!(w, "{};\n", declare_variable); + let is_char = self.is_parameter_char(self.typing[collect.idx()]); + let global_collect = self.codegen_global_collect(*collect, indices, is_char); + self.codegen_copy_from_to_global( + false, self.typing[id.idx()], &declare_variable, - &index_ptr_name, - Some(indices.len()), + &global_collect, + indices, true, - num_tabs, + is_char, w, + num_tabs, )?; } else { - let index_ptr_name = self.codegen_indices(*collect, indices,false); - write!(w, "{}{} = {};\n", tabs, declare_variable, index_ptr_name)?; + let local_collect = self.codegen_local_collect(*collect, indices); + write!(w, "{}{} = {};\n", tabs, declare_variable, local_collect)?; } } - Node::Write {collect, data, indices} => { + Node::Write { + collect, + data, + indices, + } => { let data_variable = self.get_value(*data, false, false); // If it's a parameter node then copy to global memory, else // to shared memory or registers if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { - let index_ptr_name = self.codegen_indices(*collect, indices, true); - self.codegen_copy_to_from_global( - false, + let is_char = self.is_parameter_char(self.typing[collect.idx()]); + let global_collect = self.codegen_global_collect(*collect, indices, is_char); + self.codegen_copy_from_to_global( + true, self.typing[id.idx()], &data_variable, - &index_ptr_name, - Some(indices.len()), + &global_collect, + &indices, true, - num_tabs, + is_char, w, + num_tabs, )?; } else { - let index_ptr_name = self.codegen_indices(*collect, indices, false); - write!(w, "{}{} = {};\n", tabs, index_ptr_name, data_variable)?; + let local_collect = self.codegen_local_collect(*collect, indices); + write!(w, "{}{} = {};\n", tabs, local_collect, data_variable)?; } } _ => { - panic!("Unsupported node type") + panic!("Unsupported data node type") } } if let Some(phis) = self.label_data_for_phi.get(&id) { @@ -811,125 +842,50 @@ impl GPUContext<'_> { Ok(()) } - // matmul detection- only called if einsum detected - fn matmul_detection(&self) -> Result<(), Error> { - Ok(()) - } - - // convolution detection- only called if einsum detected - fn convolution_detection(&self) -> Result<(), Error> { - Ok(()) - } - - // Standalone function allows us to handle recursive initialization for - // product and summation collections - fn codegen_constant( + // Handles reads/writes from global memory aka parameter node. We tack local + // (shmem + reg) array indexing and struct field access onto data, and tack + // global pointer offset onto global. Thread parallelization is used only for + // shared memory arrays. is_char indicates the global is a char type and we + // need to multiply the global index by the element size. + fn codegen_copy_from_to_global( &self, - declare_variable: String, - name: String, - cons_id: ConstantID, - w: &mut String, - ) -> Result<(), Error> { - write!(w, "\t{}", declare_variable)?; - match &self.constants[cons_id.idx()] { - Constant::Boolean(val) => write!(w, " = {};\n", val)?, - Constant::Integer8(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger8(val) => write!(w, " = {};\n", val)?, - Constant::Integer16(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger16(val) => write!(w, " = {};\n", val)?, - Constant::Integer32(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger32(val) => write!(w, " = {}ul;\n", val)?, - Constant::Integer64(val) => write!(w, " = {}ll;\n", val)?, - Constant::UnsignedInteger64(val) => write!(w, " = {}ull;\n", val)?, - Constant::Float32(val) => write!(w, " = {}f;\n", val)?, - Constant::Float64(val) => write!(w, " = {};\n", val)?, - Constant::Product(_, fields) => { - write!(w, ";\n")?; - for (i, field) in fields.iter().enumerate() { - // We don't emit array fields and size was set by struct definition - if !self.constants[field.idx()].is_array() { - // Don't need type declaration for the fields - self.codegen_constant( - format!("{}.field_{}", name, i), - format!("{}.field_{}", name, i), - *field, - w, - )?; - } - } - } - Constant::Summation(_, variant, field) => { - write!(w, ";\n\t{}.tag = {};\n", name, variant)?; - // See two comments in Constant::Product - if !self.constants[field.idx()].is_array() { - self.codegen_constant( - format!("\t{}.field_{}", name, variant), - format!("\t{}.field_{}", name, variant), - *field, - w, - )?; - } - } - Constant::Array(type_id) => { - let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { - panic!("Expected array type") - }; - // For now we do element-wise alignment, later could consider (n-1)d array - // alignment. Then we "allocate" from the single dynamic shared memory buffer - // by using and updating the offset. - let element_size = format!("sizeof({})", self.get_type(*element_type, false)); - let array_size = extents - .iter() - .map(|id| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join("*"); - write!( - w, - ";\n\talignment = {};\n\tdynamic_shared_offset = - (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = - reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t - dynamic_shared_offset += {}", - element_size, - name, - self.get_type(*element_type, false), - array_size - )?; - } - } - Ok(()) - } - - // Used for reads and writes due to identical logic. data_variable is the - // resulting reference for reads, and is the source for writes. Writes don't - // emit a new reference. - fn codegen_copy_from_global( - &self, - is_read: bool, + is_write: bool, type_id: TypeID, - data_variable: &String, - index_ptr_name: &String, - array_depth: Option<usize>, + data: &String, + global: &String, + indices: &[Index], parallelize: bool, - num_tabs: usize, + is_char: bool, w: &mut String, + num_tabs: usize, ) -> Result<(), Error> { let tabs = "\t".repeat(num_tabs); - let lhs = if is_read { data_variable } else { index_ptr_name }; - let rhs = if is_read { index_ptr_name } else { data_variable }; match &self.types[type_id.idx()] { Type::Array(element_type_id, extents) => { - let array_depth = array_depth.unwrap(); - let rem_array_size = extents - .iter() - .enumerate() - .filter(|(i, _)| *i >= array_depth) - .map(|(_, id)| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join("*"); + let Index::Position(array_indices) = &indices[0] else { + panic!("Expected position index for array access") + }; + if matches!(self.types[element_type_id.idx()], Type::Array(..)) { + panic!("Nested arrays are not supported"); + } + let rem_array_size = { + let s = extents + .iter() + .enumerate() + .filter(|(i, _)| *i >= array_indices.len()) + .map(|(_, id)| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join(" * "); + if s.is_empty() { + "1".to_string() + } else { + s + } + }; let mut running_div_factor = "1".to_string(); - let mut indices = vec![]; - for i in (array_depth..extents.len()).rev() { - indices.push(format!( + let mut level_indices_str = "".to_string(); + for i in (array_indices.len()..extents.len()).rev() { + level_indices_str.push_str(&format!( "[(({}) / ({})) % dc{}]", rem_array_size, running_div_factor, @@ -941,7 +897,6 @@ impl GPUContext<'_> { format!("dc{}", extents[i].idx()) ); } - let indices_str = indices.join(""); // Parallelizing only affects loop bounds let begin_copy = if parallelize { format!( @@ -952,15 +907,25 @@ impl GPUContext<'_> { format!("{}for (int i = 0; i < {}; i++) {{\n", tabs, rem_array_size) }; write!(w, "{}", begin_copy)?; - self.codegen_copy_to_from_global( - is_read, + let new_global = if is_char { + format!( + "{} + i * sizeof({})", + global, + self.get_type(*element_type_id, false, false) + ) + } else { + format!("{} + i", global) + }; + self.codegen_copy_from_to_global( + is_write, *element_type_id, - &format!("{}{}", data_variable, indices_str), - &format!("{}{}", index_ptr_name, indices_str), - None, + &format!("{}{}", data, level_indices_str), + &new_global, + &indices[1..], false, - num_tabs + 1, + is_char, w, + num_tabs + 1, )?; let end_copy = if parallelize { format!("{}}}\n{}__syncthreads();\n", tabs, tabs) @@ -970,53 +935,124 @@ impl GPUContext<'_> { write!(w, "{}", end_copy)?; } Type::Product(fields) => { - for field in fields { - self.codegen_copy_to_from_global( - is_read, - *field, - &format!("{}{}", data_variable, field.idx()), - &format!("{}{}", index_ptr_name, field.idx()), - None, + if !is_char { + panic!("Product type must be char addressed") + } + if indices.is_empty() { + let dummy_var = format!("product_{}_dummy", type_id.idx()); + let type_name = self.get_type(type_id, false, false); + write!( + w, + "{}{} = reinterpret_cast<{}*>({});\n", + tabs, dummy_var, type_name, global + )?; + let dummy_ptr = format!("*{}", dummy_var); + write!( + w, + "{}{} = {};\n", + tabs, + if is_write { &dummy_ptr } else { data }, + if is_write { data } else { &dummy_ptr } + )?; + } else { + let Index::Field(field_index) = &indices[0] else { + panic!("Expected field index for product access") + }; + let offset = (0..*field_index) + .map(|i| self.get_size(fields[i])) + .sum::<usize>(); + let new_global = format!("{} + {}", global, offset); + let new_data = format!("{}.field_{}", data, *field_index); + self.codegen_copy_from_to_global( + is_write, + fields[*field_index], + &new_data, + &new_global, + &indices[1..], false, - num_tabs + 1, + is_char, w, + num_tabs + 1, )?; } } Type::Summation(fields) => { - // First copy the tag - write!(w, "{}{}.tag = {}.tag;\n", tabs, lhs, rhs)?; - // Then copy the active field based on the tag - write!(w, "{}switch({}.tag) {{\n", tabs, rhs)?; - for (variant_idx, field) in fields.iter().enumerate() { - write!(w, "{}\tcase {}: {{\n", tabs, variant_idx)?; - // Recursively copy the field's contents - self.codegen_copy_to_from_global( - is_read, - *field, - &format!("{}.field_{}", data_variable, variant_idx), - &format!("{}.field_{}", index_ptr_name, variant_idx), - None, + if !is_char { + panic!("Summation type must be char addressed") + } + if indices.is_empty() { + let dummy_var = format!("summation_{}_dummy", type_id.idx()); + let type_name = self.get_type(type_id, false, false); + write!( + w, + "{}{} = reinterpret_cast<{}*>({});\n", + tabs, dummy_var, type_name, global + )?; + let dummy_ptr = format!("*{}", dummy_var); + write!( + w, + "{}{} = {};\n", + tabs, + if is_write { &dummy_ptr } else { data }, + if is_write { data } else { &dummy_ptr } + )?; + } else { + // Since all variants are 0-byte offset, the global index + // remains unchanged. + let Index::Variant(variant_index) = &indices[0] else { + panic!("Expected variant index for summation access") + }; + let new_data = format!("{}.field_{}", data, *variant_index); + self.codegen_copy_from_to_global( + is_write, + fields[*variant_index], + &new_data, + &global, + &indices[1..], false, - num_tabs + 2, - w + is_char, + w, + num_tabs + 1, )?; - write!(w, "{}\t\tbreak;\n", tabs)?; - write!(w, "{}\t}}\n", tabs)?; } - write!(w, "{}}}\n", tabs)?; } // Primitive types _ => { - write!(w, "{}{} = {};\n", tabs, lhs, rhs)?; + if is_char { + let type_name = self.get_type(type_id, false, false); + let dummy_var = format!("{}_{}_dummy", type_name, type_id.idx()); + write!( + w, + "{}{} = reinterpret_cast<{}*>({});\n", + tabs, dummy_var, type_name, global + )?; + let dummy_ptr = format!("*{}", dummy_var); + write!( + w, + "{}{} = {};\n", + tabs, + if is_write { &dummy_ptr } else { data }, + if is_write { data } else { &dummy_ptr } + )?; + } else { + let global_ptr = format!("*({})", global); + write!( + w, + "{}{} = {};\n", + tabs, + if is_write { &global_ptr } else { data }, + if is_write { data } else { &global_ptr } + )?; + } } } Ok(()) } - // Use normal indexing for local collections - fn codegen_indices_local(&self, collect: NodeID, indices: &[Index]) -> String { - let mut index_ptr_name = format!("{}", self.get_value(collect, false, false)); + // Read/writes to local collections consist of local name + array indexing + // and struct field access. + fn codegen_local_collect(&self, collect: NodeID, indices: &[Index]) -> String { + let mut index_ptr_name = "".to_string(); for index in indices { match index { Index::Field(field) => { @@ -1036,44 +1072,152 @@ impl GPUContext<'_> { } } } - index_ptr_name + let name = self.get_value(collect, false, false); + format!("{} + {}", name, index_ptr_name) } - // Use arithmetic for global collections as they're accessed as pointers - fn codegen_indices_global(&self, collect: NodeID, indices: &[Index]) -> String { - let mut index_ptr_name = format!("{}[0", self.get_value(collect, false, false)); + // Read/writes to global collections consist of global name + pointer offset. + fn codegen_global_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { + let mut index_ptr_name = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { match index { - Index::Field(field) => { + // Sum the offset of prior fields in bytes + Index::Field(field) => { let offset = (0..*field) - .map(|i| format!("offsetof({}, field_{})", self.get_type(type_id, false), i)) + .map(|i| { + format!( + "offsetof({}, field_{})", + self.get_type(type_id, false, false), + i + ) + }) .collect::<Vec<_>>() .join(" + "); - index_ptr_name.push_str(&format!(" + {}", offset)); + if *field > 0 { + index_ptr_name.push_str(&format!(" + {}", offset)); + } } // Variants of summations have zero offset Index::Variant(_) => {} + // Convert multi-d array index to 1-d index, and optionally + // convert to single-byte index by multiplying by element size Index::Position(array_indices) => { - let Type::Array(_, extents) = &self.types[self.typing[collect.idx()].idx()] else { + let Type::Array(element_type, extents) = + &self.types[self.typing[collect.idx()].idx()] + else { panic!("Expected array type") }; - let mut cumulative_offset = "1 * ".to_string() + extents - .iter() - .enumerate() - .filter(|(i, _)| *i >= array_indices.len()) - .map(|(_, id)| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join(" * ") - .as_str(); + let mut cumulative_offset = "1 * ".to_string() + + extents + .iter() + .enumerate() + .filter(|(i, _)| *i >= array_indices.len()) + .map(|(_, id)| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join(" * ") + .as_str(); for index in array_indices.iter().rev() { - cumulative_offset = format!("{} * ({} + ", cumulative_offset, self.get_value(*index, false, false)); + cumulative_offset = format!( + "{} * ({} + ", + cumulative_offset, + self.get_value(*index, false, false) + ); + } + index_ptr_name.push_str(&format!( + " + {}{}", + cumulative_offset, + ")".repeat(array_indices.len()) + )); + if is_char { + let element_size = + format!("sizeof({})", self.get_type(*element_type, false, false)); + index_ptr_name.push_str(&format!(" * {}", element_size)); } - index_ptr_name.push_str(&format!(" + {}{}", cumulative_offset, ")".repeat(array_indices.len()))); } } } - format!("{}]", index_ptr_name) + let name = self.get_value(collect, false, false); + format!("{} + {}", name, index_ptr_name) + } + + // Standalone function allows us to handle recursive initialization for + // product and summation collections + fn codegen_constant( + &self, + declare_variable: String, + name: String, + cons_id: ConstantID, + w: &mut String, + ) -> Result<(), Error> { + write!(w, "\t{}", declare_variable)?; + match &self.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(w, " = {};\n", val)?, + Constant::Integer8(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger8(val) => write!(w, " = {};\n", val)?, + Constant::Integer16(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger16(val) => write!(w, " = {};\n", val)?, + Constant::Integer32(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger32(val) => write!(w, " = {}ul;\n", val)?, + Constant::Integer64(val) => write!(w, " = {}ll;\n", val)?, + Constant::UnsignedInteger64(val) => write!(w, " = {}ull;\n", val)?, + Constant::Float32(val) => write!(w, " = {}f;\n", val)?, + Constant::Float64(val) => write!(w, " = {};\n", val)?, + Constant::Product(_, fields) => { + write!(w, ";\n")?; + for (i, field) in fields.iter().enumerate() { + // We don't emit array fields and size was set by struct definition + if !self.constants[field.idx()].is_array() { + // Don't need type declaration for the fields + self.codegen_constant( + format!("{}.field_{}", name, i), + format!("{}.field_{}", name, i), + *field, + w, + )?; + } + } + } + Constant::Summation(_, variant, field) => { + write!(w, ";\n\t{}.tag = {};\n", name, variant)?; + // See two comments in Constant::Product + if !self.constants[field.idx()].is_array() { + self.codegen_constant( + format!("\t{}.field_{}", name, variant), + format!("\t{}.field_{}", name, variant), + *field, + w, + )?; + } + } + Constant::Array(type_id) => { + let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { + panic!("Expected array type") + }; + // For now we do element-wise alignment, later could consider (n-1)d array + // alignment. Then we "allocate" from the single dynamic shared memory buffer + // by using and updating the offset. + let element_size = + format!("sizeof({})", self.get_type(*element_type, false, false)); + let array_size = extents + .iter() + .map(|id| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join("*"); + write!( + w, + ";\n\talignment = {};\n\tdynamic_shared_offset = + (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = + reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t + dynamic_shared_offset += {}", + element_size, + name, + self.get_type(*element_type, false, false), + array_size + )?; + } + } + Ok(()) } fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { @@ -1191,6 +1335,66 @@ impl GPUContext<'_> { func_name.to_string() } + // Check if a parameter should be represented as char*. Must be a product, + // summation, or array of product/summation types. This should only be + // called on parameters. + fn is_parameter_char(&self, type_id: TypeID) -> bool { + match &self.types[type_id.idx()] { + Type::Product(_) | Type::Summation(_) => true, + Type::Array(element_type, _) => self.is_parameter_char(*element_type), + _ => false, + } + } + + // matmul detection- only called if einsum detected + fn matmul_detection(&self) -> Result<(), Error> { + Ok(()) + } + + // convolution detection- only called if einsum detected + fn convolution_detection(&self) -> Result<(), Error> { + Ok(()) + } + + fn get_size(&self, type_id: TypeID) -> usize { + match &self.types[type_id.idx()] { + Type::Array(element_type, extents) => { + let element_alignment = self.get_alignment(*element_type); + extents + .iter() + .try_fold(element_alignment, |acc, &extent| { + evaluate_dynamic_constant(extent, self.dynamic_constants) + .map(|val| acc.saturating_mul(val)) + }) + .unwrap_or(0) + } + _ => self.get_alignment(type_id), + } + } + + fn get_alignment(&self, type_id: TypeID) -> usize { + match &self.types[type_id.idx()] { + Type::Array(element_type, _) => self.get_alignment(*element_type), + Type::Product(fields) => fields + .iter() + .map(|field| self.get_alignment(*field)) + .sum::<usize>(), + Type::Summation(fields) => { + fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(0) + + 1 + } + Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, + Type::Integer16 | Type::UnsignedInteger16 => 2, + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, + _ => panic!("Unsupported type for alignment"), + } + } + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Result<usize, Error> { factors.iter().try_fold(1usize, |acc, &factor_id| { evaluate_dynamic_constant(factor_id, self.dynamic_constants) @@ -1199,6 +1403,10 @@ impl GPUContext<'_> { }) } + fn get_block_name(&self, id: NodeID) -> String { + format!("bb_{}", id.idx()) + } + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { if ty { @@ -1210,7 +1418,12 @@ impl GPUContext<'_> { panic!("Parameters shouldn't be re-initialized") } format!("p{}", index) - } else if let Node::Write { collect, data: _, indices: _ } = &self.function.nodes[id.idx()] { + } else if let Node::Write { + collect, + data: _, + indices: _, + } = &self.function.nodes[id.idx()] + { if ty { panic!("Writes shouldn't be initialized, they're replaced with the referenced collection") } @@ -1218,12 +1431,13 @@ impl GPUContext<'_> { panic!("Writes shouldn't be called as pointer") } self.get_value(*collect, false, false) - } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] + } else if ty + && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { // Shmem/register arrays have special formatting let mut declare_array = format!( "{} (*{}{})", - self.get_type(*element_type, false), + self.get_type(*element_type, false, false), self.function.nodes[id.idx()].lower_case_name(), id.idx() ); @@ -1234,7 +1448,7 @@ impl GPUContext<'_> { } else if ty { format!( "{} {}{}", - self.get_type(self.typing[id.idx()], make_pointer), + self.get_type(self.typing[id.idx()], make_pointer, false), self.function.nodes[id.idx()].lower_case_name(), id.idx() ) @@ -1247,37 +1461,46 @@ impl GPUContext<'_> { } } - // make_pointer enforces static pointer and not recursive or array pointer: - // multi-d arrays are single pointers with custom indexing. - fn get_type(&self, id: TypeID, make_pointer: bool) -> String { + fn get_type(&self, id: TypeID, make_pointer: bool, is_global: bool) -> String { match &self.types[id.idx()] { + // Product and summation collections are char* for byte-addressability + // since we can have variable type fields Type::Product(_) => { - format!( - "Product_{}{}", - id.idx(), - if make_pointer { "*" } else { "" } - ) + if make_pointer { + "char*".to_string() + } else if is_global { + "char".to_string() + } else { + format!("Product_{}", id.idx()) + } } Type::Summation(_) => { - format!( - "Summation_{}{}", - id.idx(), - if make_pointer { "*" } else { "" } - ) + if make_pointer { + "char*".to_string() + } else if is_global { + "char".to_string() + } else { + format!("Summation_{}", id.idx()) + } } Type::Array(element_type, extents) => { // This suffix lets us work with references of dynamic shared memory // and use n-d array indexing. - let mut suffix = "(*)".to_string(); - if extents.len() > 1 { - for extent in extents.iter().skip(1) { - suffix.push_str(&format!("[dc{}]", extent.idx())); - } - } format!( "{}{}", - self.get_type(*element_type, false), - if make_pointer { "*" } else { &suffix } + self.get_type(*element_type, false, is_global), + if make_pointer { + "*".to_string() + } else { + format!( + "(*){}", + extents + .iter() + .skip(1) + .map(|extent| format!("[dc{}]", extent.idx())) + .collect::<String>() + ) + } ) } _ => convert_type(&self.types[id.idx()], make_pointer), @@ -1285,7 +1508,7 @@ impl GPUContext<'_> { } } -// TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 +// TODO: Add float8, float16, bfloat16 dtypes if they come fn convert_type(ty: &Type, make_pointer: bool) -> String { let mut result = match ty { Type::Boolean => "bool".to_string(), -- GitLab From cf40395d8b8842b8dc1b94437054eff2246be691 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 3 Jan 2025 13:55:43 -0800 Subject: [PATCH 008/109] before i blow it up --- .gitignore | 2 + hercules_cg/src/gpu.rs | 1427 ++++++++++++++++----------- hercules_ir/src/ir.rs | 1 - juno_samples/matmul/src/matmul.hbin | Bin 0 -> 1323 bytes juno_samples/matmul/src/matmul.pdf | Bin 0 -> 88675 bytes 5 files changed, 856 insertions(+), 574 deletions(-) create mode 100644 juno_samples/matmul/src/matmul.hbin create mode 100644 juno_samples/matmul/src/matmul.pdf diff --git a/.gitignore b/.gitignore index 22c9343e..45f2e61b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ .*.swp .vscode *_env + +juno_samples/matmul/src/matmul_indented.jn diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 3ad9297b..768324ca 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -4,13 +4,16 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; use std::iter::FromIterator; +use std::os::unix::thread; + +use bitvec::field; use self::hercules_ir::*; /* - * The top level function to compile a Hercules IR function into CUDA kernel for - * execution on the GPU. We generate CUDA C textually, based on the CPU LLVM - * approach. + * The top level function to compile a Hercules IR function into CUDA + * kernel for execution on the GPU. We generate CUDA C textually, based + * on the CPU LLVM approach. */ pub fn gpu_codegen<W: Write>( function: &Function, @@ -20,24 +23,41 @@ pub fn gpu_codegen<W: Write>( reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, + antideps: &Vec<(NodeID, NodeID)>, bbs: &Vec<NodeID>, + collection_objects: &FunctionCollectionObjects, w: &mut W, ) -> Result<(), Error> { - // Temporary hardcoded values - let kernel_params = GPUKernelParams { - max_num_blocks: 1024, - max_num_threads: 1024, - threads_per_warp: 32, - num_smps: 60, - }; - let kernel_attrs = GPUKernelAttrs::default(); + /* + * We assert the following: + * - Array element type can't be another array + * - Any array field in a struct must have known size + * - Fork node must have >= 1 reduce nodes + * - If the returned data type is a collection, it must have + * originated from a parameter. Technically could extend to + * multiple parameters but we aren't going to. + * + * We don't assert but assume the following: + * - Global memory can't be used in a phi or select node + * - max_num_blocks is within constraint of 1D grid size. This can be + * relaxed if we want to support larger grids. + */ - // GPU backend assertions for ty in types.iter() { - if let Type::Array(type_id, _) = ty { - if let Type::Array(..) = types[type_id.idx()] { - panic!("Array element type can't be another array"); + match ty { + Type::Array(type_id, _) => { + if let Type::Array(..) = types[type_id.idx()] { + panic!("Array element type can't be another array"); + } + } + Type::Product(type_ids) | Type::Summation(type_ids) => { + for type_id in type_ids.iter() { + if let Type::Array(_, extents) = &types[type_id.idx()] && multiply_dynamic_constants(dynamic_constants, &extents).is_none() { + panic!("Array field in product msut have known size") + } + } } + _ => {} } } @@ -45,51 +65,91 @@ pub fn gpu_codegen<W: Write>( .filter(|idx| function.nodes[*idx].is_reduce()) .map(NodeID::new) .collect(); + + let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); + for reduce_node in &reduce_nodes { + if let Node::Reduce { + control, + init: _, + reduct: _, + } = &function.nodes[reduce_node.idx()] + { + match function.nodes[control.idx()] { + Node::Join { + control: fork_node, .. + } => { + fork_reduce_map + .entry(fork_node) + .or_default() + .push(*reduce_node); + } + Node::Region { preds: _ } => { + // TODO: map region node to fork node + } + _ => { + panic!("Reduce's control must be a join or region node"); + } + } + } + } for idx in 0..function.nodes.len() { - if function.nodes[idx].is_join() && reduce_nodes - .iter() - .filter(|reduce_node| match &function.nodes[reduce_node.idx()] { - Node::Reduce { - control, - init: _, - reduct: _, - } => control.idx() == idx, - _ => false, - }) - .count() - == 0 + if function.nodes[idx].is_fork() + && fork_reduce_map + .get(&NodeID::new(idx)) + .map_or(true, |reduces| reduces.is_empty()) { panic!("Join node {} has no reduce nodes", idx); } } - // Create fork forward adjacency and join map upfront as part of context - let make_fork_structures = || -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, NodeID>) { - let mut fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>> = (0..function.nodes.len()) - .filter(|idx| function.nodes[*idx].is_fork()) - .map(|idx| (NodeID::new(idx), vec![])) - .collect(); - let mut fork_join_map = HashMap::new(); - let mut queued_nodes = VecDeque::new(); - - for (fork_node, children) in fork_forward_adjacency.iter_mut() { - queued_nodes.push_back(*fork_node); - while !queued_nodes.is_empty() { - let node = queued_nodes.pop_front().unwrap(); - for child in control_subgraph.succs(node) { - if function.nodes[child.idx()].is_fork() { - children.push(child); - } else if function.nodes[child.idx()].is_join() { - fork_join_map.insert(*fork_node, child); - } else { - queued_nodes.push_back(child); + let (return_node_id, data_node_id) = { + let pos = function + .nodes + .iter() + .position(|node| { + matches!( + node, + Node::Return { + control: _, + data: _ } - } + ) + }) + .expect("Function must have a return node"); + let Node::Return { control: _, data } = &function.nodes[pos] else { + panic!("Return node must be a return node"); + }; + (NodeID::new(pos), *data) + }; + + let return_type_id = &typing[return_node_id.idx()]; + let return_type = &types[return_type_id.idx()]; + if return_type.is_array() || return_type.is_product() || return_type.is_summation() { + let objects = &collection_objects.objects(data_node_id); + if objects.len() > 1 { + let origin = collection_objects.origin(objects[0]); + if !objects + .iter() + .all(|obj| collection_objects.origin(*obj) == origin) + { + panic!( + "Returned data node {} has multiple collection objects with different origins", + data_node_id.idx() + ); + } + if !matches!(origin, CollectionObjectOrigin::Parameter(..)) { + panic!("Returns collection object that did not originate from a parameter"); } } - (fork_forward_adjacency, fork_join_map) + } + + // Temporary hardcoded values + let kernel_params = &GPUKernelParams { + max_num_blocks: 1024, + max_num_threads: 1024, + threads_per_warp: 32, + greedy_associative_thresh: 32, }; - let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { let mut label_data_for_phi = HashMap::new(); @@ -105,9 +165,9 @@ pub fn gpu_codegen<W: Write>( } label_data_for_phi }; - let label_data_for_phi = label_data_for_phi(); + let label_data_for_phi = &label_data_for_phi(); - let mut ctx = GPUContext { + let ctx = GPUContext { function, types, constants, @@ -115,29 +175,23 @@ pub fn gpu_codegen<W: Write>( reverse_postorder, typing, control_subgraph, + antideps, bbs, kernel_params, - kernel_attrs, - fork_forward_adjacency, - fork_join_map, + fork_reduce_map, label_data_for_phi, + return_type_id, }; ctx.codegen_function(w) } -// Fixed prior to codegen +// Kernel parameters that are fixed prior to codegen. See description of +// greedy_associative_thresh in codegen_function. struct GPUKernelParams { max_num_blocks: usize, max_num_threads: usize, threads_per_warp: usize, - num_smps: usize, -} - -// Set during codegen -#[derive(Default)] -struct GPUKernelAttrs { - num_blocks: usize, - num_threads: usize, + greedy_associative_thresh: usize, } struct GPUContext<'a> { @@ -148,24 +202,23 @@ struct GPUContext<'a> { reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, + antideps: &'a Vec<(NodeID, NodeID)>, bbs: &'a Vec<NodeID>, - kernel_params: GPUKernelParams, - kernel_attrs: GPUKernelAttrs, - fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, - fork_join_map: HashMap<NodeID, NodeID>, - label_data_for_phi: HashMap<NodeID, Vec<NodeID>>, + kernel_params: &'a GPUKernelParams, + fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, + label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, + return_type_id: &'a TypeID, } #[derive(Default, Debug)] struct CudaGoto { - header: String, + init: String, body: String, term: String, - handled: bool, } impl GPUContext<'_> { - fn codegen_function<W: Write>(&mut self, w: &mut W) -> Result<(), Error> { + fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // All possible includes followed by macros for intrinsic calls on // types with no library support write!( @@ -191,44 +244,45 @@ impl GPUContext<'_> { let mut top = String::new(); - // Emit kernel signature, arguments, and dynamic shared memory declaration self.codegen_kernel_begin(&mut top)?; - // Emit calculation of all dynamic constants self.codegen_dynamic_constants(&mut top)?; - // Emit all possible struct definitions and dummy pointers for each type. - // These may depend on dynamic constants, for example an array field with - // dynamic constant dims. - self.codegen_type_init(&mut top)?; + self.codegen_struct_def(&mut top)?; + self.codegen_reused_locals(&mut top)?; + let (fork_tree, fork_control_map) = self.make_fork_structures(); + let (root_forks, num_blocks) = + self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); + let (cumul_factors, num_threads) = self.get_cumulative_factors(&fork_tree, &root_forks); + let start = NodeID::new(0); + let ret = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_return()) + .map(NodeID::new) + .next() + .unwrap(); + let (begin_control, end_control) = self.get_begin_end_control(start, ret); + let global_refs = self.get_global_refs(); // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) .map(|idx| { let node_id = NodeID::new(idx); - let goto = CudaGoto { - header: self.get_value(node_id, false, false), - ..Default::default() - }; + let goto = CudaGoto::default(); (node_id, goto) }) .collect(); - // Assign outermost valid fork joins to block level. TODO: remove block_sizes - // if still not needed later - let (block_fork_ids, _) = self.codegen_block_creation()?; - // Assign inner fork joins to thread level. We do this before block sink - // because we need thread size for shared memory optimizations - let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = - self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1])?; - // Sink logic from outer block fork joins. - self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], &mut top, &mut gotos)?; - for (i, &fork_id) in block_fork_ids - .iter() - .enumerate() - .take(block_fork_ids.len() - 1) - { - self.codegen_block_sink(fork_id, block_fork_ids[i + 1], &mut top, &mut gotos)?; - } + self.codegen_data_control( + &root_forks, + &fork_tree, + &fork_control_map, + &begin_control, + &end_control, + &global_refs, + &cumul_factors, + num_threads, + num_blocks, + &mut gotos, + )?; // Punting on implementation but can likely run einsum -> matmul/conv // detector on hierarhical fork joins between block edge and given @@ -241,6 +295,7 @@ impl GPUContext<'_> { Ok(()) } + // Emit kernel signature, arguments, and dynamic shared memory declaration fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { write!( w, @@ -264,16 +319,31 @@ impl GPUContext<'_> { } else { write!(w, ", ")?; } - write!(w, "{} p{}", self.get_type(*ty, true, true), idx)?; + let param_type = if self.types[ty.idx()].is_primitive() { + self.get_type(*ty, false, false) + } else { + format!("{} __restrict__", self.get_type(*ty, true, true)) + }; + write!(w, "{} p{}", param_type, idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { + write!(w, ", ")?; + write!( + w, + "{} __restrict__ ret", + self.get_type(*self.return_type_id, true, true) + )?; } // Type is char since it's simplest to use single bytes for indexing, // casting will be needed for use with different types. - write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n")?; + write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n\tsize_t alignment;\n")?; Ok(()) } + // Emit calculation of all dynamic constants fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { for dc in dynamic_constants_bottom_up(self.dynamic_constants) { let dc_val = format!("unsigned long long dc{}", dc.idx()); @@ -307,37 +377,44 @@ impl GPUContext<'_> { } // Emit struct definitions for each typeid of product or summation type. If - // multiple typeids have the same type, they're separately emitted. Might - // not be most elegant, but using typeid is more convenient when instantiating - // than eg searching for index of type in types vector. Also emit dummy pointers - // for struct and primitive type ids for possible future use when moving to/from - // global memory - fn codegen_type_init(&self, w: &mut String) -> Result<(), Error> { + // multiple typeids have the same type, they're separately emitted. Lastly emit + // dummy alignment for later use in dynamic shared memory slices. + fn codegen_struct_def(&self, w: &mut String) -> Result<(), Error> { for type_id in self.typing.iter() { let type_id_idx = type_id.idx(); - let ty = &self.types[type_id_idx]; - match ty { + match &self.types[type_id_idx] { Type::Product(ref product_ty_ids) => { - write!(w, "\ttypedef struct Product_{} {{\n", type_id_idx)?; + let product_size = self.get_size(*type_id); + write!(w, "\ttypedef struct alignas({}) Product_{} {{\n", product_size, type_id_idx)?; + let mut cumul_size = 0; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { + let field_alignment = self.get_alignment(*product_ty_id); + if (cumul_size % field_alignment) != 0 { + let padding = field_alignment - cumul_size % field_alignment; + cumul_size += padding; + write!( + w, + "\t\tchar[{}] pad{};\n", + padding, + i, + )?; + } write!( w, "\t\t{} field_{};\n", self.get_type(*product_ty_id, false, false), i )?; + cumul_size += self.get_size(*product_ty_id); } - write!(w, "}} Product_{};\n", type_id_idx)?; - write!( - w, - "\tProduct_{}* product_{}_dummy;\n", - type_id_idx, type_id_idx - )?; + write!(w, "\t}} __attribute__((packed)) Product_{};\n", type_id_idx)?; } Type::Summation(ref summation_ty_ids) => { + let summation_size = self.get_size(*type_id); write!( w, - "\ttypedef struct Summation_{} {{\n\t\t union {{\n", + "\ttypedef struct alignas({}) Summation_{} {{\n\t\t union {{\n", + summation_size, type_id_idx )?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { @@ -350,300 +427,394 @@ impl GPUContext<'_> { } write!( w, - "\t\t}};\n\t\tuint8_t tag;\n\t}} Summation_{};\n", - type_id_idx - )?; - write!( - w, - "\tSummation_{}* summation_{}_dummy;\n", - type_id_idx, type_id_idx - )?; - } - // Arrays are decomposed into their element type during transfer - // so no need to emit dummy pointers - Type::Array(_, _) => {} - // Primitive types - _ => { - write!( - w, - "\t{} {}_{}_dummy;\n", - convert_type(ty, true), - convert_type(ty, false), + "\t\t}};\n\t}} __attribute__((packed)) Summation_{};\n", type_id_idx )?; } + _ => {} } } Ok(()) } - fn codegen_phi_registers<F>(&self, w: &mut String, should_process: F) -> Result<(), Error> - where - F: Fn(NodeID) -> bool, - { + // We generate all phi values and all flags for phi and select upfront that + // indicate if collection, whether their current value is global + fn codegen_reused_locals(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if let Node::Phi { - control: _, - data: _, - } = &self.function.nodes[id.idx()] - { - if should_process(id) { - write!(w, "\t{};\n", self.get_value(id, true, true))?; - } + match &self.function.nodes[id.idx()] { + Node::Phi {..} => { + write!(w, "\t{};\n", self.get_value(id, true, true, false))?; + } + _ => {} + } + let global_flag = self.get_global_flag(id, true); + if global_flag.is_some() { + write!(w, "\t{};\n", global_flag.unwrap())?; } } Ok(()) } - /* - Construct block forks by greedily accepting while: a) each fork join is strictly - nested meaning no other neighbor fork joins, b) the forks are parallel forks, - c) total number of blocks < max_num_blocks, and d) each fork's factor is statically - known. - */ - fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> { - let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); - for (_, children) in self.fork_forward_adjacency.iter() { - for child in children { - root_forks.remove(child); + /* Create two fork structures: + * First, fork_forward_adjacency is a map from each fork node F to all forks satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we don't count self-domination + * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we do count self-domination + */ + fn make_fork_structures(&self) -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, Vec<NodeID>>) { + let mut fork_tree: HashMap<NodeID, Vec<NodeID>> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_fork()) + .map(|idx| (NodeID::new(idx), vec![])) + .collect(); + let mut fork_control_map = HashMap::new(); + let mut queued_nodes = VecDeque::new(); + + for (fork_node, fork_children) in fork_tree.iter_mut() { + let mut control_vec = vec![]; + queued_nodes.push_back(*fork_node); + while !queued_nodes.is_empty() { + let node = queued_nodes.pop_front().unwrap(); + control_vec.push(node); + for child in self.control_subgraph.succs(node) { + if self.function.nodes[child.idx()].is_fork() { + fork_children.push(child); + } else if self.function.nodes[child.idx()].is_join() { + control_vec.push(child); + } else { + queued_nodes.push_back(child); + } + } } + fork_control_map.insert(*fork_node, control_vec); } - let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); - if root_forks.len() != 1 { - panic!("Exactly one root fork is required for outermost GPU block fork"); - } + (fork_tree, fork_control_map) + } - // a and b - let mut strict_forks = vec![root_forks[0]]; - let mut curr_fork = root_forks[0]; - while self.fork_join_map.contains_key(&curr_fork) { - let children = &self.fork_forward_adjacency[&curr_fork]; - if children.len() != 1 - || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) - { - break; + // Get control nodes succeeding the start and preceding all forks, and + // control nodes preceding the return and succeeding all joins + fn get_begin_end_control( + &self, + start: NodeID, + ret: NodeID, + ) -> (HashSet<NodeID>, HashSet<NodeID>) { + let mut begin_visited = HashSet::new(); + let mut begin_worklist = VecDeque::new(); + begin_worklist.push_back(start); + + while let Some(node) = begin_worklist.pop_front() { + if begin_visited.contains(&node) { + continue; + } + if self.function.nodes[node.idx()].is_fork() { + continue; + } + begin_visited.insert(node); + for pred in self.control_subgraph.preds(node) { + begin_worklist.push_back(pred); } - curr_fork = children[0]; - strict_forks.push(curr_fork); } - // c and d - let mut valid_block_forks = 0; - let mut cumulative_blocks = 1usize; - let mut block_fork_sizes = Vec::new(); + let mut end_visited = HashSet::new(); + let mut end_worklist = VecDeque::new(); + end_worklist.push_back(ret); - for fork in strict_forks.iter() { - if !self.function.schedules[fork.idx()].contains(&Schedule::Vectorizable) { - break; + while let Some(node) = end_worklist.pop_front() { + if end_visited.contains(&node) { + continue; } - let factors = match &self.function.nodes[fork.idx()] { - Node::Fork { factors, .. } => factors, - _ => panic!("Expected fork node"), - }; - let fork_size = self.multiply_fork_factors(factors)?; - let new_blocks = cumulative_blocks.saturating_mul(fork_size); - if new_blocks > self.kernel_params.max_num_blocks { - break; + if self.function.nodes[node.idx()].is_join() { + continue; + } + end_visited.insert(node); + for succ in self.control_subgraph.preds(node) { + end_worklist.push_back(succ); } - cumulative_blocks = new_blocks; - block_fork_sizes.push(fork_size); - valid_block_forks += 1; } - // If limit on number of blocks in 1D grid is reached, we could consider 2D - // or 3D grids. Performance is not affected so for now keep it simple with 1D. - self.kernel_attrs.num_blocks = cumulative_blocks; - let valid_block_forks = strict_forks - .into_iter() - .take(valid_block_forks) - .collect::<Vec<_>>(); - - Ok((valid_block_forks, block_fork_sizes)) + (begin_visited, end_visited) } - // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. - fn codegen_thread_creation( - &mut self, - inner_block_fork: NodeID, - ) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { - let mut thread_fork_parents = HashMap::new(); - let mut thread_fork_sizes = HashMap::new(); - let mut thread_fork_cumulative_sizes = HashMap::new(); - thread_fork_cumulative_sizes.insert(inner_block_fork, 1); - let mut thread_fork_edges = vec![]; - let mut max_thread_size = 1; - let mut stack = vec![inner_block_fork]; - let mut visited = HashSet::new(); - visited.insert(inner_block_fork); - while let Some(pop) = stack.pop() { - let children = &self.fork_forward_adjacency[&pop]; - - // Reverse child order due to use of stack for DFS - for &child in children.iter().rev() { - if !visited.contains(&child) { - visited.insert(child); - thread_fork_parents.insert(child, pop); - let fork_size = match &self.function.nodes[child.idx()] { - Node::Fork { factors, .. } => self.multiply_fork_factors(factors)?, - _ => panic!("Expected fork node"), - }; - thread_fork_sizes.insert(child, fork_size); - - let new_cumulative_size = (thread_fork_cumulative_sizes[&pop] as usize) - .saturating_mul(fork_size as usize); - if new_cumulative_size > self.kernel_params.max_num_threads { - // Expanding to child fork exceeds thread limit, so - // current fork is an edge fork - thread_fork_edges.push(pop); - max_thread_size = max_thread_size.max(thread_fork_cumulative_sizes[&pop]); - } else { - // Recurse into child fork - thread_fork_cumulative_sizes.insert(child, new_cumulative_size); - stack.push(child); - } - } else { - panic!("Fork child shouldn't have multiple fork parents"); + // Get all globals and global references, where for GPU purposes global = + // collection parameter + fn get_global_refs(&self) -> HashSet<NodeID> { + // We start with collection parameters, and follow any reduce or write users. + let mut queued_nodes: VecDeque<NodeID> = (0..self.function.nodes.len()) + .filter(|idx| { + self.function.nodes[*idx].is_parameter() + && !self.types[self.typing[*idx].idx()].is_primitive() + }) + .map(NodeID::new) + .collect(); + + let def_use = def_use(&self.function); + let mut global_nodes = HashSet::new(); + + while !queued_nodes.is_empty() { + let node_id = queued_nodes.pop_front().unwrap(); + global_nodes.insert(node_id); + let node_users = def_use.get_users(node_id); + for user in node_users { + match self.function.nodes[user.idx()] { + Node::Write { .. } | Node::Reduce { .. } => queued_nodes.push_back(*user), + _ => {} } } } - // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. - // If limit on number of threads in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - self.kernel_attrs.num_threads = max_thread_size; - - Ok((thread_fork_parents, thread_fork_sizes, thread_fork_edges)) + global_nodes } - fn codegen_block_sink( + /* + * If tree has a single root fork of known size s <= max_num_blocks + * with parallel-fork schedule, then set num_blocks to s, else set num_blocks to 1. + */ + fn get_root_forks_and_num_blocks( &self, - fork_id: NodeID, - next_fork_id: NodeID, - w: &mut String, - gotos: &mut BTreeMap<NodeID, CudaGoto>, - ) -> Result<(), Error> { - // 1. Get control nodes including fork_id that are dominated by fork_id - // and not dominated by next_fork_id and not dominated by fork_id's join - let dom = dominator(self.control_subgraph, fork_id); - assert!(dom.does_dom(fork_id, next_fork_id)); - let mut control_nodes_between = HashSet::new(); - for node_id in self.control_subgraph.iter() { - if dom.does_dom(fork_id, *node_id) - && !dom.does_dom(next_fork_id, *node_id) - && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) - { - control_nodes_between.insert(*node_id); + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + max_num_blocks: usize, + ) -> (Vec<NodeID>, usize) { + let mut root_forks: HashSet<NodeID> = fork_tree.keys().copied().collect(); + for (_, children) in fork_tree.iter() { + for child in children { + root_forks.remove(child); } } - - // 2. Emit data flow for nodes assigned to basic blocks in block sink - // 2a. All phi registers first - self.codegen_phi_registers(w, |id| control_nodes_between.contains(&self.bbs[id.idx()]))?; - // 2b. All other data nodes - let mut worklist = VecDeque::from_iter(self.reverse_postorder.iter().filter(|id| { - !self.function.nodes[id.idx()].is_control() - && control_nodes_between.contains(&self.bbs[id.idx()]) - && !self.function.nodes[id.idx()].is_phi() - })); - let mut visited = HashSet::new(); - while let Some(id) = worklist.pop_front() { - let node = &self.function.nodes[id.idx()]; - if node.is_reduce() { - panic!("Reduce nodes should not be in block sink"); - } - if get_uses(node) - .as_ref() - .iter() - .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) - { - let body = &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body; - self.codegen_data_node(*id, body, 1)?; - visited.insert(id); - } else { - worklist.push_back(id); - } + let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); + if root_forks.len() != 1 { + return (root_forks, 1); } - // 3. Emit control flow - for control_node in control_nodes_between { - let term = &mut gotos.get_mut(&self.bbs[control_node.idx()]).unwrap().term; - self.codegen_control_node(control_node, term, 1)?; + let root_fork = root_forks[0]; + let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { + panic!("Expected fork node"); + }; + let fork_size = multiply_dynamic_constants(self.dynamic_constants, factors); + if let Some(fork_size) = fork_size + && fork_size <= max_num_blocks + && self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) + { + (root_forks, fork_size) + } else { + (root_forks, 1) } + } - Ok(()) + /* + * Once inside the block-level forks, we initiate a cumul_factor at 1. If + * encountering a child fork with known size s < max_num_threads / cumul_factor, + * with all reduces being parallel or associative, then we parallelize along + * s, else we serialize. Then step into child and update cumul_factor if needed. + * One exception is if fork factor is a multiple of greedy_associative_thresh + * and at least one reduce is associative, in which case we use warp reduction + * and disable cumul_factor change for its subtree. At end, we've mapped + * each fork to its cumulative factor, and if not present fork uses it's parent's + * factor. + */ + fn get_cumulative_factors( + &self, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + root_forks: &Vec<NodeID>, + ) -> (HashMap<NodeID, usize>, usize) { + let mut cumul_factors = HashMap::new(); + for root_fork in root_forks { + cumul_factors.insert(*root_fork, 1); + self.recurse_cumul_factors(*root_fork, fork_tree, 1, &mut cumul_factors); + } + let num_threads = *cumul_factors.values().max().unwrap(); + (cumul_factors, num_threads) } - fn codegen_control_node( + fn recurse_cumul_factors( &self, - id: NodeID, - w: &mut String, - num_tabs: usize, - ) -> Result<(), Error> { - let tabs = "\t".repeat(num_tabs); - match &self.function.nodes[id.idx()] { - Node::Start - | Node::Region { preds: _ } - | Node::Projection { - control: _, - selection: _, - } => { - let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w, "{}goto {}\n", tabs, self.get_block_name(succ))?; + curr_fork: NodeID, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + cumul_factor: usize, + cumul_factors: &mut HashMap<NodeID, usize>, + ) { + let reduces = &self.fork_reduce_map[&curr_fork]; + if reduces.iter().all(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) + || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) + }) && let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] + && let Some(fork_size) = multiply_dynamic_constants(self.dynamic_constants, factors) + && fork_size <= self.kernel_params.max_num_threads / cumul_factor + { + if fork_size % self.kernel_params.greedy_associative_thresh == 0 + && reduces.iter().any(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::Associative) + }) { + cumul_factors.insert(curr_fork, cumul_factor * fork_size); + } else { + let mut max_factor = cumul_factor * fork_size; + for child in fork_tree[&curr_fork].iter() { + self.recurse_cumul_factors(*child, fork_tree, cumul_factor * fork_size, cumul_factors); + max_factor = max_factor.max(cumul_factors[child]); + } + cumul_factors.insert(curr_fork, max_factor); } - Node::If { control: _, cond } => { - let mut succs = self.control_subgraph.succs(id); - let succ1 = succs.next().unwrap(); - let succ2 = succs.next().unwrap(); - write!( - w, - "{}if ({}) {{\n", - tabs, - self.get_value(*cond, false, false) - )?; - write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; - write!(w, "{}}} else {{\n", tabs)?; - write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ2))?; - write!(w, "{}}}\n", tabs)?; + } else { + let mut max_factor = cumul_factor; + for child in fork_tree[&curr_fork].iter() { + self.recurse_cumul_factors(*child, fork_tree, cumul_factor, cumul_factors); + max_factor = max_factor.max(cumul_factors[child]); } - Node::Fork { - control: _, - factors: _, - } => {} - Node::Join { control: _ } => {} - Node::Return { - control: _, - data: _, - } => { - write!(w, "{}return;\n", tabs)?; + cumul_factors.insert(curr_fork, max_factor); + } + } + + // /* + // * For each parallel reduce with a reduct write, meaning it's at the end of + // * a potential parallel reduction chain, we walk back to beginning of chain + // * and update the write's collect to be the beginning's init. + // */ + // fn update_write_collects(&self) -> HashMap<NodeID, NodeID> { + // let mut write_collect_map = HashMap::new(); + // let mut parallel_reduces: HashSet<NodeID> = (0..self.function.nodes.len()) + // .map(NodeID::new) + // .filter(|&node_id| { + // self.function.schedules[node_id.idx()].contains(&Schedule::ParallelReduce) + // }) + // .collect(); + // for reduce in parallel_reduces.clone() { + // if let Node::Reduce { + // control: _, + // init, + // reduct, + // } = &self.function.nodes[reduce.idx()] + // && let Node::Write { .. } = &self.function.nodes[reduct.idx()] + // { + // parallel_reduces.remove(&reduce); + // while parallel_reduces.contains(&init) { + // let Node::Reduce { + // control: _, + // init, + // reduct: _, + // } = &self.function.nodes[init.idx()] + // else { + // panic!("Expected reduce node"); + // }; + // parallel_reduces.remove(&init); + // } + // write_collect_map.insert(*reduct, *init); + // } + // } + // write_collect_map + // } + + fn codegen_data_control( + &self, + root_forks: &Vec<NodeID>, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + fork_control_map: &HashMap<NodeID, Vec<NodeID>>, + begin_control: &HashSet<NodeID>, + end_control: &HashSet<NodeID>, + global_refs: &HashSet<NodeID>, + cumul_factors: &HashMap<NodeID, usize>, + num_threads: usize, + num_blocks: usize, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + let control_to_data = + (0..self.bbs.len()).fold(HashMap::<NodeID, Vec<NodeID>>::new(), |mut map, id| { + if let Some(control) = self.bbs.get(id) { + map.entry(*control).or_default().push(NodeID::new(id)); + }; + map + }); + + // Define the following states: + // 0 is above block fork, 1 is in block fork above any thread fork, 2 is + // in any thread fork, 3 is below block fork + + // If num_blocks > 1, initialize state to 0, else 1 + let mut state = if num_blocks > 1 { 0 } else { 1 }; + // Then generate data and control for each control in begin_control + for control in begin_control { + let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; + for data in control_to_data.get(control).unwrap() { + self.codegen_data_node(*data, state, num_threads, body, &mut 1, global_refs)?; } - _ => { - panic!("Unsupported control node type") + let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; + self.codegen_control_node(*control, term, 1)?; + } + // Then if num_blocks > 1, set state to 1 and generate data and control + // for the single root fork + if num_blocks > 1 { + state = 1; + for control in fork_control_map.get(&root_forks[0]).unwrap() { + for data in control_to_data.get(control).unwrap() { + let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; + self.codegen_data_node(*data, state, num_threads, body, &mut 1, global_refs)?; + } + let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; + self.codegen_control_node(*control, term, 1)?; } } + // Set state to 2 and begin DFS through fork_tree (after root_fork if + // visited in previous step), updating thread_quota + // If num_blocks > 1, set state to 3, else 1 + // Then generate data and control for each control in end_control Ok(()) } - fn codegen_data_node(&self, id: NodeID, w: &mut String, num_tabs: usize) -> Result<(), Error> { - // For now only used shared memory when creating an array - let declare_variable = self.get_value(id, true, false).to_string(); - let tabs = "\t".repeat(num_tabs); + fn codegen_data_node( + &self, + id: NodeID, + state: usize, + thread_quota: usize, + w: &mut String, + num_tabs: &mut usize, + global_refs: &HashSet<NodeID>, + ) -> Result<(), Error> { + let declare_variable = self.get_value(id, true, false, false).to_string(); + let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { // Phi registers were already emitted. Node::Phi { control: _, data: _, } => {} - // No SSA requirement for CUDA + Node::ThreadID { + control, + dimension, + } => { + let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { + panic!("Expected ThreadID's control to be a fork node"); + }; + match state { + 1 => { + // Violating DRY with the naming but unsure how to map + // DynamicConstantID to NodeID to use `get_value` + let divide = factors.iter().skip(dimension + 1).map(|f| format!("dc{}", f.idx())).collect::<Vec<_>>().join(" * "); + let modulo = format!("dc{}", factors[*dimension].idx()); + write!(w, "{}{} = (blockIdx.x / ({})) % {};\n", tabs, declare_variable, divide, modulo)?; + } + 2 => {} + _ => { panic!("Unsupported state for ThreadID") } + } + } + Node::Reduce { control: _, init, reduct: _ } => { + let init_val = self.get_value(*init, false, false, false); + write!(w, "{}{} = {};\n", tabs, declare_variable, init_val)?; + } + // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { + write!(w, "{}{};\n", tabs, declare_variable)?; + let define_variable = self.get_value(id, false, false, false); self.codegen_constant( - declare_variable, - self.get_value(id, false, false), + if self.types[self.typing[id.idx()].idx()].is_primitive() { define_variable } else { format!("*{}", define_variable)}, *cons_id, w, + *num_tabs, )?; } - // No SSA requirement for CUDA + // Dynamic constants emitted at top Node::DynamicConstant { id: _ } => {} Node::Unary { op, input } => match op { UnaryOperator::Not => match &self.types[self.typing[input.idx()].idx()] { @@ -653,7 +824,7 @@ impl GPUContext<'_> { "{}{} = !{};\n", tabs, declare_variable, - self.get_value(*input, false, false), + self.get_value(*input, false, false, false), )?; } ty if ty.is_fixed() => { @@ -662,7 +833,7 @@ impl GPUContext<'_> { "{}{} = ~{};\n", tabs, declare_variable, - self.get_value(*input, false, false), + self.get_value(*input, false, false, false), )?; } _ => panic!("Unsupported type for not operator"), @@ -674,7 +845,7 @@ impl GPUContext<'_> { "{}{} = -{};\n", tabs, declare_variable, - self.get_value(*input, false, false), + self.get_value(*input, false, false, false), )?; } _ => { @@ -688,13 +859,13 @@ impl GPUContext<'_> { tabs, declare_variable, self.get_type(*dst_ty_id, false, false), - self.get_value(*input, false, false), + self.get_value(*input, false, false, false), )?; } }, Node::Binary { op, left, right } => { - let left_val = self.get_value(*left, false, false); - let right_val = self.get_value(*right, false, false); + let left_val = self.get_value(*left, false, false, false); + let right_val = self.get_value(*right, false, false, false); match (op, &self.types[self.typing[left.idx()].idx()]) { (BinaryOperator::Rem, Type::Float32) => write!( w, @@ -754,12 +925,17 @@ impl GPUContext<'_> { TernaryOperator::Select => { write!( w, - "{}{} = {} ? {} : {};\n", + "{}{} = {} ? {} : {};\n{}{} = {} ? {} : {};\n", tabs, declare_variable, - self.get_value(*first, false, false), - self.get_value(*second, false, false), - self.get_value(*third, false, false), + self.get_value(*first, false, false, false), + self.get_value(*second, false, false, false), + self.get_value(*third, false, false, false), + tabs, + self.get_value(id, false, false, false), + self.get_value(*first, false, false, false), + global_refs.contains(second), + global_refs.contains(third) )?; } }, @@ -772,30 +948,54 @@ impl GPUContext<'_> { tabs, declare_variable, func_name, - self.get_value(args[0], false, false), + self.get_value(args[0], false, false, false), )?; } Node::Read { collect, indices } => { - // If it's a parameter node then copy from global memory, else - // from shared memory or registers. - if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { - write!(w, "{};\n", declare_variable); + // Copy from global memory or from shared memory or registers. + // Generate if-else for phi and select where we don't statically know + // the case. + write!(w, "{}{};\n", tabs, declare_variable); + let define_variable = self.get_value(id, false, false, false); + let global_flag = self.get_global_flag(*collect, false); + let has_global_flag = global_flag.is_some(); + if has_global_flag { + write!(w, "{}if ({}) {{\n{}\t", tabs, global_flag.unwrap(), tabs); + *num_tabs += 1; + } + if global_refs.contains(collect) || has_global_flag { let is_char = self.is_parameter_char(self.typing[collect.idx()]); - let global_collect = self.codegen_global_collect(*collect, indices, is_char); + let global_collect = self.codegen_collect(*collect, indices, true, has_global_flag, is_char); + let type_id = self.typing[id.idx()]; + let is_array = self.types[type_id.idx()].is_array(); self.codegen_copy_from_to_global( false, - self.typing[id.idx()], - &declare_variable, + type_id, + &define_variable, &global_collect, indices, - true, + if is_array { + Some(thread_quota) + } else { + None + }, + !is_array, + false, is_char, w, - num_tabs, + *num_tabs, )?; - } else { - let local_collect = self.codegen_local_collect(*collect, indices); - write!(w, "{}{} = {};\n", tabs, declare_variable, local_collect)?; + } + if has_global_flag { + write!(w, "{}}} else {{\n", tabs); + } + if !global_refs.contains(collect) || has_global_flag { + let local_collect = self.codegen_collect(*collect, indices, false, has_global_flag, false); + write!(w, "{}{} = {};\n", tabs, define_variable, local_collect)?; + } + if has_global_flag { + write!(w, "{}}}\n", tabs); + *num_tabs -= 1; } } Node::Write { @@ -803,27 +1003,50 @@ impl GPUContext<'_> { data, indices, } => { - let data_variable = self.get_value(*data, false, false); - // If it's a parameter node then copy to global memory, else - // to shared memory or registers - if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { + // Only difference vs read is the LHS vs RHS, and creating write- + // labeled reference after + write!(w, "{}{};\n", tabs, declare_variable); + let global_flag = self.get_global_flag(*collect, false); + let has_global_flag = global_flag.is_some(); + if has_global_flag { + write!(w, "{}if ({}) {{\n", tabs, global_flag.unwrap()); + *num_tabs += 1; + } + let data_variable = self.get_value(*data, false, false, global_refs.contains(collect)); + if global_refs.contains(collect) || has_global_flag { let is_char = self.is_parameter_char(self.typing[collect.idx()]); - let global_collect = self.codegen_global_collect(*collect, indices, is_char); + let global_collect = self.codegen_collect(*collect, indices, true, has_global_flag, is_char); + let type_id = self.typing[id.idx()]; + let is_array = self.types[type_id.idx()].is_array(); self.codegen_copy_from_to_global( true, - self.typing[id.idx()], + type_id, &data_variable, &global_collect, - &indices, - true, + indices, + if is_array { + Some(thread_quota) + } else { + None + }, + !is_array, + state == 0, is_char, w, - num_tabs, + *num_tabs, )?; - } else { - let local_collect = self.codegen_local_collect(*collect, indices); + } + if has_global_flag { + write!(w, "{}}} else {{\n", tabs); + } + if !global_refs.contains(collect) || has_global_flag { + let local_collect = self.codegen_collect(*collect, indices, false, has_global_flag, false); write!(w, "{}{} = {};\n", tabs, local_collect, data_variable)?; } + if has_global_flag { + write!(w, "{}}}\n", tabs); + *num_tabs -= 1; + } } _ => { panic!("Unsupported data node type") @@ -833,10 +1056,69 @@ impl GPUContext<'_> { for phi in phis { write!( w, - "\t{} = {};\n", - self.get_value(*phi, false, false), - self.get_value(id, false, false) + "{}{} = {};\n{}{} = {};\n", + tabs, + self.get_value(*phi, false, false, false), + self.get_value(id, false, false, false), + tabs, + self.get_global_flag(*phi, false).unwrap(), + global_refs.contains(&id) + )?; + } + } + Ok(()) + } + + fn codegen_control_node( + &self, + id: NodeID, + w: &mut String, + num_tabs: usize, + ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); + match &self.function.nodes[id.idx()] { + Node::Start + | Node::Region { preds: _ } + | Node::Projection { + control: _, + selection: _, + } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(w, "{}goto {}\n", tabs, self.get_block_name(succ))?; + } + Node::If { control: _, cond } => { + let mut succs = self.control_subgraph.succs(id); + let succ1 = succs.next().unwrap(); + let succ2 = succs.next().unwrap(); + write!( + w, + "{}if ({}) {{\n", + tabs, + self.get_value(*cond, false, false, false) )?; + write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; + write!(w, "{}}} else {{\n", tabs)?; + write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ2))?; + write!(w, "{}}}\n", tabs)?; + } + Node::Fork { + control: _, + factors: _, + } => {} + Node::Join { control: _ } => {} + Node::Return { control: _, data } => { + if self.types[self.typing[data.idx()].idx()].is_primitive() { + let return_val = self.get_value(*data, false, false, false); + write!( + w, + "{}if (threadIdx.x == 0) {{\n{}\t*ret = {};\n{}}}\n", + tabs, tabs, return_val, tabs + )?; + } + write!(w, "{}return;\n", tabs)?; + } + _ => { + panic!("Unsupported control node type") } } Ok(()) @@ -854,7 +1136,9 @@ impl GPUContext<'_> { data: &String, global: &String, indices: &[Index], - parallelize: bool, + thread_quota: Option<usize>, + thread_restrict: bool, + block_restrict: bool, is_char: bool, w: &mut String, num_tabs: usize, @@ -882,29 +1166,19 @@ impl GPUContext<'_> { s } }; - let mut running_div_factor = "1".to_string(); - let mut level_indices_str = "".to_string(); - for i in (array_indices.len()..extents.len()).rev() { - level_indices_str.push_str(&format!( - "[(({}) / ({})) % dc{}]", - rem_array_size, - running_div_factor, - extents[i].idx() - )); - running_div_factor = format!( - "{} * {}", - running_div_factor, - format!("dc{}", extents[i].idx()) - ); - } - // Parallelizing only affects loop bounds - let begin_copy = if parallelize { + // If we parallelize over threads, then we index by threadIdx.x, + // else we gate the loop by threadIdx.x == 0 + let has_thread_quota = thread_quota.is_some(); + let begin_copy = if has_thread_quota { format!( "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", - tabs, rem_array_size, self.kernel_attrs.num_threads + tabs, rem_array_size, thread_quota.unwrap() ) } else { - format!("{}for (int i = 0; i < {}; i++) {{\n", tabs, rem_array_size) + format!( + "{}if (threadIdx.x == 0) {{\n{}\tfor (int i = 0; i < {}; i++) {{\n", + tabs, tabs, rem_array_size + ) }; write!(w, "{}", begin_copy)?; let new_global = if is_char { @@ -919,42 +1193,57 @@ impl GPUContext<'_> { self.codegen_copy_from_to_global( is_write, *element_type_id, - &format!("{}{}", data, level_indices_str), + &format!("{} + i", data), &new_global, &indices[1..], + None, + false, false, is_char, w, - num_tabs + 1, + num_tabs + if has_thread_quota { 1 } else { 2 }, )?; - let end_copy = if parallelize { - format!("{}}}\n{}__syncthreads();\n", tabs, tabs) - } else { - format!("{}}}\n", tabs) - }; - write!(w, "{}", end_copy)?; + if !has_thread_quota { + write!(w, "{}\t}}\n", tabs)?; + } + write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; } - Type::Product(fields) => { + Type::Product(fields) | Type::Summation(fields) => { if !is_char { - panic!("Product type must be char addressed") + panic!("Global product or summation must be char addressed") } + let is_product = matches!(self.types[type_id.idx()], Type::Product(..)); if indices.is_empty() { - let dummy_var = format!("product_{}_dummy", type_id.idx()); - let type_name = self.get_type(type_id, false, false); - write!( - w, - "{}{} = reinterpret_cast<{}*>({});\n", - tabs, dummy_var, type_name, global - )?; - let dummy_ptr = format!("*{}", dummy_var); + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; + } + if thread_restrict { + write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; + extra_tab2 = "\t"; + } + let reinterpret = format!("*reinterpret_cast<{}_{}*>", if is_product { "Product" } else { "Summation" }, type_id.idx()); + let reinterpret_global = format!("{}({})", reinterpret, global); + let reinterpret_data = format!("{}({})", reinterpret, data); write!( w, - "{}{} = {};\n", + "{}{}{}{} = {};\n", tabs, - if is_write { &dummy_ptr } else { data }, - if is_write { data } else { &dummy_ptr } + extra_tab, + extra_tab2, + if is_write { &reinterpret_global } else { &reinterpret_data }, + if is_write { &reinterpret_data } else { &reinterpret_global } )?; - } else { + if thread_restrict { + write!(w, "{}{}}}\n", tabs, extra_tab)?; + } + if block_restrict { + write!(w, "{}}}\n", tabs)?; + } + } else if is_product { + // Iterate over fields in product to find offset let Index::Field(field_index) = &indices[0] else { panic!("Expected field index for product access") }; @@ -962,54 +1251,34 @@ impl GPUContext<'_> { .map(|i| self.get_size(fields[i])) .sum::<usize>(); let new_global = format!("{} + {}", global, offset); - let new_data = format!("{}.field_{}", data, *field_index); + let new_data = format!("{} + {}", data, offset); self.codegen_copy_from_to_global( is_write, fields[*field_index], &new_data, &new_global, &indices[1..], - false, + None, + thread_restrict, + block_restrict, is_char, w, num_tabs + 1, )?; - } - } - Type::Summation(fields) => { - if !is_char { - panic!("Summation type must be char addressed") - } - if indices.is_empty() { - let dummy_var = format!("summation_{}_dummy", type_id.idx()); - let type_name = self.get_type(type_id, false, false); - write!( - w, - "{}{} = reinterpret_cast<{}*>({});\n", - tabs, dummy_var, type_name, global - )?; - let dummy_ptr = format!("*{}", dummy_var); - write!( - w, - "{}{} = {};\n", - tabs, - if is_write { &dummy_ptr } else { data }, - if is_write { data } else { &dummy_ptr } - )?; } else { - // Since all variants are 0-byte offset, the global index - // remains unchanged. + // All variants of summations have zero offset let Index::Variant(variant_index) = &indices[0] else { panic!("Expected variant index for summation access") }; - let new_data = format!("{}.field_{}", data, *variant_index); self.codegen_copy_from_to_global( is_write, fields[*variant_index], - &new_data, + &data, &global, &indices[1..], - false, + None, + thread_restrict, + block_restrict, is_char, w, num_tabs + 1, @@ -1018,66 +1287,84 @@ impl GPUContext<'_> { } // Primitive types _ => { + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; + } + if thread_restrict { + write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; + extra_tab2 = "\t"; + } if is_char { - let type_name = self.get_type(type_id, false, false); - let dummy_var = format!("{}_{}_dummy", type_name, type_id.idx()); - write!( - w, - "{}{} = reinterpret_cast<{}*>({});\n", - tabs, dummy_var, type_name, global - )?; - let dummy_ptr = format!("*{}", dummy_var); + let type_name = self.get_type(type_id, true, false); + let reinterpret = format!("*reinterpret_cast<{}>", type_name); + let reinterpret_global = format!("{}({})", reinterpret, global); + let reinterpret_data = format!("{}({})", reinterpret, data); write!( w, - "{}{} = {};\n", + "{}{}{}{} = {};\n", tabs, - if is_write { &dummy_ptr } else { data }, - if is_write { data } else { &dummy_ptr } + extra_tab, + extra_tab2, + if is_write { &reinterpret_global } else { &reinterpret_data }, + if is_write { &reinterpret_data } else { &reinterpret_global } )?; } else { - let global_ptr = format!("*({})", global); write!( w, - "{}{} = {};\n", + "{}*{} = *{};\n", tabs, - if is_write { &global_ptr } else { data }, - if is_write { data } else { &global_ptr } + if is_write { &global } else { data }, + if is_write { data } else { &global } )?; } - } - } - Ok(()) - } - - // Read/writes to local collections consist of local name + array indexing - // and struct field access. - fn codegen_local_collect(&self, collect: NodeID, indices: &[Index]) -> String { - let mut index_ptr_name = "".to_string(); - for index in indices { - match index { - Index::Field(field) => { - index_ptr_name.push_str(&format!(".field_{}", field)); + if thread_restrict { + write!(w, "{}{}}}\n", tabs, extra_tab)?; } - Index::Variant(variant) => { - index_ptr_name.push_str(&format!(".field_{}", variant)); - } - Index::Position(indices) => { - index_ptr_name.push_str( - &indices - .iter() - .map(|index| format!("[{}]", self.get_value(*index, false, false))) - .collect::<Vec<_>>() - .join(""), - ); + if block_restrict { + write!(w, "{}}}\n", tabs)?; } } } - let name = self.get_value(collect, false, false); - format!("{} + {}", name, index_ptr_name) + Ok(()) } + // // Read/writes to local collections consist of local name + array indexing + // // and struct field access. + // fn codegen_local_collect(&self, collect: NodeID, indices: &[Index], has_global_flag: bool) -> String { + // let mut index_ptr_name = "".to_string(); + // for index in indices { + // match index { + // Index::Field(field) => { + // index_ptr_name.push_str(&format!(".field_{}", field)); + // } + // Index::Variant(variant) => { + // index_ptr_name.push_str(&format!(".field_{}", variant)); + // } + // Index::Position(indices) => { + // index_ptr_name.push_str( + // &indices + // .iter() + // .map(|index| format!("[{}]", self.get_value(*index, false, false, false))) + // .collect::<Vec<_>>() + // .join(""), + // ); + // } + // } + // } + // let name = self.get_value(collect, false, false, false); + // let full_name = if has_global_flag { + // format!("reinterpret_cast<{}>({})", self.get_type(self.typing[collect.idx()], false, false), name) + // } else { + // name + // }; + // format!("{} + {}", full_name, index_ptr_name) + // } + // Read/writes to global collections consist of global name + pointer offset. - fn codegen_global_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { + fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_global: bool, has_global_flag: bool, is_char: bool) -> String { let mut index_ptr_name = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { @@ -1121,7 +1408,7 @@ impl GPUContext<'_> { cumulative_offset = format!( "{} * ({} + ", cumulative_offset, - self.get_value(*index, false, false) + self.get_value(*index, false, false, false) ); } index_ptr_name.push_str(&format!( @@ -1137,20 +1424,26 @@ impl GPUContext<'_> { } } } - let name = self.get_value(collect, false, false); - format!("{} + {}", name, index_ptr_name) + let name = self.get_value(collect, false, false, false); + let full_name = if is_global && has_global_flag { + format!("reinterpret_cast<{}>({})", self.get_type(type_id, true, true), name) + } else if has_global_flag { + format!("reinterpret_cast<{}>({})", self.get_type(type_id, false, false), name) + } else { + name + }; + format!("{} + {}", full_name, index_ptr_name) } // Standalone function allows us to handle recursive initialization for // product and summation collections fn codegen_constant( &self, - declare_variable: String, name: String, cons_id: ConstantID, w: &mut String, + num_tabs: usize, ) -> Result<(), Error> { - write!(w, "\t{}", declare_variable)?; match &self.constants[cons_id.idx()] { Constant::Boolean(val) => write!(w, " = {};\n", val)?, Constant::Integer8(val) => write!(w, " = {};\n", val)?, @@ -1166,15 +1459,16 @@ impl GPUContext<'_> { Constant::Product(_, fields) => { write!(w, ";\n")?; for (i, field) in fields.iter().enumerate() { - // We don't emit array fields and size was set by struct definition + // Array size was set by struct definition and we don't emit array content if !self.constants[field.idx()].is_array() { - // Don't need type declaration for the fields - self.codegen_constant( - format!("{}.field_{}", name, i), - format!("{}.field_{}", name, i), - *field, - w, - )?; + // // Don't need type declaration for the fields + // self.codegen_constant( + // format!("{}.field_{}", name, i), + // format!("{}.field_{}", name, i), + // *field, + // w, + // )?; + } } } @@ -1207,8 +1501,8 @@ impl GPUContext<'_> { write!( w, ";\n\talignment = {};\n\tdynamic_shared_offset = - (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = - reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t + ((dynamic_shared_offset + alignment - 1) / alignment) * alignment; + \n\t{} = reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t dynamic_shared_offset += {}", element_size, name, @@ -1366,7 +1660,7 @@ impl GPUContext<'_> { evaluate_dynamic_constant(extent, self.dynamic_constants) .map(|val| acc.saturating_mul(val)) }) - .unwrap_or(0) + .unwrap_or_else(|| panic!("Queried size for array with unknown size")) } _ => self.get_alignment(type_id), } @@ -1375,17 +1669,24 @@ impl GPUContext<'_> { fn get_alignment(&self, type_id: TypeID) -> usize { match &self.types[type_id.idx()] { Type::Array(element_type, _) => self.get_alignment(*element_type), - Type::Product(fields) => fields - .iter() - .map(|field| self.get_alignment(*field)) - .sum::<usize>(), + Type::Product(fields) => { + let product_size = fields + .iter() + .map(|field| self.get_alignment(*field)) + .sum::<usize>(); + let field_alignment = fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(1); + field_alignment * ((product_size + (field_alignment - 1)) / field_alignment) + } , Type::Summation(fields) => { fields .iter() .map(|field| self.get_alignment(*field)) .max() .unwrap_or(0) - + 1 } Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, Type::Integer16 | Type::UnsignedInteger16 => 2, @@ -1395,19 +1696,11 @@ impl GPUContext<'_> { } } - fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Result<usize, Error> { - factors.iter().try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, self.dynamic_constants) - .ok_or(Error) - .map(|val| acc.saturating_mul(val)) - }) - } - fn get_block_name(&self, id: NodeID) -> String { format!("bb_{}", id.idx()) } - fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool, global_pointer: bool) -> String { if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { if ty { panic!("Dynamic constants shouldn't be re-initialized") @@ -1418,23 +1711,9 @@ impl GPUContext<'_> { panic!("Parameters shouldn't be re-initialized") } format!("p{}", index) - } else if let Node::Write { - collect, - data: _, - indices: _, - } = &self.function.nodes[id.idx()] - { - if ty { - panic!("Writes shouldn't be initialized, they're replaced with the referenced collection") - } - if make_pointer { - panic!("Writes shouldn't be called as pointer") - } - self.get_value(*collect, false, false) } else if ty - && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] - { - // Shmem/register arrays have special formatting + && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { + // Dynamic shared memory arrays have special formatting let mut declare_array = format!( "{} (*{}{})", self.get_type(*element_type, false, false), @@ -1448,7 +1727,7 @@ impl GPUContext<'_> { } else if ty { format!( "{} {}{}", - self.get_type(self.typing[id.idx()], make_pointer, false), + self.get_type(self.typing[id.idx()], make_pointer, global_pointer), self.function.nodes[id.idx()].lower_case_name(), id.idx() ) @@ -1461,53 +1740,55 @@ impl GPUContext<'_> { } } - fn get_type(&self, id: TypeID, make_pointer: bool, is_global: bool) -> String { + fn get_global_flag(&self, id: NodeID, ty: bool) -> Option<String> { + let node = &self.function.nodes[id.idx()]; + if (!node.is_phi() && !matches!(node, Node::Ternary { op: TernaryOperator::Select, ..})) || self.types[self.typing[id.idx()].idx()].is_primitive() { + None + } else if ty { + Some(format!( + "bool {}{}_is_global", + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + )) + } else { + Some(format!( + "{}{}_is_global", + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + )) + } + } + + fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { // Product and summation collections are char* for byte-addressability - // since we can have variable type fields - Type::Product(_) => { + // since we can have variable type fields. is_global can only be true + // if make_pointer is true, with the exception of recursive call + // from array match arm + Type::Product(_) | Type::Summation(_) => { if make_pointer { "char*".to_string() - } else if is_global { - "char".to_string() } else { - format!("Product_{}", id.idx()) - } - } - Type::Summation(_) => { - if make_pointer { - "char*".to_string() - } else if is_global { "char".to_string() - } else { - format!("Summation_{}", id.idx()) } } - Type::Array(element_type, extents) => { + Type::Array(element_type, _) => { // This suffix lets us work with references of dynamic shared memory // and use n-d array indexing. - format!( - "{}{}", - self.get_type(*element_type, false, is_global), - if make_pointer { - "*".to_string() - } else { - format!( - "(*){}", - extents - .iter() - .skip(1) - .map(|extent| format!("[dc{}]", extent.idx())) - .collect::<String>() - ) - } - ) + self.get_type(*element_type, true) } _ => convert_type(&self.types[id.idx()], make_pointer), } } } +fn multiply_dynamic_constants(dcs: &Vec<DynamicConstant>, factors: &[DynamicConstantID]) -> Option<usize> { + factors.iter().try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, dcs) + .map(|val| acc.saturating_mul(val)) + }) +} + // TODO: Add float8, float16, bfloat16 dtypes if they come fn convert_type(ty: &Type, make_pointer: bool) -> String { let mut result = match ty { diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 86edf743..432623c5 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1369,7 +1369,6 @@ impl Node { selection: _ } ); - define_pattern_predicate!(is_undef, Node::Undef { ty: _ }); pub fn try_region(&self) -> Option<&[NodeID]> { diff --git a/juno_samples/matmul/src/matmul.hbin b/juno_samples/matmul/src/matmul.hbin new file mode 100644 index 0000000000000000000000000000000000000000..046898dddc96e417ac560697d7030bf9583f77d6 GIT binary patch literal 1323 zcmd6n>sJ&<5XI}(%=El_VWts$ASj|M3Jbz6WHAaFHRk2pm>9D-!3YX!A`nah8sG1e zKRML{=jh3|d`o|*>iykYy)*apuB#TWFD))FFxYY6Q$!I$*8`}{Wru`<RZKo@uFBH< z!j0KhW32V@oJ`p{KZsTA$AO*p9XqYSuBXCS^baf87;O#9*^bzn<#fae>8k57c}Bq~ z(u4a+%d>DjGBHd{3G1r%c&uXDVA@CziJl7>H>LxogA(X!m&c8*W*f{l(lyb$1x)NT zJup3#4S;!YD1&XaOkSC5L%MGd>)rj759}?~7%{5}KCn+U_~5V_V`oH4HI$FlT!d=3 z8n<fQs-vQUgB0v1;H(qez6HEv7kGPbIC`l0;hq2UuKiy@w?-t(R99?Ze`7PT$cfy@ zlid7SM;C!)7n%qrB7N&8(y4%_(53=YvEgOblzaRHn}O-O;_4D|X)`iOu-lq;di)GM zrWdA{^ldTq37Ohs`eFJ>&k3l`dOWh<48RPKo)<7E;1z~U4W>r=>Ky6DJe5%&8qIHv zNw#zI`t~gAhDg^$vhj~k<O`paQhx9HhGkjKIxJ_M_|zPMIYN3xz_frbkD22z$0@rn z#^n{gD*Tl2xA@$ghB-}H-3qH&Q`+i^{_1PizkNgb{WBZ>HSwPl|M%z31(*w@*9BY_ z@E+fqD==3iG<m$MYozZ=zqU@<Lz!G#&mXj;|L|iR+Pe{Z>J|9sx_Sw|SywOQ3YYt$ zYI!lJo=8nCFZwK5Y+C1?rX_AuJrmw#`7e`}|2k>8cUHXtFP&Ad!INbz4qH-8tGD2j zXKXD_srTUDXVg3JA2arxKWFTBsYkG=IA`VJvU*+~_q3EBmohkM2ghyT&K=+_+rhi{ zfp^Jue}Mzwmly^gJPLk|&%kf+1^78Gg5Tl0@G<^^T(<w)+Ujazf)udb$v;QJjvKP$ zDL)Y6Sc7Ma>k<+;vhz7XZhM%ZLpn+COj0MH0I^H4{CJ#TDos<JRvd~md2J{V0H$L! HMbrNPPRfa# literal 0 HcmV?d00001 diff --git a/juno_samples/matmul/src/matmul.pdf b/juno_samples/matmul/src/matmul.pdf new file mode 100644 index 0000000000000000000000000000000000000000..529c95335bbad3971558e34beb66f9b091600b7f GIT binary patch literal 88675 zcmXV%Q<Nq=)2`dLt*33<w%yaVt!dk~ZQHhOYo={;&-cIkAS+2FN6D(Xuga>BDT<2I zG0}6tkhNUZe!#F2F%sDuS;6q~5;4e{+L^mp5V8F;DZ>yE5iy8c+PIiH{p)QET}(wy zjqOcLVfguBoL!tu4Q*jOvX6Ug=V+yz@Sl79MnF%e;P!haUoicN@)O{P0!gtBLB8ur z;11{~*c|IjRhr?yyvDeY6SQ`8zg%8sFXL$frg#+0)XYpZ#KcE0B3_Y?e|%9-@_5HS zU*Es({qzUZw5`&Qj$S3E`BvimUc2J_-tq>=_P)|&_Q!v21^kBNeiU$i#J_)T82Ub} zJJ`HGzfp$9`aUQ@f2u5fi!*x1_xs*nCq^fD1-_n9rqYmvsYlcsGAk51`c@dYg82`} z?ECb;PxijaKgq|Jd_UUK_P%bv#`cPzhy31|S_!}J#;~ftem=3r5clT4VOv=?{oaoB zf1dY#eV6U=ziC~(EGso!W=w5O<+Awo+SES%^b7nt+55!0K<pbM`FSC9{wg@AYs|dh zSpCAS`W%(H{E%1`xRw~r<n}d{b&|mA>3Q)qveRFjdl&o+Igogx5;)Q*PGIpH!MXy( zarCKQGkKr&9ljjuBER^3XFZLb<n|puGKGDX-|R*jdQ*}49%J;fzS&2=I(4XiTwF-V zKgiTB)s6Sr{(9zYbzQ96#aT>5=63?iO7!#AofCdIguP=56Z8Fgt-{pzme0?{GHF|W z!2Hn=D9yJ|Abms>7^Rf!uGkx9a7|_9_XSjCHr77hZSCm?C;NW9ct?4p_emkWr|bVd zC2WA~9a~A&55h#b_+*Bml%yX^zN)JuEb>XHXi*E<H1Dkz+HB?Ot1a!O=tjlWN}3z* z>#db2@Z5lHP+EiadDD%3Bs<i7e!tcIuJcIyV7q51S?SYip6vS$V9<mZV|!)zW-jf6 zI$*dOG+ZV00}Ou<`n@uh5`Kf9rhB~!e3$Q0LMuJ09<O<0_e`(8#T?$@9vppM1AJJy zn2-C%9ep$4MHl*xBN$S|zY-y!JjZR0041*_?itK7U56A?)(jChO=BKklh1(Fh~<wo z6bG*ll;aO5s^(`?(MQz{@RK9lJ)_L~a1tJ1d_PX?=Oz4@6O>vDo0gbgjkg(M#uaH| z##AqICQ`m{=qXxbh2bmgk#`HE4D!~&rE^`HUNxhkx^>T{!ZKVxECtQ4*^7(j$*DmL z=)O7wFDR3P3^Hd^(Sx~zpUREd)E(XxbP;Hb!K|i&RQjXib5L}UPLv(G{Au$i?vcm= z`qGq|<lp3cCfyBCfaD;oU^juwvMxXYJ8?1|W<gEs0~vpR1_lFVfm%WMa*dX#G?lHL za8s-pa+NLGQIV8<_9~QI-rwL>^`oI$^$+HP6|6pQd9$B`G1cVQZ-mGGhc|(vx}PP5 z@LiGeqfGKiKDlNw)J9b(Z%%S+to#V7Ps)j2g#($(v!Z1ZTHrj{*c|FBsw%j3=M$N! zFKi_=;s6v#DIi&yUfuq+5gmMe;ap)*F?oM2N4-Y|-Dj@yL*H$_4=N8eB8H;KmFGEF zyViBt2%iKof{^SuTdqx0st~y&o!wT%_RU!-Z%zRLiq)p83*S<`d-RR|3-msEj&6tb zFYEbG<e<+7ecx!)31qd8@4Mu;;Iiqz?)#R0F@5Y03HtLY-Q>;B*bm6q28UjDf0q8F zF)se$2lzq{x@Lcv5VJN$uBFcncaLgm1Bels@tN}$gDXW9Q`b0CmckoFuAMoqb7!s4 z%xH=kT*|k{w(eu3%<o;3E<2Yg1!v9tpbt!Vc`QGKuWBds?gOfHlWVG5$0Lea2y-J9 zqWdJxzh(8bX&M98Xnb>KMGujB1m#oaCZb87<XVDWBrB7O52v&jUl05^y#nX|P^exQ zZ4e4@erFFr|GdgiIQcItPCdz2wDEw$1u*Z7>uXPQ@)W_PFd&5pN}(o?1%n(XL(OYt zLWjMfveS|brcp*yml*tFHAhN-I!7+Emsf*Q>TTb~Omy1)U=p~Cp2ctM%v<1kW~iB% zO9btSRLm6PtZCFmK|s0GL&nXbtdNql;!ji>(CEn5?2DF$R(4V5a=fb&KY}?)$IdrF zRx`8e9PBcDK@kbKB*7&gD>M&kGwpMjgOS1aQ6@v>-)Z~;M?mmou===Hy0r>6tO?lt zGpz5<gJycij+YoKKi$`EewE=ip>Ygar~Y}LOE*ZU6FA7DACKF<(LVDH{OLUfPv&yF z3rP5*a2y($mPCQ5Snp~WNkWd9fJ;=}2I44neHt>z-JBNbZe-IiS~yHB-Q3+3H4YXM zXoT~w4`mb&O)U7VnFot43<cc=%O!zFFXloSj)G(;0k)xNcK{H@&KU}h-f|SsMKD)~ zFSIxS!$u0=v@TnZ<ij>6L#^#dJSCnCzomr)Ksn#P@aeRMt8xM-W&6ACUs73~-(VOG zp=-He=)!X=B!BG|(SO)&uQWz6`bFL}b|OoDHl{S>B;s|%8%wViu}5^H8cHQX7i~wl ztC${^U5tX>QuAmgOzY!-b0WEweQO?9O%apwHQLG*cHuuIRjWeLQ=)H3i-A|<{34qG z2|2hkE>R4jkW%3fc0j!q?|yl3g`v7=!=#?lHII3>2*&}36!<;ZpOzTZxB+rAy3>?G z>HT1keP61RVFY(#kehhjaj8<c|JOD@&=lrRNyC`-maOzHtJe~a+!h*zI?alyfGWdA zqW8Ar00RrUr}oDRt(tCZQ4zXfiyTfGRV#WyOMUSxdD$?#8uV0qtD5B)ySnF=i6fz$ zP|i}OQ`wyJiQRv(?1W?U@YCE#>^0+=x<DoYToM4Sqr)!L|4dRdk(GujJ8P1=LShYj zF-NG|TG_Xq8xkg{KWMo?f6pDYiA5c&;n{Pa+kaz_1=8g^82K9{LXJ8q2@kK;wg8Gc z34N2yjMdmgF>2^5-z&o}yxo;_a)tdIhXlY-i07z-_nRS$vI}okL(~#K>bLq<#gzDi zt)sS={W~wkWD8DPw_eS&Y2l^QhX7*XXGSR<u{3gxK}kF);8rqPh)m(O52`H{$v6-% zH-s{TKs*aATgZd6DHmz8KTFQ4#ovPh4LGK?YK4Ry3azN39yms&SYmPWtKCaGsic%f zWb~S`F{22d7Q4w#=3Ms9Vgf7P1R*tgg?&)d7Ld)BZ88$Elh}}3@%-6B%CV)-@NSx! z#lSe|2hkdhTC|a{DLM*2d3$~g88o?V$r_~3<bO(SgYBfL`YI8^OH9$vy_}0PL+--w z^N{hNkRWIn4Z0OP`XFZ(ZB>Q7K!ed@x(&}m%(l*7D{1T6t}j{sC}>SR2$v?6zXILy z>$A||0nl-U0Epaa7#}TFZ0tP!yr$HiQ_s|~ei&G*@)OBUkoz_lba6Kepebb0QU*7* z6_!gM!6|e&XkH1Ge7vRRdw>V;jCJI&vEjE}`NP3_alrj0+$JI)P8);k+sd6Sugu5X zO!+=b-pF}`o-pDzv*Z?rx^b@Q!u)UQ=u6S~WaF$Fi#zJp^S%~A8!G@i`<#)&9#~|W zn>o8d!tS7#4R0R-S>wq5wvsZo7ga5HZNDs^<9L8fk`c#>uuV6JkUC{#O4|LAK=F?K zi(Nut;o9Lmz}E0BGPuBIu1z*GwM|mACb6nBXjSpXVNeHOo`*ju5rg98!G%w=uEzCk z{vxGM-mT=1K*UzLpV4=Ofha1jtf(K?_A9z6xQoIN5ZD+KoK}HRlli1cLp4WG8}Yqn z1g5VG+oU-!KQ;Po^>m0CzoZ3YBySO|ceLt?F=^widpp8?T4;xus5R;+YKIhth);1+ ztdy2JK*2*hf&=pJxx=~t&|&{l`8kj2z%Q?3ORjCa?5ZItYgbO6SG3tB&c42xCmTSw zhqHrT-M8yu;L5|Cf7#5Or}i5&IXtadcWnHqkkP$*yMzibhYAHn5P?CmX}oO7!vpr& zi6R{-V6p#kr@YSuzzU&t9RHLwxzwe#r4X8nUp~lvH~`whGo3_Affp;-R6ofeKebLh z7#F{NFEmpDw=GfI+7YR~^-Ic{6JL(IiKk#y<!Mtp8L>z28TU{-wD#Hhwcz`evu%oa z{%XI4+OPPgTm<nt?|02RtX|c(1fskIJg;Ad=O4$Te}2_f9cDObUaE_?nkFy9T@sjW zNk^j?4KWMb0_K^W0fnKJOu6ocokASPnswP61V#*nx?3THx&B6fZ2K4u%@nvvG%|U> ze72Uky%uj`m=u>W46V~P+2}6f*9mut3bpd5=1pD&m~W{tBmm3OHo#imS2gYdR484B z5L5X^hyZZewRDdfd?lzAti`nt77URr@wYYt(Z{$u23KhniV(^-b-d%f>DO6O7xk6d z?f$?{p}>XRX+&|danF7H6~cqd#B&ZLVaE7~A=d;+JM<+sW}oI{N|ud#I*+w`dY`pi z!F4_C-?hHJYjf){y)49Dy|0UEvX)otj;VerM;DkCgOa@!+Wz@8Ez6Umv7kLts6Ley z_}%4kL`bPuWJ@?s{gV%)-s)l@4Gc@_u=7eCX=$?oJUt44Um~b-Fz4i+@jPzy|3>1# z6Bi7q6sFb9@8o!e;b@}k5Ee-Cn$=$yC*C#lN_VyUmKl$QQfmg&;N9KCO50&dk{amN zERH{R`Lt{kFKhj_I9KJoXu7=Wkwv&47`wLO@X8{{8IXI}^v!SMR21$f4BI71(hz2L z8TTNm5#zGes0q4ax}sEDJUzJ3_S+aK#CedT{LH+miAA?fm9^fJpjrsr=_o2e{y--% z`~~HPuMgmNppJ3dv?+_5?&yTblT2Z>%f*PCe@~BwSX5uzaXn6?(11Is=MvpvqPQ>^ ze&J-}o6Ha`CZb+d-#<|HrY$2nG3aSpry+W`V^F@4zf0fMAP1v5PR?OWEGv7Qv!;AN zav*znskr6syMx<fl5-}%GtulU2$w<?9L)6){{C_%KDp<#Fth~&qdnAdU4Y*gCt4f* zI|@A}8mP=ht#%Dp*vGdHHAXnUj0^jh{Rxd&<=bWQv*GleWsmdKAl&+k1K>AEk<IHs zA{Q)IL#K<VxDE;!+3Mec;}@C5ZG7J|{ECTlaoe#;fE%tBNoS%-14bR;K5;&$$M+WK zoZSkb7(=|ggOsRbOJi{%5%%LzHoq0<3v5460k}T8oJh8`@+jAwAt({n>?+H4<5hes zb`{IqJC1?(`R>ADe?Q<?Xh6;Y1S%D6OEAY39(N4GoGOL)i`J=fk@^oHe^=fUhwy1F zN7>|wjxFPKa$olE;JcB2!2LFIzDIvML~OOT*5!ZveNq=@1~4P%=bHK9?XJwHf3G8{ zvAn+nK~Qgp2(eEa#iENsPErW0NySRF#sKPw-*-Ww7KhOq#T?Q?yz7wNDMT-$?JfBw zI6sRjPlIE$OoSjK`HE1|V09XShXpnN8mfhhSH-EZJbEA2l<*>Lo7nZXAcHSnySv~5 zC?g-l8C2_3maBZGfRcVj2MOV(qE*WPWw3_V#b04YP?x|KuVoj3w!TM+{`EEBHnVmj zQm66F-LYD*n!HHh93+6~N0U)d|9p0!Pl9O{DcU$h5vTZ;(*$b@_r;S^PH4n&`}gGb z1v2>E6_MoUcgq4<d6_n@C8NzL_E+n0yZu|6;`i!+R@UBpbl3&y_7*WkYdXD4QSXaI zYX45sbSA}xnTx+XCJ(9Qm(FaFc+{JzuI&o@J5H}x;f8~o0%KI}Y~VKaOywcVdjD51 z&=qI(eAwP+<Qif#d!aU%&Sx0{j^td#D`}`BYQuJ!x%0{`KXL)<x3&u+PT)~^rMwvB zHfVK=ZI<;7)U~OD#7Y&T9uz3#(n{#UXMy8~d4=<i)0>4PND$J_@sj!mf3sfEf()Cq zX3UlVMSJ9q6H^dm7{$l3(CjnBlI2n~VwnV%K*6HWLcvMU4bgxDwn2e8i$b1H+w~89 z4dJyg7PoxF`tLRkHN#B*&UxBxw$%j7YLCoO3{~ee-~>oKHv)cqT`-3*jZRwRUa~;u zYik`Ber%uCwgl|8zwGyQ2yIL5o)Xh)N<6+7Jogg2U8gO=0d~h>^Pr$(@*R%|9RIiF z1I_OE?`PEh=Z4j>{(n$lECU((<!5&EL+F>>^PxX8x)+C=Il5H0K9%<dj5DUY>ju`I z;}-UK^TjV7(b9+9%EUcw9$n(Xrd2wBNgmv!@hTClTwpZN$Ccm`;m<bCMzH*#+H)bf z8Loy}3=u&o=ISUkQ$$a#+sNf*Og)L0>*&dIFKUa@+DGSR(>_q+dVXLQXYJ%fZ>&qx ztoT|&oAn!U%5m#--2pW#h8Y%*{MRa%SL(z|XDpjd%`H4kG2#OY)wdt;+K<l9qv3Qg z;@fcD4niLPKos3%bMD39L>T#saJ^tLf+`a1kg1Va^;Q3c^?AH|KERE9=iUUoDpW}6 z%jNzF>lVQcEBAp#segrtZz!$QvO`N`IsYNZIVgkd#;Un9UjAA<*P)Tio^#{e!HhKf zczmMVkcR@**L!3_y#2tWYKZF}_-O5%WF|TK`sG|~OirwBdfz^s@6ES9<Uvw&Ng+?l z^**$4y}6%DP{WcQsKRhmg@=GOoDmDzct|4QGK7N7DApc&T~5`+=QDjAa6{j_jYl+w zedYuBt`QX8L!~>;m`WN!SrT~<B;%lM9wg(shEzs1!v02opozzE_tDH<Y-B3;uC}7g z)~Ww=4u(B_am^ch2M{d5K0<xa3^T3&uu_{mSBOWI-)&P7!4Zbx=~3+=cU$YCMh+<E z;n(BkkAsa{E&S@Jh0&=#&v^~C?3zgP;rbt2#8Yph-ZF8P{PtSVuBUaBV35pG6uGki zgd#132mPfp0kvlM*LVtQ?xe%Bw?M<6gPscOtd(9Jcj}fb#;poI%OY;u6V9|Jd4Xyv z>&kCmxK7aJFcFuI`pUeGQT#z|h<M#lcJr-Ul_hsb+E1V`wf3n_z`G6Wp|fpOW(YRq zD=P_-?bFpsq3)*}e>GBwC?H%hVVQ7biFtZ->BK{#&<Fylz%NVcW`(K&K;%eHWW}nG zO@XL#ysF!>Xi!&cF{&Cy(;ENK6Z~x;u)(y|<$t!?smoi{#V>B+gMN5jQq=QKxj!sR zhMYi2(1_oNq%K+7R}*axP7@OqgS~@3cR|;NmRLYh|1RJn$su;qx`KksHW<2s>iXyU zGaso_5|BEEv;`x|P!TxTNQc6D-2Enc4s>p$aIC~p)t|VkQn)RN5(|{U^yh6hG-r|L z3tvMU<EVO3<2-O^t7iOBJ1Wx;{x15N49Qm|sb4pBqkpi?4Dp<)&G&}f71-xSuBHC? z&1)I3C^<Ls4}Hpc0bl<A_{DJy>W}+aiL5X;gzT~X^PRjaFeSYf^!)=u_w%<Uic<IT zwoHWglt<+Uf#+=RC%sa>f1Zjy<sHWq*N59(xrL4*?K}MV-7jwAlL<f0(nEGTPRc{} zr{HGnldblHTq55;+@vi~Znb2`?p~F}mA`U^lYIX>9x*1LKVQp*yW?<7y3Mr6&6&gF z$f>(uU@ZD0cVjsv6N#&u6)eUD<;23u3U5%{q)3;*W;Hpi(;a)3wSC3JVgH+*g$R*c zR`S9qJmVf)p)_fM0=Fe<*{?^*J?@6}cm<!`N@c6^0~0yXb-nzxXARf&<yCx#uXo8M zwMXd*yH{xq#HHvO{rxse|1R_|M8x>dm(B9`h0XG_gw3*BpVAH!=a>V70=iO1oF)8N zp<q!+5TP{gheA<#i^$pVq~S+FDX-aEfk{=W0-n+z*+X4O$Ye$Hj%UFd9t8UGp>mcB zgyAW6S=r@P7h*??U1LOQ(&s}zmH4dsGfPNO1dCxWN*zE4a(PvWf+;WVhPKwy`JR47 z>V?QvW$kzvf%-=%6(orlZk(9Nd-``iG(7oG2{W>k3(3&VMNQ!in%to^NnMu3QrKwF zsM(<sT4*^I61*VkB{H_Tvbi<fkN0ruiFiz83NFhN?6~~OqIu@6dHEz`d2;@{FzS_y z=I-gnDN9U1#@H)xb`ITwr0nvNJ+rFC4cs!_$cW!pwuq^>C+-(#a?SE~#`=?3dfI?^ zXhRwN5R@UGYTbFH_o-ovy#=&E7wF;D9BJ_HTJcdm9G2|N@*o4$y%nEGgF1*SyM9>z zTL!FP++V@0tWOObg2MyK*xmt`X(oZsip}C9_wgL2w<_(}s?Yv*`4JmvG?NX0RM3K7 z02gf;A{Wpjumt@aCM%iCUR)nx4%^rq+5S)pl>Rzi{K}B;vJ}aAAaJg=5@q@@L%1=m z@6g61An)WqP!5B~*AQN*e^5`kSL?ZUXK^hG{rLy#-rEG8--qn^I<1P+&HX}Wj^+7T zni?Yv1UjYfyn@x1rY9i=)d(}SPEre7aocLoVWsSJZC&!;UD;mt;J7}rGox*61a%s? zX04ZaJ-&eY-5dG}=-2}rEFgE9p8lPZfsKC+qb<Y7xIyo1LaWSMBY~AhM<Lr|{pMPq z?NvtUG8Un8;L#X*4X;V&;&shAg)ERNca?apk&4e+&Fp197b{u=Fh~RX=oQIs{Yj@~ z2Q!fVvPXM=0y>ZeM*s=#9dHSg(F@1fKN?6w)W?~|A2$<@ZwiIp7s3cp)bJrh3-O`= zSpyBM(j8C^I9*Kkw{9zj6Ac2GJ>tHzKB`{hey))Ad<?7sH|47via?eV=SJ2Km#|}@ z0b@*@Uf3b1Wt-p5v9Mou@C?dp?b=x)h0hwT^TPcOn6mfBnuQ@!<7NEtJ&h1)(GZKA z9CDjGPxr<8BirVv!~hA}H+X0ez!i(xweH|R6u$sj7kf~va&y{?YXcK1eFQc#ViwEk z*XnlJ^0t4m(l<cG!%`j3{(9Ch*$f$Z@7HJK)?15|n<Tr)Wp()#BdZL>2ek&dYUq}# zNojaubHEw%nsy&KX;R+qa!MaU5UaUcCjTbByhsen4fu!tRMRRLIH5V`S-BDh$68Rd zPj;#<9FNm<42|@?A~S#Nbn#6`T)=%Nw37j)Mt&%ST{P$$V!<K);;Z{B35WrJOUYb5 z8x^AC&;UTXz`N(=Tr3XI{(mGBM4?b9IzWA2R#w<`*+fHK`B>Diy2J`aKn#ww%~FU( zxqC;ZN`k5m<8H05WKw`AxZd{3FD^1L75j)7oQ!gY6qzCUx4%xTaLD<`msFv;P?J36 zFyB(}-Mo@R#pyQ3quO}&)DrteWDZMXfm0tbZwDU(lx7-0d?aFQW*lc3GM8Ac1XZ8| z6n4qE{<1c|qV)CU5jN6ai;YE6mE`q$`MX~C;lC$}f#y;}w(o_EYyUk3<g!0KX7TdV zpIQC!=iv%U<@XmKX6wuO!10X+h)H<h4qnNS3&6f19?Fx!3dUhZX6%Fz7P3(L)i?&} z2lV(kPJQ6PfFOSD#^s%UmcW6iB9(mY&-rUh40tfzaGDz&qtwiwFSVj8<-Y2w{Xv+p z{eJ?Z3t9Yv7KON|B}h60qLIb+3a2lqUZ_j+YpBj|1oyYeJoW_*MEA?Y-)R|_eT9AM z6udr5MXbZH_>0gVa_7TGEH*uq7&^Mz(UV(_pJY$OTw+5<r6IL}4XT-eX+e)8?acu= zunu_-tX=-W0;*Kx$v_6m@|$uYBavbYNKp}=$dOUtebeoI&UWm=RoePmvmyV#KF{Ui zaP%t9*W!6%7Qz8;xXmq*sP?QSLx%vvV#f~_IjLpgY8bI1wzlg1j?S3DRcV_18F4G> zM0soSzf->Dx+nh^<!<Ub1ljK^YTon*lHa{r8h%&KLvaZVh$$*X_Tz(t45@l@FSt#r z`jyHu4@M6-06m?mLC2I4v37D2as!S>KSx+cPJtNcC=mojZzWU%bYC;gM53l!7F84= z@}PCzuzc!j(7E#>aY*P|`9tIIJrN~r>ccIpx{k11>*(Uz*VHu+Q(~Jve#>7BJf}S= z_GQK;Jcq6Hq9$4!?!=K*U4)NQQ}oOA_EL4_b=4vJ^vHqWsOhfIq9S7oS>$mCnSq+A z!y8t~`jzc={?c9S)8OUQK|i>nt7_&BpHM~r%pqNb+TtYP9|OE|J7nc=kTH`0d0ln& zw#cs&5flFUDBTOVjT3%FaP(n*j-IZR9@(1j_1ZStZIoWMv$ZeSuGd1IWVmpup>Dl_ zc8kl}agW=Yv&CWI(7U9XdTw7e7=7uP?3H!U=sU(QiW=<aZyD;NI+%*H&lz@6sLPu& z3|G@I$!dmox@Ti6BkEG4r#uDFCYWUg&8uEsh0{L6_{hOy7Lq44qNbZk0@*8%G(g~~ zQkSr0lszGq6)A`{I*J%|jZuN2EF=v7hJi$}9#pE6tbX$NpH<4{g%)PPl0D*%YODLo znh|BI>Ji$`cfw2ft4_}7%^5|dgByehE8&L~T%8$_Aezx*;eI5K1_fUW2MnLh9z0dA zowtO6OY+I4JSW|{SxQwnaGchvu~4Z{laU(Ee5s`5V_~fxRFa!fu);L!qTfdGWsflh zJk>mV8p&AIT*?iMhWQ2-w*(YeKmmUtk*}g0Ah&+EsI#9F=CY--({V^qG>4blW8(*o zd+iZl<9@3lYx2_7XY|MAR%03s;Ty)%_Wzb{m8|Ih2HPDynP)U)UuX{f_T18EdRY36 zTbJ2<bEFX*%PMlP-<htY6om|tkiMj+IUS~n?O;E!F~hi1uD%T^yqjzbBo@SpB~pg* zW!UZ<mN5VaS(?L+z%Oq+aub@{!G<ogvy*Alp<-0*_-GC%i8z(F@VEQk+=cUIRnfLX zAgJaf`8MHC!+!Ht-T~rMRdS5a|K$I*jkxO5+fV%W{#-gEK6b02#(Wbta|RX?ry`O> zIr05=_TbLx2_8vtjlVJxW}JNf@8qay*}BtV1YlOip&CdLJ4|{|J)y=^98svouwd~S z605Tsp(jH6kUJy>lv*SB&|#kqP{gvQfHm?N@q9k=O59<YwH!EpF7GhKv&OZ?gFouW z0h3-#)|cb1Syqtg>y3w#aUFP}5rsXp%{J`;D-so7DX5*ct9j^ZbSM9Kpk>fq*}&n) zoP0wAJ?uj=t1A4W(BMYYF*1`f5QmPj0|JK<C=6bMPzej0v)}|G0F}7;_VJexM1nVc z@V$w+tc*h818&M<WZ`B$Y%7?yX_Vj3R)V?y9(AywcawqjMK}Tc(((%kc!GY-R^F@C z!Y98Q^4W5H`9G~e-u5uVUq)ub5-tOx1d`=R*Z=wM%&Wn-w)MJQIo~_re*6&R|85Dc z`L3vEGjKy;9t>?1fzfriUbb4~<$h((gnkHD2|zA-e$3Q$?N4I$@FRSV7w;ln7FMtc zeCRiKRr%Z*SHUc-)HpBy@jb<|%V0*Y(c<pK>#(Ia+|CwoYsmO+gQ$zWhRJhhZByJp zb1<rcZCKyaN1UrQ`)U49n~#K-mB+uJA14_krH`DIg{ygl4`f^)$wj9GyF=t_TE$e0 z(Gn1@?f&dN^S{&<*>aK}A82IeSrFh4J|<c^=XjLgmxla0a)yGzYQt6s`im+rs{P0l zfuY}!WdH}DcY*5`ONz7~%W13^Pns$0S~(~NJr>?z6xT9V+SfoYIu5siAPfMYei$M? zA(VBl90VV+Q0l@{Fv(#xKyI<=ui^Y|x#w8dF!Fp<ki7nzemj@A6`L*n5Z-^6qk{hW zm){^VTx5<Uy*=x1ETA1B)Y*FzQy4!K1KEG3uv`dMf?Qb&$(s|M8gU`e0m@K5-Qudh z3)>T-9bAV*WvbEGt|s?zT&p^xoDv@H9`BpOSfZ=b?~|Goh;HX_NJBv{Vt6+R&o`>6 zJW6<=E0oQnk3leQrAL_1_km`YD5tXa>AviafS-v94#sochO^DWIS+0Ck|Te5d;bSH zg381A%}@RRQXGzB$R>K^$LR$Xu}h1Qa&5liX-0K%RyFnfd*$sKs!Oj^YAtB@ImmD% z&QkwUOz7@(3vwV)$>OVTqnT{HYm+wl@7yN`Ol9-a#z06Fgrg{s%&Hv!CprvgdNFGx z3>p}7p)|?8hx6cQeX<tx*Xz73`uqi;tpqTRNyj;+LK;_8cxo~A=ae(zPWK=M=l8<_ z!;+fD^#@MwYH_--IWS*m7tNUEI1$}BW;5S$r**&zS;-Q8*vjw!N#g2!&n8E!x|Cn= z5cL5d&8_nHv{wj5cXBeOrq>343Qt0%8WKT=g`1;+Wl_frI54(D0blIutWt)XApT~> z1E_=8N4I*zqZ<5A=wO7P4j`m!!5or(H7>Gg#Y|%MliMD2O-ogRbWt%v#?xRy;mW}2 zFE5;_Oxf3Y&B0<$%rpjAprx~ZvjJjDRcCL*>`Ql<4_{dl558KlU+8y+-guX2wk_ir z`sIxo5!QNq1I;;wGdgm&Si0-24l_IsoKunMJvc$YVHv*>x3}cqggh@t|Fc*R=Jl8+ zDK2TOm!LT)(BYV%l4jF~``q43Jt^DUN#q#e(eLv=khfEmPH?-u<m-I4;5r@(d=M`~ zi2VjRQ39qq0^aauKblq5Qy6YfGP^|CrIx29KaIo@h>sDE!sBuFLT$NCax(H-HHPZW ztwb+9{`J5r#w%NNP}{ps&T7p%F*)0ASg$+fEZB7+xZIn3@(_&Ngl85g!|vDw`>_Ix z$h637F3n!qv2JrDpgCqs3H&2x7Ii}Z+9E~!8`%v|ZK{4wEI;PAc(wL;_P(8&kD`Hj z&6LpVt5u_Dfe3INw&PAdtLiJd(kZxj$mi@ZGK{b}H8I}~rK~_DfuZh#dP&{VS#9>Y z?_-OEWu}LgxYqkqGn(4C$*>Q`fjr;m;7fXYkRSqy9TAAId~%s@12BK4Ty@h|fWBeM z6Zu`|EsY1c$U98X0N6XtwM3qfH&FZbw$$cr<fYO=^@=$u$+6}&>7YHWyD|)mojZzl zS0oSaA585cWi8aqCL=x>AN2p;<6>;?8U29Wm4C-0#1wWV-{0;+Lqv7TDT*B}4{FF3 zl_5ekdYk)x<PW6!FT{b2qex4{<g>w{nc`V;Mb@UUs-pSRk4%vxp>F``UWev-UjZRp z?UGt&$WRtcUO$tU0QH4?B`J^JoytiQ&mPTtNE$Dsb_K7F8(=Y|yLl$9ihNPK*08zM zJ>w&#eam<x-FI+Mk(EcQ(SVi9@SglaOKqp3QJ{Q}IfWxiYdDBc>2l=K+s~`tsPWnl zJCuN261?6fMc_f<6y_;TL`3(_kg0V=My-ri@QIm_al(w@GS1n2jDK|?1m1!AWRDSL zePKsw&2kj&j*G|#CUW{0u$W)JRpp@sB)~#uW5iuiCP{;LlVAP+Ze%}$P?hTlU!|w9 z!O>;iR<Sjr#>Ge*%cbLGj_0PpWkCJof>oG3BCgQf=lAq|J@iKbS5~tAP6c6Zh0MQq z2w@mlp%kc*0`cDbK>qM5RJd%|x;3LG9szzN$2L1f24Q$P9`9_<*+#}J#Fb6)&mO=1 z9U+!GBhJ6blVA2L#HwnQQ3ab$8g`KW!UImt_Mz!Us$=tz;aR<0Nkw{Nm2@S7keBak zJHOu4I`DnqeVM9Mtl;(V9hG8S!G?ODsUU`8=(+uF0gHL~kFSsasg02Ju>c`}Z(!yd z+_Y0+M}1p1KXw--Y7+@snr4+q4eNtyKrtZt4gnL@VO^;ht6^0VKD77=&%V;q78Quh zbyY)}abp084QsW$8SF@cmJn!rATv4j-?cN9ZazB825~>cV&m1xw)66z!c8iQrsNKU zc%)+p%5I`k@F&JcBI@CDf31m(Ib=&@8M%O;41E@LVVTaY#`dfqXj*Di`2o(&klbJj z&+Eiw^SEpAz!0kS=G6C_h~$9yjls;~*9O0#dWdQA@@N}5wN0GfRB}mK3eVAR%7R0% zA6KlQv5-&-*$TaLF~&hs;_W1InaE^LwOo$zqhbrqz_76+1Y5?T7MWEcTN&J*L9A>N z@?Breck!c;Tp)`6)jJ3<&>K$$ePuE%hw0UQ!iS22f3cvvbIk@ff!+zyLlqL5oLgl` z(teX35=Jv{nxmMYS?Cs-@evV&5le&W-+?<3lXgNO=7XtewOE@h!m9GmPvNOc=iF$5 z-*+XLdqXlq1<z&6LT>SGuu%`+_5)Q&nIxG_9!7(eFz!d*SFOY&`@9xV;4_-HP}Iiv z2$rPGk6`+VI{;5(BB^rf&@>;k^&P@PEU$=X5?ojcg#EsDZKF9yu7G6HSIorCiFRIO z{$|3Cz<wh*P0AEpl9lyW;Gws1BZnzSEJo)V^8(Wt6s?Zp*cv!OqPRb8QadP>GfxHj zn?#QHs~KD5X-nhc_=X?Ry+N5MTOz6b7g2a}5s0a*!6li(O?D~e@G_k|b|-jK)*>UR z#Xiz~9IF!7C3sL2DiX>9&r~)Moaz+7-v-Az<G_`&705rQ2{WgM@Ke%NH(qB`yWg!= z4Ch{#>s(R>SnVdEqRiTY_n1s;im7o`5`1CNa>uKg&`RS2Ip?B&(_MA#e#$d$vE#*} z5AScfO`8F8U+Z$Jb<KAM9S8V~BlJ#8L+xSLfcoZ{M_6u2ThXisARN=9b%hj|Q&*Q8 z(X3ig@|o*@`EmzL3_8RlzZS7|G+ESJvwqLYcG)$6PBC*SbLFCu(8`UWw!DMAUgIMc zNc=mjvOqw@by~qk6dIgCvl)*&Col+H^}RR>%HTs6*#iLTfsEa+X(8b+L3YlcE77l# zYSfOTXdqEHtJY%h+eA{}gr>@QRvQGO&z<>McCZo7$B>TRs3!LhbE@eZ?wSl!$_(&H z{qrx<E<8A41BH68bCj0uGFq)FFC}|Z+IO@%lGe3+ve9R1pTI2r``Gc44#|pdY9}w> zb0U%g#s|ptLnazrpi;)m9a<ors7A}w8U<D&K>+3s;lMLhSlTT1>3Sw3EDME;%%nHQ zYqk>;fPY3w>w`JyVn))H;~iDkL1q2gl4En9gxEv<1-oi;yRg3Gx&|77?@65O7g(dq zpqSx%vUd?WcgU4mDsIoME{yZYtwu@xoUTOt@M)5v>ylBlSc-_cBPj`fB^l#N7?zT@ zz(D?}0j<JUN0N@7ld^37aN$&CC`3)MwD8KjXogl18i27%adRepedm;B4_Pa370b#v zXP_Kx(tfL@bJ1i*e>x_y18riz`bGNij7;9f5;49~xD``fD>}AB!jS#5RC8YyLQ$Hf z?%~b%F+vtXgj7g0483lfkEI<MVWU)+MmiID_WQf?o~=N!482V>;63%_c88@q{N!5e ziIQsya-gk4E`5M$8^s)UK6WV6_VudH!#HgpR#-19Z@Kx?9unqBW03aSIA5{uNUg5Y zc3<^g5Mb-sj8rL301~r5wh<r>P3D0RAl`WQ;Yg`XfNVk+g`o{#OO`Ct4G_O5@J+eU z5qYuF$ArvxQL%lcT|QN7A<n}+Q<g5pMM$2uVa+yOZ_fysuZg5b#aH?b!B1lgoEF6Z zP1o;+v=Wzg{nNT-l%dO)O@;#|A!N(CykO!`I-<P4A|r6LfXy$pF4{lOQb=SPQUxB4 zZlxwcf|C-4z-VB>_Vs6{KdG3(THKW>8aH18KnBi~pAXNejmG!x`>6Gl<~IJfDYZ)} zhjkiMW>AYjvyA$sKk?7=yu<k4rB$b*cgcr_KgJVXa&f*bBS-)6QXr5(z<J=ii_kAA z5hMBMIh*K47fCoys7M5%)5gewi8pCR+S**S=dQN4rTo}-n1R*;J=jTe{2ZBW*v7Tt zSgPjqB{C@gH*)`5@Qoy;;%pFhZM#`Vj(Lz@r^SgHLRnpWR=FS#$*D&boWImaw_bL` zIF%ff^J?x`*x6YUThw<Zt`spNO%N*vnjuK3qKS8WU`DbaS|Xk-H$OJ5wVrBuLda@s zxv;SvhLQzc47qo^sGZ9$cuBwc^6kLj2$Q}2PJwk7zH4VoCrGEq_*%gjNvkBWe{x$o zJsaNdaZGd?x<`w&gIbg3nM{M1X7uZQz7?Nw?inMMXEfM1?&^Z|)CaZIf93PO6S$d% zZR5#vM!nx{gwE;%J~GID&eS~1WhElNXwQ&j&LcDz{X#YGBfA9~ihqQSCSo2{;^MT7 zq=GdQV~wPuXbW8DTIHwh>oc}ImHR9X@FX9nKNuRQUCgblN)klq6vNC^A-RB*B?*lO z$9)P#W6Oz?D+w+pYKZM}*f39OOC8wzOIY@slM2jH1o|2AOt-W*Uer$^lWoa<OKh0w zR)S8tMb_YlT21ZO*O)49coln2bJi5u+=QL0=_VlZRX^uQo1PqD3T@E;bxR7KP7u{X zoy@^GdQ}8FN3PV@DGy6%YVj{LI*)Fn*GRj(NkD7^UIN)HYKae--fv#$B@LOn&N~}+ zI$|AX9$NXXt%}P$+6gCllT)KrhZqTF#Hs|n^nJ|G%^UgCsN=aoZFfRv<*>p^lbp0E zrTHiwVu}B#g^ya_d@`Z2DLtiX4NzHMDm6WM3fokp3;p}gS@t3j&-2Q3dc>3{2DxY` zpPb+%<(M+$ie*RAjjNFc;NuMyl(6qmgjD94`By}Q^H1z<*(^g$d@K742Q#6Mfw6Y1 z{3oWSMreF@;=ZAo_*J+LBxygTlMIjZ9Lux9>0c0{4(7Ed7+HNUH?xr}h~UWtMFRCn z4oT0s?C0r{XNfXdA>R2xGRH7L)+H8j%E9t8z~qg1@q4<nqgTY6AUt)rPPPhOZS`iL z(*5l07$O;1W@ure5I``INTkD|RF@4rG^k}SJdw}<FB!fel5@2!-~sBB{+m)$UmVWz ze+q>PYdXrV+$b?<c!y|7<are#a4@OrYS^BthH^8f=V|PDnY*!XDG?WxP$mf*0~Ki1 z%{hg#_LD=SjCY-Zzz1k~+de+sPC?;@)c~<EYR7T|P&NE`@i1vytBP<}%AoaQI1t@K zYFM%Gb(epNs)NDEf1#6JYUv4rTxgCQ|3gyXz{$MSK(8p>>|qAT(>lG@1{&8vh*11u zF`E9rShRWL)Z>xv?d+B`s&6&e7%UcQ;Xp+D7mRR#U{BlV@X}E2d?zPFIDb#rjd90d z16@+c5|LcrzaL<DoI#by>=46Z2Eq%wG+W+c&ZBzFA=udxA~XtUF=Aq|&4IiTc=drT zSgMrzuq#Fd7_Mjq$lx}U+SLFMMd@-9%+O{&BOIj;v|rbZ2)Ib}ml&wZ+HSc4s@?`v zmMN%4%e%3>;J+ucS~pH%PpR-Myi(nuq!iOy^g(8)2{1UQtcD_%B~lv6T{fT1(IJ0N zh8bFi+yL=C>BqJzP!ole$|fr+Xam7D!}~KHpW@Jg;#AQziGgl*PNMC$<1ySSFwl%c z04;d<gFq{%4)Y!f#W}7gN4PH$>oK7tjjdOM-m~{(L1~4~!^Pc$_G2)Gj4tm>dP70z z1D0?lrb75`&HOVRQ20PfsI}Q~+l1(WfYB>)QF}c`U<dPjm#Cc`1nz0gi=V_YHcxX| zl&4B%54LedrLd%Sk|S={rwlc>gNtL%CkWXmRbgb?Sz)|Zl_}p1*R>CDodm4szl4)L zjp_sKYS)#6K>FQXsS(91*gov@%-WPCgM!0HbKFq1HlIkE)3NNx8AO*rD{l=aB^qY& z*{L%J#u%5+z;1XJ$AxrX8SHv4*G14EQ_+Di=+0JzGknN378Lt}l9jN$zfXt9G=%0! z4zpHzhn6vq8Ix^#S=`W>`=_zM`0j2(zg?8j!I3rNe`S_Jp)_j!!BwBdNO*y9LU##W zZ+e**e8(!nR3ocK+i*D*+W!W=<`>4BiIyjFfoma<EcP^v*^B#@V#x>Qf>+iT<fmSg z%Wf~Jiot{ZWye>@u>gm)9P&ogo-6fs-22(EFrS}Uc@1?yDUvkUSV}VCLw$kGJGx_0 zjTO>T&s#870eynH5VbDGs4f&?Y$1CDvce(77MX@rqfnnNu_+?U&RWTmdgw6E5x|IM zDgXy$=z{ewu4&ljUL<A_(=Rij=2u2`NJKATT>y(YglTe*CI$szCLS6{3Tl!B&A;N4 zR5C~WF;4<t4-DF;p0_+BGzlc9P9a&`U)k&}fD@IQDJ5hLzUXNnVu*Z+d{*A3&de87 zQ?h71#rm_;{=Q`oT8RjArA<@8ofw}N+G(W<bL4F!5x8F$Drx~FqF;y3N)ze{DT3lK z;!y3}yM8schv-}U$^+EbqTQkX*#+KBIPjA8zRT9EmGS}JT}KDGv!+h97U?&)mAmip zp>y|A^=kpAQ{$ub@CSSPTgQCT#~-ZNsb+s5{H?G7Mpvsna-sP<ytT}Uv3+K|Qw1c+ z*3bzi4j+b-4NF5hGRCS(I0whBvityS9lSqzfjU6iO5Nn!j<p>ggd%*FnWe}Yo-r-~ zx;#P<EJ^Cp-wLauUY%HZU_r57hk)5Nn9D*pM7pR#Ow;16Gtt9{`JUK^xZF>vMie~1 zt3jaPRt{jCL91t<a!5_5i+Zg%kIbFVL<|=4jkmktL2XHX3y9ry@Bl5$;Ptd<08&+( zQse4$LSMyRz^O0L&LO}!>A?L;!Z`B>3Ydp#my_dAl0Zc<dnE{3afXfz8B6pTg5`f1 zD`)zYHu&-gQL5rJpaqjXFxfxgWI4cZVPiFprT{c-O0prDdr$a+8Ds8u7c-<$*d9;G z!b~ejW}|Dx%t>>{LT#E(t4JWx=@UOY>x<y2L4XfTFexvPAuH)P(imi!Yn$Ri{=|dG zbP-oktJZ~ls1_`WzwZBLRGuWD4TTM+HdVSqsxpgLCxW9g8C=JD57s{D#3iM(fYv5M zCMD~V{cgu1nW#05Y=RSiKnOmT6kIZBM2<vLYzTWX$T%WKgW516fBkEDn6Q_OZmm)R z4RL)wp>7G|g#|plS585+ST{b~Vuz+MDX)Y5b^!%b3eIj5(C;23gj5f!c)k?(rprOf z2{$-NMT*o2T4_aygAH->sT`c4CBB4kE~f0C0YfAa5Z*z&AEgyV4T<6EMG525!ju2W z`U;c;-(}@ZEGF`n@{vhW)xQe)Y*pC5W0Cl25Z}K&a!O<jl9HO<2$v%m|2_{dQ+}oQ zkno3xY$Pi18>t+6g}Y_Ypv^!S<L|Hhek2}jI^yNRKp@J%Qm<nJ6SPKB--p;BAyQ3N zjbxk{79;lqa%mGlMC^s&PsyY{V$3*BE6h(KS(^wjiECl6YwP-cX(r@CU8PsqHRO~` zuv1_(?L62acla>`n@{XuoAf;Rtq<5TQ-JYfDG8QIz`aEcsm^ADKOV+oG)N*LqMY}5 zDY&(l5|1V0((PbeJNER5nerCi%9PA>{v8l2_|i3e7manIs85GCh!PEv$=K&a0fh^c z@k%gP&*|eWfjU-fsplIY`q#e<t@>iR3pet_Ol)*?ztZ*a7M5Ft!l7^=8#%rQBul_k zzFk^Rt7dA2?8D2nE<Yt-2cqZC1XaQl7PRE`hI~2{6?#a628p(6=xSwa_<f0)$=Lx^ zJ#%28sgMoXgv^|nNOb(BMA8h50C(&4&$A}EE!BQwsfxr2lOUlez0x4nkT5e;3<$ux znL@FK?w$uSt!|(GOuA|*o6@D#AuABm`k0)3<jcalBllH=;#|SZbgUpgoBW)*tvVJ5 zi#Y!-0DfS**Ou)zbv#tWNX0PUEmSKl-gvw<7!jF~LKYtA#yi?@`a^XtDpK_epX@kk zwYc9)MIv$=Q!ZM{lei%}=35Fl4bojXKUhq8ZZuY;X}St6nW?gXPItkKd(n)0-mJ55 z#-w>71Xv<FKm?iu;q@W-k8}Qvp{b$bJxU||Q5_Wn5I%GtC{+dqH^e`IT#pXyUQ17z zinS!@Db_Lrs$ZyYLgUMX_E|}p`xsE<FZxq#YbHe~V||>OlZ`iyAk|@7RMMQGiD(y5 zbQPdL4X)xlagWCX(gH|XaVyelbYzBQoVGlXniF>($YFq=!Yw_3OvMN4EA~Z<<|0Vh zPgBycQJWlKxElkl$m-;M=m<GOcOC>xLSJf<!raRB>WeLwr0f5x>A;O#5iXhhjgjGr z7=l|C<`&5;$(>nVz=K^GzpMeH=m`ZCC(P~O;hv<E3Enw?l{2$HG?%;oH!8EnZC#Ke zsAIL1K5kX2fgNzgr^}aGVbBwtBtgkZy|rpVrhm&Puz=tJC|bMYLY<l+A0M-d&*E+D zJ7_g(BYWC(8ledCQx=5EbR*KzAq4@cYc6Co>9IP{-oN!9!54dD2GBJJjtzl4xy+(K z!f>cN<Mw0o)j0h7z<Hl$z9J>^&|UH|Ab?E|LbGn^lYB$X)u7F6!(-8}CMv;FE{T6Q zbPilPMkFLzEz}z@E$b?X$>Z9f#s{~@$`>pRjFC^xfa%5YI{gx?(Q_VYITaNoWWlwJ zj@yKH9;x)0(N&9uFvO>-<LC(<H!|Q|tcGyZ+*O+{uw46JY0*u7(aKy^WRGbUmrOI9 zI-@*6|0`7YL;|`^SD~+T#d)O=74GUDbHo@5v*hXtYJftyiCGOFEP*NKXe0u=l*^u~ z1+02?^jKJgw|kIb#=t$ffc^{0a1d~{$ABj^Zs|Ob30IfmV7OYcN9566X2BbiASQ=4 zytSkxl30sbPvCSU80Z`%tW?SAc;QQsrJOVZE^o|T;qS0-ZvP}2MUA#tt8>PIJD|A8 zcVPgZEMF3C8O;9UT?IXiMaw9P_PEZH3z23^voG$-Fa3uv3n)fbBRHQqktq>jV~U%= zsJH{7eht}3di(D!;|`k7(d=-}=zf0T*#XpAga<P+1;yZkh5c<yRS!*LQ5pnfh!sDR zdK;6&sLe2;uO!YVBP|clm!df<(EghdWvZpA$KP|4&jC_t=7lGqN(Z_d!i*pj#Pv%d z`;%dZPU-z#>Yek~cP_+PW+$*`Dx-~fuQ;Qc;B+oK)`1uL_YjTdt(Z^RgGiv3*d%in zJMbR<NQb@<9u5#IO#w!RjQM7K7A{Kq+N;buMVX3GltS<bLbjfp2|BKIkoS*-mHCjG zbL`c-i2)W(**eW_yEU2&U95h?!?G<%yNKctI|t_D>LhJK&1naOpp8+Jy6$e^;f$r1 zu4}Xg0E#Mblq(9Q$FaSF25C_|Ic=VDcQGTtuwAj_IANhqNDJ<l`!;-UjSRb*KF~<U z1#)Z3tl(tp)e26w8k%<Cmag1Yvjne87rt+(w+>FyU2cM+0a;@Pj7)8%|ND;MdyiSb zcl5^qhXClRg`qdp*McSoGgT@xbSepSyL-4|Ki-sx_0g5IFcd6`o;NW9_LF}on4u>k zI<{Pfx#9Mm5c)XEy%^Jls=dZX{*nJej4qmn#P*Vm&GvT(($NBu5wC0MIXPAMDjewH zP}@#=uiIy+frpSba%$4J*HTls6eU4fxJ3exJ<q?-#2~D&$V5BK>$q`l+)ymr72*7< z?XIt64tgaJzQR$=q*w~H#sQPlk43r7)8=j5ad1GUv1$V2s;rVxoPJ7O975mJ`LAcw z;Ts?#A4;7fZ*p|_T7}qy$Q2bMq_BrskG5xQ%kBP*_sP6L;y#0dMH!}l!Uffy54u6l zG)Uwz0~gfX$+u&>iD#;zSx0C|_Ev&HLm{*H#i9{1Aki!K(JRzr;i$}0z@yJ$$y|kt zSQUp{xNHO=Q75ZqUDobp12n}H?F}lJBrmrTII`)f{k09hEVs0i_9|tki*m8lZw&L{ zf*=vuI&p3(GwZ|8<sp61M58sm(S}p-%OyG(8f+l(Me<d;`ymu(LZL_to`%FlOGzrP zh=Kx+8RKvrTZj0wy&fj=(uVfD0JBHp!5#`l>_PqqKsI;nh{RpyH9Ps|+#0y(@p>cT z$pHTAA-h7g!2uCMd>39M-uXl*!Tb~cwlRvVu|5=-|HsrjMn@KP?b@+Zv5k(+j#06b zj%|0xwr$%sJL=fBZQK0vJn#9=dw%RcwMUJy*BEQBT63=Jx~qWdttb;(TW|0=ex#<2 zz?PTokNi%p%qbM(mE&_aS|lR-F0Zu+1kh4a_9f5qo*t8Iy>T3WgRPHNeO~?)g|M_f z%>-t)UXz*)(7;Kb8|HAB@#15-aGORr|9wD{%(9SxFlvqY7HTe?hJQV%I;VZ{lw?pW zBlr`_jT8o>91O0+)GH<k%HETrN+R&>gMKCokU5;IVcnL%a~e#Jf0noxZIGv7;oI^= z<jCBs3riB+%c71D>4-3X9)k}G%>bE<Kx8{vJ>fKWjV);wFF3t=nTo;K*UM)UqT4{N zI@V)Kr3v!hJ|i=%ikWVc;2W=)#&%Mk<eW>Y+ChDn?@lIv4s}OF2_>r}DV9cPnb9!K za@)veRW}688y^+M%Y9NmH?q`U;#6@zE(}TLQboD@23d^Rj_I3OcnE3PD|fk3_0?s} zQ>~H?hIygWVE<9YeSl$uCH;8)musSxilxlT4X8D?4X`Q47&J$#fW7`kacuZ=J@FFP z!HF^lh;J!n%lJ|w{fSL^A?v!7^Qr?<cT5uI5>!rx;gQ#&)b$e)XD!=8O0^gHfZdC1 zoV4Xeh!KQFWty8l`pL`2#=)14V(KVcGy|!Hibo;boG{15Jjmv}X}7GOVUdR8i<(VS zNL6oL4x_3F(ix!~5ynMH8KQyM^Z~Ry6!=zNf;S3xZYtvlZFA6tk?~oGk>)2U62_M? zhK5LuAip>arxbBEMlDE<a0ZG76_hoDo4aeaU~YfuO#LHp146|Ph^&~E3uY0DL63lo z@xNwU=GzKA)eNqgjn;q9=gw!zh9FcxuW_dpHoOc1y0fw@Wo6=V2OZbB(9sjn=8?4f zQnsAIR9w>W!8tjeWCRe8w}&XrlJX`MP|B-<vXtSZ7NF)~NSjyu2e5=X!3H^OaBgO9 zg8*Kh=s{LdsGF5EOq&lzvNn7crUVphy6Bnrx5L1h8IcKuEzrc)5fUvZGx;a9@xXM+ znh$~<Cr?7+)=p?^;%<w`i&{Q5!@>SQYi^P#w|OY)a|ONS2_*>bOe>x;AzbXO{SF~Y z5}7Uz*^Y;3%UVGkh6W{bST@ex0A<HAxh7m&SdlPN>b9TZwy@9BcedI-0`)zfa54~e zw8{0%2Gf?=)zTK$>rJfFYMq5NnX{kn@drIeLWMbMG^#=~5^y;?CfI@a$1Xn##Z+M_ zWq)j~|0Q|1zJCbW?j)t6;Puq;ff1foM<|5`da@m4FVN`o$dVA6uh+6NYGV0=U|w5I zPmlE?gI0qG>QTa_o#uN)8(5V&P(Az3`EACQbwLM_&}~EAl8eeN50>z(T|#dWqa#+i z@&QRAKK^(MuH+^Zjie906RTiIr1jYishNA{R-o)0OrPI4?I_K;HtW}gCJ2VW!<2IM zg340M*r$yy`PnhkF?}}$!quBTTJKq?3AaBDqH%)3nUZN#o5opGs*iEt$D7&sFn0f3 zUsy@)^HMQX$DK&hgViIZ4VCITGl2)U9#yuIz0n)+n0ZM?p=i2Xnd|-|^R+jD{F|GG z)ya+c7OXA24i>$vf`=?E2NQ!!KMY?P2%-V<kSoreN`kKu1!L$6=qc8QfjTo*Sa19b z-m0EpgW!u;g2IdH#a%wXF<n7FBqDfy3~SW{2}2Fqb5Qe#6(WmM1rBF>{GxelMr4sz z+KFt7geJ5=m7*6KJnAI2Vixl!dC(-wM4P6R5dc_yCc{K<;@bhW6Fh?*=358TAV!XQ zFF`&~KOVB!XvdTnXw1qLB&9l%)WU**W3Yi&?0FCLDubZQDcnZ)>rmSi)>5R!vSOIe z1fG3fVF*pC+5`i{t?TlWhO62jZpG;tTD;=*8^ip%y_7Zf8a@3X=rp5L!V(G}WQAbn zRnQm24PwDv3Tj|U#Y72|-a18vN>}*`RQXNAk(Ct!Ic^rj-a|E9EsKWS8lfZ2R6gDE zHA1Bt1?;dbkI{30Zj?)q8g2r&n>TSJgQWQ4{|GlD59^0Zxn;aU*fM#H(aDY?-CHzJ zq)3F!#LlDp)PHB-^=2HFs27|JZTI&Xsw@I8;u$QWc<7!@j>LkWGF~WRbntv2Q)vF- z_6xs3`&zsgxU;K|K;4Kymkc893~haW;FPQRt<=@*B5WZMsCMQlFOx+qcu*}U^xk+F zN5loydc{#pmI7BoHw=n!oI@~d)L9iq=SP4F7-oFS)h}(H#}A`=-tD53l*M<v^zXG> zbYHKZdp1fuEeX#{0-3_~g(c>UU&0Z)RGZ;yQ<AqkHCc)R{d+F$H^Omk=HY<`!G<cq z$s#^KWVm|gU!;1X^Ga&t!cFxq|9f8Ftoy{L+;Bj>s&5#(F{lJK1+SViDpl3eUaOu5 z$nLY(gX>j$bgB(#CwYp)a8rgP9R@NhI}a&y+Cb^TTCYNArC=eS%b(k{EM+v;g~gRp zIJO2?N7p@vINh6SX_*ZN{y5lwM@u~Wzs{h|bx?kodFb+cNzwZES>9YpqE<m$aBPSq zB3B>Hi26if^H^kk99>74B{RxZD%8&;aF!M@KLeEaAlphA@tk!qZDLb=mxiLFRjMdn zl3#MZrIiSG^hFQr8rp8dDb}Y^<6h>=6ciK1Fqx4ZB}=mk?4q4*YQI+rRQ<~u2jjam zv&*`7FOxC?S9%7^f1fXvD`Hz6pVs<IxA%=QSK*jb&W$9@oc%8gJ!wHp$8vJ#F4qP4 zSA2Azo@1veA|fx_1JaO&`5PTwO)9$i<foIeJzT>`q_*@zB}3|>MtO&_VFQ4M)!MqV z1NX0wyB#$f0}+gk--1flxq=byvJTJ?*|MuPzULQ#`sG!imA)r*Ok6KAlTt(h@`z3t zoQJ&S8A(Pbk-PA-U}Lvdg+{@)v-pg8GjE!k$t1^4&ssqiNks2m-m?u(oe=REyQ9;- zeq;|%Jlv5LlqE8^TcZ@iSx-mvk_pg8^^?0GcoD9*=0#FrJty<}&Y_nv2d$Ia*q41S z)i^oiM*&GFt_IK^cjiTMj}}jE^nVzX&;Y%s#KM*yoNy>Q)1rzW)Zinat7u>w1}EOC zRP^$JPkRZ$`tBmdMXBhy%#~YA^mkte)#IU<_BU^Nl>M0~q?m2OK5^+TF|bHK`ih_@ zl4ivWpwwAL3djS48^=1sx3$UXYIfR(_5*-1VaOctEE27Z_c${Ob4y^2d$s?C;B?JS z`b|l1R|HxSl(G|?KN1@oK-Yj|nY<SX>1PV|=wJcne)VZ-LPSl9mM$ODxG8&(pC{Vs zi2XF(t_eMo23L5togBkajjOFc1C;y_ahOn$Y|sgJuKY=mwqPdPD#cW>46K%(5tHHk zaahPo11YFQGUuQffwvsl#@JsN>R6MmGuH<sn(-nDv+<6FK-mL?8aEI^&>PmB{m~6y zdm|$ZtBzF9Q0tLe32Ofhh36pDbm4ZG7(rf`5e!w*5JEaGGlw0)xRKncSj1n!6qM6X z`mwMx?AacefqCYwB%3v^6)fUKHgQ64ryBGl;lD?{q6{fxn!e2=IG80U=1!dOy}Y4q zY}Q-VXJ(7?aT3U{w=?nlAt`mg2Xf_v{OFxNB);h577zS%MYEogc`~IOgDYo7UxE;f z)$29A8+R%G*_G0$Ml+ZgAA41WoCg0#YFfcImIzs9-#wnfH#tx$rHSpo%yib^Vv>#< z)u(lkw=q&KwVcu+J98LT&uOma>mOY7I&kzavAzvttEdU_1)dcU6+acAW95@7)P|Lf zaasXvc==o7rRi>B#3E-bTK-m4YU1A8tu!o#)pQ==A$_B3(?+T7VK~3vyOF8x9MC8_ z;C_Y<w~cQJ=~+<rK#C#x^!2-+##0p#beR>%a}dmqVbd(Dq^=Y6kqcu#lnjz2>>A{A z=yiCCs!TJiCYaJ;N?9#PsER+ioDuvvd!W8w({uDT+dEr1U$$!K;jHj6s==TiSSK1w zfVlgp(flXq+-FdNBA8kv^2rXCU)AfkB(N+d64{=LapO!D9D-M;WdQ!yJgQF&zo6f} z0p+bBY8f5>7(Nb;73>sS4NAX+Y9Dr}WU-(uvJctm>l)P1xEk5eg$<k8h^9~y1q~KG zW;A+zRR#E#Bbi7wX6{!|{SFzESgL!L-M}KNt(reA<MR{G-EbaY@5z9NiT5Yaa8udw zO(hm~-8<=6DQi(~yzjxloPI8zqm2lId-Pn%PkU<$)+>3X){)>(%_^;@4kTDx|50)| z)~MaU-RutZ772AQ7k{OWR-#MA8z~fqJ!)2PWlkctahhqV_~wL64!q)px&a|G2#gGb z7~h#UYgYTtz=0Ym6dLNBT{be4?uJcx8WS-_yl2<VoTs120u3L>D8!Xm_H{Ryz-1hC zHQ>l^Wlt;-+H_)OnlZ|f3!0E>*681^UwdEb{G~0&;jf=a`|q8QOASfT)&B!t$tjXf ze`;KJJMQ~|(3o3nCToVpz&#Uc?98`_D%_A8kU>Xar0TOLac1JgOiCav293qOSp<|P zU4EEVRzXNxmP^D8k+A^ThnH)Ab)IDI|GnjkD8ku7;+;wK+p#O0-7pAL1xaZyYpmrC z`$gS{4^Wo!ogz!B29(TxGrfnuu#-;T|HNudnSRG2Aa-he;1%aQ9tjoQK+<?AJm0SJ zC$bl!yijKyVhY<^+KG#mn;tsLLQ$3z!t+Yr8VT-A9{n)^<m&PuT5+LG29uN|p)}T> zFiCdaYKLr=ab&H<Q3i%1bG|e{4(w=Oq>j^Vu`>%Ay-5RI%Lhua)&sYxoVDjM;X!nX z&Mvl{^?$R%#S}Z?>51R8zLcaNT=Y24EH(}5R=qi)0U+$+s`T5uMDlVJ^%go^!loXY z@L=nT#znem+cM<QuIK9rEn;mnsyv6`4A84hWiU#`;mVh>D^zLTKRUm(u8FUzeer|1 zP!et;-aT;kUG^U3m)}1yU!E%`!#Fe9>aj+#6l%li;?5g2ok~hD-SjJwUaob{c6vy2 zFfQ{*e<m~?z$uv;Jc*;KNzc&Tx&WsjV2CSv_A+F=Yn@ffCraIfOMmSa9V+$u%T4?+ z53+O)Eex$!3|+ga+hkw>sPj<re0uQ~1Up_DqjwSgM_aWz|2Asx_EAG!6GsJX!hJ}Q z1W^oSm#kqAa7Y#+cg4GngJnx`hkee*pT-L<1eAm*1s}w^aev3#S`l+mYmCQ^qJ^%_ z3RB7tK)dlzH(q=BugZq2?1R9F85c5829@E|VgzUVu-pbpIS^=GELwVA09b`gu>sWY zRxp~A2;z;lY?{)6vkYe}+uy`gj2?k%ij|Y=3PVm`?+fE2Ow7d=JPd{T8r;(J+17P% zEv=!5yVCQSyhOu|O1c**L3g#7E6E_FQr40^%=0Ic1gK>=jC8_IR=z}I0E+;!xcs#s z<l-buj4?J3C|!o*ePj1M>-4k6y+!OJOT2_ebG7-(eZ=NvCLOY{Fa-MkB6>QUwThFi zB92q2blqB03oB|<s-%Yy5-J6ykomHpQuW5!e~dd9kf=#w^VHYlN$PNDF?Qy>T*jHv zZ~WLS1Er~!&8Wss=(c{Y5&4hunMx^kD?JVilL89@)2s3Ih_`jmtfJ0^HUUFIKwtNq zDT@dFPPdnOQ5l^dA#B;W&P`86ALQ%84%k2W9ZSk17ekFH)h36k3242V&xb6SsXbTH znJ-uuLsqpVMfI3pAKYfm@z78Sec6F)#Y|}lgczPfadaV?5zDqIED`-cn^f1~Sl1wb zcbd8-0~p>W&BX#w;)p)cbmPMr1-`dRs$*`8n{SVA9H7UL@Urth+cL0_Ap2OHFxCLr zHc5v!LWmx(e6YWu@e&RlQU4aZSkmmC`uG1VkGr>O><ypeT%PK{E688z<{@>xVA!-6 zBm@tkS>1M>xVJu-tA5Tcj1@ssUk-IBG5*a~`$1c}3-J!JXiNsM5^4$eWY90^vnDN< zc=0s@EjwK`r+ppz?;Zfhj=w4+wJnb%+o`QLtIRwly{d!rvJ+iGfBd|cIp99hAGJo4 zM2F)CUHT6X6<dhizC}0h&0()JRIR`W4koF5<9!?A7>>CvvL}dLw_)vzh%iqx@F~Nf z-j0=}_8|^avC!wuB~tjOe>JQuJAgNCR_U*n8YI4vtgDJ4j^c^>$0oi$zAG!xwwfn6 zFTL+f2ToGnz2_?ny|0rq{Uw2Xz$5s3PGqP_YQ?c*nc{DqbW_#J)PKlQ$Z-lNvn%SP zJC@qfSkFhR%Zs?-s7dW)a1G0X{&yg0^dp`e_Q?^x=k2N4tNhBlQOL*o?!&{`rl#g# zvqm1|;-=(JXneRPr%k!SP^D0JUbQCd%j}FWP`9uFxV(5?&gQI?D^jDH6&@f~C#tAI zQ=^`O@ZudUs>no`pSY`b=0DNnnp7W3)rdwyF*6qPwHV&=oRerZ|HHMwxXB*e#n(}) zyYdxKtpn{YJJmty1-AEKpjEpJL1}O$Ah2cQRHYSNL5!dVE2DM>X>9&G2x035Rorka zrPEqA6I(m%h5hNUbP2$R`Fx0P?D+2VeP#YNGA{se2a!?tP6A$tAKAnBL7NS6+xt4q zkmbm)y3(XSF72e{m}bg3w}N*JLuVw$YAEkTP;HTAcbSaO_(4ML`yriz_oiR?bq+Lq zB-`A#)edU>k$uhA`rCxuihHQo(iu=B*i7Hd%ha7Aa5vai7yipjkp6z*tBm`FU8kq~ zE^e?|y@F|U_ZI=eo&*~IQ1WcjjF^R0ka)(0xZ`JZKtDmOO-pUy36V`O`MUbUv48Gn zGjLZb6pB_I#sCS6M;phs0~EWb6I%r9EgNa(jTKH;{lP+C^7;Z1k((IoO`q5aYBo^S zof__`C_22o=X<+`GdOFRrnzcq1f2!e%!x7xhvNA8qz0dpvFP{Qz6k&}snFxfYYo&_ zE=v5$|5*jAmGYsWR@ZCrUs0L^&9C923a&;T=HEK2g1XxFxG{2de$jP`bMPsyVMj11 zsdz?ccR<9iMq^dz38Wn*IDl5+iB&Vjuk;lxi|SE8xVC<tUs2dghHmJ-^*!v;*)!cn z_>e|LdnAv>6SLEV6ogpjhpb)717{QXOh~BB8gRvu0k^?qVEa)b%5>g>KK-Vis9eoS zl>a|Ua%;9NGJ<<W{JN2$#F0Hg((|ZNwPVhl3SWj7)^-|%I=BKkAk>a2+k!U3+{Ij{ zn&D)tdv^7&(30<8SX0l`ebis!Y2m?MOaJgWonUrHSZ6Ag5-aY_DEIS0V^L0fKS3hL zAwy9;%%_b{34LwQKn(RuCqV?jAb*hGbsyjPrqv~z?JZ@lBNo<GB10a}Dc>XLv;PPH zuFs8ZHz@EA4^Jext(;~@N8}sBh@Ym9M!b4Fa>|s5xYPqy4esoy_(-ycE9j?&9^z!5 zAWMEXGcx=y{7AX2UeMXB7wvo1sv#x`PEKdss~o3iyeJ{^t++x+*DVA0nMSIWN_Duf zLW-zy*T=y%Lp4aaSp=`QnR<_eRXq>KEDx7PN~cg;0QerNnmg9_ILu6x{RHu=_9>*O zc{e)2ZDf6^pPZG?mGH&{Si$;)-}IYT+^W6JE^gT-NfrkerxwyeMH?xRyZQwliCyH_ zTR-dzKl6IK9t&LBj~~I6(~USFfXE;S7&c*Jmh<=FGo?9bf6^qtdzSxPp||2HWPGq= z*$V~u5hhr|t6}LO*g^~6C~LwS`kLmI5eZZD#OtNvauR4FP@46IV>zAP|Mt(d204Fz zVV3i<`j&v5bOsq-U0Ved6?yeG{~wVVy+x+R<P;6O$AcxRf;4QvC8bBQdxH}}#Ns!e z1@juTxFztUH8(F5F!6P^@+B)#qzA0{V)_&CM@_vEPGB{#08^v#T_sQ9X-Q1^zExaL z-Rey&O6K{pW&j++KKE{}=FH_E&h2)=q57lvM47hNybLl5KOIkYs2AbyGw}z5_h{}s zFXeK?p%|%%vg*yqlCy%zX8v*{p33X27Ig@Qk=mgmgs+R3f1lS{Uzy-VoHmA~M<+)R zKZy7#1K5_1&M&UHda`r8<>ukIJSR2&@E=jR<TxO5Q7duz4JDMf0jd^wPx(=rSJ(tl z!jUxfaT7dcco-x)LPPRa%bQD@ROpm_qS!3A8^x%<-^un-jF;&?OTDeyVc1Ljr)3q_ z_ZDP&)&ls;mA4P(oWprGq*CSZ@K|+Enwwt@*O0=5)NIj(K`Rt{X?&RY2ab3T0-~_} zA50Im%Jg{m@wb}`Egxt>-S3#n&P%<C2)4mz*ky3({Ikv|Tog<Qm|Sq_U&$2Gx*Ut< zV+hhDsO@`5MSdcTZPa8_c%E8+G0n4R5cE@}A!_{M?u62i=6em(AX_Gs!3>B{&``?a z8!2GJQ9$e^E{(?H#f-O*I)xWUG;i%<kDecaQHbh412Y}rp&<WEl;(H;#y(H1PWO45 z&y%`m1zgj#gl?6<0=6Nntc16ZDoJDCHaiI1;sbqS4JGc2+J0UVn3C%=fG38}CP_A% z`uIzN2`h<gMCgfJSQXM%_G~KhZFKXDtJfA!+eSo8>Lv3UPO0WhYW_FJ3~&4=>u51A zD`%}a2p^(P7wTeX^?Q?+&xp<T{#he;*7s~2GbPT)pPz_D0Oc<jN6-_Cq+!W}p?q}q zvxJ+eGRLv&KHN#-pD;6M5>Mwu;9xEoB=@y7xKWa@ZKa+4fkf22`{!%N`k0TpNMj-O z@9Am{fKh4cJ-nV~xd3;_5WTzd2la~RHszjL)%OOfV*UHJ#?>z>-gVI<J+72bYx&my zV>I@>L0?y7r~LbZlswsd@D!C2LUG=ys%J4292W^iXhzA8x;7H9RAOYu%lPp4ubmgN z!rBNLE0kL9Ya&@%J53ixVG)N{c)w@up4^(FZxJoBY5nfGm3CPAuqDmG)*~V)7!c%4 zHT_M<(g7@g3Rm5Y;x__@;`M2)ow4Dp+aQ4`a1npF5FmX?u+#npyNC2CR02Zsu5r+V z{Sz3m;e2cxD}}UDz9aNr@Xn)L>2v%PtkrI}%2wUPz4_qvTB~AF5jA(Ic^ZJdF`{hZ zbz~@r=ndO;p-U*elu9RlK6z{qSI#$$)XHr+e(SB?IxmY8u?=(`rB2)SPshz04e`K| z?COnpal727EY0xS#1t=#8KTCPs+|h6A1T$@Svuhw=qk}3F;k_|?RAjKbDo|-g!|6l zM&mSl#j3Rcto_?c<E;fLQ4rVS3KW=GUL;hbZ6aETxnv(j>iIPGSLFb!IFWc7ivByD zF|k0%;=)X5pr)In<(QQkp_Yq}6~p6z%@|cp#Pm-J%Q5XNLigVT7P+w@ix*o=4JF<! zh{cYh6TqFG1#>YH@1NuTRZ0f6EKCs|<ch>R9I*g6H)}B0N+^(RCQp)IJcwkhQGLZ? z#kH<7CyJq36v^P~3}`OpDHFeIpmQv)0_86#WG$^)*nTSCg%{fZ+oGj+REM&iwW&h! zL+Tf>)J*3SB4YVxBl$9$TkTx<=Bw>Rt&Vjs@Xnjc|LLecse8YxF-^bg1}Ix+zy#1- zuq`}nE=(B(gvr^-$lHqA4dzcb%mC~Y-75O`!y*wRew9JKB34}U(k=hg@Wh?`<TXco zuMQ{G$)%Pz#rK{A&!+ZKNHulTY$5WZKqU>HS7fPdB4!X)Drko@4luW3sLIPHKujqr zz1-S0C~clyrMmGAuM?ZiGn{L4FcDzGMiAr!vpiquB>S3K`M|*(tbm<*Xn59P$m+3F zcx^JZAw66u#q|af9TF=e2ok~&Gt)WtiCAwRneNHHp?+Z+T-s{MVWL)QBbUjKK249d z5@{H#X0Eq)MHn5{t0WW){!pd}<Y6m-6OXLztncOP1yR@sBWyLZw#L4PC8n*+rj3S4 z#2X&AnQJ3~U4S4KGfiBfQjI<{zD`r0R%uxGE?;QCm#b9JIkZ<L>DIrqQMZFtHrEMv zKmklvZ|q$b0b^QoJaJiP_0Fl+46m%1wia~%zSVpS_&lcOG$T_R>Rq{(bNm<>>b=Tb zeqhInD%0sn8t^K!r%B^mMZ_I!$&f+1xNtG|yfJ#x?Vyz-)|8FCD336TcW|Gv=6$pL zRo7On?=A%biYZt@7umAJ#f=Q8;X70$rJe26<LtY9J*OJz%^sP(+P|IQ=1LmvVpWjP zoz<U6XTGV2=F9gArDYBG6GiL^IEMTt-*Q$BY%tdF^PZ;k*k$eGpRA2gQl>u{i_E|t z4<S*3gfuTnwRfdWtux|Z{4gq6r|2;_2IZStYMxi(dcZT<=N*Mvm~fG}2V;m)wuS<V zyBwH(6^uu?pRZZ56{XiV!WzV<NQy=C`zl33NE&(e2|p|-(N7SS6{Je83kCMR8TJTl zg>M`oa?l>Rf_kgNlQ|L?pYjT_4*GNVU>m<*vemQCoX#aHie-zD!jUmYV64Mad&nNX z2BVWilob3@hcO2~>-X+7ciuT#j;$qJH4-_O=vt8S@Gai;%Z`U&uNZ>`m<$@(kHa0# zl;Q)axPHm-m>*Vw(EWwNU7GrQ@rlY&9_UgauwbyAZe>@)6=RNdVg;cHyzuBOgJAVX zkQ+#Rd2#aqIUvWnMe6B{Me3+CF6x#s7b0l86Bk0nnnfw%)8`761Fm830b8S4>s;)& zk+}LM2kc1a%nU|)mpT!er|cChl_66K6kT8$b@BcB@f-|iWDu^Zj-zQ0X#kr($ahlg zcHJr@n-E>ri;e7=_!Ij$6*%9v%97--XsS+EH8?vm%=!cEnDFvfqpCxMHUo4Hsp5^Z zF|{FY4F3Sv-3-A<@*8@S%6jiE*~H7Z#YRcXx5@P#!}Be4B0eJTG4A}5ISLO-$4uhU z)DITBB)&(fW(to|xZYOsZ)a!-FE;CiI2YBY#0V^uT`k0Q^YWu4D?2;e9yUl*ga|sj zk;1|R8?W<KxQ4QOyQLMavIj%Sf+o%P!UN&bS`1bj;AMOMP9XftL$~5<xz-YuknEXf z#<CX@<aw=DpIoLAdm`wvO%eoytv*v`a8%Ha+GM5vl9Xi0tytnO1mj$F(2J6y!yx*q z90k1!7#%G~`FH<2RYGPVYRr#zEe_*qxJawLbSQ-?Z~DC~gQW8(O_=4xoOrl^@QWK! z<&i9gPp)Q#n)1#f)w-PQx|BxW*ZpuOkyk#y-wIcCI0pjufJPx$S*!<laX71L7ld<1 z$Up~({IV?@I#H22b!M%5Y8T&_uS!ca-*+w^fiuGDD}vwwq>2C^OODM{J&{tlV7csk zE%k^uF`R~+jpGEjdpc#bx!UFid#|2e5=T#z#(G~3En_E+{cRpAYAyv$^y*4G_c!RD z#(!gBA`utTNfU3%#L4~fPFh|x5?1@4FoK+Uo^+xLjv!Uhf5-2tBu)Yu1RlTnpWPp> z1mo)zq|aie4)hdkB&V{xsY)Pj1;^_a9gcAQh(PXGU+`1;cy3S4*^)VZQ-iVM+ezeh z_rRyrlA<#ebl{K%nFn0?`>*?EfL`mPs#%e|4Xvq!ICkl$Bc1NyA--uGmN28ZU6{!^ z{p^Nkq%G8UI$r)jd1vPi_xO;YG_gwU&t3T&ru~0V4R4hy6iCSJpFeDtwUx=DNxs|@ zooqZi^XUrT(JhAL67F0EN+7$jsNipY#FYix!;k98ouWHj^^`h!y%KvnM3)}Xvp`## zp#CJJT+lwGgD^$NZ0b)*a3)oTwpNVA!p8U(FH&}W0Z4baNPqe&XLUr5Clq+xrQCz_ z1icPN23-Y|9mr5aFzonoJcOI6>fw9F&OFWmw6k$>f2b%MF&~EMde-sB@sLF~&OG*o z=VsP{Y9UG6%LJ;0Mia4v>FZ}6mH#(9!6lbj|KOvtZD{Hd$6GE@A(T55mH8MrMX;~N zMvn&HvJ>ekJkKCmzQnRnlgl7g{P0dahyuc;h-<=f7oe}<e!BY*CuqJC?1jT;iSphz z(#<NDg8o6jbej3+f$hjqLVOIcf#5q$7Rl;OpE>E><mg28b`6x*F)VpPKXS-~bXAaT zBH<_F$+&|wZus*C7va!)rgsCCVwE_fG5#ShDwva<VmV#~)sMt+uXC``Sq>DFc<rO% z>w`zgm;43)k8Vi-hz`4{SITib1DbLhS&TT}`XJx-QdS6>kS&HdKWh^F8<%b!*nbJ0 z3PbOv31pusPj2j|-G!{@T<>!*|3Tr5j|-w7_0}Be30w@wj655GOe`3c5HO6~s~gT4 zPl{EZmj#3DjWpC*MtP1)rx!G?yI=PUV8ud8pdN&KwFvi2)NrjUzpJBRO1n2Pr7r{6 z3%rh#2mWp5omsVaVS%&4=bfPr44~wWc0UM<uB_lt4OZfJ&2Kum+-Fy+isrt!6{!hX zU2s%*j;`<`n}9Ls?n<P5RZK>q#e(#JJaskZ$G9LMPN<Y*Yb0SNPtilrb<3<$ras#v zCkwQ$UB~jAoB$jvVUr8t!J+TjoSqnxh?j!QREkmtOfA!w`zN_FVU3u!abQUfv0$xi zt!Wn$m@;Srt1yrb^R;`MT0>jf_<v_}9_BkG(ZWGH%%nd$IiP4*hQ(Bg)|?ib@wWuv zt!@(qmbe)LdMY$@$w58s{pFkgH2I6M)@Wad7YDzR(nTa0+{&ItDKsaGKP~QB9ds}K z`NTvWIgDOBWSe5A;V){dBr_KZZMt)4`tzj<?CF$s6p2_)YpNe^9PaeHzJ%!lna|O( z;FitDp+2&o8h8(o_!}S9t)G)s!UA>n#BY+w)Zh$Ih_xT|p+p29O{FWPw5REJgMyN0 zb&YPAF$0aiB!Th`<JM4}a+HsFOMB2qR>{8rl7<|dHIMatEL-_+u&&?KyT9Icrq;Bl z{v0nsPf1$BZ9J-#%Go|l>DY@@)Z^5r^P&HD_Di0eorF8iz~FbebwCjdT}7^~oDFWa z7#L83cT(KFw!4s($x<WZ*)1zTYJ{sG8D;pS9aG--37Lx&?sGyIn%7ZUy4`tkWLuu? zP1c~@3?=j}Eh}(Wr7Yp#Y_Ynispea0HIf{;N$*WI?V^g5|K5-+>Z3QSwMyK-rgT)T ze3hd+L(zGRa~<Em-a2Z%7(t<Eu~-7nT6<C1AX^Wzil6iA33KUnZBfn~JO%4`r&B|( z^7{*6fK0Cqxn_PB6t1T|AIG%voZszwW0S<1k{xp#{u8`p0HhgheZG})^l+@EX`N~d z24Og*xEya|E+W`bdr+Xr;zaT<=?HMIhEAcK7JU2-mw+jz>C<4-#bB;VX$3p!td;%- z7XcU8)2%1zJZ(VV{*N{-O}rJqQEO!ytKP<3<<p+2s_!58(>3DpAK1n_j5|l($Z{Pt zFxdIf8OihI0+aEYO*a-wR*DH5lL}3L8>at)_qoMLR=EHPJ++?Xt@`jMK06m+Cqi0% zR=eW$djD?p32R&$|4uD%^U^A3ZF8TqUi_;vAhuK&T!qnj_j`_UJI6a#q4xOwT7~bg z3SRPBd6c|^)(2XZh>8c}!C?iVz1CH5X3}F6v_NCic$!S^%SA}?-1Bxwa^LMJQk9Cq zlEUw_zGclrnN6}JDo4#H;Q!+&YIb&o5^?8PVg693b2DsVwVLjwg>$es#Bl{-COedp zb9?@=lFJsM*6`ncbt~~Jyfh`1uN7z`6WCD(MgmP!B(Tn87@nyjFL0-p2&wL$i&Gjp zx(6Xe!tmR8Vknd#J1zY6=t6$Ww7E85*{a3Z)oRJ-kE8tWB)PwqLgO^ju;S!n-A556 zRx4P~o_X*B{<^?)VSINYHcY`9k|cZe%?cVsV#3Atd2}^tPd+>^#@R54RRJ;WP~}+( zfJ9N+9T_cQ5{x<-4}OxRfHO6Tv=3rJvh4E$ZckpRwK9yA@E61<-2?M6x976%&e?o+ z8Z67Kk<}=?<~{IjZ#KWjT17Z49MMF@UR@gB&a~91r+YSE6fGRJ0%i2Sy=_Wixg|b6 zRrhQ7G|yMTwpi0hs47^w+TkNnc2@F+RDswXkr*F8BaCKlK_mY&@44BmQtWX_Y3A#t z@6)(MWn$DYiKt2<r34pZa4pcRfL+{)T$A4iJq`=VL5Rt21<uT}6L1ebn|X6Se2xfv z5&Q$knv5`aNDYlO|H)3T{7-f|UZSc-6l70Pmp}LQdP#3p|38gmft_kOwove^K_!1y zNe5#6NOBr&8;}Tl{w8B|L~^Nq;YpPhF?jS0@tBBkOJkOK^9@Um|Fjigaq86hJgX-h z$AMG#;rz0DpoVI?v-;Af898cqglh-f?92bL;)AgsHTb-e9=}W4W#ysy5;g(j#s4fL zqwGBojBX{pNPVcr>oxgChYkTZ7>YaQ<F{Nmg$2qB=p!hpQnkJq$`(Ci1ojk2nb-Ey z)fbfGNJpOs;Demo9K%vn66}VM7GoEILyI3<>{BXwuoIWi?3q{+CYIXde#|1r#yf`C z+PdG8%<Ar&1?DuuDVY*gxYXbNUF*{IzSD|lVOt&zl;H2x(7N7YyQpO8D*CeJiAv~X z($Ou-;$;C|Ti~|(sMZ11KhV148#hsZe7mfDe`W_~ZFIcdeJFqbPr!Clux{16lf7!? z<P^)9PjkvO-NC8gzxB|p5JaP6Y18=%d;vu?@*r1Q@SC_$oZ@O6zn{bDzAL*p2i^Ke zIH)VPk_TI)Ebt{%QkYxUiIOi)rt~tB^{qtiV(fM53&l=t?(%w+<Pr5hq4My3y!KGy zpM=SE)fxtg@Pj?%wJ6U|SH$T8&{5si2eM~gNjY=`8(5vI+z9HIx^}aEC0pve?I(V$ zu-$V(Ym1MQSB5nGGOJ^-8krKY%xvzaNlLrG{OrWXQ;2@;q|eKR@<i{IZUk`@P9z0$ zlO~gkmEKlBhE{Y*uT^-q6cGv!Jl;85%vMf=ibvM-D>$ddC@DUuS(~ns<E|KAIV2AV zW|A8hLCdyYfc>`naY9|W$uYIYJnJDr;MTIFlWbZY;Pcr4m+Qi@jqJ)=hb~RYFUC(2 z?A0fYmy;X++wMiMdG`++Kzh!gYcIFKa5wp7N9gT{sU;Wq@#~*DEl%Nb+7>7)gh4DR zldb5yE5iE*HYaKvsXv|aI1<QNHP?t}jPUVl(9IZ>UP0T@#xkoU64kGT{HjV70yM%C zx?_c%@KAE6lc%32B~IF*d_MT2s&+r!n(K{v--g*98!dW9-hdDPe|+phP&Y=IH>%-7 zIq9nfpD2o6S`AcKdBz`TOi<#-)KF8oR+iarlsEzq9#G?T@K=sg!pRZagi70nS&W6o z*bDRo!fpp1CidA%joC7CL>}r=zJ8W!TfRns2XsiKAqe8i6|oZWU>~I-8wE;`0N$L? zjTP}=Yn<#a2?a3(gA(23gg`k-I@Hv1{J=XJn>--qxv#*eq<|9y`V|a04jWbkf8*t+ zYFFb{5{NMpIF?olIX3_iL`;e*3ZHPrfg`6XYY69$8u8B(k>5sIC|^b!c^Rk9^CQ{9 z=p>HO^XtV89_zvZjGz+(a%d=TgFD95T;-3A4A2V=NNAOWfA7u-|E?~x9V=?6<m_^< z>IRgVTP5-LOfb3yyqQ|=mw6d}yc9K+$n!Z?`uon|twSGybxNr=&Wiw;)3d!0_>9uQ z<K}a#NaNv6r?PX7c*bA`9_00*!vzNWko*i`kre`V)a~^YQiaU_O+2I|%$RB8b^6o7 zt?|hbKYvpIWKN850mBZW2DTeI-&q%S-!~!z9n)pMV=YV|)vcKDdNhmVZSSNvw@-^c zPSKrJOK$`Xy!q2(MAtwt{1Qa9hI#!`1~m6DYKsRPwiADaL;!O7_!uV32A3I9+7;ER zH<jpzX+eJjjY#1v!Ieho(uOY}y$#XR<fhy6(qRprp|6tK#vO{RnO=BEklL4>k=-;0 zG+>-G{sOHZO9|Bo+8bslXTq8YeqlEqP)9cD7BSVY??@R`o{TyrT8$6o-E4E$Cm)ei z$E6pR&Q+riJ}9)WMpH+ACr_eKWYclqux`y%>{_?FeCRGjF6?Sx>g3ZizC%W!d8esB z8Lix^aMD?+b~xuw(u&0LCDPYm71P&XG>Zk-mmmVjI{zme_q!^8Sc0v3)d;J_;<Y<i zUrVPB5F9jGkHvVHpm+jpfRx}4^}_D~F+-kBxw92QYMScpb46YZ2u}02A!#U#=?E$0 zhTKBPQo>35^Yag>Co`?zvGKhI2d=P?RdcM~@1&iFvWEcWF-RlD>m({uWTkTu3}R<H zBokVX3%%y}fs60KHI>d$AiaEit#~*TOD0=$$<*lC4dAeLSk*C_GQp@(LeiiUW((p? zGYNAFk@<a1**-at7Bt)R+_#g1Qf*+os2H1?4BBG~T)yQ$s%_3(x^G-?ih-)P+1h+5 zgqM4!=W7Z$usHRlF&D!!(}HxJKMTGHp5lv-aQ=Wgw+jfFMg8*^YyJ$TL`9_xbW1=r zG~`Bz%|X&{$xrPXrw|54I#78fPw5v^03z0j5|vLFu{}XJZWV^n9I_STYbVWV*?{3S z0IxIpr(?^ZahzvG4eZ4&Kg+&<l5DL2nRJPaU(PnSq50NcXo24a+nKZ3tb0s_a!u~~ zQ!d$%&MPdT_Xy|wTzg^Z8g%Neo_SRwW8w$G%~z!bZsZqbUqJ4x(r?uy5PA4_0hC1u zsmX4!aS(yq4ZWSOkx+GTPn;b_+_U;;x);)Ll{e#3Hxq)fjISe=D=e}|cQY5VbMAU* z32PTJI`E4+aqPpz4H&I0A8dmq6r%rJ#Lnqffj2ZBffpXND9#%Pg!xZh`#e?}uXEzd zAv3E@Bw=5aG&K8zfuY?fpk$#;E_*vh7wkDZ$Y1ghhWoVd0u>?Ho`yw)f1CSAK8XOz z-%oAz{gr&>wHF`V2XFTN_(n}<sr#S1wOZ)7#Jk0bC7b`uM3&&w*;1eM{OAJDKQ6rq z?9`u}`k5QJ$0kEoshr2FwtNBjF2@^0-UFP$SbUL44S8R-*I!1V7uhvXGAcY7>%p1> z_Z;)(L<VRvGkqNtU4N{(9~J&SiV<GC2a8P@)u;&$#4~pFYeq{0xz0f?3Qk{x8zH6q zbiGIL@2;ii?Yl(ihl(z;gB|uQI17L4&3|gjv8~(v-4dh^zK!^Ze_dpx-kWA?PUgW9 zt^akIf_@zRr>jt+&hqWYj~vZG%T~So?_2F9yhvHjZ$G}L+|tZc_0ooSpIqrp%FB-_ z7@=X)^szeYg1p;?m5h6vf5^6Ati#|cEw^q*tl+gQr|yx=aUJseF-n$DBKX!YCV<f@ zQT=+5M~}UsZI7f+NmNMgtutgi%V;_G+Ri1eF;c-MK&bn|+d}We5myyQD*bls{pm!V zvNL`VQA+s_cp#NLP54cd^I!w*$I=NrqgKH)#h=Z}M0{7xn8mF}(T|#ru8oJbHJeCW z`-sXvS9yJU4@Ugswxvvv<qY>mjevlV(uQK^+R;2^j2Q7z`}=BK2-t8AUmgLIx~W(< z(H{UFkgeNatl)p{+^yrm2WqLP68o1L1FV#JUA&oT^lfJ*SL6RKp)6HbZA^<6dua8f z|NF-*=4;OUniG(hQjK;_#^CEgVE03dC+cK<^POfg&1WM3=Aowj9*y@s{wgpr(Dqds z<q1nOlE~)f9TLrr##h8}&Ysw`J#=`_c*ILW__9~Fn9U&S7nPPwAkOL$CYclt>`w75 z3w4UnQ#X@Lf9+^xxJj7w(Ho6azJ2Jw;}6RnoCAYU9dXY<Tl$~D>Uo0oNX*YhE<n1N zPp1BAGXDQu!sl#D$a5;5p*l~wB;T{2zvqMTWv~~0Fo(aNA7R!CI0P50vY}SXZdvP3 zLOg^~Dbdc6pDIm@y~%nI?C`Y;7c4`MN#b~ofxx?MWKQ=27mTi~n*FHRlg+c0VzDEp zu@{}2uZRx&YY_v_aD$*3++1m(?U4?34Lm@!IvK7i=hiI`GoqZDD=3sA@+1e2-`|PC z9#=**M9eIgiZlRI%mMGE?1}c^2A`33MRRRFYA<qAKKGST+IXCY*6p74x+Ym`#L^AM z`$^eTw`61H%}epkivNCkvi;+g7n}8io3^zRjZl7)M~#PB!8KhS87J(IVl=96fODY~ zt3h-_I>q*jsHRK#FcnHSERB2KEt!xWci2B~9M4!^Z)n}TeC)EB5%j5fBpM3!FwkXB zN~*LHBR`(r&AvDpM)xkIJAm0jtmdU5bG3J?`pKos?oo~kuq=BsO;eY>F!@cgy(E(= z%eh_sE&sLRa=4w)<qeVj>Jk2zz=Nz8ww1pyI_`4)A}l<Np-tJZPmm{p2RT7e5M!Q! zF`RuN14*K*IGHr<<Y!$DEwA5O$RpkGx?+mx#(Z!zctT;^v(=ui=WmAO=;qlVDu>OV zPW@4jO<+iaUgN_d63-+?L^o)pf3-evUT32~-2{60cPU=&`ej@NqHvx$+egLD&ON5% z_BuZxkh=EI$bL984|*QMzSsCnXuQ$?UHGc}zyvq+V!XWiSo6U;>q6YwS|N%?dw<AT zM1Fnp>B0Gwk1TFte(HMPTCS`?dyU@sJ><>%@Vq_$70LQFg+28px#yN;f&+obFQe>M zQBBx{TrJ?R_x0p6(BpHrv=#yXITxbV(Y_q}AQWPuQ^~K502=Njs&lS)J@q;){fR~w zF^T+mm*@c)tl1GT7S)|ZFM&vGJ(fiwfMQI2!SA-|L8*_H@Zu$vo_k%>@${-o8w<oB zPtvg#$ti=+osz^FrjQYU6Q$ue$#(Bbb%JFOi1f3Fmmflzv!;n{G-cBV47BOD$&h)E zP7i4`FH9g+Stx|XuT%bvcHEkLLQ(r&KUYbn<W*mxoU`^uSZ?~+Id(Yq`u%pi9s-zA zx~Ei!YKRNII1vuMt}itF6tSNI@ltIM1_JTEx&te&4h0)3GM-8MAUJk**I<{rn08(} zm-o)2Lj+w~OrN^889jQdvYewKf@|7gm$v4Il%|_r!sVHMlk(vpUgsgw&s;<0*SbaE zYc_&ApE*BKdR;aV>H#4E2s>bw&aP#4@#mm<dcYQu+Tnp`jvmEa0r@U?I)mXhlJIqw zA3D^G-Hn5<>CCjVtJ13OulaYhZ?GPJ%_b_7I<QH^(pg4BE%4TS6?n(DSJ-%?38Skm zMgQh3!|~5lme}iaSD6j}`Ohw5BhH2I@)7S54YN*u&QLEY63GU!)H-(U_!j<bbZ!qK zq1Bsk#5T*`M3HU0T7>9#t>@K+LarMIoump5u7O#vXzFhMJA*eJtKe{!-^{P#dxs-6 zD0GQBf@_1S`kCSj5KlTh3LRf0^xRC}n1eAwbrdTTMn4N^%e~XIUMEdxPi?FPJty(a zhn6*)#9HFmdzAIPH{cG=a}Qs$8<oazTW0y_?DlNIBA`bR)qgJN$`xHDgNfICn}4$( z`+1zA4#$rM>?e=zS*@AczA`AFKMAq3GX;Y8=>r@iFAUo5ubK&F9cU@D&|$MLsIMSt zzm$FjdhoLap4xWL78vOBz+ftc1S(`BAd9yJ<z^o-<hMVgkPmh)c|ae(v-c7Br6p!l z2l>DTbA@SMLvw}3dcvJ0=Bu09l6x$g=9sjSv;0EEabtVBiJr300)gAo-k&d8&+eFV zjn9&<&tFnLFO1kp0{kvuISEfobIexC)xTqxOPliN#j29{<MANxKyFc=lR?!cG@LbN z;*Y^EY|NY7&mVS^hSZV%>d`-K7)iLKsi3l-UyDt-{P8gtZyC|Ne5`1iE7+W~FWL3E z{%xhlmT{yWQ$4idP1dlw#_0T}ht5<)lWQnKFFqqJ4^T!B()F><W{1cEQ|O+dRYHYe z+ycjtmVsjLZ$O^6<wFFgxqS)cBclMP@sFML3$Jmnd9Ykzi5!h*SBxA8Vrxi4$?Pp} z0W**kYeJb8ggxetpY%h9^=M|JP~i|Ps=$Up2QgwRAr%RtI&E)zkMvhjeIX_cjv&3o zj5<k}mOk;@bF^dGMS+nsNW2%l!cv@#J7V9hY651JWhpAzwreZS^1taMHpUp$tB(~F zTDp}wJSpQVmux7`{&$(zhdKHlnG|j>BjF9B(zZTx+^88|v=lCNELesg1B5uMx^(|F z7^bG}Eh734GFw<+P4g3eWk3;QWN4V?ZAr8Vyon+2$23Oi3s(fneRbY-{uQk{tl=M* zxN0xsk`l^cuxg1VZU-)<H3DmN@j}`HsV|0ECxR8O%d=3*iVHTL*`f;9!m~E$2uGc# zi@53#z1xSp`pT6rl#|yl)50^Zfs7tS^6nQ%JdhbQXMC}?)U<AtLmYiPv_VmpQe)W9 zCT<_5qu}GgxI4~B>S47u(i%CD=4erOaRkSt{e;W1yNU<Shd03`yZ$}p?N0$PoimyX z7{{uF4d^z(NH?P6wrJPfGwxkK2A&Ck<BCeT{Dl`1>O;%rto-%0bSF4==ocSk+oh5G z*Qj50Y1y3~%CWoHc_p8mh^>^pS#Nopg&qS^qNr{nSqZ>XLyiP)goN3xb<4^-;1JCN z0^UtAdV1n_<hBv0E0$0Q{7%~>h4r&w64~@FHdTIr-@HQczYRs?bNuk!4lvt-r+#J) z{}386b4+?*jACXMv+Q)6Z?}OL8aN-YHk*@$#5SJ#gm5Ijn`?Tjl`?VOc!f9hW&i_t zBeM%$e8ciFeS$3`hEACpzZ>!WuTO~M{fJU(v=^n${JmCInuHIFXBf$>9nIDSj?N}y z2FI3M{_7sN{h1{jJ564SU$j@|F8{vQDpf7GF7A#;OL$2ND!ZU17P$Tfe^j`dnNl!^ zhM}cJKjG9DvzsP_Q(n;!Wa*Q>DLw4WwVNg<y3cA&vg~Q7zKG2lC?GFw`TT8tDas{H z`Rh;Ga`	($kB|Ql+u~`F!IS`!DW`>eAK!U7eOqygpAs*5IjC`7gO+K=5Q;4Tby) z{jAdKe0@LplS$5WKN0$j{X~CUL4zN=`Z57CWo~8{T4PV+*cGp5YnkZOavQjVTghlO z+-BZloY1-Hy{DLT?B<5^S!r4Z>BU^K6&)u=#r4Am@}Zh20JjQTG5QKvc6Lzx7LI-v z3b~|IYR%)i!G*iTAP8el_LTRnPqdrZ?{Z9`m<gpk(W&G3IOh5fCA%o^r#WB6$Nq)H z7gGv_L27nc+uXB!L!B|H%b7V)$|aZ%hbp|JUK;_s3rqi}DziVI3)CHh_ILW~yL<NO zyKB*-zT`brL`ueM$}Fe2cdTCzrCsNeN#?N~vIiFLRao&Kn^ztvv$vPTn~4cs(sBMj z^=NNUuEuld-k}z)jc1B5<d^PXBgqT~3FNe*s4YVrKZr*4fqj@2ndEhgAPUw9%;(@k z=B631kUVX3uH-$Ao|?gl&#q~|O0!@ZiBq-$M~+hV)W5Ci5OI+j4j<RR0C&{TjDHr| z$7_%1I4ywnespg~6WSMjecD&NhwNRiIg36*1n-&JkkOg{6aomrRPhNgAx}h6g-tOl zU^Mur%oF7X@c2Gp-JA<}vSB=`?&S<0Fx*nSi59)F7&!kbXf?SnwmR6y*>1AQqnqZU zcT}VF=D$YY)^uvxKZ5t#2U$15t~YC6gjM7`61c?>F(Sn}=_~xxx(7`F%`)v|0iXd+ z%-|RauEIbMcZfE&`KLpKVQc-tc%l@?MppzSLR?lf+b7`B-qLkt+rl#MlUI;q4k)^; zZt8&QdXPpIt&XH>YSPEumW+kTDkyj{k<N;Jo?&ynG{9rZy2UM%Bc_5$8tb8JH@f;i zwO9!gJdh&EN2aF5OAZ6%lNv^(#-a`y{M><oignaQJR1AY9RzZRVl8+dtJ?-hUOVww zdVdAh!tcJ<!Usx&SXRq#_>h(~aL-;51L>?<$f`}OZ^FZs4c?MN`7aVM5S(V#p9w)? z86JnB)C=WN$4>}09amyj(6rDO2&JkmR#~m##w98i+ZpWfKs`;UjtdCPj*62uAt$gA zU{1aU8nB@<?R1Qg`7m;K(J^qM(s8=SkX2V9D9NG{jFgKyeVE+G^9UJBe;!#Y>%2iM zp{SX4{(nq;V~}J`w{_dLZF}0bZB*Mfrfu7{ZClf}ZFAZ)x1Z;|_j|t|RdpgxL`B9q znYq_qYwwk@HX+AmF8U)G>>lVZD0SZPh@RUed3*b5K4tB?F8YM)-F@}3ML*>Y{DEOV zNDJu7rDWRXm<B$HRe;Z~aC0$=vs2|BylpiV1V<A3^0gxAyB;hlyo`<{ED~c(Vzg8& zNiG;57=kJhR_TA<d>R3!Q3xPbT!+U-*z6Z2X422GG#zMeAu!ycJ_20v@BjHY8kTiH zq}CpQen$cL>Pv=y(KPNrk@}yffM;fFV736#A9QF`iux9wzhV3-=9Io|elcxl@#N7+ z1MxXqqp*7gqnYbDRBAyVxf`%B%Y|aI#Dng`W}=u*xtFwNjJcx9dSA$e0YB`vw1q@3 zr5;8>62PAIf~$ZEjRNKymnf}&%A}|-Z?I|}{O0Pz<OUiLzvRVv#5RIT&u$^x0jHWo zbCZAFG^6@?^Fd;xKFA9v+uE9vZq5Tg6@50t!IU=PVh%{OJ-vur;b2;JqMi57gq~Q` zhbaA~1S&rs|A7v1dc=NahQ7b_{S3tNJn$8aDvsaEOiCtF^QEQb*GZ+-m#07#6{PR4 zCR595mL8%PZz__L<#qVTdKj(zj`6SmnTubJ6vi*_R0_{(-e2Is{vQs9lCY{z9~Il$ zo}hxplGDvs1pB^GjTDz{(YxT|Aq8J6x(fOAv3eQcDF~$mSGrUn@u5UmrtD_<zWX$n zvpEJz-a@`yYGuETs;(`S$h1l?p4fB0Sfw^Ai)ZL|^Y#}F^wul+i`HoPi@M7mN5TJ2 zVIO0*^gYB*vSZgR`sVDx1Fl@{|01<$d~}l$px8+oSZkZ~)UtJ4Qkqo7_s?7=(S<!@ zu=7%xwA&i*)l;mAp{`|4KtO1sh_0ZDH<dg24_Enw=QQi&soO(ruC9)FEadsnFKH<V zo=#C^a`r2uAczGjt98F^y)7%UOr)}p*ot+;qEX7TtfwWb0lH5D{7w|y57++MWU{P6 z*OGeHG5V6d2^$WwQse3c)gFYWT@rwKbo_51|F(7eN#QvDCB-pCsmfI!(6C<0&<WT3 zBIO_g(#^ubn=p`&9{%0vG2$h_Enh$-`S%S-h6H&!(8~CVG~b^O9}%ZfT(dIOTvAKP zXO|UW_NT#{K07a5-DP+jg1RN)P*ifSw@1UwIi9|%cOS0)Hz}sSn!v~H)kCP?Ie7xJ z{sNFC`S{P&=4lmrTMSzrV}J7CMwWl)>PB)IT3a8gvZn6{#Z<h&3oszm7a_(a{2>q% z3GKVZ-0g1#X%G}gX8i1R^`R(2sOFd#EPzs=VW{<Ua%q>I#3=)B$~rEpBoY^>ur@;R z{Q-PJJ+g-b0{h}t%$IBr6;B{M2Mf8M4A%ZY1&1Kt<KlxRcRTpB!$j&g@+$fS(rhUv z@iDRq0izyDKYxhqgTM>X(!6Wn&tWx@Skj^ULhfmGig`8NjH6D-B#Mrk%@V19sK$jE zx$$Dg$fVD|_!&=Sse50mq)9uca7g}I9L3<#%PeojTVx{Xo!wO!L4M){f)63w4#rdO zXO-&C+zPF-oaYx4HnBOa9{nEQt7MYr;h$CVQ&PFI&@aF3>C0yemQ1v*{xh@u+P%O? zUBf6H6%Dq5?xa;6Ko1FM7pez|ANaatwRnU{ZFDV1acM!IkE)lz4}k28g_4d3VhWb~ z81)uXumjuHba@R#J%qnZfXMv|*^3$OATOwEtKNgzV;~OpzCT!b>}%1wf$MC@3M1bN zn^j{orneSdm|I1w^VJB9AmH*+ea?ODX80oZ{J*Ij53&aB>)UwPmz8L~0A=pYYp+CD zWz*x&CbgU-Wi)qcw#TVJKZ?H<CuoR!jtrR3XWyz0Xm-sNTj~yL%4M_V<IfMVq$Yx# zf8yQfAzIze#S7y7#q;VLu6EdbBWc$8QY7ms+PH)e!Jq15t$@8`LyFknKqwLKy`?~- z$%*urKY*bhRY(&jk^ta50?`Ow58lNmN#svkZz-gZ=nWQ?8!OVe*%n@eN+kpx5Pv^^ zcy1VO04MgIHqq+LJLA}Z&zft_GHvqPrpqUteh0dTShv+nyFO+p7-Nuo(2#4!VX*hE zqhX;J_!2g(L_=CTbN>xX&QOZnZe7|GcNCbOb{bNcKr9p$t8>^=^14HOAaQnWM^PtK zCQRc=L9=6sK8<!g3tiq9Uv`Eq`-maUsan!Ssqw83De*bP&EJu>Hi%Pyz6)13DBEMB zPn3WK7B*wSCPAar$A_y#8J#!&KQbH7=du6bQ$=&?{ZtA@6t{>XXYC#<V9KeHR5yYJ zZJvmUa*)hINeD=iqXY^)`vov)9A0pQxL>$7+&8Wm$Z4?JaJ;IabR^6VGY>`;TXuaY z@Zx)X)b{&d;!gMs8eaqsxWYQT#`M`YJb~iP?wjTtl}DTkBQ`7R2*s*v3|s4k#5MXq z5SLnt{pX4JcY!`-nn6rG0B3Y-fm`E_R}vsZ!)fDmSp!KiX9&Ij7F;)u#hL&jwYK~- z9qYa^wb*!SfSlO47n7vl=$FpKtJ*CjCgF2%08((bIkX8fb}CSm{KZDsfKeLC-cC#q zrL&NH{92a5_)$POq?J`wP}Pz{k3C!+c1W)!6<~*mFtKws;dF*kc4y_$u&ClVR2_A1 z!j=u^WmDXGP}&x^#_`wsH>AOptLfg(_1LiBzHXiWEp}b^#daI}4Y0<NZMELUB|n^E zy!})pfnw1H2a#j_5$GCgV~7jE9&D&q8eoDi$!utYO;fpYu~W7B5+cFv<=A~3?*^~} zb?Rj5IDzQaj{f@da@~7MY&bsn<$C{h?7A4AN>%j6dG64cWx8-~IgnU7z2s399d3)h z-_pM^<F)s3|G+)T;(PL9m$l}rmTcKVE4FvnN`Pmn*o?Gc-qlKQ$*Zfi=|HbY%?jLE zyb{g*-AM9P_~-vbm{3cb){mb63Ey9=-utUVn<p1S+@nT@C=oV`fyEN@jU&H=Zy%_% zOG$;3Zb-ulO$|R+FmZ>TPf9L7n-X&m!&ZRoEb8&dmjV**wm#4$K%wB`!5?kEI9c%7 zU^}`TZ7)$*H#Xlhq1EXCeguY<NC5lGRpi~3I?`U+P0L!_9F2%9=>A(FiP)}YRNg;{ zxBWVqk>LCrtABB6F~VP+`wl|>R*rxWWPhqR^m`!4Zycl?#|FiKpeG3MP2!tE26P30 zfoSTatrPFGnRZ3sr&Iiryk-4ht&ey3X@4pQMhw`M!owvJm>Qa|jdnp+D|lSO<^gT= zY7TIE>IZ^(Wz!z0Z8A=<?kQBUs@bR;c`}=RRCtemG_Ym>Yd{w;aagnr!Ihduo=nnv z^j(IQ>!l6{tqw<o=^*km#Y1|NBuXeF7jY0~BDP-kypiLG7v7=A@l3A~h`Fj8f)pdB zQ*B>WNjirig>8-mfCs1!u*{|x`YCiTC&OwRRMS<8jeuH`grb~ikwpT)23LAx$*q1y z{5t7)se+S-2}>^1t{ub<%%4g;Us^b($@O$A$oB+-#bq(xS=OHS#8_w9o~|kzws6_l z0~7x!OuSyg?=U(2mPv@DikFVewR3i1!UR4TFKP7rS*|cqB34s`c;FH|tPNR33Yc`b z<P;P=|0VArG~eEFG+Q{)m#j$iO=F`}3S!^(RntZMc`lqcRQtD`X$037O9z89#-)i_ z#@1Cm=5bYsO*Z^LpGWtjq><qJ;X2lbQ80M}pz14e{s6_UT;C|XceAH2(`d2G0W<$3 zdl$?M=l<nNrgFp(z$f4*2#;ZPYG0VmasqehSeaN4G^ha0e2B8HU_p0Y$Nd-h=0-dd zg8@=16rQHgQ=osP-lSWtFcW24q~$l2;N^+^pVuUQ16UHB%N|yGxCWx=@)jy(cv8|a z?D}f5a)Yox@?nfB5-l5r9A#VJ^YoPYvO;lCBGvF6u$<dhc8LP6zJ^H>du3+zbCKHf zQ)L$Yv?&NjP?{?Yq~W3K2LbHlnggz`rnGC)!z<}#j-ImxWN->RbkNS5|4m+gx<-`7 zW-7AX$88)P9@;>gpRu)aS!0{x-3G;}?1+n=S!XgP4kh1wp>66SNSa*LT4B>?w?wnt zTu?(Al)1KcT>c}2{)fPfLk+@Gcm!?iU^I~#&9Z@Jawz|`*Uxeaaf04`AM9n0!EcOO zW*SUlRtYnJFU!p$FsNTodw5GE6~cnd02Eu6>R`i?@Fp^5F)3G1ufW=@|Ky9@9DNCY zd^Aa5)j=hp`iF)xg>VGplhrIz*?`;OaFMgC@mTx&9^BZC)~M@7qj}1|Kw;zGz{x~Z zaNXJV;(P)7mU!>kOEx;m$x=3o<z+B$6b-D3U})t;?3sW}3iQeBd@dI|5E*X)Y$yfc zN)%jS#O#gkAaqe8{0r!V&Erx!BTxKyz3M=<K7MOnw6P$8X+KktixJ7B(rYyGu(RjV zH&BQs!-~FZaCWl_zYKTg1-zt|he<OjQ~km2>&{B=`ipIkP;h<uom^2yw#RZF2IW9V zpwTX=b!*;sLZ&?Ext~*dypr!?S>_nsT)O>5V}7~!3<bZz%GmQaZ(4I4?wy(!vCmzU zw!+{db<#wk#Occ&Fvi<i9R3bZ1%9P}iE5eHhs}%<P*6ESqGv9rgdI(YcE#BuQoct9 z*YGS#7`o;<ED?@M@s0(<g(m{nddxMvvV(LR_Wj>Ge;mJW<(%Lcbw(mKToSMi*LH1D zw*MtJAT%N_7Wf(dGutqNPFbN6HvcdaSvfoV7ODJv?JbCjSJc?JxF8%7EpV#6<-0G@ z0iG!s7J7d|7lW%`n4OoD#RX+aXBSnE@X>qO1W5n6ceW91lYxtgP7psaaeIn{c>SN> zKrui!iM_wck7ND5#H$n6?jZ6m<c3mF;s^88D+H+ty8}7*#*c5(veYRMy3dxUWlNBB z_1bi^WDa2y;l5|FHR@-s8L^JtW<?7;+b#8_T-kQ(c5nGtiu;jX+&6qu=Ar-fkl^@2 zuhP-Op5hq9#c5ulO~1t_AItc${rk-;n;Q`Kcu|?^oLYH2T=dPRz;R!n&~5*l#B^VN zZ$X~f#Sr8Bppy%AnUjp^TfeEAB1(3Rqo-C_#LcXKj*mzC!>le(*IX%#zoltaIQfW( ziN*EkYcYplbdpmoJB6qn@!YQ!O?tN@iDH_g_ci0UJKpkiRvsTzTXhUx+*sBpwhKJ! zeSF8??xKH{(>=k$sW-%Ctv)NnzRF!rtH_ndP$HDEzCO=*sR9cE2jUgTn%%*?=%wiD zt@M%&dADRnx{7aC*%KpIN&WFuL1smOZV(>bT<1uO(%}?p)UVfDM(;i>%zUH$P<*{W zS3;>7_NaP<wl9-q)F)#%Tkv=&Honi<xznMWc7^)5VfyjTI2%*79Pv?#0sKW=QP>Z@ zvvJ2=-@tF$75)Y)AA`w(f3NauxpZs{9gpf9#Pjb3oe7dY@1Q#|t4aTA1quuTxI>y; zl8~%Jb$yF3FrVsP=a5;5HFC|QQB6G_ha(vbj^{II8Oy`2@g))ev+p_oNEmQy+|@ns z-&;HY)I32fDO&t2Fht+Aum3fvxBs`Y>a%893p`qN1>AZ&0I}x!pTXe2RQ$;RwdtM7 z+lH^FqM)d~@CN@?9Z;-Oi%yq#Wy~idB51TPqJ$u^wW?lE)`yE9&w3b3;R-D#dc;d! zA5jPuvCdkD>^1+f(;7mkZoB4@Z<S6wK?FMKwGZZ$J}8um+P(t`t)kV*#a~xAI~SxF zC#>z&kyVIjB1&oP9_tU#N;h<V`R-#FY^@=>a7%%+?g-F6;Z$YGfAOj?sXn&OrNKrD z;O|-ynHki!%fHEO)OX#m-MopqsOu!X_1e1rfw7V6TOd8^Be$9?bP0vf91c#$nWt=0 ztS<~hwkVeol=Y4cKVPndFakAM&KlZ*us$k^wLnzTn}~HePByJ|^x>N)ie+8yzZ`lB zvIMM!7#(Q{gW(hdkw2D%_IJSxGi})U6mtkNqn$P?KeY^GP|_U3P>Wsy19&~3H3|uf zc1>^nfeu862Yru^UZH<cUkj|S{4ct@i|aoeFLB_8uO5bBFSKuvWF!uf?r$-+sEhe> zF3gG1maU_G61-N2<HWf6pqAuGL*xl+vbL4V%;JkNI7}S|UANX;fgr?enVQU|!btl$ zAg~KKN?9xNaJ`FQjmvx1Ic`@1gCY(l`S4w#UV_~C8g04?ujbg<AJ$K)adiQ$sUKuc zfA_HQ))ZZ4M(1pB@ReI`OJyw9D2sYiO;hu&7d=p_laUjOjZ}Q<G!V(Z6N&Frs5nF1 zGp*U1j<^@_R#rNh$Wzc0G{rtySCsEoAk)czn*-@1zjnzx;8);6UrR)8_He^rX&kO! z?f?j!cY}Z+{ujf$b1sscE?&Iuf_~eryvB&V|EObljyudP4P5PdT_jdekLB9X-hQka zIRj6E7WOT+89IP-@ehGu4#eseh6g~zW0K)nFJl)I3soM6!&9OuASe&FNb*RFT{8Jq zG0)W*rY~cP3O>?%g7=L*!%gM(%NhR+7Dlx5=M_rohq`$lhZ&^*wQ1^c^JQdbR@-L> zQ*rzo4rU5b#p3!fjCy$RZU}9%B=;M>t4PwfnY$@e6&+f(Q8)jW0*Y(6#>^6)<?4nj z;GD!9CFWB{EzE0<rQkAH9YuEms#c+I`uy@JMSp#PZWM-Nd<QKTri|f%BQ|^I9$hR> zc6+1h4Y@tG-It80K#YlStwwHq5a~Wk;jzLF-2*B6>wK$G1ev1BBU!zIfHLcXNj9qA z95hT+#1%ALk-G-m;`3DALfW&lIM~m7jZXGEYU6Al|9al}yGn{gujZ?h=-wyC*0>~- zLWRtKTuQq9317^5G`Y~`fX1-ycv47mlz1?Kh7mh-E<c@S$P1t<pp!<NjBt2*|7Zvm zK*OR))A7rktcJ<v2L<@yw`&Cq;$dpH6_vgC_;=Bf#H;H;yf6nz&I+ybg^#HaJ7G05 zso(xo<%&rUu0KB*Wi(>=u1cOT!3>G-QQjcxUP*}-x)7P+l9N%`0A=g@TGWo;R`NdG z)Hvp0zVHSzD|r~zJ>rtJnZ>M=auc{7rw&z*Yu85-zi~u+uX<<c9!rp^U5#83jy1K- zk*sqF;SPTgOQkquc|eIx$EQq!d{VCl4_JoMwfEQe4*TM(wh7dTkbnKM)ff9pMv>Wa zGo@%|`$r$hKOc2A^b33ahm*w>s;<Ou{>C_)Htn(oXurp*tH`{ci0X*VxBrovhQwG$ zn(pU2?{*Pp#aU?H1r#nie<qn5_W}<NdaUB+&cMzKQw~N>tivT$Q$pOLBdNf8vq8=; zcW!@4<-LHZM}gcskY^UQBWVy#ILJu3Qm1QP^P#YTV)J&&xkUM~kHS`IMJFCuiaHub zNJ7q>h~1`<yTmW9wMf}qgA|FwxK-LBU33UE1;?6Xb32fVOsuEW=w%JBFfcXd4pqe! z`l&<bFkJi}8(G8wJwNs)q_@zfxN@!s1Q!n^%5UxdmI4FqbmbF#1Nq)aS1)~3-@4MQ z_iD9$^QU;WXY9~=u91WQ>CSIbHGnm>Jpy8Xs`@?mNoW~Z6djoWzd9dv7-K?c$t$p; ziBia>0DZ}WKWw{k<@~#N#h!5gW&b$C$}`3^O>(-^o__3=*_d&K)H>M1lOpu~rgDX0 z*2H>M(@TN2|H#ta^Q5=@Pvw7DC1V?492S16>CZoFUdRMhBJ7tOt*ZPJT>yL;HIHho z_LSmRx>-@~ZhQD{pv;Vj6j-J!eV6d>TxZJA4qd{+BBq>oi6;%Di#|~wG$d5qqDlNM zpQ^mrnAr=-BS3(=kcs+%Dwn6PmCm8cPzT*XI}j1LXWT9^@cRTnmKO^+m%}DYu74Ia z|F_zcZ1yjqbkel`F9`XUdTC;DH2X`EQvWncSYyp8YLJ2CGPb~$D7{!<FF1@GNfDk` zUQKjb88WF=CSqOh0vf7Nl_I89EoWxSNp{CwRUp<srEEw?bc>NkBjEFs@!>+iH5yn6 z>Q_to2xRAl?$-*W;NRxg?|l^f?o>w#LF{*oM8vNJHX%_P^bQ4{dzGUND@dT8z1*HI zD@gZXRGs<z;T;tpNA4QfOx^EMR?4n+>AQd6`}?UgU)169Gg=MpNAw>bxS8pLCGN>? zs@E4IA!qiiallaoLSc|k(^iRSdn3x;rheF!7T3I#G&LU+V;2$2kCzMN;^->;dgOcl zbJ!|wdTIx^2qnKbyDJSE@7^&1Z+F&t+$er+ubJ=fZC<3a^<}j+BXDO7cKG^7Su_$B z>jcC)sxYP=R4}l2>)`nD$D%AqjeCBxW1a`}<!ruqU^DY*r*@Op#QeP1rw=HqU~g|~ zxX1TC!2$hn8K{Y7f?V?CFcF!otQDLjd-0%71tS6SvY`2>5+52*;Gb~QO|1lS=$)3R z`GYG~Ij}IESDcF^UMiExzxhP7;HmJI@JG=*0DApa?Gs25b1X0zM_9e0U?E@v%Hb98 z0YolLLX@f$=<XR)NVlI^tqVY!)(dK3o)_nMQ-lNHHr`8;7ThmgBgUJPIgXP|T7MGC z0fjA<8fSx6O^-3j8-wwj8Ho-G>_X5L8wpQ>%mj#f#<n6dYa4<I{t}FmH+^10{lQ)G z5FjjXvRea!(O?!lt%%=7*O~vqc;c4KLvxIyivLn8ISVJ`Z%?cm_s~pq*Yf3}dM3vl zNW!nIi}cWO8s+NE;&d?p*20S*h;vU1$2UOSCQ^#_$u&3!YU9#XrA@%E_>1rFwn}Tc zd>3fMzmnsBazF69NU*=seB+o2qQ&h%ffC0~$s&PD^OEEnXp*7>6reUxhCtc|sGyZ6 zRzOH0K9$CK8JI(puqsrJp88$R&inAe&;D6B%TC5m2rF^;0JF?`e2Rs(fflGzP%w~D zvK}c5P6>vMlL(MAd?mHm<NfV;Q8hGuWUJ+(lz0VsgI<mz<H0OZ6xP6WN(_-JK6hWW z=;-?PIB9EmM)8?_tdw~6E^$*}?@2D2Q_e>ktG9C1Do?S77n^<Vnz+e-bJ+is@*d4| zUiqDu`ydAeK%v8GWjzjp-Usp_UGgR<B8>;FQP2P#DbZG&2ceu0KU!J0h|d|9N}XDm zSy?zp%l|k*+9us06%~6~AyM^Pt%gt5C^XsLLyE`;6EU3Vn%u%Fser4-=vNhRg<~ZN zw2y!r0hbLmL}X?3@|%By&o|j#2d0X+2UbKm1gC14#mpClgi~~KF}(y!9=-#jir5E9 zUONR<Z2TL^14Ef_lK1=4A7$$V5|DTs*Muphn=jG46v8*BxGeOpzEmP%DUvYjs##p^ z|JSXIeN|9v7a*L(zRaW?UzW56k6ZNa-gE!j@O(+zw!3#(0N@I0g8ewaNegg*#%K`? z1n`}A+db-`6U>jA+!6^D?75$TfYKh22}QPg7$$=v-1&qJvE_RADL2cFQ8lN+mna`{ z!N_EcQPG)h#SEGT5ERX6bLcKT6f}EVWuG^DquoSp7<fu_eJ4}~2G^6X$3=HyCBhj4 zWc)>oe+HpXzq`BEkhxv805g$?@SmLFl&=_E1ChPC<dy9Emj3+?o^)prwjNYK2qI(E zm^Ao|bA=|K4G|0g95-LSrxw_c3r$|^U{ToYG?13gNeKfo!Z=J~7d)ijL}(sO94kc0 z<b9Ms&3}t=5(OexEEpb=*WErFW=hQV7@(5~+50laTD`%9u>>1C$`a#6k)qt69$*C! zZ0Rj;uzy%wqtAG6Jd7>sM-_dm(2+bF|D=`j$FiRxLoB@!YEc7u)<s;9<Z52?(7MD> zU`EtaTEd!Eui9<6ZI6hb?U+1l<s0moctp{S<0NeU{92A+iga#iXaeI0ftCp6NR!)@ z3s{IbU9L>MafH$wr0+eWv(0Oyctxi>x5@Th`}5VNijDNbn=#n}x3ggnwwMilOSa!U zsG7!v>92lcjoGc*MeT!6<=+2C(;IFOC+rIJXZ`e3?a{uv3USuN7PsUY8Kx=`oh&e& zgh$C(LK&xzA4p$G8p9GYORg6dN{$)+`6~DU&y+H3Grn<}$Qb9!MK)e7x)e$?4jt<< zs8(MU<Zgu@3CRg^&g5T5;(#t!PsJ=P#*_-H3_fLkuM9=>3~&@*(gz6E{7eC!4c~)# zB#J2@PSXIk#6uDIwS<a}qU=)DqQ2@nkj4Jv&w&M>pIxq+Y883_eRQhzwkjz&lkp1{ z+dv8xmVzGS-cCONxDzWb+c=_toAX>Vm9+N*m~lsS+{*Q;nPKYilu@*qL#77Q0CbPd zPVvlnOr!hRRDW`D<-vkhL{-NY2V<**7arzK#gqH7=C;<V<X3vvQBhqk%*aJEk7Cz< zu`=jm#PTia^7-v-pGLq2T{7QEUJ4`eibPLFOEoD0;u^Lu|Do7tx5l$i^~Ia^@~VUh z%8NURRVXNOa|)steMds+l~?o|T|AH;%y1Wp@KB{hsMJPhya`=SQFeds6!u+lbl;Ew z0^OaRsF0YTd-Nzs=w|LzG$zW&5?oE9f3mhbk)C8r{$!Q??Y~rG^w$0}dnyf75t(ZT zdt{KB;&gLcK$g{vqYr#K#nyt?ezOtQRmvq3f}=AF*brz$zUeTGWcA_u*N^CC!AzRO zhXoBBYjCW4TN;D{6(SuEFVZs|>)4kXx+&n<8-E~X1oXrC_m69}Qw^15_r4K|`Hjeg zel9$tq>@w-8x(SzvNe+cITu$k>nOR(7Ie~@vR_gwPLXo&|1Im5oqvd!J9=@ZW*kRT z@|Pm1HIBsSiM5VgF^*hu{`dR)dhxE3sXNdLgBIsfiuoi`9FFTF^`vHVJMRhm`ysAs zW{gIvlKo)`DpFI#2!{PRiRy%&{t!WR8H5=mMuOCVrAnyI*@G|g-ECWtU!*ETk}t_i z<5&kpVnwWQild~qGQw`8xX6^W!UV!6DG=G>*($u1EHPJ59wCc?B6?&TDu9&aet%#c z%=5;o_ydUx6V<MwdR;nOPX4sgsX15R_F9L&-WUTVw0PVlen?MqqVb1){&>gDGn$r; ztQuUqLBo*MxhjHn-*)SIdVzM-r$}%&!Zk2jDMu!d`bYpvmNELEElHnXxcRTQQYQdR zHX;-xitn0aJ+_hyc+o!ol7p*gZ@_L)$ynyfgrJ7#Ji7WJAzg?u2suKM6Y|ns$IGv1 z1~Vy+@<i<s8Kszc52_!8LIFSU-jHJ8G9c6EW6U*L8@nQH$m{f<bG5fzHqI1Fe$JnL z3Y-6<!tMVFu*REzFldA6i5w6ocOoW)8K?abEj&h;g^wb`mE;ZUj%ouFk1{pTgWs`j z`jaIK(g)oI6v@6WUq=+!xACgVb`c!GoOY74^2!~4I`pskN6hk9UyIx}SotgqJk@kC z99H30|6u@6*vO}Khqg>=A0W70u}I>@J`kXsjJ_5K>UoEHPS3E*7uKQys*R*{CAWtb zJBKQ=Pk(*b5AUhN){Yl?ue}|54BOyM@}BHpW0>L`J@yW9f}_c=yK7PNh$O@REE(y3 z|4c8dv?6{cKiFt^4*h_F?8)zlYqHYb0e{qe;@AULi=6R?01O)Vt|iqND-MHycV;w2 zM;$hG6G;!um9?a<03%Y`X4K<f=HJeaw{bbZ!=z@a(>!tBO|@aH8d600I^YWM6+Xs5 z+ZTzVjII>a6xbNT<j&L>fFbAsneE>^nT`9<@mR%)8s+Df9Z`dCjuJUk#f?gH?WSgq z0%W4$TIzriTzTwB(duSd`qJDfm;1Z&;wkFgz)?cupLe13EB;{&J!jXzLLXuubl-sL zXSS9r(?PIUYxcZ?W@U0B?vmN07CUY${Gggd3Fq}Xp4xzlAFXG6d-{TuYgDE<Gq@_} z_7Qg^NqQ9$Nfq1HDup|}bbDCKComF=mYpJ43uJKOr?TZb#cClKMb9$<&z>rB7x@+z zfV2T%trfN3c3$b@xXt>q0B~X>3*Xy9SQ;_I$*<<~VU+&$W_V;0vu$Sj1Y|p2*BEGN zgS_h0a8mSp5o{k~6k||TGbpSXXv%>BQ?O*P9oQ>UD|>XA<Q%$+Jo<5W=H&6zxW(cq zCAAerX#Qw7RDZkEG7*u6vbB0k9x%J<zYE|FHZ3MNA;(<B`6M|IX-ACE&6PDB!npQ6 zi;RphyCK#%prQy~JChfiJ1=(tzauzdu9UoO==rSmu~tFe?P{Sa7XgEzp12~$mj!UU zJqQc5zfwnsbSt<@PoK`Tn_|hJW5+ae{f1fxblPKgv2~*!Ji;-h$)7OrQ--?7hnt+Q zZA$P;#h?)jkO3iR25R=;YAv%4BQ)q|<f3Y3Kln~6^Z@S;Q2d&6RRlRGi^RkJ4W2;m z`B91-^k@zcPW7Q^f%SQ9(cN;vT?<U>OE!%>G(3PzYwWGLqInWYy}m;z0>d~^Tvn-G z1>cvHhmQxK%~K6YmpnmZN!C}vmOO1`2{g^;#3h&tJcO=J#z-3>Fb`7Pw-pjfNf5-& z{z-`T7v`r-)NC{nqce0CtCyWWt1UrZz-69mi0JW5QA@6SC&1Xq1F?1C)|%fnp$GXb z+UQKi+G*f&q5!vbltOT>-P{0n5RdG3k{jq#dnPkQw_op_|H`>(ay4d?x=t1%wg9oh zO42mq8EA@@^;<{zj_3!-O87jd|3%gt`V=UlfZP0XBaSjNVq&1c^GRJbhZdEGiH=I@ zh~%pj8Spq;xnSzSoi^Ff8%+{&h67#V7g`t`exYcU>$*~d)>T2>ybzP|IwHk_({HK? z-*hTUHMnXvIoI;#Aw*Nv=n)UFd0ipq6{;XHQVG*NmR40bX0L9+n5wZ-+$<4ic7<g# zLGjo>3>Z&Y>}^B12$V1PkH>e7YE?=)5Vc6r8g=BM`cFB(bh^&gN!_5Z{&ql;i<3Tp zCWt`bW_+%0?$R;De3A@W_^biB1LDlk;2{QwDR|LDWNzDXxXN1Bk&Ek|Rd7K2@mT<& zZv#ke9>5;5uuwx=3O!o%ntm@w?H;GB*%c(8YfH^Qm<Sm7b}L3V)9!VNNSBh*u(l}0 z(>;DEZ#QZuVH#2xNtbGs9i)XCmv4{gNc0#;CGG(U86cH1$jHatl9DH$MsW8kHN^Fe z#P_0$(d9q-?~V;pJ$j{Wi0JGHNTkY^Xwxo$DZVTdZ?~|*i0OAH0}*bvqkY6MZ^*6! z6T%s8JfQqp&LB(E+QAp{xF6+&O$F_psG{QDisA8@fxuEwl!dn{I%^Irv+o@|R(|dG zS`r*auy{;c>L%p+cs*CJcqT@%sjLPpR>_3j{{Hc|$fl`mb~Q@Yoss21?B>OzNvB=k z#M>tUYe}<;^o!%1dPl64c2$44;g?J1CP`nW8#ahahP2)dn<>TB%9>!pZ8DSv^5d_m zBzD=$@gr6}b%!ti!C+;lX!#)~H2`R~7Ss%*chKk;qSz|deO5`5)(Y7v0ngzwscAyi z&QIiKMPlSljP{|SB%(i|1q-9v_Z~KAYY8w(uEkCwdO#CRcO3q6HE{S-tO!UE;lr){ zcsS6#W!0*~z+O^t6i+-u`U43qa+KHNLLKqNx>oJKLV{T_oj$2$=4^F-lS=U>c_?kX z)qiRt5moK7JC9#padVC3J#n3Sw8Kz|u0lJ$gUAfu&T-lOapN%X>0PsjShuIsL4`9M z$;T@I2N>}N1{e<JX+jDnB~9#;7TLYQr{qFptR6>WYGdC<`$2iS_%gCY87DwV=MY(j zS_KjIVfU3LoUswwF?#7lbes)x?%1rx!<XE1KvUmp$43J8%CiN+bc+7ECdoye4vAz# z?3tn}@cfMK;Wlk(l|X)QB!rnfG3wLG!WqJLuceIetvtBUo!z5yU=``iGm*l+OmqH9 zaAt!$ndcf3Ee+nQV{+V&P2RKO^9#jFPC$t_hpts-3oHX0>YS(>+JhnR@6?XVvkwb~ z{u_+uN#VB?PoEBv;CjS;Rg6b&DCnoK9vGjmX@Zm&pB8e!9W0XH>W@3N2Lu)sa&@%E zZni#FEBI755F{&jl}RUH$}PNxlWaZ6Xym$tK~3L1c-Ol~%l06dA2=NLee`S$%5@_h z=yQP&-#|LBIwlBn2P`#=m;W$}vA;-^Wupl(|BgC<4c*;;(N-HABGRFrj)`NGBzkbr z?ET}0MBt^FQYbD4LG*<(U?Q4Qs1b}8I=CQU_u_e9sJ^jSFqA^ow3N9#<1lA}47t3r zYlIr7hW~2ZB)NET{t2+sW>4l)jA4C2XeuR0BHa=5+}%jc9j_}ChEs#Y@Dy=rZgw1z z8dX+%DMeI2A_unrTG=>9jT;hj?l2YTs9&8Vb!kZh(rLzyw}lwV8-6^2a>O<cNh92# zs#;?M7Eff|9~~mo$mcqeBiw<rUo=qZF(hOZ5WG2rG7r%Ukt{PHYLVFdx(yIoY0vSS z8=J4YRrIAHU(Ol|I}r{Oe*`?+{lhGg3etS=;IpgJd5P%8FbJfiNWQu<WFwhKQ`r>_ z$=x9-rPOm2v<@F!DW7IM2m8@v;+tR8Xk(>6O;J5Gik@R=$q(Wx5rp)j(ljS9i!^9p zGS(<gBd0@q)ZE$>vx{+hE1rNJewEo6Zg~ag<QDFMSj%crpgY1fCFJrP9&<>+OQ3LE zzbiAyEuI-v?4Vb~QZvvbdtYHgq-%sZW@r2r!zQk#QV=dwYUS9q<Mp*6zOCEx4bqDP zPQ?zv>zlSL2;aEX2cUP7P{6V$h#Agd!D>ck%N%(GU_=yOQ;>PCg5U(Wk`c2b!EhU0 z&Dm%49NZrCXNS<PD3F_VXl)9Q3{%H4x6F`RaW}21*muVfGjCYbJE*VHfaSL1<riXH zNQDIRp=h*=;FGRs;|Rq7(E&2qseC6t0Ye63%ilh$luK4yzC*cT<emP#xX7YQwoHX- z4F1=<>ad^S&2E8#uHca@(Qr&~thn`a^xA?hHMIVeeaqBzTlt!i0a0JcZiY%=tV~Xj zIfL9-SjJ!;nNZJFJ6pbFRAKKpWPii==M+G^$uTy+mv;eLeA}MS^)b%%)ABDTNECH3 z8sVdS5aBB|i>sDfE8d?^CHCxg)*YA1%M*N`p_RIW2=jN9D@FIOj-=h{-^X5Ug_o2& z2?hvwn2~3bnM&fYN@n(3u#`29imD@F>m7IIpyjmtg>vr~BGs}7Sv3MhD3w=k19t-v z47tQ&WgjFwumvOOUo+&+$6}2hns*MgeX+j|IGnSTGMghK>vz+t#_HKI)r<Zdhd<Fz zK|L2l1_4fv;!NhT0S`o{gT+aPJd=(`WrQ4rmi__UZ)hPNDk!-&AVd~TcgmYbarY(E zV2WNO-i}glxdR>wja@WR#B`_tLDEtg&HUDdjE#U1G}3TWQpf1!yrshriz|nop{lGN zBHTM}ZW=GG)WGerR;PbRfSU0TSCQ#2Bl`2!s@N=s@1}0_aO@)Y$ZBgve4Dq37fT1% zpUYjkgzLbdqGhz4_aL5(KYY4~8f7i0%Y}>Z+JTcm;6*#mp8il!TZD-Mm@OO?`fA?p zp}?UbJQ9{S&AnH+wK7}lh+i42xH2Daz~#8kw9YtOmeuZvf%k=jHY2G?fqWryeS>WP zD^x3r;O8<R8G!A%)*~6fgr~$$r>uk`W1XN)ngEQzRx3uMPrEoY)+0I6C03amrOJm{ z;Z-C1LRcqOX_Sz;?137z45*JzB8xNE)8#lCe@MZ<R4Xl;A<_yZr)gXC77s7AP$J!E zx&UA7k8<gYfZwD=+xgIru$w9ucExleo-9Y8cFKN@Dti8W(<~Uot{4I%)do+-)Wucq zhFwarxj|5DixI<Aq#Dc@pT=(c#M|f)7HdcqBT6L!HpP}llI0**W(g#Av@$ls&W@p) zGk44go<xVM3r>_k_7Y#_2`P_0<{@py02lDrYapXMA^Czm!<3qG*2}Lt640CvtDZaE z;n>3-yQtJ=Zi_b@D)0d77KT_3WlrDf3Rwf7$i7h<8B6;B<^}P>QNXTeXwgJC$FU~* zQ1{E-U}tMbuQjJHT7tLzy46|P%$RJUZOW}V7OZA?7bD_?BmL;}h_;BsL8#3sPPH_Z zi!>Z>(lHORX)+=T0aN8+Jwp;D6T(;fG^TtzheaO2cCdrIL7J)Sw|7cPfp^Hil7-Un zrqBu@vE+Tw_OwAn4EcgEUGbcJk~?jDRDTXij6nJ%Z>K5DxgH|Embc^)Cb(FyfAZ?Q z`taalMzm>5lF~#=N#EM>8@CThgrE|-eoEy@)Q+tp1uo$aTGulj5RvL#Va}bU21)&( z1RL(~R({h6C?>uynb}hkrl0oFS*nJV&yWIBZL;VKBeA%g2Zy*v&N~may%_AK=6~E; z2W6W#GeVUa)vi4^JkWv&%}qREQauh^X|lGeG34%N2ip`pAgs?53S4-8%hdYHl>eSJ z8Oy6+gG&d#0Ty(WkiX$MXVxU*Dgs5(;8@iY#UdrB4SRZn3b;5u54hCsoOM=J2L>ld zr!ovHO{5)1JZ6$b=THM7Bp>2Yim)lTGVFMBuesC|(=}`>nd$-yC(l{M?(|&ORZM%5 z$}1L{2S_hUb)5;-U46`u#;tK^)djJfM#dh1$-6fvV^<bWgNZsr?x6`>ZnQx$%(7@K zY*WsQo?qYG<Wo#k#E|_2cgOt^(*P#Ke25j`HXxo>B+WD3(0JY#7Zjv};p})O1^#)G zDyxC2VEI<)cmY-L9$_4&)R0=HJ`BdRcr^Izz1(yy!b%H)qblaoueQk!8w7DNiX-)K zsMdzFbQOh9rfM$GODQnpO79O9tfXwy(yH7qlHwMD*S{yP)X>Tb5QxWlJfP-)#;3p# z=fvc;w5p5Etr}BUbv0^{wui>>Ye08tp=>8!N7nS1rNG&09rW6TUA~{Og`Q?v;DlVS z{IFd(g75_8_5_F<Zs&)l%+BL&z8i63P?j#KeA*xp?s|D~tS-}dVmV3TY6suT2rJ`& zm4hUlA&!BE?iOLh<HV5aMKmJzGujCJFJb=N78s+v(<c!*biH=PZaXL?Zl}_{aUDod zN1hXoA}$?)<0{mdXamU0{jk^&02%so<Qw19*P*&%4MLxrtGrt?0N>CBDTD+n>=6n> zC;DBOOY{9G;&|-Vh@%=O?=_Q(-qQ@eIXn`X1r9wPYQK{gH<18lo!5PInuG>2tykTs z$zjxtc$K{}BZ0o$?L@G`x-2Noz)88j5MdZ{ueh(<5DXOXdrwMIwCTtqA{|B$MynKt zh967HOme;OnDM(3ursI}ifm1}G~Zg22SrAja_9nF$XI<fND~8=Xu#=RE}zbe5nNyM zI!2SZTpjKvkTQxVCfdRs{gRU2uR*+kIXK4qrYDp~x*=RN<{L{Pi6E%2%X(OkNd7^Q z8#X&?rn*XkwkSRCdg~wn-zBbpf{Og3`dpVjpP^_X4?;;M8YYMpXTj_KM+8UU0X}R& z0zjP;Q=ta9r-Fd2Tx1YbPi=`AQC$$!ZQ4QI1TI$&b>y2i0ufyd2C9sJzvloVjg6<r zpyvV$&*iDp5$C2<gB{EW#=;OVoYVzLLWcUIn>H|vTs8F#g?nW_!Gx+JV4BC^-nZp* z0%VX;;ZV0{!a$Hpe7`|WZqWh=9=O}+k!C3S1)y3UmPK?p94jFs|7@cbuDk_TaFy<H zFj8ah;Blniex!&h`jr!@a<1UVA-In40IAHN1X@Z4CY*?R^Kq_#<OvIv9G3Y>{C=<@ zL<}@0oS~UB>=f6Bej6MkM#<@p2i>{>PvCXsrpZ~#-b#gy3bbT10EW2`=1*LHad#4u zY9WR_PM&0Yn&8y-60(gBIF5YuBC`B%H$=KSVyJTC@}N}j032z@y;2X8m6u3UNhAzb zi@T~FT}%|u^}_^w5FxB-<Y=gF-~Pw;fI(T45e4&%KApZEvw}^o)AAuGjFpoU#<av- z1&IbdCXXTku>>+H@o2F))za(mqkJ2`L|UK#yyAfQ5aPIKT~5ZVcHsLGP>uIkLU+V( zKV?)ISQ@%>J>`weDs&QFFbPA_I&sUlLe{c`$?~_kClB_}*CdiQ)w`-Mej7`p+7mmR z1Qj}W5a+P)*(eFCbe-TF-N1w58C~FVwv__i^e=r#lpgL>-K30Ah4FP1m`4te$ZE;q zVOqFg#N+ZB)9O`;e~2{SopNI9m?`?cBkD7;LG@q>ZKe__3@PI2KW@0x`PRWrt-a~5 zK=nEr;H@ZBEU+U#V3|~xLa6o_nAq;CcKrM|Xf4R|nf2iKxWUoG1j3M~NuSsosD_AF z-8B*O&OUp3EQdRiPlhh&>3A3-KN{<gTuzu7sE1>4uMetr+AV3QG*g|!hmsP01TFyO z21*Usw~$Dr*b9htf#D^*s{0t}xmGrA%~J?0YZd;;iNdktlJX60n=fS~pUrXnC3Y;T zD@_)wyob3NY1v&Bzow+*Xr$PvxT*%iF$LAvW9s9`l$SE15v)dVSw$CiWDL3jkpova zTU7Tj+v7P4kX~ZgX{sqtg&xdF{9+eRcr&cd4{sdysl8Vkfogpt&d|kc>pihlPGzes z9sAM8o(Szdt2QbGOqZe#J9@`L%49FZ#nC~8URz?H3%ke@6s2!+u}2vzccRyxPM+BH zFu{5rvW++(!k>oO^B0LdocfO|Wl3&&m>0r8F@Z!IIZuY@1XU<EY3|?aWMWG1LjvW% zB$&AF=@CIBkZ8s$O1MP-WBRW5LudP58Bk3UBD~I?DTXF&MAV4`5Z;2uLaGq_W|_%1 z4rdlg_eMH-V5o3wy5B6C0xKhFwnpF)h}bk2L|>$szRBY$nF$qfhyDQyP=8%A1;jPO zE^z%_IDx`_z$7SO`|tk<IK)t{z~>h|q&(z8#gJ|VyT`bGh0RDf)bBLohRpWZ>X#o_ zaN0T(`?omy%A6$#bhCb+wvMH2)RBdKreyKuQUgzZgU>!bt?j(;!4&e|8LcLU(^7(@ z0Xt_D9OlYu#~DVCp&BqZ2lNx#@+wq%KcfEI8>;6kg?J<%3Ji+5;ZhM39f=@FJJ*s| zM<Fln+P=juLh4xXp;NMmC!v@0P-4kNna?~$ruI1kFYWH8^vNk6WCDr?t)EZ}LL1=S z2QQH_h#nd|9TT8<m9EmF$GbAE1-AhXeahZDppo;pT}fCOm*QD?*f1Q3D&Tu76~x;Q zjgW<P)PqTM6XW4uW2y+4pl58{kd7A-L}(o0X*OiBV8HF3iYj{*v)^LFkYd>E*a)34 z-eO9Sk%jxaud~J!AdAf9+A7n0Hy<kX%JITmT-kEk&}1c9X`2prDbWYN%Vgj>N>6Gv zo(zv-+HGuuCn_y2zDpOg+%=htqtw;?mD7Bye;uG4M)cea@HbgVHW>@?cx@&d@mftN z23SrY7}_n}g7nw8Fp}YI)$IZo=ZlTua<C06G)QV5o4EAK)W<~_pi^6&cKOVE)AK`J z@kRH!UKxJgk8i&&y7w~u8FzYR3iOx{yASjH;{BulEUrk!8zVr_h3oge!u$sY3cHu( z&-lPAQ}EiG?<cxwR_^F-)r`jsU)F&5H49&;F1%_=^F1b-J5<&Tj)^FYaYLrGjoP{G z7yY3P%#AkRn*umQ2kU;iTY<1<L8uVPeQfpH>5hYDTGr@*s~CqR7r^dn+{|qHNB$z* z86sV~hazDp2kV*OSr{odHHS!cM%P2E4a4D(ayD-UWl!Emtv9%A%JX!4ot-&c=?lD& zu6jn&CiFcw(>&&)Ucocaa~#1=6&SJqDs>ERp^O%SqOVKTr*tTB(N-)ci9h$9o9nOH z9>X{E$5faD3gJ<O)P_UqXkwH+CGsgGU{90`&fn(xm<%2s3OG_<*!=!Ww#z}E(L0vP z(I78yyl96#{uUVx4_hyBSxoPUo9M1Qx2?45J-fCV3%@@H-9`XALvAC{vqNrTt0$IS zUh}vqWCeV-8k}x@diOe~ZYf(KpEg~At{?4s1&>2+;7}uvZqJ3RcAh=^!#=Q9l{Iq+ z*xYu8m-H3MBS%oAS^_9>&0X-Ysnl22Am<T25g1pH-ANme;n+0u0YszT4WMM;U}T^N zs`eBr#}rnlH18R~7!GRIHweV84_hDLz!hvPfhLXB=qcY`<mmz?8|XHsX&GrTY{8Ir zx^FYJHsGu>>w)Xy7)rMp66<Bl=1B4Xh<fcsRhO=929!k8&8$O=i4&1^rj0PPM$ep7 zQtFCf!D5-uW+B0#PzGSyqr;#N{?&k`>Asj~GuVlH>xE#3@^O3^2=u7HhXFSp+|klw zk6dn2>JZNgoq3o*6|;_O=(){sNey*+p+I8LH1ylZ$4m@Yue!TYMl*rFix1dPYIZ?D z;d*GSP~_mjGhZ_N2ED+y$CHzc5p*0`XSG^j!q2fvAyv^c{NLu$;#SLRz|mX!-xeyS zgvT}|fT3CMh26%*%v;wkhJV^$|FQk(V|LQIPxMO=)$pO8R|CopZ|{;eK(n`^G8a@u zes}=Cc{CA4iPbTjbIz_3f))NGC*jL*6uWjw>qo4QBgeg`fSmPOx(!A6jm>||Z4<>n zM;WG2k;H;12#HnA(A^6C*?ks0AN#=WFW1l>oR4RtQ5(id1EzyLdK?Un!D!K;lwmeF zTPAze<y{*?z1m_0EfE8Yw(J#1tu)9F4kae9ExhcG@w<cd20H;ZIAb4FYlBl`gA$tk zrrg3@Bb$f#8LqC3DGiK*)`n-l=WhAE5cnL<y6S}>!bw4vxNXW?JUOV3^rEoy^mM1b zR@RnE2y==}BtgySTJEtM0;WnTeOXt~OI!Cx0a_SKfPIh<#G=j9CR%%xmX%065n2`D z5PvzVb#r?U$5jWbaX`=#UPTB1<}f6`V1qAU1OspBti~ug%?g>yg$LI?@5{QJV{b{R zO41@248Z{r<zUCL_lFnIO92>XuZ3mK#3TjKdZzOvm?|kK;m~78`BT&kTrhg7E;$hL z9JBy`R6LcCel_(K19`S;&3iEr@-J`@G5@z7uI}D7v1<fF0aaSAwh*knBDE1#IV6`s zk9}1AC$oosA|OE`?@>}I3S<1N)O*2WKg!FY`e?}BMa|B2ff<i#cpY+V|69#b&?Tgf zNV6f<Z>NiL&i#~okDHh*j%CIb8;?=Y?1{>J+0=W*0q9zklzUBY3P#``7emn!sl!j) ze(TtXXbO)RM>+R<WX+rHECl}`hh>nV{JUhasV4?dj+_k+lkmJLW<lS?LihlM;@e<d z6g$VSalvb@wN+6FH=}Wf;Lpy1!)ZL_YPAWMEtq_8qhEqs7+{FMxTZ1585>y81`vc= z@WDHa!!L`kq@r-BC0ko)?l6cfAOoZYXJEE6<<Z}aO1PkmowE<dq7gX*Lnk-m>su@Y zUcY+@a9|9fTLcsJYMqiov__GimNqFUcn27I+4wy(l1!L^7dnVa*TRPl6rzzY{pWiK z$?jGnEKIH#e3Xj2g`e=K_(BKHIg8v5NzV%oUe^=1HR0|IX{y@vNE`gMW@gGpl!Goi zen4t0Y`aVbP-k&OJDH=wmbKB%tw@GSN;2c$=1ef49Sma+E9%5kt3b1K!%bBTH(~{? zVO*7Z{HBh#_OF#zmg8$brx}r_v7}sZMK)S8MhEtb;9wZn;K&d@kK|ACJ*G#A-5r`P zu^`VHHyz-C;!U-jB}z*{Uz0~$W~q7=Gmivuqiv(>4OF3uPeOZQH$fpvW9m>wqR1)a zMq5fxseI$3fOK^`+5<P8W&@rn^<QG@<V5j7bmWZJSzToO=Z&$Ek%W<d?d6e-NMr^{ zFH~zI6qaF+BaD>i2^!#~BS@R~M6E;6Hed^veVXxF1}l*UVUlkOIqNA@bbP&5vff}p zF~*93O>y+qkLN}<#G~KQu9KEpxDp#Cv$6=sm7**ly4}r<+@%al)zYPU*<3lCgA6tL zM@2uEAegdJKJFG@(j4M!!}+Q}vVzfT(H#aYry>~}X6PpL>;<C99^@}ndOXXA0T4_V zqQ`$~)enWUZiaPmf4T_2=tNP7@aMCEbsE5k(tg`!(r1^St@6G<FTEkc7RYE5JFOCd zcVDPbwe?4?{tsF20G&zjybH&+oi|Q4cw=vDn;RP&+qP}n+}O5lY;1dPet&%De&^1a z(=$`kGhN+hR8v*Yqvge@j;=+yXu|KgXv%U!Eck1^2$k4*Km9^dk)zPHI)r#}qS=I4 z5zG{e180Q^bxaJ!LME9bKsU{Rzsl3wDwW^)sCJQP+=L;`s{;m<<{9MSbe#jfl3a6u z3tWwz?rUW{WaZlp0>SR=m#{2?Z|FrOE*bmhaKHdCG~<@(N=7og3wL;AxMyLYF}!S1 zpYc<%6$6U2(0a)CtkcIhQZ>Yd>fSF0|A%S2Bu-f{$#4Tpa~5tey~Eq)^zhd4XS4a_ z6k@)Bg1C_mB|N!qgj<k;ulkNB-ig#UfsZ1d4S5vhcdYRXIh1Vk0^9Me#~e1$z2`0M zS2Mw$Qvui>D3caUj8+JV8G|7L$Of38TiG9)%TM}5B}#E7>1e}Z;8@G)Lx(85tYPNC z<^pOh_r}iwmB4eAO1o1@8&ummtMZ(HZCNnvgxj+Q;*W*9?P9@2Xsfp-=g7YoikPZJ z3?<iM_NbP*b;kYD^0_<*G5$D_%nrSveUCy#N$B7T@U|^rujHRNN$kY%K>vP{1}q@V zHZ-LLl=5C@f^cMcao0^iLk_uZZ$d(t5i4v#6<HN#KD9L}15|8r&&c(jr<182GswTX z7&(do)60<4Y3ex2Th1#qUO_FfT%P-SmBK+lZ9b87abr)8bbORcI+&(d>x|6yCVy&1 zV${(i#-Y&>9M&@vYnsgXPikTC2+}IBYrqoOx-_*Zh$Zl@;DG!|0e~L*^Ykcu>9|`% zk8w?b1ri}Q&0WiI6^h#Iub+Z>!6AIA*B_7AZcIae6ucYrP48_zmR5`HVx73MV@f2d z^rFXPhUND@z2cfsU<KYJoKczRgp=k$P_?PeZ4jIfs_NGwKC%H!Rib#d?#6n^hZ~aQ zJZLYC0U;tn>yP2YzFr(vM<ZWkTHCUAL5$EW6_0TNT6QAKukAcK!F5VINbCBOZ-@Th zWz~Ug@n=@jD999Z=NpZHSah&6uU&nW)a4Eld=~Q+>6hIY*^I1kaY)qVdIOx=B323; zch0NGS6I(Upa&TZ$Ah$^cwd$qks^r%R}Ik+W?R*6)h`0_nzUEYPmVUayv3*ZCxY(L zAIIBSpMgUl7!;?`Y+s$Hqo-N<9VA@bSN!tX_A`>!WyXd0(>lN^pMcvTHh|PFlOh7R z3x_q#@IOYi6wEii*tA5ys$GR8t0*yWnK$dGPU7|=CKKsXZHvLu>DGg)&5w3mn%sXF zk(j|pr%L+9Ah@LUVFnF0bL{L&%g0}!nZ9a+KD{%G;A5M0H*pQvx%*D6zwGcm^?4tM z5Wy+sZ)0~~+ehtCeQ?HG+o?gwYW<v{m17Lc4?zYS*td5W4sl7BJVIW?HR$o3C3pKp z_8^^?NaO^SLNlJul=U|4Fj?p!B@<|Rz#ou9omh>2x!_H(qPj<Zo0ww}gX?_14n}K6 z4aHRKEqQ7p2<rBk5)7l<2<RlpOBV!Q-!DbYjx4|xp)Js?O3OpjD0{*@aM23H>QN|G z*9=`?rSuZHuLBoVp)v@!k!4;1nW5$9Ybx8nxWL>HRQV6bHV$6e?uki5k8%8@oECJU zz3hyW9TmS~V9hD<K?=<tkuO%Iesm9?RU>OGiGC)UC(0Jy#IdtZ$Y?T83E$W)uVj2X zid8<#?}?4il9eua({>{h5MYhVHzfpcOYW8vWKh7UxwcsrL-<xDdToPXhsA@N0I9QL z&k4c<8iq=kMljHQPRrl9?XAZZ`PnG%Oifrz!~!wmoiw?Y%71vL$dkO~Iu9u?HL~H& zk(Xzj7$Qvbv47QM3YC*C;MBcLcia+-%8iB_24~Ohf4Ws}eTR-;U?(?to=!(jZ!!!i z#sMHnik{!fZxYbl@>SQ@R4N5bkTkNaXwla6g<4}kc3s6_4;deoSy(D3pt7898J|QI zXQHFie@#*d?&`<$y8K1UF;Qm)FZ;JxA+0DeDv^*Ur2(6^_VTKsKkBfm32|R9VfAnW zZw=3BvPm|WFh)-ZKZ;C(7``ijdtJilIfu^??Zi=Ckg-J!%^NkJD$B~jr6l5$Q4MDF zV;ww!;#Lp~J^^_UmyHJfHU*wb2Ct2OXj3a4RZlO1pw(p{R;R5InT&yMNKlrk93f(v zSlxb9DovG9gb6u_GMG<)BaZ0pa@c6Hf8flyj5S`x5os|YKcj9e4sMG?{?8-te}e9n zUsu?ghD}?^C<U&?mF!X91k8PN5T?Nr&T}PMg`l!F9*@c0rtAZ}$vynP_|Rl{XgTAN zL~U$blIu^=KZu+TTPM5&-oG;i?cWlN%Vu75gU)?lt$z5v?)<|$*ZaEHGkTfu6;%57 z@6Afa_m!U=44`5-n~Mtamkn$;*H^ly*;h07-vDCJ@ST}HDio@N-)^JXJ#m70%vd3) z5q&&;e;V8{0P5VfCH3GWcZJPoMYTOjU|B;XT0`M}u9QHzdjOdb|1GDL3(@vt@jZ6q zmP{f_fRM2$tzmvpLX%mej8vP?23E2TB+geTA_&vJtSiSBvHrl7wK%Iq+M-98NEi=F zo@z0ZcOAp_QU5-2`-SlGjZpZb6PIgJfYn^6J<Y;tl1X9zP+)$&g0Wi6E?EsLN8X$G z2fvTFB9=dpk=6kol_i4cz}u-{oRcH0oM;i+dGyWkx?`sgSaqu2hgL|99KsD&(j0Hb zN2VIS>)2&HoL799$g095B-B6q5-@DLiP!2}8{+6j7(Eg@y%#)u$+6Cuj6rl*BzhBJ z0}+9xBW`gy67Ii%6JVRyHg^#OH_8mDPYI|;lSu=D#s~;vs*Hu@K*{OlV<|f>u<2F- zn}o|wMYdsZz4Y(IieM_~!kILF954zOB8~O@_6{99T(48gMfm!yk)qxV*XE(IstR=- z;zFAr#h6*PP>D2^ZsJEw{+i#E^&(wVHU%cv|L15-EG2F<tprbie2uqt)Mk6rSX}-5 zlm!$1sU^j`Xmq6E5LVJ?UpR=?D)W@|p}E$~XdE(~OI(~a6HMGANeIsYjPhTcshWq^ z>lCSDp0{)k->$~wEEB#Mb~)OV=Ca>~Aai3;->MUb1mq#N#HOfj^9Rzklbld-o0Tdd z$8jDbhstSuWspU<BC?KM`lzq((xU;G+~**ix>`{c9#^o=H+=wDZkmtqp=5wbo|TS^ zkf;RAYut>Qvx;K>dw=KUQ5wzRxOi^R7-I4S91=zCmhYK+OHOSt8t*8Ec3@Y(sLTAD z-!42olC{!q5vgB}4|_rZAMKpd4QaqY$TgM?b0i28J#MyQs`Ls|gfc0seai`6u($%y z8y$$rMG3}5#OFK2NHoeh@#A8rDgt`#Pne<7Dj~m9)M=3)_xDGg$JNJ65*XpAWl$Zv zPo1EHCgCqNXw0LRpc{AeVD|%;FMnc=XA(Td>Jvr%XOX(xE8uhpn-efopC|9jg%NZR z!jPXcccGB5vBYtWE+W}}O+=H{o4&eGHX&-+2$;DlJNhb^6sKlN<AsSiU5i<vGpN4F z^^S3?no3RN2P9q1NqE<kzo7dRXTpkKTAU>}z>ZNdnxKsBa9$f!*F4}!U=I<>7DR<l z`J%O)zYeRtiIf0^3!*bJa=Z^rAgkG;&%D(5@y*vtm(-+P#Z~Hlu@9%BqFRmR9o+RD zz*0Xw?B-3M`on#oIRuF&G;e;^2{IupD(ZP!wbDUpiv=3ehwG=5b;aShmfq@MfaV3< zE+;BF$^7P+%D6-iKF!HWmN*)@?E^qNq2Nb(-ASoxV|AXb_@h~?8%|N~l;SSqOT`@M z7%^Q+bw?u=$`mqTp=Q*oyDQa;=rwdE61DR1r6j7Z`2xvr`hM4;(8PPkIFp|F<XF<> zQ9t&n=b@4JP)0b?D_H%J(V_{$3`sP)EM~bj!H6oNj*B?^w~FThhro9Gqjl})6FR6K z8+>Vs%T|WJ_=;Z6QaKRGzD*Kr{xj7JI8FAo+V*uc%~a*@a|M>uKBKZGlkNVGRz2~@ z+s~Os#e9aJh#HlsQ60)dhty_<4f%OD*3q75ABN^NdY>B)25Dj5;Fzm%kaFD5A&`m& z?#XPrN|`j8qNF4%cGJa!Q~Qocx8lXJz_(B#+r=kJN2D>r`SWHTuAJs9rSnW<53P`h zHN(L_<;`OcHOmWTW4D_hR<Lf=LzZKb;S{;3z4kY9In^UG(CIl}PPcNTA+lV{iuCR5 z$opf>aY!sG9o2_kZIdh}na^RKqsFXCF*8u==|UfXABMMND9VL&JsUKms?|E}_h!`G z&x^MP8vdI;Q+2w5;@#`UpZ8}4Z}(`8=0i-=jI7cP`e;J#d+<3;+*QOeq?~PW!NP{P z)tu0?$ox)PZjS?`<NHx=XCC~0^*X)O14fLh&Wn2eIp7OG6UQ>px=D+rStdp;Af{ID z0_;+R&8TVrI{p^%0i_w}QVt%?1f7m}N$M|}l}R#0k1_<~#CSx!)zjZ&2L|Z@pHLd0 zmrP3YQe*O@2d|-=jhPsrZ$C#KzuA7MApge6X$e%$o~z+U3XkGpHN;eX)~Q!_lsRg} z&iO?JK@PE9pIU)8uYy1hDEkkUrpzoiK7ORP^M1!}*hKXzxqQ&=o-@@i25b__yEPH< z@5o$__N2s3`q5~HF!@qBcz}#Zi#MB;GW0so5=m=Z|D}weGSu<Jp^RmsaQ=EX)lxDK ziMk1PSDLhF<6U0A>s7xWIzAIQweE`t4~T7#*^IS=$9#ykDkXwDdnN_zpAkbBs(^8- zMM~iLDCcB@9lFJr$uIFojL}QX%AiQf<=#Q0OcCi5myS-(EqDie=HDk8<#5CnwY@`h zJuBGnufuO|E2~^m2^$q&xSy|r5S9trn<Z-AD}B~aI#(~`@le#@qM?Z1$k$K-E;xQO zx=Ct`kP}g9l^<^d19~-}_kK{-#Pxjg#yT0Xipn61Ss(uhG;Z}Bo-rl5clk^vRldLI zj%6iwewSz_PnkB9zdu@8ja|0&gO1^OxI*sPKL^!^S0hRP=l=4{UvWVj((^&()!yF& zBKFv_xAQ{!u~E|ptprJKR<BB<rbk^EX{?Q`Bu;XV4KCz1G}#2e8EesooqgLZ5oYjn z|3E^QAZ>NCnRI<@mix}0C`<;s+w<6c5IyebYGEAd-0o#PLExfGjFSR%3o+ZaG0TEb z2T`H&P_q%MX$e{@fihxTRfhG7tVp5-uIFjVS7eR*F`VcD?q%Kc^tr{T;RK;t6hO@H zJ9_2*ICxU*!u#hQu1`uamW$VGn3o$<=nF!+ATz$GT;y%hV24Rt{V^u4E}cm=D5Xp< zcI!64)At)Tf(0h4=IQc+Ui}J?pf!(dhcQAi)8`VUl2xR+>CoTKuQ~hKv1c;o_YZ+A z=5CzM9V8nspr7%1@Dtj^dQck&M_vy0ZS~gYXR(|_M3Yod;t#h~ui>&?13`4qePbmd zS!7F~B=m7Tc(^Oo9VH1E>@q%#4=#EsK<+yee_96`PB+0)_yPog|FPBTiME1yzI!O( zmLbw?QVFq(y3E)I3k`bCEd=X|6!To*x*`mjk1V6%Qui9=i1Ic$E<=gGQg7#tY?T_u zmgC9KtBs;a#{xaW&U~Y%V2U7ZHx2M%3PI?lhMad4@rfWcwddCg+QAcc%+6l`JJOVj zBP$BN@Br07UC6DkZFw^s(HEkZ%!<ehM@d<fG*$_*54Cs`t^`{e;QdpI`ndVN=PGVN zU}y|Cu05&}b(ZYsCmFp@OnFIDjjI4nsifGx@0BDQ#k4e7q?mU<OwBipFt!SYGLX3w zJ|3LiIgss@VyqD5kuLGFO)f0UW+X$rIcg@&)M%22A)uIKU@A$9>;spSDvvu72k94% z?Z3QcZ36V*kBKM82ECUrt~Wl!U6jQwAlSgfk|;3h(;FbozDEX`mb@A*6V#6I>{Z0b zDxMT>>jaFlmisRs4q(?=D!d{|1Lp5Fre|Q`X?^fo$X~k^FvtGlNyq`NXrWAQh3+b5 z=Vh4-GIDha0OY%*L&fbEOZYiJQ&s@}jS2jP^QI3(QPN?2a79E%6ntlOQdGY_*%-bl zVT6WK9y_@v+MDV^-}w0>tBp{<QFfBzn5ALl2IgYPygqic<)@RBiQ0*>B%KCx0s(X< z`UN4SSx9vZJiiVXkjZQAJ@}J~5d%t$9i`WK@Da|8biyAp;Qlabo6|aGW8;f9JgP0h z?fG1S-Sdh&fiAGNaTU+`-pY-^U<U;Gf9Nh&F8PnF<}PVZ|6OnR;{I!jz2x8heUMQV zb|zNHKQn1$_*LLkqEK9L6Q1U`)Wq$61{k`9ypL6>UnM}j;;<p+WoH&LRGp1b9^&hY zfjO(eJZ`)8J~U}6d*kSKIyH?Yrw~DkVlZ=94~Q0EP{R3{9S<WCR}Nl4PJ7yFkgM$9 zD0O$DIE|{BDY4-oOUlP*W?{CkDA;%@Ii<L2frpxq)StqfrZ4e;C)?{-X%Ch(JW9mr z7>-$*$JV?F{a4f>yi^S11VJJotNck_hA}9OMSvrp$Cq9a7Xz+ITL40SkJRlixwR$1 z%@Bq7500Q24jchVi7Dh40Y+_N1v{K>0WjkNmDlYwUCO+?$4Z8a{Ace6$Th8mTj)b* z$eR#<nA)^(t}2D#1VDPHy;MV$stXZMlnW%EdTWKI3Y?NC34(Qk+F{+3G&-)Ej994+ ztIc=F#URg03Hmg>DSdm@Z>Z5aLt()@&IwWx((&#d*?tACrR5CJ)Y9H8trnn%&)K3% z{P9AFzs^_iv!D@n+0K$+Qwu+4wSlu{F#B|pJp@K-cdE7XzxRo4<m&--V=_N|^pM4q z<$(C=U@{y*Oj7UM8r(4D2=npYnKRubN}v~CPyW(={mSQ5Ou2wn(w~KgdP!$bHHAc! z$tWZwqE0A~tJv^Z9NY0hDG^mErjXtTas(`_1)TUEaYDK?DFG;#VT9)lUSk~Fv^~v> z#!Qt?Z4->)%$sOc9!0~S2+O5BV)juYb#Wp%a#e2R=zJ^SNz`m^KpX7>(I*=OD$)`j zY4YDi`48-_ia&&^7}4V2EGEND0KcUFbfZhJn=MELGQA52!!3#hE(az)cb|Q7-e*co z`MYIAbTao0i#Q|;xCpNMb@&JSNl}k*%d;D4+DnitZv`}27Qm`3=Fbjpr+8|)O7?;1 zQ`SvZkp+MzX16$x56-XdOu&za7Rkk>36g3nlN*kJnQGV}fv13AzbvAnn7>dyL>B4$ zx4^Y~lS0@0g*+AhMqx@Qmi?8DnpC72=k3iZf#@*Bv2Ic8Qt*=$Fi->w5Z13Ist<$V z-M1(4;FPG3$vl}I5uFUIjx_wV{OK)6U8D{YAY1e|zBSKt2oB145|xa8wF)ONAO`+u z<=E;MGsz7AQo6=I+WrHg_;fU0UEvXdm962CsL*@qRkaR{1;>;~A{Dc0Q(U2A)V#20 z5uq*l{qfaS2{o_&%y&vP<YdX47`A)<1vO6wv+7XOT43h_ZzJLsf7bsT$@Mu@?>1WB zWXVw4^;vIB<B}M+QpGi|HYj@Mg{u0)&%*-Zmnns_k=+;rN-H%HQ`nk&VU=SVP=-zW zPq~<~rJ_<6$WP@lF%vwCPmqI&KCF7|uIp-5njq}U2xb1j{5KgB3dP~-rL{4w`d$F| zW`DV<2JS+c*u~U3tCF$OF~riGrU%D}kB|;wNMV4a5N72_Y+N_I2E9VQmo-gVh?avA zbNW%)crc8Ka=)mw8Y16w`iC0P`)DsIk*Fqo5W5L0Kq$}ie2RQT|L>31^>CR#c)O+D zvGOfU`dBPh(&<6J?;t>y;%=5yj7cfcSH_wN{Xt=j%1MnsC6Y(!Xm!2F7LW2SWGO1k zY)kTBMU&w1P+_uAp}>)0kKUu4AR16p5_M{OXi2!(7nBg#;@VSDA1zV)(+=Z56f7mc zsKhYxroK_TKTYAv)ut({#Kip8Ivm_kA91SIdQtBx<7&u%;tCBB2>kdnYjIYdE-Xwg z<Ylr9V<cMlRH*9sK3XM68~%s5uo@l3!FEZ7&q>6^u7nasVi1trj~2*ga{dR$5$@zC zC%Z=uWr^w_f#2hk%VlU8_{uH`b-1K=(Gm44;n0J8uqy<+JGmFhasX0LRMRZNtmaLx zc}NrB2rn@vRB_IDKtuOm+kM~^z6wh0@8NM#g?gM~9bOBmhJNj;V$z7whkq>L%^fJi zxp)_3?W^7{W*`*XX&ingYwh8!SY&&g%MIu$Kb?IDYSK}Rk<<sFIDcVU9@_oAgP_V) zb74*Pe|(dR?J`T<D#kL;PtO@#=+)xOn|a%k+7M&o%mp_6o-k6YAZ0G~;uwq^{X<H< ztFnwyDrwj!Fwvc*Cyg{$k1<drMkBqF84#aDPa4pc3;wfSARPYt7oojkw;)|s)s>*? z6VqykOfK~t$h36v&gq%~rAue>&v_b;rjQBx0K8ZVq%L@P5lt?&g93eR<4I+Qk=z=~ z#>_l?k&~thdp|X}1zF%vz?iE3qJcGcGIlnxn50RH-nkLh{-0x;^8+9MvhvZSpop)l zr*o*c)O9^k)=fI4L_X-z*+D3~=Rb+|FnhnzvvAtW{{pzMM1gVBEkC1lc`_%_UaTJO zm^Bx|%z(!mVwy~<nsFJ5!Csbd6noyt$${Cm_<fZp;z53OoOs(e?W@tk`(Ar_TLMu~ z*Gd#`LwD#PBxopO&};K4ZbD^YW0GV(rYV64<n$3jO>vr@p)Kmf^*n$gkt4-C0`wwx zushr^-`x63wN|SV2#Y$g6vJ@z1ZqsOm_D0=r$&Ede#GB}H*jE>YqOcdYSJV6MTBe9 zOJUreZ(EJAz_)E}Kqr6erG6ZvB99vr9jt$T=w+PB!l<Bl5E>4Ccv0-S#0nlgZnPa+ zI2jpkd5HyX!(aEE#>a<OkcWEMKBJ{C<(xno(B#F!)i=`VDU2}Cc|2n#0s{$Uk|2q3 z{Th;89g}Jn7X?np+$rCy!(M5lM?pdoB6-YNzmc?u8AjlkR$!H@4a4x)=Bv+FqNnEm z(^6md=(!OIXg5@mzOhab5Ar3811Orz8sQj(om*H^BXMG4H8}*i;L2bSmp0F_Pg4#b zF_$Y_d2JZKT%ka4C%2g-J{2q>+|?M2x~xUSG&7V)M3mG4ir<?3`IF3?@$IKtB7mUC zBtvQ{mGdk|E6z9|IQ}#YJLCn9W<t<9^aaY^f=T|boy!IJgd)&Is?3>O1wnCzHNkns zLLqkf-b`raxdni!42loDDbY9?II+x4I3s~)bMVse<ImhFD;1br)c95W3{DGWt)_Bm zVqqY7N~9JSglM;+{QbR%)KF5!fZ#u`!iLz_KOAo%f=514$mq^q#f?^h(EJcf^ERVW zO*&VfdIsi@=V7!+KpU#5qRdnhM!&HHsr%rKx(Hx~+yimL%7}WGXvDw^D|n}Tg_!#g zeQ8Sx%Fhe=hbc`6hNb@OUF0!tKunV&w4iB?wEcmnOH=n>M%3okweQeMAl{&5-bRlh zl#LJ0Vx9tKQzNGx1`<d(jpG^!eFO#q1k9?qCk8U@ii{YahauGGeV+)XVM5eAZibe7 z;kdR3))Z38w$>q|>o_ES-nHoUyLT)t-l-WX8&(6JJL=pFdyNCcW`q!Sl_a$l=D@M) zZhZ6}x<wv!Jj{YJNC$k^BPTKuZ$um|&o)HRXp)a;ys<t7ZH1On1vD%vicK$gd{R0= z7)-jpg&yr1`AzNvABH;w@{@Xl?#pM{^{l>B_vMBP98#*xLer=~P)4M1%(vYN<Z83X zV%N+gAR8#*0D&a`B-lFfEf?)5QQ$wapx6W<f&w#)9^BN)FK#v{+k`qSg5I*cF!T;z zP<S3Jo`2U31+}GQvE`J6b>zqpHQJ&MgJ+yzBC%c1d~WT(O{S|V!Vvitwe+r^+b6_9 zGYlq>ckLJ@5w}K}Zc_IYMtuTq)mnXu;4gmu+Ia6#sahGn8Edg9KI}?Jfi1y+>2~nf z(M?;d4ew^bZk!&PTk%10)Bz_tmLicJX;$neJo*HXSvrk80P2hURHH*Ph?60S6D7-Q zrZixnAX^wPsytbUN~Auax9XqgGcM2v5UjkNs^H{T3O6&KOH-rbsM;TW(a-hhSq+?T zTFP9y`^E7TZykTT*bIr_Z-?G|HIXy(2l`bn@e=rbX^^kwI1q+o;jU;}Gqu06)npKN zfD&<D*}3-o!_?aeL|gkc>h@L}0)|9<El7i>`O1`4;1^FIN7vSeY>Ytg@D~9IEuQhk zqa5Y!)lP|oi`dB7#&6L?R5WPXpuixRfU=cWXmxZ(y^HuM&s2Ab@da7lR?=fJ!4a#7 z0Ta>zJ8?63w@H>ba)^`Vj3d2b0o5n8`SXPFxE|7EqC75^;7VL9Oh+3<mo*}f`PC<2 zVi}YTl9)XIPg*6;1GeTD>?j$w&<}sVX*Pi=51$;S&{-6BPsMYDg>*0>!J+z91jvR_ z5VQC|?lPaLhrk?ee!fjh<*YZo_ciuHQz{yy64<FA*hY$Yoq1wqFr_&hE687xc0Ria zU{p*~9!1um-ib5PEFez@3@}P&_ms*LJv`4W{Z9N%>FD8++TzB`8pS!^7h7Yw)Ku^Q z4L*kk7LOD;58R~3n-uvMqz_mx={jqwf5DlJhfPJeE<56l!%c-*GZw7oYf{bk)8p=7 z`)MUZvZ0;<{#pjc`<6_B2M^@)DR}ZHcO5md1z)0$r~(`)!v1|RVk!-qKOEZZ5NGmz z<TNs}L&dHt#xx#}42^+5uFRT#XCo_-JbUE>W^p}qp$qv7@x({iK4abs8APIFaVWq7 zbQFO9`^7WnoeV!%1TGb!mp<B_n^p*I1Vs;HK~WIL0fSa5^Yt6Ik^itzA7Nl|umG1X z9R>ovoDOBeO4a@k``~?_gY2!E+alhO7$Gc8N?$`(dA?9Pz9SaR%@1%+B1Q<f%&bya z8)_V(e8<sWSbTAu+$;sr6Mevc0Y=#!@d{OXg%P|T8724{*iygD!><)wVtU%!#or2^ zjiU5qZYz!yhfq14pjU|k8Npz%0<n$DHnXa6C!;#FS~~b|?^Q{Mv2I(&Dtu8C$jd3y zg>j1EC4Utj(3T_HX*JTNVx~HlX-Fl+krQ@V7H3e0HG~WO`!qQYJx(xDuFh_l4KP;W z`C!oE9J4yisR|1xv-j{%n<3J-Tn31;T{)be2nBS+F8$KS8S#<(vVlR$Uq-$im#1y6 z#!w(@!*SF?-{6r`>T=Q5@;hJ8nRxrQk$_bLHYe+>1Y}->_Hnk@BB?5+$)i8Mq0LeI z8^k8R))D@BI`_nIioN_Rzuc_H5Avs8Z^UEme{0$nT#loZecH)*MunK<YLz^G(TMS% zGcR&?H1jnJ^LnKByE+XL`e$Sw)Ito1zWIV5x%sB?BO=7I3@ZFg6mx5{v30dGu%n2i zpP<560?7a=9}ia!UtAvB(mA(qmj@V|Hr8!uLePLtLEuM{^B|WqxLYk<vZ%8kOFF=5 z?fWu_=Xx>iGL?B*)<l%v70@V<*+*F<C>Xj&tLf8uzkZ|1sy;(F2%fOou@9Nk=UX8% zbFAto_@*LYbt4+~zKD$j+dK)@G7tA_cIIT3wfWV^nTKhtaU;Sa`;IHs4rmgr=)1(G z(KncSlYR|HWU9BD7hR+&NY<>^bdsm;2I7P!ghW}w*3gC1AQ&!HmrAH+etEW(ucw>a zu=Y`*ks<(eWA`{Rqw_9CBP!>pdSG)~Sov>AJq<f%P`1=Rfo?3l6o5gocDsfkwq`-U zK?8vt;aOKYsoq}RU#lB`ln$>4^sZ~ih;skL3Ra11!$$q01rfnbU*Jo!8bt4}`P(hA zcG1dL`JITinp)gwPP$p+Unk|?EQuGcSp32M=0>|V4ek{3XL3038|G;sz!(H7woQPS zZlCfAE^i#KkeeIFS6~%htp_8pt0^wvo68pNSfPzyOaxgOUTC!PUKqPDcVXoLtV@p$ z63)cN*wM+s#NhXT)@%(e;n)C-fd4E3000{&7YFPA%31zL&dd9sT|iMcCvjyb11A%} ze+&L2GhqDQ004yr0Zag(k->M#$M>Jg|6KybY;BzWuiB>cjP#5Ej_=bd{io9R1T>{* z`WA8iuju=@@A-f9;`;yE2LM&v?Mwhbc|%KOr{Djjjf{znnUgtyiSd6A1<C=K{%enz zg@dCLfc1aYt({C906;P8Z@ooKjBJff{?ow!^xrLuMVWCo;wa)T7zh+6sLua8H2-z_ z{}~%5_WxCkiTVH1?0*&c&uINuw}4_6|8JxJ>Fa;jWCF0Tb1*Xg|I|q8gw4Vj>f+Q@ z^`72ZaNODAd7i$SyqNwDYp;?a<Cti-;>0G<i~)fgzp9|7Huwo^X>8Q_2rIeh_7|FT zH-M=IHX}{qkYd>UE^3hf)?%Ns^COJQ*zBJw8%vn`_5Huje?qRR>gvxb>fYCdAP^uY zeT@2k4M!2G;MZHN$F?B9ZdE~M@7T0!)PAh(<Ur})(|`o&)#7XT4k72l>{8Vbg8SKO z_E?;^v;5O-$LrbC1Y!Ri2?5q>y}wV7a)7M*K(#dg?nS)Ib@m4FDu&e)Xa8mY=*+K< zh`e{M>*Z~@aKu*Wmv4Xy!h_-Uxr|v^gTJfyw<of8?0&p=HWfO8_-hrSw~i2Gg3d3G z7-V_|lvfMXi9d8V^5oqQ8qDJrWOr5Z;WH3n7QV;<1}t{e5qR(p!^jBhWJlJC(W*RO znvI@~EosKxLB%ZyyHecvvY4P+UBMP1KRXl92*={0cF=1kU~roX22=Io**M8pm-IU1 z2s5S8B;)vK(GZ38Y(a^xb<q$}RL7tjN0`m)IBO^vY5x})nj&0n+8iqraYd;~y`tFL zrEBHF12|t$UbJ#?>R7Viv!FR|#BIyEe-Fm<+6?)4Z1;B|uxKIR4c_i=Zbs;XP99aQ zW?VsR>B&L))7;oXhCgeQ^dXbiXNoCcruls=^7fIq3FUM>=EPfMd(Ni#x^CW8BYAEt zw&Gf$p$NJAOYo6Uip(9l`>WX$po6K>;j6|H_e?D(U*f#Hsdxk~o8D+%FZO1PHIOru ziG8gxNvGRCV_`3zi6CT#PUSc6X2F{{mb6WqsytF0&U6ROQhv5`t8&((Zcpzl!yPg+ z&qO0Sv0}HDAY1G;bL-U1eS3bpfe1!Lr`0K9*D_g8D!G3vUSFi3)H&1-l$-mb^6uop zmaQwR=ht(93`~=0O>ID!S=d2NFx4XIvil2su<?9n?u7C!h;DVra2ZJMS?GYrdZ2F{ z7iBg)Z)*$^mqY%5_4B#ERg%O@9oBBJ>nXRIYe?<87>n(t^D@4P&;B@xTe-CF*ix?W z_2!$(#g=>cTpL4p7PQjqDc?+C?u+Lu3zK3#Ardu1W^bYjq;D<2#ls>GbUfnFNEBS5 zHsZsPqX=A69Cl*l{rR1aO1s{4v+wr&Alf&?GGy%8q9#Kr0uVW-)0D!ZTsL0ChsgOG z#MP+XV3XQC^Y>0_IV?kU*RMfOaGaXXJmtxl8#6uOcPRMf`-UQqg`*2Fn@F<`u(-7m z9nvA*j(U>PuK0X%xN6YBva{7wWs5=YtLtqT>WNYF4YA#RB2&T3xwuAeu*n-pi(XnG zdMw)5&3i8#)p%Gd^3V04?T53Li8|{37^E*DKm0$f;P8}-sOpR^62qpnf<2AFvbX%T zdHt9-130Pd)`4h_ei8+_zCH&%Vrwf~tBGcAou$SFgXWH`=`3OTL#>V5t^hXT*kF7& zK|9UMCH(4K4*1R+cdO?3rr|=a)ebeA?Ko|RK%-9`oqH(NQpp_`!QusWRp(PpJugXv z1F#?h!bVzp_y{Pu-)9!Ibk<KzPO2p-T3y^ECyyiavpGV!Rn3EVD>a=pl~(p6h{^QQ z7-rOH_L2rxz)o*KHRchXQgV&-LMszmy$@_lv$=oCQ-qOwCJ^mbRtEvt)g$?+K?<Ih zLc(b4khK<Up-J27m#$uDDiuCopCC57w1+;gIHxZVStvG-2RTeoObjvMC^rwO_b(+O zn;>5sNeIKC$+m`iR>@rM0CE|F(1$!5T#$V&X$qf>(5GAx9%6JY!WQ@2q)w0%BkLGI z+-h2q^=vIRM-e81XHFy4uVPT2qjKhD@yG6XQ9I+LgdeG7)wjO*DFnXFEf=r5VSTfc zBYfA-PEDJ?{~a{BW$=@%^zc&O^NQUrbT0;sI4+w|$)a`e>;t|P@`KJqZ6E-tx``HW zHWdJdchtx5$ve;~r2ZZi<Bm<oR>iSd(IM&^+1D`Y-iRJ7MJrvVH|g(X#=Vy1i~8n= zio+yw9x)u0Z%)2*bxAETtV=e3Q@%4w-Lz_G9G!hNH|E+@l)rkQd%AsL%6vs=bND4C z5pWpUn5{+CpZjx`cQYyj-fdefeJvxIt!N~}z|j}uP)5v%FP7?yj)JKTd(Q&}RPiqI zh%_vuU|=qux>8hLe9jI-jAEYVjWokwv&6{ZLZIvfC-}TM52$R7%Ic_y>Je<<9}nTC z<<|v<VJeepU(6G%Nj}`4uk!qoew)N9sKC6$6@Qjg@sE$*=*r!6Uqus-m@XPwHVK+Z zl+2q^wPt=3JPI|yVTV(fw&52B$uUOB+dx*ex6}s4;JHXwm|L@;g@9L|f{GuT{yq4b zDQx;j9SLtT?OPuPQ*E!%HsGBisu%HPvCs7l^b3<W3`1rQz3*svco-j6Xh4t=2AWBC zM#CkKe87R2uC#YHN;-wmx=$~H<n9nO94mBc$OY1u$J7Eks)(~37eTbekh=Y5k81Xc z4dc;`W&^!w@jxk7nvlU+H12I*kN9hv8hEo8qCpZXDd|Zfe;P1gQv1iCVZUM?Q-%sg z(mw8~6_bB}=b{yM>Q$@gG9Y;cI+6d=__SY%HH5j|-1J}0Z*1Ae36E4!W^Qii<z}be z_e>8{hV+Kr6xynsOorS!=KJ<<<Lw*|FN1r&`t^u{Y-~2TT${Fbhj)n~jk|P{;~u|m z^VH%DIIRO)Mqrft0bCsG2J^Bj2zj0JqcjRaGKa`S6yxZ#FC2j4!mnPRgxLD6Y9mf- zR<>zCrff_;vx;DtIAmcao_`JE$4gC^%W2Qc;!`!;xPzalJ7Dg?$MVz~1wonpaKTx7 zH83G)u&!O}7B*=>)eH*Gs^K+jpH4X4DE#J_L45bpg2*?js5TWI^z0zkZ_9<hH&m1y zmBeiF>y_&75W9A--oa3sWsw{K74yhoxP!Y4Hv=K9(UG)xjQW}@MA{8V34SE)o>Y*O zKXf&+hptQK1;L}&A?g#vE(7p_=_ySOPd(%7?8vQUfY6miE_CC-1i-cC?MZSlx-2$f zj2h5cF93DvwQ$>gFoQIFYtnIzsE-qzYc3msinoy7`vn3A0;8D{x$61bZWYR3w&AiR zdQEOsZrx40=U=*xNF2R#jl^8NS4os6iI`?oGJ2_0wn$iSi=1F(CCm+p(Fb^-Nff>b zV^TlCOon4nHN@Tex+5>>utpyqgZ}JqtI+!$AD&c&Qq6<Cz4TL)Xg%AWinU~8z48yk zr@?=xnJF?=x{2CtP`?$jQ>UW6+u+pA?=V>UHo*(1wF7C*vc^Y9iyG)yslzj(-2PAs zLs>EJ(@cq(NB}QSlFNC2tbih^e@T89(*&f1e@DOH*t`7=MQenA5hK>7pfe_+1P584 z<B(iY9s&2dXPmmqGA?zTkCK>ZvFbyjlhk3x8lgkQ0hCXX;G|?{h!K6iV?=abs+ZL9 z@+3fD&R`z|;M5du8xF^%hs0`gn%qqkqi8UmFuH$G4d0mm6UvvT4B&1P{ekgj%tgeb zhc-w>3a%u`$R;LGAJlilnV+r!H5_)Z0LLgN!b7o5OdR$Npz1E{Stgv0pHXS2JGYGP zG%{c*&xnT{Y0s$x6TyNh%nFF<*UXSbE>MgO^@-KAoyA9}5Gw1zy$GEJgWe^2p)}s3 z_K=JmC$beu@%&B3&Kf2E{u=?5rx?l&=faSGRq+^J^%Ozb8^I6f3&g)w2@?ah54cRu zB=ta%yJPsM-RT3+^i~2<wk7r7awDTQI+GEom%`Bb)v6FW=tW0~v54ojU+LzrX{4B) z-`aLpdPL7Lc}8S?*D@2l@GezO+a58I?n&v>W)`!9NA%Hm6YTAZGyq-N@}}>!zqhyj z2U4Q7j8K#*O4l!#7~U1zuD4d*-~q-?)~5mz#%V2@7sE17%9%*QnaG^{T!K0;{zS6~ zR-G>oB%Ib(85Ks|$1{xA=uo#mj%HsNgdHIlDpO}6K9$D~9P3KC8i#ho%BjAjNuo9z zI?6s%Mt@iiHV2VEr1=VOQxOb$?FadCGrW;1VNvn|Mr)K<Y9rLIL+G1Y!|VVw)*X1V z8m1Ks?1O%{J1+~DoFpxAdrP8rq_pKfkL`I!XU>luP`}7(x>^hMWuSsz5gBn04-7mv zS+}s7W(I}P+<y9@Ce}Y?RFo;FWe_v<>ylk?AmNBdc}AB;XWjbU_qJF$!QER`*ramL z2to}m!oVMtV-Qq;_i@3tQwj{6|9s_2R-(CN3*Fof!qjT(;LGebmI9#Nx-ghFg&w;V z-+<)v+ux9O1av{)?EdSXofg2hyDwuehr$X28-$M!B~WWn5>cjpL#jhy86RW=pILK< zx+SR4JqKw54;hNOpmrnynGA-;fs|e8OBVn~>4L}cqV@}BX>J<sLu*Oj=tFd*3uWwr z${Nl`MS61hBsd$Hk~o0kgy0Mo-c)QtVZ7;KrK-E3a%)Q1nJS$6Kq9(Lg_&R&T5u|f zs<-&CPX9nuDjH9M9_h*nhpolO#&K|ouyBpoVj$#EKOIo<hDrH73wrA-gEL)YO7wYw z_yGQLUz|)!n+goIRX@G&AB#<+*o2ye!zKXrpo7AV<ZiIb9^QA_qWiQKRtJACL844l zPSEP+|7ZGT{AKF_dbsDb6?60FVIiI@zVLKq;W~6D_KWf}G$gig*xk?pWBb*Ml*C`r z>FgC<Gwko;2T4d`)+`s?GXb1=xeOm8oEa}LJs})>4@w`J`{AE?h^~o-AlbNX0W)*k z@v~702b{Pa52~AZ#<O8Lb_=l<%>_yvV}wubX*2I;Ch^2KK27j~?yNO%BW@~@2QKi2 zOrcm}tF-8DxAgDr-7c;}KOR}gpWDS31&dDJm|KG^rGh_BlsF688r_13m8fhL4ZPTF zBZeL(Y?F0Ul?Jl`Ruv@OcxTQQd>wcpGX6^%NO=vEuVjXlB)<axk}#%8t*w_CAJ@Ul zIrR5i>#PpU6P4&dR>7nFX8e^FQo+0z5_4?x`ZNz8YeP)b+Hp6cW5rxCV%0T``TLiv z?<&63mTUZcquEvTC$g>n@pb$2UC!TU-gSqUZ@gAME*2tlv^F$bHz#HJl_UW}4+Y;- z`k&{nyox$I)FJ=)!2ZQ8J=pB-D@M>TA1i>=74)KF?C$<U#sGnUbc+D9Mucp2r1oQk zD`JI_wqw}@4P`>#(=hh`i4uRll~!Mu2^%@KTax@@L->WxOLUFH2^$CX)(5i+uL0Y` z0Pg_`-wV<#qD!h2_0kjphe5?C_5nl%GlYfi2Ya1)@_vG9JihqB9^0)Z3G{azcvn-w z3rE@vmj^q7gqrm`T51!7AcOb|)e!CgmUu?_pyQ0bU$ed$74z+q!`~g39EWfUf{oi~ zy=xpOYMDNyR7rnuSkVm5bfWsUm45#iY{fK0aC|JwTq68Y$ZkE{xHTiJOV;HUl@5%< zzCr9nOuT8Szw+#)>r$_|jj5Q96kS>gJ3YZ6xpqb}RHBAfJWcOB)>l`rX7=`IE^uGp zcpbt+2JJ}|KNJU5ScK{m<GWU#lFwN@B{e`d;i43Ii412Zxie86KB$pqyW&2u?;zW| z$3d<pf?h0e?ySI4Yl}O=4e$LL6)+}m;BesXr-|Q&nIEMQ!jZDF%1Ke?t}t=@qZLxv zv^Y;>yPb`&e#DWMAn5L+5$#z*E9hJ@XVetED^CM9tB+yb;3ER<EDFW%pfBc+kx{Io z&yE*O8B4RjtR%`|qWzoj6_w+{dBEq{&vUSA#e03K4LNAWNc%vfh%0b{WW1}cB9dSB zEh}8H0%Ss?iSkz!rm}y=u-SX=Zz0AtPG?IaqvOR|H?0zN(@BZ<vTQ1Y5SJ?khr-5s zdrQ%5g%pg>q=`P&XLg!nYYzCjtgg7VSV;@NHQmG}^aAuqI5BuATK$sX^bGyU7`h`( zh-MTJG(J@7x;1;?-KAP3=89h*b~Btn98CF8%h*DZL)Zhl2G~|n2ra+6|L9-q$UXrW z+WLkR=>w-eCKsh%FCbeA7hlNVB|JJPf&rY4XYddN!2xqwXvI&__>V9wk~(tOh&nxe zzgZvxyeFhuYvw0bqy_pb4X7_36XOs}*xF@2;7d+vNH5Yq){CBDQf}UuJv2ZSWRQDH z1M{oQRGm+ap;YTs<5r@a_cC6+eImza*48(3k2ivt_VcSi`5s%?Smh1({@&0W3afRD z;>gZ@y|a!f`}vk=i+Q7XI=SZW6{apIGq>(+xY#sP>`w62p(~zkstuwS{?7wGV^8=l ztJh{Hs879N%4c|up<cZf>F0<WqX`Q^<9z*gNT&9}L@H2zccsjolFaC%6Az)tS<)>_ z)*zQP7k8qjpCz6${6C21FIWD-d{S{+@D{d2QbE83p@ZR(Q>1sG@>;w@>aI?&3of4m zgqz1i-HVMvjgZxtG(!iT;j#8HdF^|3jl=r}#3Edq+8p91Zz=dB)I{8bohZFTBhhVK zuI#Q{n5S7>-s@l*_rt;(_HAyx<KqV_8k+QYY3}wHOrhajP+JW%Sh=j0C%a6c(kL1Y zP31E4)1yQ}kLP*GI13{iP8WA)zkO%`RRTuQRK`*{$pg3O5EzN#kh&itbR3z4b(`W4 zVn5IkF==FTnS)PiSyBiO>j^@j-l=8tExGHj2|&TV*o0fn7c(L$y-<GN2ozAOzzaPi zy;P`opLMM#cY74q>Ykn?n=WG#CX}8iR*9nX)q#L0>(#wJJU*_ujqh#fh?w<F>P#CP z##SV4aoqHTSgl<B^09Ipy-kEB`gh<<=yTE>o_!-JcUNVh?Z%SvHoT->+cw>O?uyd= zE>8XNa2jJ0PqQBF_QOmr$B=6>RVWYFdFh5HCI3%4`8B1+RAiE<!U8;UlAp4?RyI`d zZg}$9s@R#M{mR36kCj-3t+H;(OZXiDAVkUw6J{(3QIcKOOiW|1x4l5l+|saY2=kow zKqI#1oL0m(WtiHVU*707Au1Ov!rx%D@XH@$5$qEb0|xiT3PCQSO0I(T09C13ET0?U z76nz5D^YHq@8N*acc10-$61Nxq%nfy_pR0ws)&71S1Iv`yMr3nSH@)~$0xje-mk*F zliTLs<=>G_IDIh@WdcFZk|hW@u>J-@bc`GlEq{kFr~yfq>F|m13I76p;)(WT>DM@N zf}?>u@`XT0#6dT6{_%6Xc%1u_(j>D1`a&@WD+Z>P4bQNE-X@Cy`Zo)qf2(6VKTaBM ze=P1QC3JSP0=l&&98(AiW0_cm;Y5tjah+psA}p79*68bqF;IR?l3V`uJqwJV>}O$x zLaEZ$0lKhBfiyZ(>7pb`fwA)v4q_RWO3>r5QWZ|@@L&;`5*;5!P#GA08zrOyqpadk z24`qp6dcsvtbV$mRZ?vO1wwI+kWSZlcB#AeLfAK}6VshIOUUt#>)kc~=DLyM=5`XS zR%ZRvr4_r1T<PmHTe?Z3+Qm+j;pYWBpXZ}0XX$&nZ>ID^-OTno<x-zs==CH4^^^6> zt-Oz@@xmCtx@n`sUy^+TJ}?w4M%mteKlUo-YO~|Xp{%FcY*l;0gElP%&DX}~RXdX> z(*EldgRZ|^RV}T8U|CCewR6Ac^sKripKTtDK12~Vr>%`zxNM#4vke1|)9R@9_Ha## zVMME33DgW3u^M|7Uz(4%G+^<AKkl*6Q!MkL`ezHqpn&F!c=5}tJFP9Z#$+2OTyUkJ z#BHk0UdF#Oeh>axp<LyaEch)@@jHv&RX(dd>;!-7z-iO13&s(bxZ*AU*U%3K0@fQo zB7X;BomNU;6)NZtLH_+}MP<4Qc%w_9yK1Nk+!fNdRm+pq<RuNSJ}M#WX&M5REk7g{ zGL&l!MV5ZV+cY+y__{+}m4@sRz$vutI_28S)5!Sn+_kg*D}>`p<MFd2|L-+E)pp$R z<n)ww`+4Ptk#rlPqVi0OPm|Tu*r)u?>jTBiK(7I3OM|uH(RA=<PY2`uD)`KYa()98 zAB*_S=D9!Yhm<s@>@1jRIxBpkrhXwRH$<4*Bq&#vBMu^EFP<!2Y`i<h8=EG|WdI-o zf%ZlWHArUmfz%7a>KLk3oW(wFC7c)NP&GH&4RdTTin7N&mA9aY(;B)q0x-q+l64$F zONr>}1$(FWq^F{f3SNPrcAFI6(`hc|3`~cad*}Z4FZtZ%i$oz58<tpxB~XTNSBug< zB3{WTfBLD=W5FwCPH0%-2gKB|S*yY`hQf~$aDBWCBG0m^^XZ27vbd@e1)PVZXHIX2 zcXJ6|9>YwI+`NG0pu>vv&Vg5ofT=)pP7A^%j`2w8iyuQW(T6l?_*&au7)M2GN;bMv ztRlGhZE2A2`%%4daT@$}?u3$Ee3s2O0gVbGLqr`U8{M_QD)7<k#eQJwq&E%6Gqj1F z2y{zsu7M)i?!a50YCU4z*McxaT&CJjq~lTpkKL}%aA1Iwjean<?wm_W-6+qnY$k(~ zy;MrhY}PCvmXZL?@I79dcqeY#+pGODd_=bC++VHQ*WXy`b7EJLahar>oGKLO5BRB< z(S5u$-Yx$l^!oFY*0<8@VY90;Bqw+%mZ<wjrEXo_9n6xfyb<UAZ5CJGXNHdBvk}vK zQ6{9#RP-Oe4G?DemcEHyr@TN<LGWc%Nu9w*MxU2!w?`nj+$Pwr9xKaWmP{xh)P!MB zU*rq_icdaq$~4lH-k6?;jtoahBXYzULv*)%{h@@rqxe=?Y3U<qg;;CGJ%Wh<<*-3j zG9O<0TtmJV{=9V0YiZeC67o^r{w%nR!-M3#(AMXVqq!7{Xd(4%1LEh%&uA$|k#2jp zJJ&Q@TL|S@B|4q$jUY@tcxJ}leoBD^>BnB8qDGgf{<Zb+KQzB}hev9hY9Fc|Y94O0 z!dU``>ye%;2bM}Ir?lz7`}}7GOaUFzol|uSO$%O;CT3@nPh(l+nGPkL<tU?&-=sKD zrV1>T5XlP6Ygw64rTJ)?kH&#(C^>H*kil&wZPAek(lXM;6k`<S0x%(QK<pnc>sW2r z?pW^cTEH=#gq|Wh$&EV~79&RT-%xtY)(YGul#PVKcX7Yf4?*|*Cj-~0Sd7Be@${sl zowKg$^vLCI{ijA-kh@_~)$U1nBZ_+%rc~U@Acaqk;($23cmR6$8DSR$n{2VhEpyh= zsAyC6l0r!05Ok26a4gEKy7Spa1$g#D5(ERopKt>iN%h^<qSxB_hFfOdJL4MSbJN&D zGf>KC%O?_{vpK9@T%X4e#u<`Uj&NFUvR-|DO$pN_BlFg8o_6fi{cE#2+F#5m+#MsK z(@{qGj^>6i(wM5eg(+uvecKjtrITsAt^N;V>Gssoxwxgx%t=u8U6#BN#*o^irsng1 z1?JHv`oa)%gXp>!Iq)5O^{u6biiZiB(h{Q;xw60wLuLdme-_w`JBk(I+mf@x(Vbeh zvxPX9JkDoeXkU>{r(@tin9xmIiUn>c88@aFA`wIQzvr>ZXt0(%GnCSXmE~wAvjx|& zVT-6@p~Q+BI-_IsnH{B<{qlKAK-80!eSdj(b^3huRKCwA_V@`(g7?V4(`50!vca>T zJknjid+5sXJA41G!o$XhqS^7fn1>-J<$S&SD#DY&=fs;(9nN&OXXo?r>;C{rK(@bI zmbb2J3jXlwI~U#ki-ISVsoI`Qm@vM;>;nmSsIHMV(l`P><g&)8H9hVQ!CHva`4L5- zdN@<49?pa-f6}Z(QVr37rx0~NhR%77LQrDdPS1;2h;@S)@d5S3H^+CxnRr5Q8yNwV zj+4kCO!|omSggkl5jWo$v6o24r1QN$Ve+~des;5NhYsoR3*0;&DHTbrRBR<<Bn~K^ z4B`*e=0Peikb9v}WAbx9B7Q%}l#U-SV+pnD@b+n&6{J{NQL9Bmn&l=|&>G>&Mi@#4 z-EkU+b6YqbSEk2;@tHtxXtdfD39{~CV!Duw(AdXdt=R3-ujHY|Lp~j9gpI%&357xm z99BjY%%UY_zcQwrP&lPO`NWl9^v(>O8mt;Er17Egvb7o`Qv~N3FNQF*_4PqVRjwcs zVlA4b7X4BWeyQ6o7F@NpeQrJyUh0q?%~_l2!fBP%>;{jXGljjOxE?x~eLwrzjHY<o zEPL|O%De?}UnCN7(m0hhdoS=)BL>1W9<V8H0oHMv;B`kp8Vm5>=qO~;*SN3tUG2Xi zF_6C3FKl(c5ILIojqf-9Uq^UPTuLWm#b_};HIYp(k6#uaNDrqCuL0=wC;eUiUwMAx z<DN{wpG4ku{W|h){GG%fBfMV=2NI$=<P+5f81e~f*b?$N)G!DrO^ty>DcloA;jrLn zOgNnmBnqMoyplKT)w}~<&O48WuS_cyKpJZ4{b{rZ6PFX1<TOE^fFz_!6{Ln1*kCbJ zeauW7&0cDKd9*n_3a@=$UA4wdW128A?4F=!iCC6CEqBkrYHN&X{@6#0@xIz(S*b9I zmr5?y1DX7hq|5J)CSpk!Av}J3ihD4riTW^^sWEpfD9&57Rg;1ZYA8IFZ3rq;fvSW6 zq?8AwZlnsq5Q%z%<rE}~oU4a8RU?_g1+~>#1BAG<CRCE8hmZMV-T8?Zb4#LjU%Wd9 z|LgG2M}G6;mZ9l|F9kN-KlheJxn6XA<(lD8Q#3jybTzYy)VdCAd*Xz7j$Z%G;br%C z*(j}8RGE)%!1}EXbZFuGtlzPs+l;%cNdJJ&3Lbzq2!LMqq`9FNgZr?3QPl`MUUM;0 z*@z4tIxw9>1qrk}cpg(iOb*cv_kVfh5!m|a^^abpdgpJiY>qBi_jP#tTd)mIUfDSF zlgfRszP<1E?>vOzFFJXO`3>gPJ`jSJYL|$dAiFu2%*tjLZ?y=L*%dUod8<*7+@_G; z$6F18<kNFHz*|Kf5G0SujT|<{?lL=FX5<JkcDKok96r6@r)TUC=#YbD>_DgY`Kso0 z9afQbW|xax;dk2IZj;05^mx2p2oY&?F?O3R5b*n1mU%(IN_;RF3UM4y?llXN#bPuX z45BFTW|K*;*8!K??ehV>WVPDuj*@w&>$|3%#Da<3qsXMW^gDgJoxbma5)!bIWI+&J zqAYui%q8_JNoTM$bn4UcsfjaZsLK{qJ-6qd>A#83t7rbF)}35BMU3xhHz_Rt@<*&T zE?c;evtgyqmSf0~bA<7sGGUv_*i@T(Rr#40XH5cVTd{Bjym-~Zl}{g;Jy|(jzIa9D zyDP7R3+Gmz@xvc|2`=r0KdKbT(OY@yOVt|kz@-$X(io<eP?#!ektEwF&gB>kOjZn3 zHUObcj{!>Iv%JPL{IH`s5nlu<Shf|g79~3*AQal#<U&54M(GE>y!(;Ln}550Pq*sn zx`A8U*tPEN%C*0&{G<Y}jL!Na-1zD*_uu}+Lv##HSjK$@%eX<fLAyzC8;UNs|AM?* z!zT|pS%OYyQs@-s2~P>UrmSF>iz{5q-5bTLtyjyB82-)tp!He9v*y>i*Ihq$zw3I} zeNy=c`wy4H0Ryau^Eo_Dk1OC7bS{J25XgJxcy_sVD}vjDkjv{adUzA#K^*U<t|%Le zb+8SZPN&(8CBjTbnVe>nIPY!`+~e8jL7t;b4hzm-dmbWV5NpR+A?H8nv8}Xiw(YR7 zwoxc(HjS!Yilz)JOh5J2d0vG7z?zx~YMOl|+KhIf-RNcXCVC(J8HvafI!`Mrcc1F4 z>NMqU8kj;%8YEbtG%;A^yf5t5!IyP!>acn|*q1s*WH2h3W2%Rwn)<xm^ED4qpUs^+ zC2spMGjVi~_?0rG!8wpo3jkErULo98*Np`vsI6^8@>zNIBwPiRN3UGFCl>X*`S9cK zW#>Qfk7;oArlqsJkgI$ag%`pHp1SdgYX*<J_SVSy_1`*FIXy*cX~r}H>mJ+&%%c(@ zkC1jCLR~;nO5S%iK>60#tKnChnR$`#G$VJ&m0lNN@H|Ij(K%ol+>ACyu7}s7t3p>P zTN<v5-VS#v4>UgmpNYN@`(E>8#K9}Kz&j(i#2<=04j)5LMD{hm-26`V&&`v~CK>BG zUL+?lA8P4HcVyQ^F3ac}MdbHGN62SU8$dMS17gT$R>RJa&##6x)D(?G8W6N21S8KP z1qqGG$EkVUMTRR#La)#-FvLuZ1mJttpC5%^*DS3GOmL9JY{rWw$||vbE-R2io7GbQ z>OK|qsC!kU9+KLi);7>~qK#?Gi<I|?bRI;?`@{yPgYrHHZFErH=XkWR>In3JnsKYn z_@wd>mQGSB!uK*&zL%--y&B~i$FL$7%R`wIp)ww?G>#>6N?o8_^kP0LVc~o%Md6_( z>n5DBxg{75MVrH!9Bc{VQ@Wuk2f~r8(n@6Sn#@gN12Ol*2elPPsRJC2PM$tsFD6Jh zpdEvqhg44=l8RZ0$lS0R14cp!Q>qH90ek-+OjCeZgM_W;Y%v12p>kiPkW)-S$sb!# zpzPD(C+zc2?;PL#?Ps97|MtyiFR=M_KX`S|%^hpdHUwd1%NLnw>8WeBkH#w3-?q$% z?txF<v}2ErWItR`na%tL%iL4JJkZD7uiYd&d+(1u*v5cnX$9KSxMd*%jeMH_(j5w0 zYVTRGx&4~hz>3}MZtiB+E$-cg+o#<;Yj@XeJ@>lqbwAiM${yhkx(>Qu&%fR^w&KK! zlPgZI@Oc$SPAb^jLMyoMi1XV@KHy~9)cHQ(nIT&xi`ir}=yf`q&8`!Nqfm~Lh+x@{ zv4mJTG36npW{?e~J<)y9m!nK{6h3-rS!!5?7|MRDnMhxG4+gZCRYt8nLreUKiu-Er zk@;{wp<(ki+%&(5=mYb6?a)37Ma{NZggZoBvEtW=1^&SqaK<Rpq8UB&^%)QB^$cS= z_Y(R!;DL_m2A%kpo)<h9!;71mEZyH{vLFkB_*?|tOjZj?S-3g7JG&>#WZlFEmZkBC z*+MbR3@?I<$hVm=c;m`X4oY@f`7QN?E+TP|dJ|R)7DYn|m>?hRbb0f;6R;;SkQhsx zNU#Yr>6xf2I|-uW$OLB+*C;EpE3_4R@tkuj$fJIPF~7pR`@Y$5HjUez-J&?5#W~=7 z)5$nTC;w8_jhsf}ysO4cI!Wx7?ZFZ(wPcxI2K6!!FcLAL;i>cl^0dTnW$IpOQh9-V zBXimE6-VK9K!y6}c46H5lmI*yO~;2OXk|P#bV^DMeo8M=L&QUq8kA09gf0(}V8g28 zX5vHQ#wlSP1EYGmj0@5qcgFhd!8g_SRaEXvVeK1ZQHr!g-;d&!A%gHVs(+61*Q12D zbuI0f6)E@wE;r<2(bkq+OTLBSr^k9?>1bnYNpz7P`lkl{psUcWfD2(s0T*y3Kj>}l z_Jc2_7AbJ1d$u1gjxY7YB})SxKHS|m6)b3(ufVSPg*FY%P%wL&#&-JQMVX8JU}5rN z1<Z2I@KeKUs;;fqPBpE)_ScxYk^Vu<iAX>tbw6qPw1i=_Aju@U`1EsfO=qv`;?=AI z%$@jfxGG@B`KslX{5xmUCBeh^AAJf|RsC8J3FObe!q<gG%a8B9x&H?#GsAO?C3WqT zV~@|A(-cy({(+xeP~Lp`BcHu;YnQ=V5LV_>Md+Bnc4oeJ!RlGL%HK2Dj<qixd?uHF z=y&j<<Tv`hdQ9VZoy)7|__+hahwZVV-Kq#I!|6-|Um9F<_tMrjw>x^FZcV5q6uuPg z+_LS_r56ru+q3+_vp43JMYEA<JLcw{PL{`fa~77Zk6^lF0Rd#JSp)obBTwVxD2Fgo zwqcY=dV$f)5~w6MNky}fMwWJCT~W~>ML{(mi0bjRu!qkb%sO(P^z)L3eKZ!6wNvHY zZj@+ToJ51-Sem^+sw^8c8mo?Cnt(&16{O>-njGlUX34`2_#NkA=d18_-Lb&CI$r)y zJ)EnX<y`8x72c`aWqH>pgtXQIOC7{}Lhv=m>t3XV;5@ND8kr@SoWk(X!$OUPC&;;% z?Pmwr5q3Yzv!56VLDP(TjL3M-H%cPyiMKP=wLh`2Yk%*>%bzm_=RFr<=Y47UvX@A1 z0bubFnj9mEKQoqn4|thYzydqd`myw}@4QQlnSGyIlG|WFj+$dA>W}H8e9UUGD<A;9 z3Uuml%`M=X%_J$%$KaF0;8K8x!>4mL<LV!ctslbp4~Y*}vtEO);kW6xnYYQ;Ij?bF z;}^?)<*MaM=a;NSA3hue5e!vCAfXWi0r(o2x3#$%2t>;@fJWfNS2k{WbH|(8)^C4f zVPWHidv03wmCNQZ`ySo7@A|XDkKgg#ul(cM>7_?+_;Ka8dw+Q5&VC|u8!;pN8-}$U zI+v4IpfSf;Dv-2PRdUhd43p%xG`R`~B1w_jp+stj5~&?ZbVkTyPAa=NsrS@bC)Un- zqKlJI?HK9q1OfVRfW9a|UlB;sX!#^HS0$^V=d^N~z9m_=ZzV~Vo}N6B1b!6B68EdF zMI+AGmR8gEnGb-3j|O~1$(@KL(kghH5XFK_2FpjEd?LM<8vk#$t@ASBh>5Fi=lVS6 z{?d$-ax}^_T4<dPpr!hLOGKo2BhsKBk><aNPU@s}(k3U3Z+1Ehc@UsI1GFJPZwBax zQa^4zLMTbn;z*BVvXDP7*naE_(Wx2Y*{xa22(<aMj)K-$5DO&SIa}y0^cMySBL%J* zqY$kR<E#Azet+Ra0qrlqe%vrtU;?5u8MKTt7R}m_Oa>$K8^mPLJU<*r2E(I_Sxbjo z;*Ha@!IqhR5N^%UFOG!67K>T$az=y^5$+eEMH~?Kh;NFlNI0r5nF~Z3L&@G`e{vwn z4kt&F`;!bvN=cNYfem(NKG~mtVh7PCQ?*F^iE3GwXGv<>xlrhuB}aC97@m!K7?&Sn zg^lyp6VixL7-QEUR&Ry~#=!rcidHkmVbSvZ#?S3@IoS2hySp|iPP3uq!pc;emeaG- zyRY41Fq4!r`|K7=sGd<a{)4V1oi|jrE)9A7kw`pd>4Dd7AH1m&C_4kbNMz1hxaje@ zUShC=VE*LqI19HCL?F^G)CGy-A;V-s_gNkce%tbG`LN{$*&qfX7Jj!g*E_CrevP@^ z`ESg9-e;K?8J&?avnVi^>0`KzC|M&uFbcUt$XD|m99ACU5`G4bqW2G3Q~M<-jWW{@ z?KbT(A=4<6(K5tCfR%XIDm}Z;3PaYC6<NJn490YwiW^$oAvbbUj^dsdUAv~LaFmCr zNA1(0!S3-hgIG+N7(7!x^I>VcZrhh$r)!}&e7sSJdSix|Gs^pP&A{luC(*+-L%qw? zT=%F~6^N?-0BvDvbVrzY)#5+cutI2>9H|=JF#~7+JrtVu;Wu~wdi$2~2X6Vv*3i1j z=@%>ej@*72mR|bC?nc>X_Zqm3mE4<$cU9hcf3)(Kk-;bJho1cB(X&5=i(Z`TwE41C z+xptcQ|OnN#<j5d8jXwRsBJx`5nhQPjp~3@J%tvZSBx7;dt9}UkVA6MLU>~<xo>F^ z#JELe<rD|Ea&Qv|xo8GYZKL41HVD@QVKAzAVZV34i@dS{l#XF(u3XOG5*K9>dO=7I z=EBF{IxfAH`eIH`t7?ghY@;(Mr#aNvB2-`Fk-Ip!k-MHlT(nV`8H8(tR|iorDjOgf z;a@Z_AsQA-uGK4=DgBAdq=?6Jt+hCiW7YB)iR3DiLppY>T#}B_tTH?(!nKpSCQlQR z<+NrfHYE&2x4qA}Jod13PlVG8`h-5&pBu;x=XgtQ6e`+IOj~|x`l<O?<XH4q;kP62 zHhsu`82&Kwag#wVHI<vLY~J3q8}3HCnPJDUci1=V-_^W3Z6Y2$JqBpruW$Ns!|P$u z&p7R}-x=^EeN7MQ9@Iaqd?Wmgh(S)75>4})dU7js*CnrOy3PD#cwg=#=0m?RDYgW` zODG6KFar@Bg{cGJrSvHDYSu<~(DRZn=nZ+H<W=y@k(-{EoaAPMEJwm718a%Xf(ybQ zfpn&^1pqRs-miE(ZjvErcV@`MqBmp+Wg>Y0ndnuFT{8?25Bh**#KKrcVVmZOdp+q8 z)}5O6#39Kj8IChbJR3*xqfh~@P<gJa?noehGfLJbNL<#W3d?=P4Ce9&CLyj81>n>f zlKDZSBu|~^P{T5U9#hze$zV5`40ZQfU)8@>9{PfZ4OeRxpR}Sg<*~|LwRtC!Aw{zC zLddH6A)gfeK#{P}FC;iWtYRjW!>J$bGvQO|Q|o65w%iAYs1t3O=Gg=HpgqhU!$YPK z$B1{tH{yS=;r{TRW+N6>Qbckm8i{7egfo#lnjVfk+{BgphzMg%D4wD&;VD8*Uqtxu zRYSzQG<sHFOyi~|dgzKqDJYlB3ONyTg0DymPcc#rv<p`YBQ6fxicM}?^<8rHT^6kP zXfpl@x!9!0<eAf&#e%z8ii~8!zhfdVIIYPh{5vMx2On-LJ^nQX@r!@3nv+PJhLl=S zr=hE+c@Sn1w&v;v{z&|MhannKW7l3Xdx;WSdG}9Wx@OTP)!{O!s{hf|vzD%^{I<FI z;p^MFb5=<<GW#k&{>J6=o2Mj_={ak@wf(`M-V5j4dDq3oS(lD<6qgP@;If$An1(%B zxg7Ol{vm)HHMgdNK+rKP7h{kl@G$^ooy+B=k-8d<)IHEaieW9>UXU1t8DNGPc9<Dq z&|U^IJ2`$oguSRAA><Ktqp;<9_2<i}gJqC<z0OpFjq9o6-4mT;j|W&2G0IT2st$ah z{>of9TKO$(s9Y|<o_{=o$Jtf63^8<^w`dVfw^xVybr6q~7h(`d92-N7T+%@@TnN#A zo~sU)6j~4umQ%O^k_$4-jc{l$rmj1)U?0|r@HoE*TYw4{K>+AbiB+_g6LXP7Mp`~^ zsMV1UmTh>L1$=bjan8l#{4KZf@0A{YE#(VWR-S<mfFA=FSg6JOP@n6VlhL{QJtsU2 znP8T+h%z`VYeoaxVR3{U!w$wV3L7;;$g<LcEFSkGkJT)-6J?6Vausi@X*sq+8x_lm zH?W?+N`U9e^@BP=Fhpg0OGj7Rh3j`$o@r{>-D@-H?7EIz%j~OGu79q|w^mPn#BAjj z02Kzc8IQ40?h9TP+`;YOcLeTazZMV*sGu%p7As5Djs7j%R{u_PyZ3hgx0xq(d&4Kg z780K&S!J8U=@RWGl9vGenpIKl3d1U@*XL&hH_PGHJ<ltOYC8)50o;sD!-4?(0H6<4 z6>CD(SgtvThN<Cm6#iAyYhkE``@<-V>FPfYOK7hORq_%|r)bh%2}z!Yqwrq%F*RJC zDtAlcQkldN(bSStpqf@$8m9>*&|0LC17bSGVbTUZkNeV0LvRQUDL28J&`k<Y1Hg#% zK-D9|Ms~BjHaNfy1UQnO3<XtStE$m?b6t<<!m2)Gwq8`(&<Ax7-@5dci?7<cZF4&8 zjc2;LuX*m#JFfU1WVr=TAC5oz)zOWIhvV%FTm30X%|Ex}`d@Z53&=v&g2aqJj@QBf zUO9qw$dk|8eA76R@tndHj|e%YQFkHGO#P<4rk}vq(L3-Rbkbz-0%(B1q?s7RvFs@P zhUR6Eona8mm^f`tf%^dRxZpp4c(O;~gNOGTpuuD0j-rnMgZ@)90#;&);=7mS*zcha zfw30yM7%GjDD0gfeu0!Uo+?pa(oXaCV|CU3YVK<O7VZ|FtxX@vTg6n(raE94s>Dkj z{~7vKrE>t@Te)K}yC@gn7R3JfefCvfy5CTZ)w&pe_mkWPj3aSq)N<Fy-{pSQ@NS&l zz;5NXi`#V98m}{LwOy;+A>M4$>%`qjG*#r{ZZ+=am|&Cz0(TUy0dA-rjFYtZQ8+`> zWulv-C>jMp67Xf_Fb}`;pvwhJZUSvyXn6t1lB~##JPN<8$snmEhm(wkb%?#mlSwuS zN$jDDyJ`BD^+@kYd?ANvyqd%`QH8WpEvQgBQ%#|-`R^%jZ1hD$t1%{Fr9K=Bnbg%F zU?IU@BCaWh;A-fz;!}ezdVVNjU5YKch>xqio~>R@U_u!BV9noKH#Kd#>EypB|82+a zH`ZPM+IOzK`?s%s^ZO_#U%0iq@3y|_E7M=`N6|GfvhU09y>Q@;r*41dvkxj;Z@e5G zx#^-+zq@YlqrbRz3EB0B0XJY;a64WH4~T&r+@>AvgHV?X<r?FgbK4q*4Z}tvW!)4V zj@_Pn%Kf<aJJIKj2fZ)EUW~u0f7S3SlT!eC$eWN?7k8Rm-l!>R?t*v1n@zWxp91Eo zpaXV+E;ui-60V3}lDiyS4lhINW0%J_<gSM|#J4ovkRx{QVPROj$$FE#+rHcR0DGT! zkM%zJVdoRE@5aBI8)XlR9~=H){Mh_){NvW7VA91qKoL%9<z|Y&=#8`VB)Mp`AI~+D z!e$Cg*Fg)=;T0uE7S|=XPLn`ED<Gn+?k$`wu!ZpV@C^n-M<cXo`m9THjkp+>Cw~<F zsYYnX%AcV`21`@b*d)5zuIgI1rZT|>tCJNSQI!j?1|cD^8aCM*SA(>i#f!`;t6`7? z{H2`Dt3k%vTwU{Xu}0NJC)!Kh97(+u&U4}bZi}|n;zx;1(pKA(plW#B_094dPd)bI zP0#EvF8K9xuWVYf6}DWbZCST&xX{wJu=i_MY`Q5n2R(Dk-X*uZd|+t7qZ_|^(YnFi zKi#_Os^!nUy=i;TW!G-$$#2M1{&V)@{Wm_mZRy<N<u(5r;SWxX4UE=j@EHQPN%u;> zl(;R@7W+==0o#L)*L|-C-jYPORkjC$jNpJfy<ZKYgvf_{08@aF&!mQ3swb2%n@z}* za5{nL@9dFb6$P`hCUf%W<ZlntjIMd%T095l0ZJGSgbANv!m5jUv|QBY=%R|G3lo=W zbUkh4sa1%7v|-hne{pBk;y9?fSx<l-#|nQvX2Tn_IHagO7PR=6K(7Oz0@lzH=(Bm2 zoENi!mHwgfVD5_^C54q8SUOVUco~3I0z_iVEr~dZ_Xs6m7GChmv#(UH`OS_cAHmkj z&rUDDD%!4I#cbN4G(~T(eE%1f-+%wD)qXe|x}XQn3{-itu`-)E!`uT}QA16awCi-K zMh09Lhw*^S+o>*TA7wr`Y^7BzsUliMq}ocWt+@Ie;Awhg>J$90lsQhcLzwy8G8@#T zK)`NA+gf2Oka@s|w~^nl*zGw0^0}J%6_nm9A1jrl_sS<uj8$pdeyMBGvX_8w^6$Vi zc^Y^pPd}%ZYFiyWqa!wUbL!qCvgOmxwQV<ZxAI7*<7CkzdUYwgH>Qio5pOIt1-8ir z-yC^^ZiD_Z&pPiK--f2^#I5?Ro@>2V`>t!+rQhXw06d_(-+N!`MQ|ejdp@kwiK$dm zW1}95SV6OS>_Ho7Y7GKeZw<;ZQSo@a*+#t`cW6qbB0ABIXAVEo=+&`$u?d$R%tFMl zO~%?6;Nvul1dleJ2^RyFJQf)}#B1f-t%vXHPwP>?en9_cJ)_@V()H+8>KNU2aTJ=h zK<aIa0xilO1u45%Ho;6&sR=cCa`~so_DEE89O^z*9y~R1rc82UCTenS_r$4G)#!9C zqEW1>T<83!cz&_=Mck+V{VYggr`TDQ{Zhog-q(j2ZIuz&Y?KbfFjP8-j3Pk(ygf~b z%FqspoDz#0;WLirX7&B!RzYk?!NzF9t@BjwXxn%3)CKKXwV2Qc=R~GgUa+Vh$(3X7 ziN*u*S(R4!&tyW@8BEbA>sHOBvsd2w)tOC=Ij3b>-yZaQC>=IRMhd^PF+ZQr?86k} zuLndH)^{#K-Y=jl#bMqVwdjO?bwEWb&1S<mt_DUY-=f)Uxbl<3#9SL_A*n7TnXOzp zR&z2OBTy_;YH?L#a~_0=o!@0z$|1jP5xbCE$S)F>`Ih;G_1qS27!0cieXlAfl#}51 zoUR?tflJ(r{VT)$?tcFk_mKZ~`7Yatb;SJyd<^XiKM!AlKN5cA`B*&V|ATS{x_LBT zUMk-ax<eTbpAHLF1%7YxBv9}Xnml;`1OQR&vyuw?)nOF@RZ<nTS0&#yqV84qt7Ga3 z^`v@QHL2?Y?^~edN6x5Dz@PWd0eg`Y+7!8nKhmJS5i-Ia<8C7|W+a-8*AE832-puu zCa#V&An^26-kZIs*9-S}p?4IEO!8@xBr5UM;2Dme(J<o(`g)aT5!+gMXmDb%d}?rL zoTda~!FZ5b4Nl25GN4}=SQEG^z<eVBNt_49Z<3cnGgHb~O(**ICCd&1$xXE4(^x^~ zB#F4?#|}tjPoA;oq+*SgVO4ez)-tU?Jz5b%WmQL^?5e%3otYnf=jKN~g7DzZ|I^Ym zHE1=2!_(GYaPc>Ht-h!|4=*|NL&(4X4m9uXj%8wwEurB2)!%&Vvl;2F6uvNNLf6r_ z#8<S<D$Ti4>8@C+mMD1CRo1m_V#tTo26xCOs|}uz52|5Z$Y)i<vK1jHx;=E)JP*y# z^04GlPlIki92QTC%p^&r>lOP&W~DeLo)DRu2ZFK}k!*KDLn}rnD_Vg3w75zcz>9uT zWwLb18M4-B)qmjG$5TxCP^vsgVm&D{qOpTjD>FGq&yP{4@)QT1DPlRsXp63!crm*u z<~Hd=P1!7()v_?=G3k|5HX4n#DBGA#>s62JruDP;(0Y}J^k9K*8+Rv2A+f%HP&1vW zW}3O&Oy|p&f|8ksqB&XHQA0Kp4~$Gu&Med5$ZDV@Pm<Ae%m<oYj6{Pj04zxh*+xj_ zMUW|tk4a;t<L9=_7<-R|*h;UFe_!xz903;k3?Q5A1d@CNze!Bi_RXN4jN0xQWEk&M z{eWiLj<0`BBYDl{=B9eW8TC@oNZz_raM~TpgN_Gd%uHscajxf9=2j#35DPQSJJb<= zMA#$l(LE|XYTe(glXyu$D;rm)kY6+(42pL*z=J_ylo7R1IJhVHau5Zrk*Eu%dL<}j z8<Vn?7X`iKgT7JtrRR4y!{$-+=>gc78ikT(N+h9du}XJaEHFZnDWC7}&(pG_qgs|q z)iTmTOU>z5^CM<RQZQGV2h3yU6K3A*X*$aAj8Ky_t4WjH<MoKXPF#FgK2?nq?(Ccx z>MTuIi{(t+TM>=how2AR7Ih~4Aa0NN;lISw(5*Yp1;vn5=WroM9DG%^g6`)+_s-)T zj+_HN;g3#RIPqRGaiQnHfn|pVFI(1;54v*mL!nq&^MArDn0R8ip(zqc%v_C@&+Xjx z{cC16PYD*(D{Quw^>1G|7o+IpKPvN?qnHoGLGhd<lblMe;;s5dQs{XCUV6990~97{ zqd;g==*D$QeLo?x6A|^z(Zop;O^##|DVa6%-uebZ2DitZj+)#0u(nA(b%1*602Bbp zjx3F5#;H?|itCxGGj41QgQs*X^MyRgBcOHi>>&bpt$Je6qsfkX{nQRTItr&Y3ZXWo z>L=vMp#G%!XuhO^dnI^t96*m555_=7Lk2*g6U$E2dj`*CKA)(iKBoxaenmf(aIc~? zPg)`Eva+`|!KqE9sa;Jgnl86q-gK3?)w;Fm7V&Z6WAPt4Q+DdITwi`up4FzpjL0OC zvQ0?_J-0R3NW6L&#MPd75X?k!D#5U6sST2mAc2g@<2JXpg!Cgsuo>3x!-CBp6=Wk> zkUmA}C2V+Dg@k=oSr(`Ecl@BMzJ*nFCn3T_h|7#**t^bU*fVB{q(oNTHJO575~KN; zF_w)MgjNM+O!$=3wJEU0kXHW1T|?tDFymmNImfweIw;qO*HiOz&i2o@o8hWXH9|CN zrY{7&u{pbYZog#ks{>EZZ%eegid_}O(;l}uq;Sw3g?XL%iiK;ZU3`hQESrfi#i6&i zuG)0VTjLM!a9ElvA77daMx)SaXj#jw?#sH(J1S3a4tFfOXx)*Y4_@S!Np9rtC;z}b zjd@NSzM^&AE<<@Ygh=Zt>_$)yAQ(r@wkfvjY!9IK(IgUV4Gpps^6AwEHRSWD4NS<V zQ^WR<&x=`*Y=sCl$PIQ`ZfF>V-_k7cXQ5uFgUII<WgUY?(d(K~UTC!{QdZI=Mj|dE z5=bP~o!z8D)g|ds>g1HLx|D=U5)$hLO4{t?6%JL+Ho&n4*g(au28le`Pz{5^k2ZJ` ztG-p&l*(13Y+X~r4Y>XxP136d8jp|ftl7`xA}qSe+6hu83G*JBu}n+oWRIMLB~X-m zz<hZnST1h{m&@Dahv8H3MR-X5Df}mt|BN7s3hx7hDL6>u`_SZ*&j;laBE->YD#^t9 zjxkj87fJO%t?<#}u&0PQH>tiu?D4WwE+WZ6!vSqXkTMhzNu8&~-w)Y~Nc-IQUbm|N zYHnv#Sl8cbyf*x=QTCJ;`rrU_fj`=tf$wCoP8T_QlP}hjtt2u_BhIN8Obtxs7Mv9r z^Y&0<G@4h2KHJ64Jo{4pirF`-$)>C4-7x=`y9>0T-8Cv0m_p+pB)wHnX`bFq0~_4l z+9o7_uWv$fvbuGjn}#d6V|4Rh(hyVgg?KYmSrVb3(s!uMG_s+Y&`r|XTn}Yvu0sv3 ze4<&X5J2D83>DTgU5{y~T#aJ@34Fvs9#c?Df$oS^XajL;pot}M4P@Kf)Hn@o_+nH8 zN!V$Xj-~4RWqmPqXIY-^aZ;dX{EP8CekpmSHLj<`R8LvFo|0F+n>M*=liOI`uw+U* zRB5wHn^oGZHrFE<n#tB*<OZ&srF)z-H&1E*--a^yw9FK(v7nzqA{i`EOKf<`$P{+} zl(8u%rZ6cU_D<=aGC-QNDNqsJ$)L3wzR;Wu#^*QalR;^ISWO0F)$oNvJUu-hEX;&T zybaK=#Imo|D(OA$h;Bp=_v@hrYm|HRZ|Ye+-OQ~SsF8H2xwpB$d7zmcZXRhy`<o$X zmYT<!Pc*a5{q5Dzh4TMCbiwZRb0Q!0#r&Md4+YUH1kT@0caSEQ4AR&I>;JHk?&qmQ z=k1r<{)O$nx;Ni6{h|S%&8*LAm1z#GRnLTGW?L?w?<mf$bX*X&yDcHFBV&d#ch|(~ zZL^kKqCH*t-crR)V;H21;LQ6j&E$J3e!3T~O+RG`b3rvXo#0Cn0QLn;YZb5`vsNHM z1t?H!Sc=x8Ytik>gUVCN5yc2G?Yv9Nnb)>0MwbK;mhTz0!P)M!UeKTq`6M;0gzEd5 z{HNb4A%7S#B6t>V!u0i+mT~@11|^+NPw`DpTlEwj^^dBn%AZ@5Br3(9IYUF`@gkfm z7pvJna43b<au@S2F~^P=&(~&`h+XV`>eZipSPw8p>ozS_BxCE%YaaQ^26(Md8Hr9& zu4Xn8^H3BvYFp1fyD;Rir>~)OWE!5A{Z#MyFRf!aT#l9pmIpV&jc8+FV^GYfCACL= zfV<!KB=>|*Krj$=l6h#*k$Cj5;0}Wjk}RS+ipDgX4yJ(SGM8iveqArvM_dCap?O6e zofaLP03DqcU4zRRN(HH<gggO)QgCH(Z;%ZhMG4@<G==UhUW>}YZ+~7{Tc*)AsWW8~ zLLS6=qM<-uaKK>6W4KG5k~*u?NPWUUGZgSqzx5%NaVIdeLg{r90Ro8@Y7194==v8z zP3Si-gl+6MEir>Fw0_acm_}zNULgeg+ba|K`9e(M7F2$)DALjX*_k?VW{qasrc2;7 z>VKGAQ<)8SbGv~7B;m0dg*GH?R9&)#$Zkt#4ieI;tM93*t8WvfBh1KF-A$^x6P2Dc zn?hJSq>tdrC-r>E|EOnBzJrBRk~j(YB$ZZ;2`a57l1|^xB#;?DPImK<-acO64l`BX zMk@6h78Vab;)T2irpS7f+6$)C0Zh2GRBvh|^`!a9z}^(E;EUlDBjJ`4DaI=%;>z@R zFfr3ZJ_!r2UR|TdrzDMnlbEth5&%X4|FUI|4N_6Cv$47<lQyToq*E@J*J!NmF-5lo zAzzBmkx*!an7|;(IytQ{ibAE}(aFDJ-I7Sg2O3lVu96*^Xi{A*+mWn#)s@<PCN(}J zjhE?0npIx0pFi$9NOy-CAL=7!+S;A~a;laHUfrhBA2eG6QNJbRhe5NC!~(<rx4dtU zkFq%Te`e<0dp5hdkZg80*;_UjHeoj*2nj3MT)BiwxbLDi2^YDDCCa60LE^Cp1}#>p zidH~-5ELz&jbNg-rXICkYQa;rYOUZATdLAhtDagQ`}@wjOMrgt`TTyL&+otU&OY<Z zJkK-tXXc%G=Y40-t#mNJW2y5`$2(o{7%Xq>Kb`Va76^v}qr>5zmv>(tt@79%SzR96 zn7s36J#ACjqXXe#_nE6cJ>KZ^RhxD5{qujwZ@)Ki{;Y0Pq9(4nnw;olwzLjqGZz+; zh2$)9QlX0#IeG5SQwtW-0gUosF+Fz{35(vDS4ggI*26esdJC7UYPqTO`jOXzLAhwb zUH8wQv0&=5ZrJ?73s!(!Iym&17K_<xEHb#-J=#&09IHLc?r=D5g<ADn5x@GaNPj3= zqvF6A6(`D5WOqr5Y%fZY^Hp+URO$nhd#yP&W{U}t*IK4oTCG!yW_V(j`PR9)(V`WW zrPgId>#RM>=FDEpX6xqc+lsch9<V%M-I4V`(V%6}`ZN1r(QB4htuGh8TJ$^1JJwGv zC#)xmzP9|``gP&gMHL21n}fTGn5-l!EGlvuGK~&Ho-^N(r{_9{9)qT1)b&Lct9+Tn z>CLj{W_4#V`9VXboZV*eqFm04T!njxQWZDA4n{KcR*M*wm#5bo^v(hHkBGs7FnmvD zBx`_&`ezieq5=L{B-1U=KsuEvGWWW#?p1HE1@^O7p0wHJCi2Pi-?@BorCjyM*J9+{ zck`9<A?;?(h8F_=@)7N3>xLJ?x-fp~X7c$rJ+6aZdelc?^7sPv{NAxFb(Vp;iErZK z(X)TM!aH%{@Z7ltV<)lS``A0RSI$2Br<t{7-}%F5?4`G7l)8dCzu#gD-K|`4`o1kQ zHGaP`##7P3%)I#Q`|=*)CDMM5dqjx65;<Y9u}sqEXBL>tEajHbN=TP|DZ4Znv#nyw zY}W*Du-(h<558u5&vt@+ij}U78S}NF){wZwc1dWmP2`12Z6zV0wP`~6`658&@J*z; z{94<%f^nhx>KWC`=tf#+yP;rBsF!ZDZ3^w6dqa=XBca{Z@#<gYzhWDzen0<R+mY&% z`JdQ6DL7XB75yXszeD~hY-)aMZ~=?u&kL@|zrNrl+l!&MZEuG@wtXDRw8*uR*Dcpd z-Z3smna9iJr@=m~r}8{LtfxvnzI>a_D-S=k6=1P#vq^oJ6bj~sZ26&}Ey#k%d%nG( zAfFrbdZJLMv`inm9Nn}aIL7OCU+_d$>)9EpX4OnxbD6Ccw*|{A%g!qG@g45?@?{S! zisZp1XTz9j1&8I|8dxv*08>83A9=YzJ>F0{Y^g?<_pU3i#YO4|O%X@XnwwG2QkJ#W zX3MIzS+i>i>zaHlUJmBh=7(}?Q=iSHRx9$}=lO<jfaPO;#x8hPcmbP<*0WzY{Owq( z$lG9UX4`CZkk8n0)+3iLaz%4?C^X-fclJwV?U{8WE0%Q&$_h*Ur+=rY-ZQ-;jNa2* zrEWQrAL=$hw`xUEYJ@vvU=zwqa^?1uT$1rp?lgoPiYc3$^m3vtUq!69o;`wVNZoac z6d1H-U54Ih&>4*(ZLKal)0R`4k$&5%UvH=Z?vdCj-z+)-$0dew!D)t=5;g2KXiKyK zeTAtkqb#S)UhWuOR$6t5w$@$~nyhWswV9?k=4hk3s6J+lW<>4L(44B}+Qqt+re*eJ zj;qJ6Q`TwM>DC#qH+?VTd-m%c8=Tj>*9JE!x9fYITY_6cw^iMx+hMvp=Wg2$`+bh> zWj_jT5AD-GW_Zl>n0=q)QRicYdxQPDe*II%0sE7oUxvQce{DKb__cf5vfz@?vZ~t* z$^^&CqE)W%R4Pk!OZ3YOqRlYfHKnXAsKgxegEK>-UDvK(U=mnN8HLH@%nOcomb<ES zwI;*4gZL<WVx1#oa4M#(RLwfF^*R$X>1#`~xzxORIUc<zuaWl*9FYox)2YYQ)#-F# z9MzMSIVi`T>&Ph!mOIL_GZ0H@QHi6pwrYZ-b|Bf^?=Ts0pI#No4e53643o+0fV9Kz zbQT$mM)lEw!wG^jSg6-~WxomqtF$_u<TyiB@K)tymzI^))@Bno8I5|q&M<MOc25=B zED;%3m3rMuqDV<)s3ug^Q*~#Rm{HYH)mhc8dX80{s?t}T(Erge%j7s{H$BbWWXEW2 zl7GQ)BtvBEt(!Q&SM+=2;2e;zp%hq;+pK55P(Moyoc+Vd2z|j9&ho7kVA}-_ecB;6 z*aH80%)WSwEi)`P*>u*hdPZsbqAS&hR!rRqiR6}*<(cb?q~#95a@n#?^{J;?G5Kc( zqrB<{oR8sGlHQX`?b5NF($dr}+=WDzdWXn2PPbuPV^MBk_|~%F*M?s&9=@hBBe!KD z`^q+MLIpE@SmqvOH|G=-<dk!3@r0U6rZ8Snm{)SCX1c$m#`pcx&xl259#pQnDZj+; z_lLZ`o6hR^=Ibu6F3B-xV@c=_m5<$c*2O<vAIdM&XZlln1zE}C;-uJ4ZvKmN{n|kG zY^^$5$pW|m{M=?TmjCs^?2C4ht87Mt0~+Y{Ae<>z{b(UKN1HWTJ(wd~%T)?zTDR(O z%XnT7od2ahD+~Vqb*p-2T*Qz)+ZeT7ULXq82Rq}uQkA+!jms^_wfhWSqbJLqU2Jm~ zxb1a@T4P<dydl-ur|G8}nvE^CmV&AF<@z7%cNiYD-|yJveU$d;_ZS{7co^OI=lX+& zL&ih4rwX37KjRqkzGeH$_?7Lny>gd<dDU}GJ8M)Ns7m3Y@)VvtIfYA0Q`qNA;jAnb zM<NBz8jE*5$v4WnHS66sYc^$V^BU^(HO3lSt>Y!_kmnt{Zj13Y+vWl>K6{Fd=h)=d zwZom>x*ka0ijHhAaN7zBLIz_l`m@7sS2wT?dYx9IDC&N9YTMdgV6qLc!bo<9ky(w! z#$Cok#y5=`;|7C6PT;K(ZE&}KQ2%Sa&~GplthLL(J#mu(`L$%%7}6gcN}Q-3C(%<G z<H#^%;D!PA+##!%^>|aY0V#<OS#oMT@~;dEtbxF_*L|VRYV2oie<;8N$@Yc)q{P?R zPE!53QwdJW|1;Q}-q{}Mf#1*pQ(f%ZbF(D%$P3Js(8&%O-RN2Rf#iv&fDOeac@*ri zL{6<yJ{B|P)au>1Z2;2W|FYC+kKec*p6ZP9p<ZbgJK!uWANAJT`FfMLh6QSJea_)$ z%7+K@%3N91V!OY@?F$WSxp_iirom#u1S_kk^~|4z=8~Y*pr=%i5hVYOErS$(8E_G1 z|AI1UBPH2v*2S)8H?h0;OZ<Izi96)3b3g9!CX@1i(r7oE#X7;?kml!rUpvD8*B3(? z@3Z^akJ(O`-D&d@%**WM5vfrlu3XAfu{-UzAcy&+LmoY3#vHL&zSo0G3rLkoe#YP= z|40mFs$EG{6abcIg4igF#;DZlv`i7erPhO70TGpA1#P5qDxeB7kUVrqBWDd{r_pd{ zQz2?ek-N9JO8&?4{x7{WKLAW87r=?}*g=&15O5rDVt88e4Az&v;T6eaLbk^fX&|)P zJ+upLdWu!i^E5=$=n-n5cG^yp=`bBYtKYz0L(TiBnf8&NxzJjzl+QG@gWjbp=sNnC zj-kcd=mVCGcv`3%`cs?y6YMtHk{pDTk(%hI^fX(^W}{`BXbP`j0i?VwIRworOTM0b z2fhdCV^*B(qbYFx0s2ykKJi1e{0e$Cc?LNxriHYZt!ICN9(K}JrAFyZUQH9}Aic%f z;QAKbpn1n|kX9npPgp)1N*+!Ak$$c)T7sDVjkX}i2^!*KM3ZJWw7Y~ZrOT)b-0#u5 zEQeKzNU}89nA`#CUizE|_)9{Ed<1AJb<iF35NhLXI!<3Q6C1}KWcy*>V1L%UgS@rT zTC8Szkn=|n_Hi0yRji8V^L*4+KD2r+c-v?X((k7utc}Ik5c`GLqX`YyCv%gdl7B=W z9!*i?a2NdoX?(##kU<K<E7mAQ${J1e*_%<q#q?u3LT?}+AE4g9q`$M#Fdy=p_{QY? z<i6y`B>SX`CeTb;K&xmSeU~0at3FRJ&|leUZh+)r<webnnp4TUP`4%2h`i5$^lZev z6|Iz@0hqT@$}HwaDJHPXFzzg6+t@u=B)`kv<y!9H*YZz9T)ZaUS1!?Ll68nF4?Wri zEtyZt&;mE1w(mlD_tA^=3LC{rSS3pIHo||!C-P>PpYX%{1F=bLQ_g5^9X>Yv>G0`f zFGiVW=v@@G`50>9FDwsvC}%6!YW5LwekXrYWD2YBiE*Mq%n>oMMQj%@i(f0(Df^Z8 zG*dNQn*F-2;qMH;k!(wTpXAu5MZQWgrqs}Q=+9NqrmK;^ZkX$6J>5*b^aJS2U9_9_ zqukHYEA$rqo<2hh5c43<%aQsu(7H|R2QWL><LnpgMfM8&kbNZ$_rjF%OL#qR;;nos z-vqOrAK`EF6T&GLiH)KMW~VqL-c?9Zl%%E_W{PI3X0P@&U72o*ZlV5HXHK3SeKvOX zgJBxB4_`if&+spX|CpSYyg@zlzY-(odfJR!?tl*Nfq4v?c!*x2U(s*XynfC&(?H{G zxKE8{6==13HknO@`4*cAGZ$t)TL9C=7P4h9@+|*<W4EyHvpd+2RC6E7wue2+4#7Od zo`!jg9c6!DpRmt4l$Z->vmcAJAg@J9oA_itgU^Cl%2&a3^XvFJw9sDO&j<P2B1iZ| zrRWmZiXGyo;(76=_*&sgg%VW4$~<MMa*J|Uc|&<eIjwPNS~SZvJ2lU19oic0T<r?& zecA)s6WTMl?Y8R{>elPt)Ft(Pj8?xyxes16<OQ{d*=kL$ay>taeqj^cn$2u3YD~-L zh?U|8;y*Q4u~Wj$-ebLDxwty{6Vb~5E>^L5{5j?oE=`@dif+e!cR&A-f5HE#jAC>6 zpIDjlL-rJ3C7QTa9an#&j8bmVoWO1Bw^YYBup$1UxJBHO{2A40cCw?Io%{`QE5~>a z9YwF+%<n~@U-RXBD@BzW&1qVWdVf@NJ?eWB-@-<VH<g|AvGDP~u~Y0Gj2f@AX-YBw zHm_y-G2)(OMRbx~OWo{86k*S>-?IT?Y@gW6rt=K6R-BvJcuXK(7asPeFj7qBk(KaK ztev0YbH%gTBVrt;c}M6!=|(142)c9b8m8}{2W;o17}HuXn*4@U<Ia39#``aZpOquF z<{iye=-ES}f@a~aekFg6>d-enhKbUxxOYDd9o#}8ejlw*_OQhm_rHbllhXiOK|yB1 zXqk_^ZN&W|k9#o|$^%vZj`9ChjQVZt&-7j9M$a0eGDUK3S6VPKbz=P83bPo?r03C& zcWDo5enT@@K9S-c-U%&#pT3QI&PPb!PGRJ40X?KtAlGh;%-15skB6sF1m;$Hjd9w5 zJWoPDZ&#*Z<h>`k0wr9Id(?E?6JMd_$$O~@Z8s}<OL8l9Bp*s%K}%_Nav#Rub;$%> zLYp-)K2H-+YB0vW!d}4r=yz-@#_cKe9>ya-v(YCoKSlm0X`Z28<+m6+>yx)9-@+WK z4D+KM7-yzo^ty)rjCz|QhG^{YWxOxhD!OrpI!ZH>dy_6^q-DvK7<r$iJvt4>o*pXF z?1AoXRj%S8<hPviSP;}JG`ol*jdSNj>L*<qo>*5qLB6^$wz?`598*~l7+qdgTH-JE zc|C4dQK8deFR<n3<>s6}yG5P;3(lyb#n;;Dj+b=Cl@i~SDV5Ui>jI(c0zzlp4N~jH zGI4jOD(1deG6Kn~{z)>Dl8lT<GOIgG;mQhki_aZ@z1im;U<+nO;l88U7jwr?s_t*8 z?mJbt8E%gU0o*ONWzFt5>vXroTh}e?ZRu=AbbTgcldoxsv9f~tj3ziuaK-a|-F+;7 z5>p*Kzoo8^lirL>#qGZ4mUw}$S>_-v{4HIJ<Lxt}EzJ&(CstVzXHAQI3*+Q#j9UV# z2sNq6#I;Rvotlh$xhxHBb@x>a_1->UrG=e=jK#jiT~|coqAMoT$O<6c=6L>%$8G1m zh&;O~y7@w`L-e-Tmb<05w|BEUzI$f$Las;Ju^8frbAM}RZz~eM9aY#i+l{pNrdTx2 zHX%W`EQc&ps&q?yEt1l?!W}pG8hy)pS9GFb?7eZCb%Q5iw?_t($H?B|?wu3$dE)gB zU#zRy*_TVbvu@}wh`0+b=2TYnS+i2r+LvibJ2K1{I+l#^RF^6)U2U^Q>XpeH`KCf0 z;_gLm<Rt1tnI=eU$pq?MGy!6`Vho|;i_sp-<A$crUTYobRvA97@mqcF-Y*I5?>qU~ z#gwizMeDbINzy5`WJJ-xKj)4I0`bwKrGn_1(4xrWB-KBzvSQr;kNdi<ZXi@2wWDsk zVs$}OzQ-e*ck4ie7Qz$nnHf#_-L%k=phz$fi}Oy&89K)qHCJ+a&T&S9clw~mPpYfE zQE`3A$j@TU%V}9w7iW3@1A9q|-!|LVHgiGL-O}5cuG_Xb7x_~B2_w9;i=`N-@VMef zO-=Pd^=2)IN&<cwf2*%$dFK>#HRLg#(-ajB9!t5nL#R<ff3FycS$d)wGA_lhRrP=I zfKCquRw>Ng8n<>%N!c-@$Mb&~Xdrn?hEmb_VCgc&>jD@1Ctl>gD0>;b0vT3Hc-x!> zy}ibZc&!+$dV5=a?$+MU-mZaU&qANu>gyd8Q4#I!Zs|PN`~%6Sw>sjjx5rS3WvmXW z&8e}^$F|Jui?A)T7eoiGSQKxW6HRc&n>rg~eZ}BK2i+K-RVtTMNt7P9^x&R{UXkE> zRo*ceA?i`Nib_!Ziv}2}H2pamqeTNeMYEoxanO_$EuzxoB}a;;InfJsK<!wuN_A2o zCog>4<;q<x9hUHy`hwJA{o#)apO^UXlY@_*UUT+#E9o=gH9($GI-Jfk+(PrMbb8G{ zj#?>AKY!I|YnfA9OIhL$n#yY_SHa{6_)_H-^20q_V)(=0Plvx*q2ynHqX0Kx8sL<& zntJ$SWCr#E&jHO3JO;QzGmm`0mjYS<8SqaNg|ry_eSkJj^c~O_0%ZI%0qZrtRKsK- z%~r^a1<XLYUu%Au{1|kz@)4B-Ch^+jBFI{Gh2+)UK@RY;(&f4y71oTZK(m<cBgzaD zS#4lXAj$#GokCd55!sIs6+Q_#MdVtEBJ~h?;P)boe>+j>3q<8ZM57T`MJ@pJG0TXm z&;a9h6HUk<nz)82jPMNz*LVd{6L?MgP}~}#)(WD@pig}n3rpnTTUh<JLuMw@k<~Qs z2vH1iTmjydbBMnEI8i6!T7>*Bd6Q^4{43vATdjsZ$BgMgtgGZMbCBktg#V>{N*dyW zG)H{aFGjoS8%Bxa)G7W%yTr$I6rhk5jh%Ts)ZN?14Oz2PN?DuRlAT%1m`TQ(P<CU< zQiGwf%*aUD31yASl7zW$`@SzFr4U&nOGM~agh&#}GJZ4TR?~C$JkRfC`<`>IbA8Xb zuIqfh-#^UjeNfhRB?Mo8o58?KH(+?8&s3P7fjidZ;$OuN{fr?8d|Xy~TCQ(iEUvW$ z3K_gdn!jL;diJdP-7R+MQ`<8gb6H$EOMorrrdHNTDQ831d~zjtFi)Th2)f8?4Z3;h zC4R|2nh9~T#eb_+Vz42D_c7x(dWqv$9AsFQooE3Q-ORbdaoXfCpABpOy|ArR>%oC? zfwNC8u&wTI0yjyAXco2DkFX81AQ+6>8AqAf7@c*ahZR@FGX3|BGpevzF%PpC8%M;4 z2GvxzqAD1x)Kcp0+}qCdO4LR-pf?wreBNHyXcMqLvFCA1Lm!znL}Ht)6v(>i+w?7G zsga}-vS^YySKkufN514xB7ip#SmZ0>J*nWPwSNt-&IWut!ad|;KIZfN!<lk^3q{$% z#PyJ(&@w%Y`OvlSw!GSx*T|b!=f?8;xboMz^6yi>zj(@2kneS^C!Wc%;z?aeu%`UV zPfD<g`)bko!LPi8L0O$NZkaVMp~~&g=Bb?%d~`KEWUNo}QH-D5sZu<z>;bHT)ayR- zi0-8=s8umo+aDdQfmx|tMunNkh<Z8;*ksQiNNUKQNcf}R^!h@CG-pAPAf#1vT_nUz zVctQVi^NZgJQ+1f`iQ&8Z<m#)o9iYvAqAHagz4v}V$~!I3+xm#ZJcBDIkZS`2KPI^ zQ{fuBCK6UI@a9^=9}X9kFIZs49viTwutocUwY=F@jsWb&tiIO=SJvX^({smi>v8Sg zE!k0S*Q-xew^ip=-yKj!EQ}a>)YYEy&-1B!1yn=5P0z)f#p_ua;LX|(f@{*06of~| z%Jt+D8-;2Q^>Wqol1MjH)Gx;nn2z@f2<Jx=I@QL*3_e=}PQ6936o>XOBKN+RBb`;R z3DGZ?$P?~mpD`rXh}|P%vNxrl0}`*MvAZ6dTRk#w*2jE@C6gZo4RF>K{@W!%OV(YP zBn&hk0W(yytRn5Dx)=9cK$E4>(?W`(&0nB5OZ)!x;!9BkSE+Pbi`z~Z3okwGc=6)X z=SyoU9miLtiUNu@i#jY8Oq|!8*IW|2hqDGY=C)R_EB=Wj#TlMt=5JC8nVpL=#08UD z12xdr{n_hzvcmnA1``Oo*c5+(DE0iat@e*wSb3RO=a!PMMQt)f_XGS7zm-x?*z2>; zn8%9QcnvVDK;D!8WnfzJ0t<<Wq!B6k{xjof(zM>XLXxE2tsce6y$3g!w8w_|x%xrx zeHeTIdnSW~Hrvp%i62As<C{^reJ<Hkmy5g)&l53wSn8SQKy?gb1OUr6^8jFm<-$z) zo%poo0Dh?ZVUnOH11XXjwA>m%-qZDN^P<Uquw|T=NyZ!9_7{CNg2gA8j-PklFyBB8 zJ~F8Jh;BgKLUwNsDlPZqr4rJe6(Yh?FrftJQS>M;7<MPq9n^CHl%jlRzs;jJF>kZX zZDc)G3tZ3TjvTsa8~@!c{yRZr`JC!BtLn6y>hx(<$UWY`p)MkNRNTWS-qpx#;a22x zY|wLT#v&yCa;jjLyQJ3KK>o<d!i5C*#-`rsstC@W@}2^IaD{gLn+WN&Xe-EmJcEE? zxFl1$LHK3HEwTNNMpxuE?iSQ3t)l;0t1rSF+_-9h&fX^AHZ9>vtCWA&wCC7li%31X zHo^OMTF$1?K6237(PL%y9ctPOt`uD;)DH#;e`<@0j?i5R)o$g|T?r~QHfvNixhOQy zz<a0;FqV{W#a=D{cM8AlO}2O+m{UcLc~{Q7XxB3*!`wWxdspvRG8!2&<Mqtjjk_VC zfLDp1(SW25TU2MjK<Tj$`fCH4e7B6^p6a1s{oG%Q5Vx%dy3(E(IRrkv40U*knpOL> zzW5nh`Sz<smAKCXlpo(z8tY<kMM2b7sWJ=pl7#JbS(ZE72_J_Zx!5Z5@s#_r4;i<H zh{S%7VL_<O%xOuq{n7dQkD<`$oKndEnalA!wn{6do;e=@6O2RLYMbms-G_QQ+<ReC zODD=558bxyZ{DygAp=-^#A91Tbu7~tk6X&O^Rf&Nkhe+;CjqHR`zD8j+hGFI$!i0} zUBcT+M^7EEg`56qlVy{imX}tLmM>UTG;se?#PK`@%gmCxIp_+%UzUB@(A*)_q1z}+ z)k-bH`^1{LE}J4<8<R1MQu1DZ*BdWVGqM(2h}wdLw6XTd*B%4#JTMB?3n!g^5pBd) z-MX@na%$tqh6d*&#KhI4XZNHIn%{yW-!DgLILOt{a$j%xs64hmrlNMOSJc&J=0-3~ za0=v*p0flSks{bNmZ1DiEj-+om95fo8&{j7+z$(*on&Jt`r#@9!NnlHvg8|1m|}U9 zHR|<|_d{RM^4(z%ZT_<IP8e%1^GKDLoqskx9C^foeQaM=xwdI>=0nrihsEa(rIki& z7&OXv2ig<#XMEvk4voMJ%ZTLMNg163kwtc*kxQa77Vl)HPnvSBCq#F7VVw9|<XoMS zIZefg`hTJx9~C_mB;BVx&J&8-zLM0qePZ;35mr(s2qvr+Q!oqivCoy0V3ODE^i*|X z+*T`-h}-6QTKxFqR%^`r<s8Fvod?ui0zN*#)?Vq+tZN`FZOpP7Nu=*REPuP+O>Z-B z;@0$<!?~)4GPjq?<%jirQp)bf^VM`kt;$Es;S9Y`J~$g-*IhwA1&7zzCBE_y!@)c@ zueHNF+V47@jjs-5YA~ogdJ|Xkyn2Mv))q18cTF2?9~sFC^m9#=3@xb7OpBf2N#`~y z2KS7p>MaQIs@A3n8Oer^CQI*&aaj|1=N#gMhMU86`TX*nupN?-xa5W0D3P{)_j7l# z%*f6mT|i}x96{uJiG8DATzHKeQ~6MFO+htmraI%b$CAbOhhjoJbuD7)-QX38h??0V zX@LsVVU*Bx(Nn?Z(WebhS)bmmadQN^sC`hKpAsHNkH?LBjN6Sf*_Sxk?pHm?zW9aQ zD*H+^ZCJoozT5S#Tj<j5ahRKBXG}tqtGdknrV(|s>L;gTK%>|@sg|bPO{_=u-r?%) z4Q?WHy%lSkVBi>I+`u;(F>Kuul{txx@$juW1J2FZOW+;c`t#x-FH1;6ss7)u_nOvE zRUYJ4HXjY)U2%-#;yd$j>L}W>zbNDzLnZ?m&>QJ$Qlj$+o>Nmnp$Io61b#p8UI16Z zU2L*FeEa#Febw=Oq`?A9asIRFTW;$yD;dLcHQ|NoVGJ6&Ii<-@Ug-yXy2Z)ay3HzC zCS3g+O3f~*heDZwz`-!&4^WNLZ$LFv+|U0Gs-dUj*xl~NkAn7vFZywa+|~l72D>se ztKYN_nBkdQ1&F&&aYRalCi4Vsc=^O0oOv=W>FRU2{_HD4n_`>q$eArM#aZz*{yg7y zFFQOkYIf>G_uz}W82!<AdfWXuHboEiWPTblP-yeDE!-L(Q0wgcn7lYWyIlVDi!Hz7 z+@f#7iEfwWuNw<3CH)?D6H}tg&UfcLL1D2HV?+3@S9UD}0-Ih!9UHiY$BE^^(dWLQ zV+TJ7%jaMdK8{E{9}7r!ols(}I@+csvHW?W;WOK8+t<9#K@|7N5hbkjL$O7UI{W)_ z5ll$++xdDC!mp)ODw+0LlY(=|hq_LfEQ5lz$sn1OoEDvDIv^Ogz)*r9DDe0iXR)Ah zUCmcqg;8M0c#x^WY0RAb*W_`zhsbBg<0M!;-Dj(q`(46&+Mzo4UPALMdEWC0`AyG0 znJ21e%5ON=5ay$_jD*~f172UJ2L!iN*1pY04J~RK6g6VUA|GUI*nfN76B$%HuNRYX zW&PWF+#NO7Aiw3$d{(j~i~0On5@uU<ebHATp2)n;^Le5YZw(5M$2B7+#{;^2A57Wm z*ea&Hfam(%oq9(u>5P`z+F~|*GXwZd!O$U}lwis~$^}kWE<3~}D#1w`OLX+Yd3qDP zc9=?(E<=n51;=E3`mC9*g{(Qw14}f-UN$9oVDQFf2IfD^bTA$`cVD0s%>qa<0n%)w zcMRMyE<_*{^21tFlW+xS1C)os5kPq;2o8jRKp+a?iE?E}Gt%?MxZ@l(@GkCHAc)qZ zL3G4Y5K{=`4*H3@QfgZsLV-0=kxN<_PhBj|<$^cO4Xw+}8|z^KMC{C`HgtCz)V)&J zO;`w!26!^1Ls3zgRa6WV6?XMAFgyM(Ie63HROVinu+%YHuO^0wrO~hcpCD0LRazG? zUb}HPgZZZohWtab`XvECFf~Oxr}reDg0sTmU4V*~IJ^d)h}-R;Ex;1z<b8oi2`3eh z1p>k-k=*@)LZJ#EkTQa@OS((yC<L}^L$&{JeyH{^I8*@%S3)RJ)#jH51@;AjD1hP0 zNH`Vd1&6~GkVps^f%vt5=cfN-EpRYM0Rcr)y&@1uiW3kR2LE*)3<g&K!yr)Fjp>}z z_0xHw-UX=yRX`{y!zrGAbJJf|uoE#Q`U!Lm>L^{4Zsu;(Aj-;=^+P~VFp%z&662r! z%0Dl6XYJgT&K>mU-sxuUn$yBhSqklZP)d`4#^b1vGa#6n3;GHM)0glrv+M_V?YGP_ z`bPn@7YadxLYfb1DKH3!F&+Aircj;jqS6pQB<0@_w_nOs8|Und^}^yEv6NZ`0Tn@Z zbf*qN-QLcz<Dd)yX?vjG6a@UkU>C0jgizsZ|K;;7q2fj}kn`H*jHiw&<_TRU*@bmZ z^=^Z2R2cxbwOl#9+7s&3G_x`W0OA|lYRitkv#8@=*x$#6&4NRho%lbLmunpH{3>23 zP^^qA>+m=x9MR?Z_yv334cmPNOkXlukxLxctGJj%m>Jq-ZJdn6gn|^dss<IoC{Im& zlJ@A;lPPV1b89)0d#vA@xm11X_v;vb{LVj)?NULJi#V^`3ll*n1F>kG0zQEcXG5JP zs?(3$`%d{Jeo`TN>C;%;$fIx3j(pSivgb+K!y@0p%V4ehlnd{vwS2=5{5w6gu>6xA z2n4j7pWUSWpTb5FH7HF~sEW3u(mSVzk}ayrQo9W>-d;GWd{a{b0?P0F*%{x>I1o&& zhMn9{MbFra;7FvX{Eto=x{oG@JM#QP4R@M9HBA8m+i`7*B@!-qQL2X`&D6@FEQkIW z&<cdUV!-qjv!moj|AGoae!e0Z{s0M5p@NWq!tfiwAN^rL-+bcQ4BDr?NRGarc<lPf zE%zZq5K=tUSpSK5W0FX?sE}4gta!@wVngM-L{Y2Zwj0jLQ|E8$-ZW$G@fC+;w-U|K zy5nVyI;NdE4Q%RPMlys^z$VX@50O63GbgT8zmAO_xw}lNZq<DLO5pRIl!~Uw$-u=| zdXT<n9ppZl!-Au7DgDY8rzh^>6&s&9Jmx#&WSf1j?2G9^qv8N3cVyaT>r*Cc)<F3Z zZlq@A{)e+f*}CcLlMp{^lhTCJ{H_WWBgX6Vr3fqN*ZjiX@E|Y#*;qsEl6l*sZw4+s zQ~d1HORUQ(Xe8s4P$IPe)48Vjk<=$L3kl<@M&jl{9E&sD+k9{I6d#`<j>h9XZQ`#5 z0S-(NoPSf4bnhV$1bDY7A^(04u`<Ov(`s#3x<JUC()&#+s3l-V`NShdYH2o>KpP5l z5eWIGVh99EB6sAGYN>~JCQuSft3moCI-Wm&er-E5Dl{}L6mSLthMZ_<X=-Y3YT^;i zTwshLGQN%ty{zFh3{O~F27-JKi63~x*%id)uz?PHE)9g7=el!(Q&ej@tlgLeB+YUx z?1>ZmaMyR^QT`|5P%+1BcHI+w6vqr)D||2i9T=iF#>@MP7iG@~$}<qgAt7<nNQdJ; Dlcg*@ literal 0 HcmV?d00001 -- GitLab From 74c105c3f70a7ef24253117aff21f2def797258a Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 3 Jan 2025 22:52:50 -0800 Subject: [PATCH 009/109] done blowing up --- hercules_cg/src/gpu.rs | 1019 ++++++++++++++++------------------------ 1 file changed, 407 insertions(+), 612 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 768324ca..f2cfd9cf 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -2,12 +2,11 @@ extern crate bitvec; extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::env::var; use std::fmt::{Error, Write}; use std::iter::FromIterator; use std::os::unix::thread; -use bitvec::field; - use self::hercules_ir::*; /* @@ -30,37 +29,19 @@ pub fn gpu_codegen<W: Write>( ) -> Result<(), Error> { /* * We assert the following: - * - Array element type can't be another array - * - Any array field in a struct must have known size * - Fork node must have >= 1 reduce nodes * - If the returned data type is a collection, it must have - * originated from a parameter. Technically could extend to - * multiple parameters but we aren't going to. + * originated from a single known parameter. Can relax to allow + * one of multiple parameters. * * We don't assert but assume the following: - * - Global memory can't be used in a phi or select node * - max_num_blocks is within constraint of 1D grid size. This can be * relaxed if we want to support larger grids. + * - product types are packed with padding inserted for each element to + * be aligned for its type and for full product to be aligned to its + * largest element */ - for ty in types.iter() { - match ty { - Type::Array(type_id, _) => { - if let Type::Array(..) = types[type_id.idx()] { - panic!("Array element type can't be another array"); - } - } - Type::Product(type_ids) | Type::Summation(type_ids) => { - for type_id in type_ids.iter() { - if let Type::Array(_, extents) = &types[type_id.idx()] && multiply_dynamic_constants(dynamic_constants, &extents).is_none() { - panic!("Array field in product msut have known size") - } - } - } - _ => {} - } - } - let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) .filter(|idx| function.nodes[*idx].is_reduce()) .map(NodeID::new) @@ -246,8 +227,9 @@ impl GPUContext<'_> { self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; - self.codegen_struct_def(&mut top)?; - self.codegen_reused_locals(&mut top)?; + // self.codegen_struct_def(&mut top)?; + self.codegen_phi_declarations(&mut top)?; + self.codegen_helpers(&mut top)?; let (fork_tree, fork_control_map) = self.make_fork_structures(); let (root_forks, num_blocks) = @@ -260,7 +242,6 @@ impl GPUContext<'_> { .next() .unwrap(); let (begin_control, end_control) = self.get_begin_end_control(start, ret); - let global_refs = self.get_global_refs(); // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) @@ -277,7 +258,6 @@ impl GPUContext<'_> { &fork_control_map, &begin_control, &end_control, - &global_refs, &cumul_factors, num_threads, num_blocks, @@ -320,9 +300,9 @@ impl GPUContext<'_> { write!(w, ", ")?; } let param_type = if self.types[ty.idx()].is_primitive() { - self.get_type(*ty, false, false) + self.get_type(*ty, false) } else { - format!("{} __restrict__", self.get_type(*ty, true, true)) + format!("{} __restrict__", self.get_type(*ty, false)) }; write!(w, "{} p{}", param_type, idx)?; } @@ -332,13 +312,13 @@ impl GPUContext<'_> { write!( w, "{} __restrict__ ret", - self.get_type(*self.return_type_id, true, true) + self.get_type(*self.return_type_id, true) )?; } // Type is char since it's simplest to use single bytes for indexing, // casting will be needed for use with different types. - write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n\tsize_t alignment;\n")?; + write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n")?; Ok(()) } @@ -376,83 +356,21 @@ impl GPUContext<'_> { Ok(()) } - // Emit struct definitions for each typeid of product or summation type. If - // multiple typeids have the same type, they're separately emitted. Lastly emit - // dummy alignment for later use in dynamic shared memory slices. - fn codegen_struct_def(&self, w: &mut String) -> Result<(), Error> { - for type_id in self.typing.iter() { - let type_id_idx = type_id.idx(); - match &self.types[type_id_idx] { - Type::Product(ref product_ty_ids) => { - let product_size = self.get_size(*type_id); - write!(w, "\ttypedef struct alignas({}) Product_{} {{\n", product_size, type_id_idx)?; - let mut cumul_size = 0; - for (i, product_ty_id) in product_ty_ids.iter().enumerate() { - let field_alignment = self.get_alignment(*product_ty_id); - if (cumul_size % field_alignment) != 0 { - let padding = field_alignment - cumul_size % field_alignment; - cumul_size += padding; - write!( - w, - "\t\tchar[{}] pad{};\n", - padding, - i, - )?; - } - write!( - w, - "\t\t{} field_{};\n", - self.get_type(*product_ty_id, false, false), - i - )?; - cumul_size += self.get_size(*product_ty_id); - } - write!(w, "\t}} __attribute__((packed)) Product_{};\n", type_id_idx)?; - } - Type::Summation(ref summation_ty_ids) => { - let summation_size = self.get_size(*type_id); - write!( - w, - "\ttypedef struct alignas({}) Summation_{} {{\n\t\t union {{\n", - summation_size, - type_id_idx - )?; - for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { - write!( - w, - "\t\t\t{} field_{};\n", - self.get_type(*summation_ty_id, false, false), - i - )?; - } - write!( - w, - "\t\t}};\n\t}} __attribute__((packed)) Summation_{};\n", - type_id_idx - )?; - } - _ => {} + // We declare all phi values upfront + fn codegen_phi_declarations(&self, w: &mut String) -> Result<(), Error> { + for id in (0..self.function.nodes.len()).map(NodeID::new) { + if let Node::Phi {..} = &self.function.nodes[id.idx()] { + write!(w, "\t{};\n", self.get_value(id, true, false))?; } } - Ok(()) } - // We generate all phi values and all flags for phi and select upfront that - // indicate if collection, whether their current value is global - fn codegen_reused_locals(&self, w: &mut String) -> Result<(), Error> { - for id in (0..self.function.nodes.len()).map(NodeID::new) { - match &self.function.nodes[id.idx()] { - Node::Phi {..} => { - write!(w, "\t{};\n", self.get_value(id, true, true, false))?; - } - _ => {} - } - let global_flag = self.get_global_flag(id, true); - if global_flag.is_some() { - write!(w, "\t{};\n", global_flag.unwrap())?; - } - } + // Emit helper registers that are used throughout the kernel- alignment + // is for proper dynamic shared memory allocation + fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { + write!(w, "\tsize_t alignment;\n")?; + write!(w, "\tsize_t max_variant_size;\n")?; Ok(()) } @@ -539,36 +457,6 @@ impl GPUContext<'_> { (begin_visited, end_visited) } - // Get all globals and global references, where for GPU purposes global = - // collection parameter - fn get_global_refs(&self) -> HashSet<NodeID> { - // We start with collection parameters, and follow any reduce or write users. - let mut queued_nodes: VecDeque<NodeID> = (0..self.function.nodes.len()) - .filter(|idx| { - self.function.nodes[*idx].is_parameter() - && !self.types[self.typing[*idx].idx()].is_primitive() - }) - .map(NodeID::new) - .collect(); - - let def_use = def_use(&self.function); - let mut global_nodes = HashSet::new(); - - while !queued_nodes.is_empty() { - let node_id = queued_nodes.pop_front().unwrap(); - global_nodes.insert(node_id); - let node_users = def_use.get_users(node_id); - for user in node_users { - match self.function.nodes[user.idx()] { - Node::Write { .. } | Node::Reduce { .. } => queued_nodes.push_back(*user), - _ => {} - } - } - } - - global_nodes - } - /* * If tree has a single root fork of known size s <= max_num_blocks * with parallel-fork schedule, then set num_blocks to s, else set num_blocks to 1. @@ -593,7 +481,7 @@ impl GPUContext<'_> { let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { panic!("Expected fork node"); }; - let fork_size = multiply_dynamic_constants(self.dynamic_constants, factors); + let fork_size = self.multiply_fork_factors(factors); if let Some(fork_size) = fork_size && fork_size <= max_num_blocks && self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) @@ -641,7 +529,7 @@ impl GPUContext<'_> { self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) }) && let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] - && let Some(fork_size) = multiply_dynamic_constants(self.dynamic_constants, factors) + && let Some(fork_size) = self.multiply_fork_factors(factors) && fork_size <= self.kernel_params.max_num_threads / cumul_factor { if fork_size % self.kernel_params.greedy_associative_thresh == 0 @@ -713,7 +601,6 @@ impl GPUContext<'_> { fork_control_map: &HashMap<NodeID, Vec<NodeID>>, begin_control: &HashSet<NodeID>, end_control: &HashSet<NodeID>, - global_refs: &HashSet<NodeID>, cumul_factors: &HashMap<NodeID, usize>, num_threads: usize, num_blocks: usize, @@ -737,7 +624,7 @@ impl GPUContext<'_> { for control in begin_control { let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; for data in control_to_data.get(control).unwrap() { - self.codegen_data_node(*data, state, num_threads, body, &mut 1, global_refs)?; + self.codegen_data_node(*data, state, num_threads, body, &mut 1)?; } let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; self.codegen_control_node(*control, term, 1)?; @@ -749,7 +636,7 @@ impl GPUContext<'_> { for control in fork_control_map.get(&root_forks[0]).unwrap() { for data in control_to_data.get(control).unwrap() { let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; - self.codegen_data_node(*data, state, num_threads, body, &mut 1, global_refs)?; + self.codegen_data_node(*data, state, num_threads, body, &mut 1)?; } let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; self.codegen_control_node(*control, term, 1)?; @@ -769,9 +656,8 @@ impl GPUContext<'_> { thread_quota: usize, w: &mut String, num_tabs: &mut usize, - global_refs: &HashSet<NodeID>, ) -> Result<(), Error> { - let declare_variable = self.get_value(id, true, false, false).to_string(); + let declare_variable = self.get_value(id, true, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { // Phi registers were already emitted. @@ -799,17 +685,18 @@ impl GPUContext<'_> { } } Node::Reduce { control: _, init, reduct: _ } => { - let init_val = self.get_value(*init, false, false, false); + let init_val = self.get_value(*init, false, false); write!(w, "{}{} = {};\n", tabs, declare_variable, init_val)?; } // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { write!(w, "{}{};\n", tabs, declare_variable)?; - let define_variable = self.get_value(id, false, false, false); + let define_variable = self.get_value(id, false, false); self.codegen_constant( if self.types[self.typing[id.idx()].idx()].is_primitive() { define_variable } else { format!("*{}", define_variable)}, *cons_id, + true, w, *num_tabs, )?; @@ -824,7 +711,7 @@ impl GPUContext<'_> { "{}{} = !{};\n", tabs, declare_variable, - self.get_value(*input, false, false, false), + self.get_value(*input, false, false), )?; } ty if ty.is_fixed() => { @@ -833,7 +720,7 @@ impl GPUContext<'_> { "{}{} = ~{};\n", tabs, declare_variable, - self.get_value(*input, false, false, false), + self.get_value(*input, false, false), )?; } _ => panic!("Unsupported type for not operator"), @@ -845,7 +732,7 @@ impl GPUContext<'_> { "{}{} = -{};\n", tabs, declare_variable, - self.get_value(*input, false, false, false), + self.get_value(*input, false, false), )?; } _ => { @@ -858,14 +745,14 @@ impl GPUContext<'_> { "{}{} = static_cast<{}>({});\n", tabs, declare_variable, - self.get_type(*dst_ty_id, false, false), - self.get_value(*input, false, false, false), + self.get_type(*dst_ty_id, false), + self.get_value(*input, false, false), )?; } }, Node::Binary { op, left, right } => { - let left_val = self.get_value(*left, false, false, false); - let right_val = self.get_value(*right, false, false, false); + let left_val = self.get_value(*left, false, false); + let right_val = self.get_value(*right, false, false); match (op, &self.types[self.typing[left.idx()].idx()]) { (BinaryOperator::Rem, Type::Float32) => write!( w, @@ -925,17 +812,12 @@ impl GPUContext<'_> { TernaryOperator::Select => { write!( w, - "{}{} = {} ? {} : {};\n{}{} = {} ? {} : {};\n", + "{}{} = {} ? {} : {};", tabs, declare_variable, - self.get_value(*first, false, false, false), - self.get_value(*second, false, false, false), - self.get_value(*third, false, false, false), - tabs, - self.get_value(id, false, false, false), - self.get_value(*first, false, false, false), - global_refs.contains(second), - global_refs.contains(third) + self.get_value(*first, false, false), + self.get_value(*second, false, false), + self.get_value(*third, false, false), )?; } }, @@ -948,105 +830,59 @@ impl GPUContext<'_> { tabs, declare_variable, func_name, - self.get_value(args[0], false, false, false), + self.get_value(args[0], false, false), )?; } + // Main difference between read and write is codegen_copy takes the + // returned node's type for read and data node's type for write Node::Read { collect, indices } => { - // Copy from global memory or from shared memory or registers. - // Generate if-else for phi and select where we don't statically know - // the case. write!(w, "{}{};\n", tabs, declare_variable); - let define_variable = self.get_value(id, false, false, false); - let global_flag = self.get_global_flag(*collect, false); - let has_global_flag = global_flag.is_some(); - if has_global_flag { - write!(w, "{}if ({}) {{\n{}\t", tabs, global_flag.unwrap(), tabs); - *num_tabs += 1; - } - if global_refs.contains(collect) || has_global_flag { - let is_char = self.is_parameter_char(self.typing[collect.idx()]); - let global_collect = self.codegen_collect(*collect, indices, true, has_global_flag, is_char); - let type_id = self.typing[id.idx()]; - let is_array = self.types[type_id.idx()].is_array(); - self.codegen_copy_from_to_global( - false, - type_id, - &define_variable, - &global_collect, - indices, - if is_array { - Some(thread_quota) - } else { - None - }, - !is_array, - false, - is_char, - w, - *num_tabs, - )?; - } - if has_global_flag { - write!(w, "{}}} else {{\n", tabs); - } - if !global_refs.contains(collect) || has_global_flag { - let local_collect = self.codegen_collect(*collect, indices, false, has_global_flag, false); - write!(w, "{}{} = {};\n", tabs, define_variable, local_collect)?; - } - if has_global_flag { - write!(w, "{}}}\n", tabs); - *num_tabs -= 1; - } + let define_variable = self.get_value(id, false, false); + let is_char = self.is_char(self.typing[collect.idx()]); + let collect_with_indices = self.codegen_collect(*collect, indices, is_char); + let type_id = self.typing[id.idx()]; + self.codegen_copy( + false, + type_id, + &define_variable, + &collect_with_indices, + if !self.types[type_id.idx()].is_primitive() { + Some(thread_quota) + } else { + None + }, + false, + w, + *num_tabs, + )?; } Node::Write { collect, data, indices, } => { - // Only difference vs read is the LHS vs RHS, and creating write- - // labeled reference after write!(w, "{}{};\n", tabs, declare_variable); - let global_flag = self.get_global_flag(*collect, false); - let has_global_flag = global_flag.is_some(); - if has_global_flag { - write!(w, "{}if ({}) {{\n", tabs, global_flag.unwrap()); - *num_tabs += 1; - } - let data_variable = self.get_value(*data, false, false, global_refs.contains(collect)); - if global_refs.contains(collect) || has_global_flag { - let is_char = self.is_parameter_char(self.typing[collect.idx()]); - let global_collect = self.codegen_collect(*collect, indices, true, has_global_flag, is_char); - let type_id = self.typing[id.idx()]; - let is_array = self.types[type_id.idx()].is_array(); - self.codegen_copy_from_to_global( - true, - type_id, - &data_variable, - &global_collect, - indices, - if is_array { - Some(thread_quota) - } else { - None - }, - !is_array, - state == 0, - is_char, - w, - *num_tabs, - )?; - } - if has_global_flag { - write!(w, "{}}} else {{\n", tabs); - } - if !global_refs.contains(collect) || has_global_flag { - let local_collect = self.codegen_collect(*collect, indices, false, has_global_flag, false); - write!(w, "{}{} = {};\n", tabs, local_collect, data_variable)?; - } - if has_global_flag { - write!(w, "{}}}\n", tabs); - *num_tabs -= 1; - } + let data_variable = self.get_value(*data, false, false); + let is_char = self.is_char(self.typing[collect.idx()]); + let collect_with_indices = self.codegen_collect(*collect, indices, is_char); + let type_id = self.typing[data.idx()]; + self.codegen_copy( + true, + type_id, + &data_variable, + &collect_with_indices, + if !self.types[type_id.idx()].is_primitive() { + Some(thread_quota) + } else { + None + }, + state == 0, + w, + *num_tabs, + )?; + let define_variable = self.get_value(id, false, false); + let collect_variable = self.get_value(*collect, false, false); + write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } _ => { panic!("Unsupported data node type") @@ -1056,13 +892,10 @@ impl GPUContext<'_> { for phi in phis { write!( w, - "{}{} = {};\n{}{} = {};\n", - tabs, - self.get_value(*phi, false, false, false), - self.get_value(id, false, false, false), + "{}{} = {};\n", tabs, - self.get_global_flag(*phi, false).unwrap(), - global_refs.contains(&id) + self.get_value(*phi, false, false), + self.get_value(id, false, false), )?; } } @@ -1094,7 +927,7 @@ impl GPUContext<'_> { w, "{}if ({}) {{\n", tabs, - self.get_value(*cond, false, false, false) + self.get_value(*cond, false, false) )?; write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; write!(w, "{}}} else {{\n", tabs)?; @@ -1108,7 +941,7 @@ impl GPUContext<'_> { Node::Join { control: _ } => {} Node::Return { control: _, data } => { if self.types[self.typing[data.idx()].idx()].is_primitive() { - let return_val = self.get_value(*data, false, false, false); + let return_val = self.get_value(*data, false, false); write!( w, "{}if (threadIdx.x == 0) {{\n{}\t*ret = {};\n{}}}\n", @@ -1124,40 +957,27 @@ impl GPUContext<'_> { Ok(()) } - // Handles reads/writes from global memory aka parameter node. We tack local - // (shmem + reg) array indexing and struct field access onto data, and tack - // global pointer offset onto global. Thread parallelization is used only for - // shared memory arrays. is_char indicates the global is a char type and we - // need to multiply the global index by the element size. - fn codegen_copy_from_to_global( + // Handles copying data to/from global and shared memory. Thread parallelization + // is used only for arrays (possibly inside another collection). is_char indicates + // a char type and we need to including element size in indexing. + fn codegen_copy( &self, is_write: bool, type_id: TypeID, data: &String, - global: &String, - indices: &[Index], + collect: &String, thread_quota: Option<usize>, - thread_restrict: bool, block_restrict: bool, - is_char: bool, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { let tabs = "\t".repeat(num_tabs); match &self.types[type_id.idx()] { Type::Array(element_type_id, extents) => { - let Index::Position(array_indices) = &indices[0] else { - panic!("Expected position index for array access") - }; - if matches!(self.types[element_type_id.idx()], Type::Array(..)) { - panic!("Nested arrays are not supported"); - } let rem_array_size = { let s = extents .iter() - .enumerate() - .filter(|(i, _)| *i >= array_indices.len()) - .map(|(_, id)| format!("dc{}", id.idx())) + .map(|id| format!("dc{}", id.idx())) .collect::<Vec<_>>() .join(" * "); if s.is_empty() { @@ -1166,124 +986,140 @@ impl GPUContext<'_> { s } }; - // If we parallelize over threads, then we index by threadIdx.x, - // else we gate the loop by threadIdx.x == 0 + // Either we parallelize over threads or gate the loop by threadIdx.x + // == 0 + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; + } let has_thread_quota = thread_quota.is_some(); - let begin_copy = if has_thread_quota { - format!( - "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", - tabs, rem_array_size, thread_quota.unwrap() - ) - } else { - format!( - "{}if (threadIdx.x == 0) {{\n{}\tfor (int i = 0; i < {}; i++) {{\n", - tabs, tabs, rem_array_size - ) - }; - write!(w, "{}", begin_copy)?; - let new_global = if is_char { + write!(w, "{}", if has_thread_quota { + format!( + "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", + tabs, rem_array_size, thread_quota.unwrap() + ) + } else { + format!( + "{}if (threadIdx.x == 0) {{\n{}\tfor (int i = 0; i < {}; i++) {{\n", + tabs, tabs, rem_array_size + ) + } + ); + let element_type_name = self.get_type(*element_type_id, true); + let (new_collect, new_data) = if self.is_char(type_id) { + (format!( + "{} + i * {}", + collect, + self.get_size(*element_type_id, None) + ), format!( - "{} + i * sizeof({})", - global, - self.get_type(*element_type_id, false, false) - ) + "{} + i * {}", + data, + self.get_size(*element_type_id, None) + )) } else { - format!("{} + i", global) + (format!("{} + i", collect), format!("{} + i", data)) }; - self.codegen_copy_from_to_global( + let new_collect = format!("{}reinterpret_cast<{}>({})", if self.types[element_type_id.idx()].is_primitive() { "*" } else { "" }, element_type_name, new_collect); + let new_data = format!("{}reinterpret_cast<{}>({})", if self.types[element_type_id.idx()].is_primitive() { "*" } else { "" }, element_type_name, new_data); + self.codegen_copy( is_write, *element_type_id, - &format!("{} + i", data), - &new_global, - &indices[1..], + &new_data, + &new_collect, None, false, - false, - is_char, w, - num_tabs + if has_thread_quota { 1 } else { 2 }, + num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 1 } else { 2 }, )?; if !has_thread_quota { write!(w, "{}\t}}\n", tabs)?; } + if block_restrict { + write!(w, "{}}}\n", tabs)?; + } write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; } - Type::Product(fields) | Type::Summation(fields) => { - if !is_char { - panic!("Global product or summation must be char addressed") + Type::Product(fields) => { + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; } - let is_product = matches!(self.types[type_id.idx()], Type::Product(..)); - if indices.is_empty() { - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - if thread_restrict { - write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; - extra_tab2 = "\t"; - } - let reinterpret = format!("*reinterpret_cast<{}_{}*>", if is_product { "Product" } else { "Summation" }, type_id.idx()); - let reinterpret_global = format!("{}({})", reinterpret, global); - let reinterpret_data = format!("{}({})", reinterpret, data); - write!( - w, - "{}{}{}{} = {};\n", - tabs, - extra_tab, - extra_tab2, - if is_write { &reinterpret_global } else { &reinterpret_data }, - if is_write { &reinterpret_data } else { &reinterpret_global } - )?; - if thread_restrict { - write!(w, "{}{}}}\n", tabs, extra_tab)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - } else if is_product { - // Iterate over fields in product to find offset - let Index::Field(field_index) = &indices[0] else { - panic!("Expected field index for product access") - }; - let offset = (0..*field_index) - .map(|i| self.get_size(fields[i])) - .sum::<usize>(); - let new_global = format!("{} + {}", global, offset); - let new_data = format!("{} + {}", data, offset); - self.codegen_copy_from_to_global( + let has_thread_quota = thread_quota.is_some(); + if !has_thread_quota { + write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; + extra_tab2 = "\t"; + } + for (i, field) in fields.iter().enumerate() { + let offset = self.get_size(type_id, Some(i)); + let field_type_name = self.get_type(*field, true); + let new_collect = format!("{}reinterpret_cast<{}>({} + {})", if self.types[field.idx()].is_primitive() { "*" } else { "" }, field_type_name, collect, offset); + let new_data = format!("{}reinterpret_cast<{}>({} + {})", if self.types[field.idx()].is_primitive() { "*" } else { "" }, field_type_name, data, offset); + self.codegen_copy( is_write, - fields[*field_index], + *field, &new_data, - &new_global, - &indices[1..], - None, - thread_restrict, - block_restrict, - is_char, + &new_collect, + thread_quota, + false, w, - num_tabs + 1, + num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 0 } else { 1 }, )?; - } else { - // All variants of summations have zero offset - let Index::Variant(variant_index) = &indices[0] else { - panic!("Expected variant index for summation access") - }; - self.codegen_copy_from_to_global( + } + if !has_thread_quota { + write!(w, "{}\t}}\n", tabs)?; + } + if block_restrict { + write!(w, "{}}}\n", tabs)?; + } + write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; + } + Type::Summation(variants) => { + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; + } + let has_thread_quota = thread_quota.is_some(); + if !has_thread_quota { + write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; + extra_tab2 = "\t"; + } + // We can guarantee correctness for summation by just copying the + // largest variant. + let max_variant_size = self.get_size(type_id, None); + write!(w, "{}{}{}max_variant_size = {};\n", tabs, extra_tab, extra_tab2, max_variant_size)?; + for (i, variant) in variants.iter().enumerate() { + let prefix = if i == 0 { "if" } else { "else if" }; + let variant_size = self.get_size(*variant, None); + write!(w, "{}{}{}{} (max_variant_size == {}) {{\n", tabs, extra_tab, extra_tab2, prefix, variant_size)?; + let field_type_name = self.get_type(*variant, true); + let new_collect = format!("{}reinterpret_cast<{}>({})", if self.types[variant.idx()].is_primitive() { "*" } else { "" }, field_type_name, collect); + let new_data = format!("{}reinterpret_cast<{}>({})", if self.types[variant.idx()].is_primitive() { "*" } else { "" }, field_type_name, data); + self.codegen_copy( is_write, - fields[*variant_index], - &data, - &global, - &indices[1..], - None, - thread_restrict, - block_restrict, - is_char, + *variant, + &new_data, + &new_collect, + thread_quota, + false, w, - num_tabs + 1, + num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 0 } else { 1 }, )?; + write!(w, "{}{}{}}}\n", tabs, extra_tab, extra_tab2)?; + } + if !has_thread_quota { + write!(w, "{}\t}}\n", tabs)?; } + if block_restrict { + write!(w, "{}}}\n", tabs)?; + } + write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; } // Primitive types _ => { @@ -1293,34 +1129,21 @@ impl GPUContext<'_> { write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; extra_tab = "\t"; } - if thread_restrict { + let has_thread_quota = thread_quota.is_some(); + if has_thread_quota { write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; extra_tab2 = "\t"; } - if is_char { - let type_name = self.get_type(type_id, true, false); - let reinterpret = format!("*reinterpret_cast<{}>", type_name); - let reinterpret_global = format!("{}({})", reinterpret, global); - let reinterpret_data = format!("{}({})", reinterpret, data); - write!( - w, - "{}{}{}{} = {};\n", - tabs, - extra_tab, - extra_tab2, - if is_write { &reinterpret_global } else { &reinterpret_data }, - if is_write { &reinterpret_data } else { &reinterpret_global } - )?; - } else { - write!( - w, - "{}*{} = *{};\n", - tabs, - if is_write { &global } else { data }, - if is_write { data } else { &global } - )?; - } - if thread_restrict { + write!( + w, + "{}{}{}{} = {};\n", + tabs, + extra_tab, + extra_tab2, + if is_write { collect } else { data }, + if is_write { data } else { collect } + )?; + if has_thread_quota { write!(w, "{}{}}}\n", tabs, extra_tab)?; } if block_restrict { @@ -1331,59 +1154,14 @@ impl GPUContext<'_> { Ok(()) } - // // Read/writes to local collections consist of local name + array indexing - // // and struct field access. - // fn codegen_local_collect(&self, collect: NodeID, indices: &[Index], has_global_flag: bool) -> String { - // let mut index_ptr_name = "".to_string(); - // for index in indices { - // match index { - // Index::Field(field) => { - // index_ptr_name.push_str(&format!(".field_{}", field)); - // } - // Index::Variant(variant) => { - // index_ptr_name.push_str(&format!(".field_{}", variant)); - // } - // Index::Position(indices) => { - // index_ptr_name.push_str( - // &indices - // .iter() - // .map(|index| format!("[{}]", self.get_value(*index, false, false, false))) - // .collect::<Vec<_>>() - // .join(""), - // ); - // } - // } - // } - // let name = self.get_value(collect, false, false, false); - // let full_name = if has_global_flag { - // format!("reinterpret_cast<{}>({})", self.get_type(self.typing[collect.idx()], false, false), name) - // } else { - // name - // }; - // format!("{} + {}", full_name, index_ptr_name) - // } - // Read/writes to global collections consist of global name + pointer offset. - fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_global: bool, has_global_flag: bool, is_char: bool) -> String { - let mut index_ptr_name = "0".to_string(); + fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { + let mut index_ptr = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { match index { - // Sum the offset of prior fields in bytes Index::Field(field) => { - let offset = (0..*field) - .map(|i| { - format!( - "offsetof({}, field_{})", - self.get_type(type_id, false, false), - i - ) - }) - .collect::<Vec<_>>() - .join(" + "); - if *field > 0 { - index_ptr_name.push_str(&format!(" + {}", offset)); - } + self.get_size(type_id, Some(*field)); } // Variants of summations have zero offset Index::Variant(_) => {} @@ -1408,112 +1186,213 @@ impl GPUContext<'_> { cumulative_offset = format!( "{} * ({} + ", cumulative_offset, - self.get_value(*index, false, false, false) + self.get_value(*index, false, false) ); } - index_ptr_name.push_str(&format!( + index_ptr.push_str(&format!( " + {}{}", cumulative_offset, ")".repeat(array_indices.len()) )); if is_char { - let element_size = - format!("sizeof({})", self.get_type(*element_type, false, false)); - index_ptr_name.push_str(&format!(" * {}", element_size)); + let element_size = self.get_size(*element_type, None); + index_ptr.push_str(&format!(" * {}", element_size)); } } } } - let name = self.get_value(collect, false, false, false); - let full_name = if is_global && has_global_flag { - format!("reinterpret_cast<{}>({})", self.get_type(type_id, true, true), name) - } else if has_global_flag { - format!("reinterpret_cast<{}>({})", self.get_type(type_id, false, false), name) - } else { - name - }; - format!("{} + {}", full_name, index_ptr_name) + let name = self.get_value(collect, false, false); + format!("{} + {}", name, index_ptr) } // Standalone function allows us to handle recursive initialization for - // product and summation collections + // product and summation collections. `allow_allocate` prevents unnecessary + // shared memory allocations for nested product and summation collections. + // Since not initialized, array collections don't need to be recursed into. fn codegen_constant( &self, name: String, cons_id: ConstantID, + allow_allocate: bool, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); match &self.constants[cons_id.idx()] { - Constant::Boolean(val) => write!(w, " = {};\n", val)?, - Constant::Integer8(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger8(val) => write!(w, " = {};\n", val)?, - Constant::Integer16(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger16(val) => write!(w, " = {};\n", val)?, - Constant::Integer32(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger32(val) => write!(w, " = {}ul;\n", val)?, - Constant::Integer64(val) => write!(w, " = {}ll;\n", val)?, - Constant::UnsignedInteger64(val) => write!(w, " = {}ull;\n", val)?, - Constant::Float32(val) => write!(w, " = {}f;\n", val)?, - Constant::Float64(val) => write!(w, " = {};\n", val)?, - Constant::Product(_, fields) => { - write!(w, ";\n")?; - for (i, field) in fields.iter().enumerate() { - // Array size was set by struct definition and we don't emit array content - if !self.constants[field.idx()].is_array() { - // // Don't need type declaration for the fields - // self.codegen_constant( - // format!("{}.field_{}", name, i), - // format!("{}.field_{}", name, i), - // *field, - // w, - // )?; - - } + Constant::Boolean(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer8(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger8(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer16(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger16(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer32(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger32(val) => write!(w, "{}{} = {}ul;\n", tabs, name, val)?, + Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?, + Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, + Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, val)?, + Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + // All three followign collections involve align then allocate from the + // single dynamic shared memory buffer by using and updating the offset. + Constant::Product(type_id, constant_fields) => { + if allow_allocate { + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None); + write!( + w, + "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; + {}{} = dynamic_shared + dynamic_shared_offset;\n + {}dynamic_shared_offset += {};\n", + tabs, + alignment, + alignment, + alignment, + tabs, + name, + tabs, + size, + )?; + } + let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") }; + for i in 0..constant_fields.len() { + // For each field update offset and issue recursive call + let field_constant = &self.constants[constant_fields[i].idx()]; + let field_type = self.get_type(type_fields[i], true); + let offset = self.get_size(type_fields[i], Some(i)); + self.codegen_constant(format!("*reinterpret_cast<{}>({}{})", field_type, name, offset), constant_fields[i], false, w, num_tabs); } } - Constant::Summation(_, variant, field) => { - write!(w, ";\n\t{}.tag = {};\n", name, variant)?; - // See two comments in Constant::Product - if !self.constants[field.idx()].is_array() { - self.codegen_constant( - format!("\t{}.field_{}", name, variant), - format!("\t{}.field_{}", name, variant), - *field, + Constant::Summation(type_id, variant, field) => { + if allow_allocate { + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None); + write!( w, + "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; + {}{} = dynamic_shared + dynamic_shared_offset;\n + {}dynamic_shared_offset += {};\n", + tabs, + alignment, + alignment, + alignment, + tabs, + name, + tabs, + size, )?; } + // No offset updating needed since all variants start at 0 + let Type::Summation(variants) = &self.types[type_id.idx()] else { panic!("Summation constant should have summation type") }; + let variant_type = self.get_type(self.typing[variants[*variant as usize].idx()], true); + let variant_constant = &self.constants[field.idx()]; + if variant_constant.is_scalar() { + self.codegen_constant(format!("*reinterpret_cast<{}>{}", variant_type, name) , cons_id, false, w, num_tabs); + } else if !variant_constant.is_array() { + self.codegen_constant(name, cons_id, false, w, num_tabs); + }; } Constant::Array(type_id) => { - let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { + let Type::Array(element_type, _) = &self.types[type_id.idx()] else { panic!("Expected array type") }; - // For now we do element-wise alignment, later could consider (n-1)d array - // alignment. Then we "allocate" from the single dynamic shared memory buffer - // by using and updating the offset. - let element_size = - format!("sizeof({})", self.get_type(*element_type, false, false)); - let array_size = extents - .iter() - .map(|id| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join("*"); + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None); + let element_type = self.get_type(*element_type, false); write!( w, - ";\n\talignment = {};\n\tdynamic_shared_offset = - ((dynamic_shared_offset + alignment - 1) / alignment) * alignment; - \n\t{} = reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t - dynamic_shared_offset += {}", - element_size, + ";\n{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; + {}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n + {}dynamic_shared_offset += {};\n", + tabs, + alignment, + alignment, + alignment, + tabs, name, - self.get_type(*element_type, false, false), - array_size + element_type, + tabs, + size )?; } } Ok(()) } + // Emit code to calculate data size. For Product types, setting `field_number` + // gives data size up to but not including that field, so = 2 gives 1st field + // and offset to 2nd field. This is useful for generating constant initialization + // and read/write index math. + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { + match &self.types[type_id.idx()] { + Type::Array(element_type, extents) => { + let array_size = extents + .iter() + .map(|id| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join(" * "); + format!("{} * {}", self.get_alignment(*element_type), array_size) + } + Type::Product(fields) => { + let num_fields = &num_fields.unwrap_or(fields.len()); + let with_field = fields + .iter() + .enumerate() + .filter(|(i, _)| i < num_fields) + .map(|(_, id)| (self.get_size(*id, None), self.get_alignment(*id))) + .fold(String::from("0"), |acc, (size, align)| { + if acc == "0" { + size + } else { + format!("({} + {} - 1) / {}) * {} + {}", acc, align, align, align, size) + } + }); + if num_fields < &fields.len() { + format!("{} - {}", with_field, self.get_size(fields[*num_fields], None)) + } else { + with_field + } + } + Type::Summation(variants) => { + // The argmax variant by size is not guaranteed to be same as + // argmax variant by alignment, eg product of 3 4-byte primitives + // vs 1 8-byte primitive, so we need to calculate both. + let max_size = variants + .iter() + .map(|id| self.get_size(*id, None)) + .fold(String::from("0"), |acc, x| { + if acc == "0" { + x + } else { + format!("umax({}, {})", acc, x) + } + }); + let max_alignment = variants + .iter() + .map(|id| self.get_alignment(*id)) + .max() + .unwrap_or(0); + format!("({} + {} - 1) / {} * {}", max_size, max_alignment, max_alignment, max_alignment) + } + _ => format!("{}", self.get_alignment(type_id)) + } + } + + fn get_alignment(&self, type_id: TypeID) -> usize { + match &self.types[type_id.idx()] { + Type::Array(element_type, _) => self.get_alignment(*element_type), + Type::Product(fields) | Type::Summation(fields) => { + fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(0) + } + Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, + Type::Integer16 | Type::UnsignedInteger16 => 2, + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, + _ => panic!("Unsupported type for alignment"), + } + } + fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { let func_name = match intrinsic { Intrinsic::Abs => match ty { @@ -1629,13 +1508,12 @@ impl GPUContext<'_> { func_name.to_string() } - // Check if a parameter should be represented as char*. Must be a product, - // summation, or array of product/summation types. This should only be - // called on parameters. - fn is_parameter_char(&self, type_id: TypeID) -> bool { + // Check if a type should be represented as char*. Must be a product, + // summation, or array of product/summation types. + fn is_char(&self, type_id: TypeID) -> bool { match &self.types[type_id.idx()] { Type::Product(_) | Type::Summation(_) => true, - Type::Array(element_type, _) => self.is_parameter_char(*element_type), + Type::Array(element_type, _) => self.is_char(*element_type), _ => false, } } @@ -1650,57 +1528,13 @@ impl GPUContext<'_> { Ok(()) } - fn get_size(&self, type_id: TypeID) -> usize { - match &self.types[type_id.idx()] { - Type::Array(element_type, extents) => { - let element_alignment = self.get_alignment(*element_type); - extents - .iter() - .try_fold(element_alignment, |acc, &extent| { - evaluate_dynamic_constant(extent, self.dynamic_constants) - .map(|val| acc.saturating_mul(val)) - }) - .unwrap_or_else(|| panic!("Queried size for array with unknown size")) - } - _ => self.get_alignment(type_id), - } - } - - fn get_alignment(&self, type_id: TypeID) -> usize { - match &self.types[type_id.idx()] { - Type::Array(element_type, _) => self.get_alignment(*element_type), - Type::Product(fields) => { - let product_size = fields - .iter() - .map(|field| self.get_alignment(*field)) - .sum::<usize>(); - let field_alignment = fields - .iter() - .map(|field| self.get_alignment(*field)) - .max() - .unwrap_or(1); - field_alignment * ((product_size + (field_alignment - 1)) / field_alignment) - } , - Type::Summation(fields) => { - fields - .iter() - .map(|field| self.get_alignment(*field)) - .max() - .unwrap_or(0) - } - Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, - Type::Integer16 | Type::UnsignedInteger16 => 2, - Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, - Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, - _ => panic!("Unsupported type for alignment"), - } - } - fn get_block_name(&self, id: NodeID) -> String { format!("bb_{}", id.idx()) } - fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool, global_pointer: bool) -> String { + // Setting ty = true will return with type in declaration format. make_pointer + // is only considered if ty = true and only relevant for primitive types. + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { if ty { panic!("Dynamic constants shouldn't be re-initialized") @@ -1711,25 +1545,11 @@ impl GPUContext<'_> { panic!("Parameters shouldn't be re-initialized") } format!("p{}", index) - } else if ty - && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { - // Dynamic shared memory arrays have special formatting - let mut declare_array = format!( - "{} (*{}{})", - self.get_type(*element_type, false, false), - self.function.nodes[id.idx()].lower_case_name(), - id.idx() - ); - for extent in extents.iter().skip(1) { - declare_array.push_str(&format!("[dc{}]", extent.idx())); - } - declare_array } else if ty { format!( - "{} {}{}", - self.get_type(self.typing[id.idx()], make_pointer, global_pointer), - self.function.nodes[id.idx()].lower_case_name(), - id.idx() + "{} {}", + self.get_type(self.typing[id.idx()], make_pointer), + self.get_value(id, false, false) ) } else { format!( @@ -1740,53 +1560,28 @@ impl GPUContext<'_> { } } - fn get_global_flag(&self, id: NodeID, ty: bool) -> Option<String> { - let node = &self.function.nodes[id.idx()]; - if (!node.is_phi() && !matches!(node, Node::Ternary { op: TernaryOperator::Select, ..})) || self.types[self.typing[id.idx()].idx()].is_primitive() { - None - } else if ty { - Some(format!( - "bool {}{}_is_global", - self.function.nodes[id.idx()].lower_case_name(), - id.idx() - )) - } else { - Some(format!( - "{}{}_is_global", - self.function.nodes[id.idx()].lower_case_name(), - id.idx() - )) - } - } - + // Setting make_pointer = true will only affect primitive types- the + // collections are already pointers fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { // Product and summation collections are char* for byte-addressability - // since we can have variable type fields. is_global can only be true - // if make_pointer is true, with the exception of recursive call - // from array match arm + // since we can have variable type fields Type::Product(_) | Type::Summation(_) => { - if make_pointer { - "char*".to_string() - } else { - "char".to_string() - } + "char*".to_string() } Type::Array(element_type, _) => { - // This suffix lets us work with references of dynamic shared memory - // and use n-d array indexing. self.get_type(*element_type, true) } _ => convert_type(&self.types[id.idx()], make_pointer), } } -} -fn multiply_dynamic_constants(dcs: &Vec<DynamicConstant>, factors: &[DynamicConstantID]) -> Option<usize> { - factors.iter().try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, dcs) - .map(|val| acc.saturating_mul(val)) - }) + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Option<usize> { + factors.iter().try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, self.dynamic_constants) + .map(|val| acc.saturating_mul(val)) + }) + } } // TODO: Add float8, float16, bfloat16 dtypes if they come -- GitLab From a06ac462625bed1930e0407c2c0a7f3c1e76b4f2 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 3 Jan 2025 23:18:20 -0800 Subject: [PATCH 010/109] theoreticlaly just speicla case left --- hercules_cg/src/gpu.rs | 91 ++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index f2cfd9cf..122af64b 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -47,12 +47,17 @@ pub fn gpu_codegen<W: Write>( .map(NodeID::new) .collect(); + // Fork reduce map should have all reduces contained in some key let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); + // Reduct reduce map should have all non-parallel and non-associative reduces + // contained in some key. Unlike fork, reduct is not involved in any assertions, + // put it here for convenience but can move. + let reduct_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); for reduce_node in &reduce_nodes { if let Node::Reduce { control, init: _, - reduct: _, + reduct, } = &function.nodes[reduce_node.idx()] { match function.nodes[control.idx()] { @@ -71,6 +76,13 @@ pub fn gpu_codegen<W: Write>( panic!("Reduce's control must be a join or region node"); } } + if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce) + && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) { + reduct_reduce_map + .entry(*reduct) + .or_default() + .push(*reduce_node); + } } } for idx in 0..function.nodes.len() { @@ -160,6 +172,7 @@ pub fn gpu_codegen<W: Write>( bbs, kernel_params, fork_reduce_map, + reduct_reduce_map, label_data_for_phi, return_type_id, }; @@ -187,6 +200,7 @@ struct GPUContext<'a> { bbs: &'a Vec<NodeID>, kernel_params: &'a GPUKernelParams, fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, + reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, return_type_id: &'a TypeID, } @@ -367,7 +381,9 @@ impl GPUContext<'_> { } // Emit helper registers that are used throughout the kernel- alignment - // is for proper dynamic shared memory allocation + // is for proper dynamic shared memory allocation, max_variant_size is + // for variant selection during read/write copies since we don't keep + // tag (don't need and it can double summation memory usage due to alignment) fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!(w, "\tsize_t alignment;\n")?; write!(w, "\tsize_t max_variant_size;\n")?; @@ -555,45 +571,6 @@ impl GPUContext<'_> { } } - // /* - // * For each parallel reduce with a reduct write, meaning it's at the end of - // * a potential parallel reduction chain, we walk back to beginning of chain - // * and update the write's collect to be the beginning's init. - // */ - // fn update_write_collects(&self) -> HashMap<NodeID, NodeID> { - // let mut write_collect_map = HashMap::new(); - // let mut parallel_reduces: HashSet<NodeID> = (0..self.function.nodes.len()) - // .map(NodeID::new) - // .filter(|&node_id| { - // self.function.schedules[node_id.idx()].contains(&Schedule::ParallelReduce) - // }) - // .collect(); - // for reduce in parallel_reduces.clone() { - // if let Node::Reduce { - // control: _, - // init, - // reduct, - // } = &self.function.nodes[reduce.idx()] - // && let Node::Write { .. } = &self.function.nodes[reduct.idx()] - // { - // parallel_reduces.remove(&reduce); - // while parallel_reduces.contains(&init) { - // let Node::Reduce { - // control: _, - // init, - // reduct: _, - // } = &self.function.nodes[init.idx()] - // else { - // panic!("Expected reduce node"); - // }; - // parallel_reduces.remove(&init); - // } - // write_collect_map.insert(*reduct, *init); - // } - // } - // write_collect_map - // } - fn codegen_data_control( &self, root_forks: &Vec<NodeID>, @@ -684,10 +661,8 @@ impl GPUContext<'_> { _ => { panic!("Unsupported state for ThreadID") } } } - Node::Reduce { control: _, init, reduct: _ } => { - let init_val = self.get_value(*init, false, false); - write!(w, "{}{} = {};\n", tabs, declare_variable, init_val)?; - } + // Fork initializes the reduce and reduct updates the reduce + Node::Reduce { control: _, init: _, reduct: _ } => {} // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { @@ -889,16 +864,25 @@ impl GPUContext<'_> { } } if let Some(phis) = self.label_data_for_phi.get(&id) { + let val = self.get_value(id, false, false); for phi in phis { + let phi_val = self.get_value(*phi, false, false); write!( w, "{}{} = {};\n", tabs, - self.get_value(*phi, false, false), - self.get_value(id, false, false), + phi_val, + val, )?; } } + if let Some(reduces) = self.reduct_reduce_map.get(&id) { + let val = self.get_value(id, true, false); + for reduce in reduces { + let reduce_val = self.get_value(*reduce, false, false); + write!(w, "{}{} = {};\n", tabs, reduce_val, val)?; + } + } Ok(()) } @@ -937,7 +921,18 @@ impl GPUContext<'_> { Node::Fork { control: _, factors: _, - } => {} + } => { + // Emitting reduces before the fork allows the reduce to be + // used outside of the fork. + for &reduce in self.fork_reduce_map.get(&id).unwrap() { + let reduce_val = self.get_value(reduce, true, false); + let Node::Reduce { control: _, init, reduct: _ } = &self.function.nodes[reduce.idx()] else { + panic!("Expected reduce node"); + }; + let init_val = self.get_value(*init, true, false); + write!(w, "{}{} = {};\n", tabs, reduce_val, init_val)?; + } + } Node::Join { control: _ } => {} Node::Return { control: _, data } => { if self.types[self.typing[data.idx()].idx()].is_primitive() { -- GitLab From 5d78595c490660a275a0da1df35d6e7cc961bb11 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sat, 4 Jan 2025 18:16:57 -0600 Subject: [PATCH 011/109] beating around the bush --- hercules_cg/src/gpu.rs | 662 ++++++++++++++++++++++++++++------------- 1 file changed, 447 insertions(+), 215 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 122af64b..b129fcde 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -2,10 +2,7 @@ extern crate bitvec; extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; -use std::env::var; use std::fmt::{Error, Write}; -use std::iter::FromIterator; -use std::os::unix::thread; use self::hercules_ir::*; @@ -22,8 +19,7 @@ pub fn gpu_codegen<W: Write>( reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, - antideps: &Vec<(NodeID, NodeID)>, - bbs: &Vec<NodeID>, + bbs: &BasicBlocks, collection_objects: &FunctionCollectionObjects, w: &mut W, ) -> Result<(), Error> { @@ -38,8 +34,9 @@ pub fn gpu_codegen<W: Write>( * - max_num_blocks is within constraint of 1D grid size. This can be * relaxed if we want to support larger grids. * - product types are packed with padding inserted for each element to - * be aligned for its type and for full product to be aligned to its + * be aligned for its type and for full product to be aligned to its * largest element + * - similarly, summation types must be aligned to their largest element */ let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) @@ -77,7 +74,8 @@ pub fn gpu_codegen<W: Write>( } } if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce) - && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) { + && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) + { reduct_reduce_map .entry(*reduct) .or_default() @@ -141,7 +139,6 @@ pub fn gpu_codegen<W: Write>( max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, - greedy_associative_thresh: 32, }; let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { @@ -168,7 +165,6 @@ pub fn gpu_codegen<W: Write>( reverse_postorder, typing, control_subgraph, - antideps, bbs, kernel_params, fork_reduce_map, @@ -179,13 +175,11 @@ pub fn gpu_codegen<W: Write>( ctx.codegen_function(w) } -// Kernel parameters that are fixed prior to codegen. See description of -// greedy_associative_thresh in codegen_function. +// Kernel parameters that are fixed prior to codegen. struct GPUKernelParams { max_num_blocks: usize, max_num_threads: usize, threads_per_warp: usize, - greedy_associative_thresh: usize, } struct GPUContext<'a> { @@ -196,8 +190,7 @@ struct GPUContext<'a> { reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, - antideps: &'a Vec<(NodeID, NodeID)>, - bbs: &'a Vec<NodeID>, + bbs: &'a BasicBlocks, kernel_params: &'a GPUKernelParams, fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, @@ -212,6 +205,13 @@ struct CudaGoto { term: String, } +#[derive(Clone, Copy, PartialEq, Debug)] +enum KernelState { + OutBlockFork, + InBlockFork, + InThreadFork, +} + impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // All possible includes followed by macros for intrinsic calls on @@ -241,14 +241,14 @@ impl GPUContext<'_> { self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; - // self.codegen_struct_def(&mut top)?; - self.codegen_phi_declarations(&mut top)?; + self.codegen_declare_all(&mut top)?; self.codegen_helpers(&mut top)?; let (fork_tree, fork_control_map) = self.make_fork_structures(); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - let (cumul_factors, num_threads) = self.get_cumulative_factors(&fork_tree, &root_forks); + let thread_root_forks = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); + let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, &thread_root_forks); let start = NodeID::new(0); let ret = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_return()) @@ -256,6 +256,7 @@ impl GPUContext<'_> { .next() .unwrap(); let (begin_control, end_control) = self.get_begin_end_control(start, ret); + // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) @@ -267,12 +268,17 @@ impl GPUContext<'_> { .collect(); self.codegen_data_control( - &root_forks, + if num_blocks > 1 { + Some(root_forks[0]) + } else { + None + }, + &thread_root_forks, &fork_tree, &fork_control_map, &begin_control, &end_control, - &cumul_factors, + &fork_thread_quota_map, num_threads, num_blocks, &mut gotos, @@ -370,12 +376,10 @@ impl GPUContext<'_> { Ok(()) } - // We declare all phi values upfront - fn codegen_phi_declarations(&self, w: &mut String) -> Result<(), Error> { + // To abide by c++ reassignment restrictions, we declare all values upfront. + fn codegen_declare_all(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if let Node::Phi {..} = &self.function.nodes[id.idx()] { - write!(w, "\t{};\n", self.get_value(id, true, false))?; - } + write!(w, "\t{};\n", self.get_value(id, true, false))?; } Ok(()) } @@ -509,132 +513,270 @@ impl GPUContext<'_> { } /* - * Once inside the block-level forks, we initiate a cumul_factor at 1. If - * encountering a child fork with known size s < max_num_threads / cumul_factor, - * with all reduces being parallel or associative, then we parallelize along - * s, else we serialize. Then step into child and update cumul_factor if needed. - * One exception is if fork factor is a multiple of greedy_associative_thresh - * and at least one reduce is associative, in which case we use warp reduction - * and disable cumul_factor change for its subtree. At end, we've mapped - * each fork to its cumulative factor, and if not present fork uses it's parent's - * factor. + * This analysis determines the parallelization strategy within threadblocks. + * We run post-order traversal on the fork tree to get the thread quota per + * subtree. In particular, each fork starts with a base factor as the + * maximum over its descendants (leafs have base 1). We traverse up (details + * in helper) and pass the factor and a map from fork node to + * (max quota of its siblings (including itself), its quota, its fork factor) + * - all three are needed for codegen. A node is in the map IFF it will be parallelized. + * If not, the fork will use the parent's quota. Nodes may be removed from the + * map when traversing up the tree due to either of the max scenarios. */ - fn get_cumulative_factors( + fn get_thread_quotas( &self, fork_tree: &HashMap<NodeID, Vec<NodeID>>, root_forks: &Vec<NodeID>, - ) -> (HashMap<NodeID, usize>, usize) { - let mut cumul_factors = HashMap::new(); - for root_fork in root_forks { - cumul_factors.insert(*root_fork, 1); - self.recurse_cumul_factors(*root_fork, fork_tree, 1, &mut cumul_factors); - } - let num_threads = *cumul_factors.values().max().unwrap(); - (cumul_factors, num_threads) + ) -> (HashMap<NodeID, (usize, usize, usize)>, usize) { + // We clone to add dummy root-of-roots fork + let mut fork_tree = fork_tree.clone(); + fork_tree.insert(root_forks[0], root_forks.clone()); + let (tree_map, tree_quota, _) = self.recurse_thread_quotas(root_forks[0], &fork_tree, true); + (tree_map, tree_quota) } - fn recurse_cumul_factors( + // Helper function for post-order traversal of fork tree + fn recurse_thread_quotas( &self, curr_fork: NodeID, fork_tree: &HashMap<NodeID, Vec<NodeID>>, - cumul_factor: usize, - cumul_factors: &mut HashMap<NodeID, usize>, - ) { + is_root: bool, + ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { + // Subsubtree map is the union of all keys for grandchildren and lower + // nodes, and subtree_quota is constructed map from children to their + // quota + let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree + .get(&curr_fork) + .unwrap() + .iter() + .map(|child| (child, self.recurse_thread_quotas(*child, fork_tree, false))) + .fold( + (HashMap::new(), HashMap::new(), 0), + |(mut subsubtree_map, mut children_quota_map, subtree_quota), (child, (curr_map, curr_quota, use_curr))| { + subsubtree_map.extend(curr_map); + if use_curr { + children_quota_map.insert(child, curr_quota); + } + (subsubtree_map, children_quota_map, subtree_quota.max(curr_quota)) + }, + ); + // First update children_quota_map items with full information and add + // to subsubtree_map + for (&child, quota) in children_quota_map.iter() { + let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else { + panic!("Expected fork node"); + }; + let fork_size = self.multiply_fork_factors(factors).unwrap_or(0); + subsubtree_map.insert(*child, (subtree_quota, *quota, fork_size)); + } + let subtree_map = subsubtree_map; + if is_root { + return (subtree_map, subtree_quota, true) + } + /* + * A node can only be considered for parallelization if: + * a) it has statically known size + * b) the known size is less than or equal to the max_num_threads + * c) the known size is a power of 2 + * d) all reduces are parallel-reduce or associative + * + * Note: in what follows, there are a few cases where we choose between + * parallelizing the fork vs its subtree, by taking max factor over subtree. + * However, parts of the subtree may have had smaller quotas and didn't + * need to be discarded. For now we avoid this complexity and discard full. + */ let reduces = &self.fork_reduce_map[&curr_fork]; - if reduces.iter().all(|&reduce| { - self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) - || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) - }) && let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] + if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] && let Some(fork_size) = self.multiply_fork_factors(factors) - && fork_size <= self.kernel_params.max_num_threads / cumul_factor + && fork_size <= self.kernel_params.max_num_threads + && fork_size.is_power_of_two() + && reduces.iter().all(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) + || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) + }) { - if fork_size % self.kernel_params.greedy_associative_thresh == 0 - && reduces.iter().any(|&reduce| { + /* + * If there's an associative reduce, + * if fork and subtree fit in warp, parallelize both + * else if fork is a multiple of warp size, parallelize the max between them + * else parallelize subtree + * Else, parallelize both + */ + if fork_size <= self.kernel_params.max_num_threads / subtree_quota { + if reduces.iter().any(|&reduce| { self.function.schedules[reduce.idx()].contains(&Schedule::Associative) }) { - cumul_factors.insert(curr_fork, cumul_factor * fork_size); - } else { - let mut max_factor = cumul_factor * fork_size; - for child in fork_tree[&curr_fork].iter() { - self.recurse_cumul_factors(*child, fork_tree, cumul_factor * fork_size, cumul_factors); - max_factor = max_factor.max(cumul_factors[child]); + if self.kernel_params.threads_per_warp % (fork_size * subtree_quota) == 0 { + (subtree_map, fork_size * subtree_quota, true) + } else if fork_size % self.kernel_params.threads_per_warp == 0 { + if fork_size >= subtree_quota { + (HashMap::new(), fork_size, true) + } else { + (subtree_map, subtree_quota, false) + } + } else { + (subtree_map, subtree_quota, false) + } + } else { + (subtree_map, fork_size * subtree_quota, true) } - cumul_factors.insert(curr_fork, max_factor); } - } else { - let mut max_factor = cumul_factor; - for child in fork_tree[&curr_fork].iter() { - self.recurse_cumul_factors(*child, fork_tree, cumul_factor, cumul_factors); - max_factor = max_factor.max(cumul_factors[child]); + // We have to choose either the fork or its subtree + else if fork_size >= subtree_quota { + (HashMap::new(), fork_size, true) + } else { + (subtree_map, subtree_quota, false) } - cumul_factors.insert(curr_fork, max_factor); + } else { + (subtree_map, subtree_quota, false) } } - fn codegen_data_control( + fn get_thread_root_forks( &self, root_forks: &Vec<NodeID>, fork_tree: &HashMap<NodeID, Vec<NodeID>>, + num_blocks: usize, + ) -> Vec<NodeID> { + if num_blocks > 1 { + root_forks.clone() + } else { + fork_tree.get(&root_forks[0]).unwrap().to_vec() + } + } + + fn codegen_data_control( + &self, + block_fork: Option<NodeID>, + thread_root_forks: &Vec<NodeID>, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, fork_control_map: &HashMap<NodeID, Vec<NodeID>>, begin_control: &HashSet<NodeID>, end_control: &HashSet<NodeID>, - cumul_factors: &HashMap<NodeID, usize>, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, num_threads: usize, num_blocks: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { - let control_to_data = - (0..self.bbs.len()).fold(HashMap::<NodeID, Vec<NodeID>>::new(), |mut map, id| { - if let Some(control) = self.bbs.get(id) { - map.entry(*control).or_default().push(NodeID::new(id)); - }; - map - }); - // Define the following states: // 0 is above block fork, 1 is in block fork above any thread fork, 2 is // in any thread fork, 3 is below block fork // If num_blocks > 1, initialize state to 0, else 1 - let mut state = if num_blocks > 1 { 0 } else { 1 }; + let has_block_fork = block_fork.is_some(); + let mut state = if has_block_fork { + KernelState::OutBlockFork + } else { + KernelState::InBlockFork + }; // Then generate data and control for each control in begin_control for control in begin_control { - let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; - for data in control_to_data.get(control).unwrap() { - self.codegen_data_node(*data, state, num_threads, body, &mut 1)?; + let body = &mut gotos.get_mut(control).unwrap().body; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; } - let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; + let term = &mut gotos.get_mut(control).unwrap().term; self.codegen_control_node(*control, term, 1)?; } // Then if num_blocks > 1, set state to 1 and generate data and control // for the single root fork - if num_blocks > 1 { - state = 1; - for control in fork_control_map.get(&root_forks[0]).unwrap() { - for data in control_to_data.get(control).unwrap() { - let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; - self.codegen_data_node(*data, state, num_threads, body, &mut 1)?; + if has_block_fork { + state = KernelState::InBlockFork; + for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { + for data in self.bbs.1[control.idx()].iter() { + let body = &mut gotos.get_mut(control).unwrap().body; + self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; } - let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; + let term = &mut gotos.get_mut(control).unwrap().term; self.codegen_control_node(*control, term, 1)?; } } // Set state to 2 and begin DFS through fork_tree (after root_fork if // visited in previous step), updating thread_quota + for &root_fork in thread_root_forks { + self.codegen_data_control_traverse( + root_fork, + fork_tree, + fork_control_map, + fork_thread_quota_map, + 1, + num_threads, + gotos, + )?; + } // If num_blocks > 1, set state to 3, else 1 + state = if num_blocks > 1 { + KernelState::OutBlockFork + } else { + KernelState::InBlockFork + }; + for control in end_control { + let body = &mut gotos.get_mut(control).unwrap().body; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; + } + let term = &mut gotos.get_mut(control).unwrap().term; + self.codegen_control_node(*control, term, 1)?; + } // Then generate data and control for each control in end_control Ok(()) } + fn codegen_data_control_traverse( + &self, + curr_fork: NodeID, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + fork_control_map: &HashMap<NodeID, Vec<NodeID>>, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + parent_quota: usize, + num_threads: usize, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + let (available_thread_quota, use_thread_quota, fork_factor) = fork_thread_quota_map + .get(&curr_fork) + .map(|(a, u, f)| (*a, *u, Some(*f))) + .unwrap_or((parent_quota, parent_quota, None)); + for control in fork_control_map.get(&curr_fork).unwrap() { + let body = &mut gotos.get_mut(control).unwrap().body; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node( + *data, + KernelState::InThreadFork, + available_thread_quota, + use_thread_quota, + fork_factor, + body, + &mut 1, + )?; + } + let term = &mut gotos.get_mut(control).unwrap().term; + self.codegen_control_node(*control, term, 1)?; + } + for child in fork_tree.get(&curr_fork).unwrap() { + self.codegen_data_control_traverse( + *child, + fork_tree, + fork_control_map, + fork_thread_quota_map, + use_thread_quota, + num_threads, + gotos, + )?; + } + Ok(()) + } + fn codegen_data_node( &self, id: NodeID, - state: usize, - thread_quota: usize, + state: KernelState, + available_thread_quota: usize, + use_thread_quota: usize, + fork_factor: Option<usize>, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { - let declare_variable = self.get_value(id, true, false).to_string(); + let define_variable = self.get_value(id, false, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { // Phi registers were already emitted. @@ -642,34 +784,57 @@ impl GPUContext<'_> { control: _, data: _, } => {} - Node::ThreadID { - control, - dimension, - } => { + Node::ThreadID { control, dimension } => { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { panic!("Expected ThreadID's control to be a fork node"); }; match state { - 1 => { - // Violating DRY with the naming but unsure how to map + KernelState::InBlockFork => { + // Violating DRY with the naming but unsure how to map // DynamicConstantID to NodeID to use `get_value` - let divide = factors.iter().skip(dimension + 1).map(|f| format!("dc{}", f.idx())).collect::<Vec<_>>().join(" * "); + let divide = { + let divide = factors + .iter() + .skip(dimension + 1) + .map(|f| format!("dc{}", f.idx())) + .collect::<Vec<_>>() + .join(" * "); + if divide.is_empty() { + "1".to_string() + } else { + divide + } + }; let modulo = format!("dc{}", factors[*dimension].idx()); - write!(w, "{}{} = (blockIdx.x / ({})) % {};\n", tabs, declare_variable, divide, modulo)?; + write!( + w, + "{}{} = (blockIdx.x / ({})) % {};\n", + tabs, define_variable, divide, modulo + )?; + } + KernelState::InThreadFork => { + todo!() + } + _ => { + panic!("Unsupported state for ThreadID") } - 2 => {} - _ => { panic!("Unsupported state for ThreadID") } } } // Fork initializes the reduce and reduct updates the reduce - Node::Reduce { control: _, init: _, reduct: _ } => {} + Node::Reduce { + control: _, + init: _, + reduct: _, + } => {} // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { - write!(w, "{}{};\n", tabs, declare_variable)?; - let define_variable = self.get_value(id, false, false); self.codegen_constant( - if self.types[self.typing[id.idx()].idx()].is_primitive() { define_variable } else { format!("*{}", define_variable)}, + if self.types[self.typing[id.idx()].idx()].is_primitive() { + define_variable + } else { + format!("*{}", define_variable) + }, *cons_id, true, w, @@ -685,7 +850,7 @@ impl GPUContext<'_> { w, "{}{} = !{};\n", tabs, - declare_variable, + define_variable, self.get_value(*input, false, false), )?; } @@ -694,7 +859,7 @@ impl GPUContext<'_> { w, "{}{} = ~{};\n", tabs, - declare_variable, + define_variable, self.get_value(*input, false, false), )?; } @@ -706,7 +871,7 @@ impl GPUContext<'_> { w, "{}{} = -{};\n", tabs, - declare_variable, + define_variable, self.get_value(*input, false, false), )?; } @@ -719,7 +884,7 @@ impl GPUContext<'_> { w, "{}{} = static_cast<{}>({});\n", tabs, - declare_variable, + define_variable, self.get_type(*dst_ty_id, false), self.get_value(*input, false, false), )?; @@ -732,29 +897,29 @@ impl GPUContext<'_> { (BinaryOperator::Rem, Type::Float32) => write!( w, "{}{} = fmodf({}, {});\n", - tabs, declare_variable, left_val, right_val, + tabs, define_variable, left_val, right_val, )?, (BinaryOperator::Rem, Type::Float64) => write!( w, "{}{} = fmod({}, {});\n", - tabs, declare_variable, left_val, right_val, + tabs, define_variable, left_val, right_val, )?, // Doesn't need special syntax but bool type (BinaryOperator::Or, Type::Boolean) => write!( w, "{}{} = {} || {};\n", - tabs, declare_variable, left_val, right_val, + tabs, define_variable, left_val, right_val, )?, (BinaryOperator::And, Type::Boolean) => write!( w, "{}{} = {} && {};\n", - tabs, declare_variable, left_val, right_val, + tabs, define_variable, left_val, right_val, )?, (op, _) => write!( w, "{}{} = {} {} {};\n", tabs, - declare_variable, + define_variable, left_val, match op { BinaryOperator::Add => "+", @@ -789,7 +954,7 @@ impl GPUContext<'_> { w, "{}{} = {} ? {} : {};", tabs, - declare_variable, + define_variable, self.get_value(*first, false, false), self.get_value(*second, false, false), self.get_value(*third, false, false), @@ -803,16 +968,14 @@ impl GPUContext<'_> { w, "{}{} = {}({});\n", tabs, - declare_variable, + define_variable, func_name, self.get_value(args[0], false, false), )?; } - // Main difference between read and write is codegen_copy takes the + // Main difference between read and write is codegen_copy takes the // returned node's type for read and data node's type for write Node::Read { collect, indices } => { - write!(w, "{}{};\n", tabs, declare_variable); - let define_variable = self.get_value(id, false, false); let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); let type_id = self.typing[id.idx()]; @@ -822,7 +985,7 @@ impl GPUContext<'_> { &define_variable, &collect_with_indices, if !self.types[type_id.idx()].is_primitive() { - Some(thread_quota) + Some(use_thread_quota) } else { None }, @@ -836,7 +999,6 @@ impl GPUContext<'_> { data, indices, } => { - write!(w, "{}{};\n", tabs, declare_variable); let data_variable = self.get_value(*data, false, false); let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); @@ -847,15 +1009,14 @@ impl GPUContext<'_> { &data_variable, &collect_with_indices, if !self.types[type_id.idx()].is_primitive() { - Some(thread_quota) + Some(use_thread_quota) } else { None }, - state == 0, + state == KernelState::OutBlockFork, w, *num_tabs, )?; - let define_variable = self.get_value(id, false, false); let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -867,13 +1028,7 @@ impl GPUContext<'_> { let val = self.get_value(id, false, false); for phi in phis { let phi_val = self.get_value(*phi, false, false); - write!( - w, - "{}{} = {};\n", - tabs, - phi_val, - val, - )?; + write!(w, "{}{} = {};\n", tabs, phi_val, val,)?; } } if let Some(reduces) = self.reduct_reduce_map.get(&id) { @@ -922,11 +1077,16 @@ impl GPUContext<'_> { control: _, factors: _, } => { - // Emitting reduces before the fork allows the reduce to be + // Emitting reduces before the fork allows the reduce to be // used outside of the fork. for &reduce in self.fork_reduce_map.get(&id).unwrap() { let reduce_val = self.get_value(reduce, true, false); - let Node::Reduce { control: _, init, reduct: _ } = &self.function.nodes[reduce.idx()] else { + let Node::Reduce { + control: _, + init, + reduct: _, + } = &self.function.nodes[reduce.idx()] + else { panic!("Expected reduce node"); }; let init_val = self.get_value(*init, true, false); @@ -952,8 +1112,8 @@ impl GPUContext<'_> { Ok(()) } - // Handles copying data to/from global and shared memory. Thread parallelization - // is used only for arrays (possibly inside another collection). is_char indicates + // Handles copying data to/from global and shared memory. Thread parallelization + // is used only for arrays (possibly inside another collection). is_char indicates // a char type and we need to including element size in indexing. fn codegen_copy( &self, @@ -981,7 +1141,7 @@ impl GPUContext<'_> { s } }; - // Either we parallelize over threads or gate the loop by threadIdx.x + // Either we parallelize over threads or gate the loop by threadIdx.x // == 0 let mut extra_tab = ""; let mut extra_tab2 = ""; @@ -990,10 +1150,15 @@ impl GPUContext<'_> { extra_tab = "\t"; } let has_thread_quota = thread_quota.is_some(); - write!(w, "{}", if has_thread_quota { + write!( + w, + "{}", + if has_thread_quota { format!( "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", - tabs, rem_array_size, thread_quota.unwrap() + tabs, + rem_array_size, + thread_quota.unwrap() ) } else { format!( @@ -1004,21 +1169,37 @@ impl GPUContext<'_> { ); let element_type_name = self.get_type(*element_type_id, true); let (new_collect, new_data) = if self.is_char(type_id) { - (format!( - "{} + i * {}", - collect, - self.get_size(*element_type_id, None) - ), - format!( - "{} + i * {}", - data, - self.get_size(*element_type_id, None) - )) + ( + format!( + "{} + i * {}", + collect, + self.get_size(*element_type_id, None) + ), + format!("{} + i * {}", data, self.get_size(*element_type_id, None)), + ) } else { (format!("{} + i", collect), format!("{} + i", data)) }; - let new_collect = format!("{}reinterpret_cast<{}>({})", if self.types[element_type_id.idx()].is_primitive() { "*" } else { "" }, element_type_name, new_collect); - let new_data = format!("{}reinterpret_cast<{}>({})", if self.types[element_type_id.idx()].is_primitive() { "*" } else { "" }, element_type_name, new_data); + let new_collect = format!( + "{}reinterpret_cast<{}>({})", + if self.types[element_type_id.idx()].is_primitive() { + "*" + } else { + "" + }, + element_type_name, + new_collect + ); + let new_data = format!( + "{}reinterpret_cast<{}>({})", + if self.types[element_type_id.idx()].is_primitive() { + "*" + } else { + "" + }, + element_type_name, + new_data + ); self.codegen_copy( is_write, *element_type_id, @@ -1027,7 +1208,9 @@ impl GPUContext<'_> { None, false, w, - num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 1 } else { 2 }, + num_tabs + + if block_restrict { 1 } else { 0 } + + if has_thread_quota { 1 } else { 2 }, )?; if !has_thread_quota { write!(w, "{}\t}}\n", tabs)?; @@ -1052,8 +1235,28 @@ impl GPUContext<'_> { for (i, field) in fields.iter().enumerate() { let offset = self.get_size(type_id, Some(i)); let field_type_name = self.get_type(*field, true); - let new_collect = format!("{}reinterpret_cast<{}>({} + {})", if self.types[field.idx()].is_primitive() { "*" } else { "" }, field_type_name, collect, offset); - let new_data = format!("{}reinterpret_cast<{}>({} + {})", if self.types[field.idx()].is_primitive() { "*" } else { "" }, field_type_name, data, offset); + let new_collect = format!( + "{}reinterpret_cast<{}>({} + {})", + if self.types[field.idx()].is_primitive() { + "*" + } else { + "" + }, + field_type_name, + collect, + offset + ); + let new_data = format!( + "{}reinterpret_cast<{}>({} + {})", + if self.types[field.idx()].is_primitive() { + "*" + } else { + "" + }, + field_type_name, + data, + offset + ); self.codegen_copy( is_write, *field, @@ -1062,7 +1265,9 @@ impl GPUContext<'_> { thread_quota, false, w, - num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 0 } else { 1 }, + num_tabs + + if block_restrict { 1 } else { 0 } + + if has_thread_quota { 0 } else { 1 }, )?; } if !has_thread_quota { @@ -1088,14 +1293,40 @@ impl GPUContext<'_> { // We can guarantee correctness for summation by just copying the // largest variant. let max_variant_size = self.get_size(type_id, None); - write!(w, "{}{}{}max_variant_size = {};\n", tabs, extra_tab, extra_tab2, max_variant_size)?; + write!( + w, + "{}{}{}max_variant_size = {};\n", + tabs, extra_tab, extra_tab2, max_variant_size + )?; for (i, variant) in variants.iter().enumerate() { let prefix = if i == 0 { "if" } else { "else if" }; let variant_size = self.get_size(*variant, None); - write!(w, "{}{}{}{} (max_variant_size == {}) {{\n", tabs, extra_tab, extra_tab2, prefix, variant_size)?; + write!( + w, + "{}{}{}{} (max_variant_size == {}) {{\n", + tabs, extra_tab, extra_tab2, prefix, variant_size + )?; let field_type_name = self.get_type(*variant, true); - let new_collect = format!("{}reinterpret_cast<{}>({})", if self.types[variant.idx()].is_primitive() { "*" } else { "" }, field_type_name, collect); - let new_data = format!("{}reinterpret_cast<{}>({})", if self.types[variant.idx()].is_primitive() { "*" } else { "" }, field_type_name, data); + let new_collect = format!( + "{}reinterpret_cast<{}>({})", + if self.types[variant.idx()].is_primitive() { + "*" + } else { + "" + }, + field_type_name, + collect + ); + let new_data = format!( + "{}reinterpret_cast<{}>({})", + if self.types[variant.idx()].is_primitive() { + "*" + } else { + "" + }, + field_type_name, + data + ); self.codegen_copy( is_write, *variant, @@ -1104,7 +1335,9 @@ impl GPUContext<'_> { thread_quota, false, w, - num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 0 } else { 1 }, + num_tabs + + if block_restrict { 1 } else { 0 } + + if has_thread_quota { 0 } else { 1 }, )?; write!(w, "{}{}{}}}\n", tabs, extra_tab, extra_tab2)?; } @@ -1225,7 +1458,7 @@ impl GPUContext<'_> { Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, val)?, Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, - // All three followign collections involve align then allocate from the + // All three followign collections involve align then allocate from the // single dynamic shared memory buffer by using and updating the offset. Constant::Product(type_id, constant_fields) => { if allow_allocate { @@ -1236,23 +1469,24 @@ impl GPUContext<'_> { "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; {}{} = dynamic_shared + dynamic_shared_offset;\n {}dynamic_shared_offset += {};\n", - tabs, - alignment, - alignment, - alignment, - tabs, - name, - tabs, - size, + tabs, alignment, alignment, alignment, tabs, name, tabs, size, )?; } - let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") }; + let Type::Product(type_fields) = &self.types[type_id.idx()] else { + panic!("Product constant should have product type") + }; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call let field_constant = &self.constants[constant_fields[i].idx()]; let field_type = self.get_type(type_fields[i], true); let offset = self.get_size(type_fields[i], Some(i)); - self.codegen_constant(format!("*reinterpret_cast<{}>({}{})", field_type, name, offset), constant_fields[i], false, w, num_tabs); + self.codegen_constant( + format!("*reinterpret_cast<{}>({}{})", field_type, name, offset), + constant_fields[i], + false, + w, + num_tabs, + ); } } Constant::Summation(type_id, variant, field) => { @@ -1264,22 +1498,24 @@ impl GPUContext<'_> { "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; {}{} = dynamic_shared + dynamic_shared_offset;\n {}dynamic_shared_offset += {};\n", - tabs, - alignment, - alignment, - alignment, - tabs, - name, - tabs, - size, + tabs, alignment, alignment, alignment, tabs, name, tabs, size, )?; } // No offset updating needed since all variants start at 0 - let Type::Summation(variants) = &self.types[type_id.idx()] else { panic!("Summation constant should have summation type") }; - let variant_type = self.get_type(self.typing[variants[*variant as usize].idx()], true); + let Type::Summation(variants) = &self.types[type_id.idx()] else { + panic!("Summation constant should have summation type") + }; + let variant_type = + self.get_type(self.typing[variants[*variant as usize].idx()], true); let variant_constant = &self.constants[field.idx()]; if variant_constant.is_scalar() { - self.codegen_constant(format!("*reinterpret_cast<{}>{}", variant_type, name) , cons_id, false, w, num_tabs); + self.codegen_constant( + format!("*reinterpret_cast<{}>{}", variant_type, name), + cons_id, + false, + w, + num_tabs, + ); } else if !variant_constant.is_array() { self.codegen_constant(name, cons_id, false, w, num_tabs); }; @@ -1296,15 +1532,7 @@ impl GPUContext<'_> { ";\n{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; {}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n {}dynamic_shared_offset += {};\n", - tabs, - alignment, - alignment, - alignment, - tabs, - name, - element_type, - tabs, - size + tabs, alignment, alignment, alignment, tabs, name, element_type, tabs, size )?; } } @@ -1312,7 +1540,7 @@ impl GPUContext<'_> { } // Emit code to calculate data size. For Product types, setting `field_number` - // gives data size up to but not including that field, so = 2 gives 1st field + // gives data size up to but not including that field, so = 2 gives 1st field // and offset to 2nd field. This is useful for generating constant initialization // and read/write index math. fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { @@ -1336,11 +1564,18 @@ impl GPUContext<'_> { if acc == "0" { size } else { - format!("({} + {} - 1) / {}) * {} + {}", acc, align, align, align, size) + format!( + "({} + {} - 1) / {}) * {} + {}", + acc, align, align, align, size + ) } }); if num_fields < &fields.len() { - format!("{} - {}", with_field, self.get_size(fields[*num_fields], None)) + format!( + "{} - {}", + with_field, + self.get_size(fields[*num_fields], None) + ) } else { with_field } @@ -1349,37 +1584,38 @@ impl GPUContext<'_> { // The argmax variant by size is not guaranteed to be same as // argmax variant by alignment, eg product of 3 4-byte primitives // vs 1 8-byte primitive, so we need to calculate both. - let max_size = variants - .iter() - .map(|id| self.get_size(*id, None)) - .fold(String::from("0"), |acc, x| { + let max_size = variants.iter().map(|id| self.get_size(*id, None)).fold( + String::from("0"), + |acc, x| { if acc == "0" { x } else { format!("umax({}, {})", acc, x) } - }); + }, + ); let max_alignment = variants .iter() .map(|id| self.get_alignment(*id)) .max() .unwrap_or(0); - format!("({} + {} - 1) / {} * {}", max_size, max_alignment, max_alignment, max_alignment) + format!( + "({} + {} - 1) / {} * {}", + max_size, max_alignment, max_alignment, max_alignment + ) } - _ => format!("{}", self.get_alignment(type_id)) + _ => format!("{}", self.get_alignment(type_id)), } } fn get_alignment(&self, type_id: TypeID) -> usize { match &self.types[type_id.idx()] { Type::Array(element_type, _) => self.get_alignment(*element_type), - Type::Product(fields) | Type::Summation(fields) => { - fields - .iter() - .map(|field| self.get_alignment(*field)) - .max() - .unwrap_or(0) - } + Type::Product(fields) | Type::Summation(fields) => fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(0), Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, Type::Integer16 | Type::UnsignedInteger16 => 2, Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, @@ -1504,7 +1740,7 @@ impl GPUContext<'_> { } // Check if a type should be represented as char*. Must be a product, - // summation, or array of product/summation types. + // summation, or array of product/summation types. fn is_char(&self, type_id: TypeID) -> bool { match &self.types[type_id.idx()] { Type::Product(_) | Type::Summation(_) => true, @@ -1555,18 +1791,14 @@ impl GPUContext<'_> { } } - // Setting make_pointer = true will only affect primitive types- the + // Setting make_pointer = true will only affect primitive types- the // collections are already pointers fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { // Product and summation collections are char* for byte-addressability // since we can have variable type fields - Type::Product(_) | Type::Summation(_) => { - "char*".to_string() - } - Type::Array(element_type, _) => { - self.get_type(*element_type, true) - } + Type::Product(_) | Type::Summation(_) => "char*".to_string(), + Type::Array(element_type, _) => self.get_type(*element_type, true), _ => convert_type(&self.types[id.idx()], make_pointer), } } -- GitLab From dbb1d863322d7ad129762c02820f17cccb288441 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 13:28:33 -0600 Subject: [PATCH 012/109] cg will solve it promise --- hercules_cg/src/gpu.rs | 487 +++++++++++++++++++---------------------- 1 file changed, 229 insertions(+), 258 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index b129fcde..00b0051d 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -89,7 +89,7 @@ pub fn gpu_codegen<W: Write>( .get(&NodeID::new(idx)) .map_or(true, |reduces| reduces.is_empty()) { - panic!("Join node {} has no reduce nodes", idx); + panic!("Fork node {} has no reduce nodes", idx); } } @@ -157,6 +157,8 @@ pub fn gpu_codegen<W: Write>( }; let label_data_for_phi = &label_data_for_phi(); + let fork_join_map = &fork_join_map(function, control_subgraph); + let ctx = GPUContext { function, types, @@ -170,6 +172,7 @@ pub fn gpu_codegen<W: Write>( fork_reduce_map, reduct_reduce_map, label_data_for_phi, + fork_join_map, return_type_id, }; ctx.codegen_function(w) @@ -195,6 +198,7 @@ struct GPUContext<'a> { fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, + fork_join_map: &'a HashMap<NodeID, NodeID>, return_type_id: &'a TypeID, } @@ -244,18 +248,17 @@ impl GPUContext<'_> { self.codegen_declare_all(&mut top)?; self.codegen_helpers(&mut top)?; - let (fork_tree, fork_control_map) = self.make_fork_structures(); + let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - let thread_root_forks = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); - let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, &thread_root_forks); + let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); + let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); let start = NodeID::new(0); let ret = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_return()) .map(NodeID::new) .next() .unwrap(); - let (begin_control, end_control) = self.get_begin_end_control(start, ret); // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -269,18 +272,15 @@ impl GPUContext<'_> { self.codegen_data_control( if num_blocks > 1 { - Some(root_forks[0]) + Some(thread_root_root_fork) } else { None }, &thread_root_forks, &fork_tree, &fork_control_map, - &begin_control, - &end_control, &fork_thread_quota_map, num_threads, - num_blocks, &mut gotos, )?; @@ -376,10 +376,19 @@ impl GPUContext<'_> { Ok(()) } - // To abide by c++ reassignment restrictions, we declare all values upfront. + // To abide by c++ reassignment restrictions, we declare all data values + // upfront. We also declare an iteration variable for each fork, which will + // be used for non-parallelized forks. Thus, some may go unused, but we don't + // know which points at time of this call- could move this function after that + // analysis but for now not. fn codegen_declare_all(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - write!(w, "\t{};\n", self.get_value(id, true, false))?; + if !self.function.nodes[id.idx()].is_control() { + write!(w, "\t{};\n", self.get_value(id, true, false))?; + } + if self.function.nodes[id.idx()].is_fork() { + write!(w, "\tunsigned int {} = 0;\n", self.get_fork_iter(id))?; + } } Ok(()) } @@ -394,87 +403,36 @@ impl GPUContext<'_> { Ok(()) } - /* Create two fork structures: - * First, fork_forward_adjacency is a map from each fork node F to all forks satisfying: + /* Create fork_tree, a map from each fork node F to all forks satisfying: * a) domination by F * b) no domination by F's join * c) no domination by any other fork that's also dominated by F, where we don't count self-domination + * Note that the fork_tree also includes the start node, to include all controls + * outside any fork. + * * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying: * a) domination by F * b) no domination by F's join * c) no domination by any other fork that's also dominated by F, where we do count self-domination */ - fn make_fork_structures(&self) -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, Vec<NodeID>>) { - let mut fork_tree: HashMap<NodeID, Vec<NodeID>> = (0..self.function.nodes.len()) - .filter(|idx| self.function.nodes[*idx].is_fork()) - .map(|idx| (NodeID::new(idx), vec![])) - .collect(); - let mut fork_control_map = HashMap::new(); - let mut queued_nodes = VecDeque::new(); - - for (fork_node, fork_children) in fork_tree.iter_mut() { - let mut control_vec = vec![]; - queued_nodes.push_back(*fork_node); - while !queued_nodes.is_empty() { - let node = queued_nodes.pop_front().unwrap(); - control_vec.push(node); - for child in self.control_subgraph.succs(node) { - if self.function.nodes[child.idx()].is_fork() { - fork_children.push(child); - } else if self.function.nodes[child.idx()].is_join() { - control_vec.push(child); - } else { - queued_nodes.push_back(child); - } + fn make_fork_structures(&self, fork_join_map: &HashMap<NodeID, NodeID>) -> (HashMap<NodeID, HashSet<NodeID>>, HashMap<NodeID, HashSet<NodeID>>) { + let dom = dominator(self.control_subgraph, NodeID::new(0)); + let fork_nesting = compute_fork_join_nesting(self.function, &dom, fork_join_map); + fork_nesting.into_iter().fold( + (HashMap::new(), HashMap::new()), + |(mut fork_tree, mut fork_control_map), (control, forks)| { + let nested_fork = forks.first().copied().unwrap_or(NodeID::new(0)); + if self.function.nodes[control.idx()].is_fork() { + fork_tree.entry(nested_fork).or_insert_with(HashSet::new).insert(control); + } else { + fork_control_map.entry(nested_fork).or_insert_with(HashSet::new).insert(control); } - } - fork_control_map.insert(*fork_node, control_vec); - } - (fork_tree, fork_control_map) - } - - // Get control nodes succeeding the start and preceding all forks, and - // control nodes preceding the return and succeeding all joins - fn get_begin_end_control( - &self, - start: NodeID, - ret: NodeID, - ) -> (HashSet<NodeID>, HashSet<NodeID>) { - let mut begin_visited = HashSet::new(); - let mut begin_worklist = VecDeque::new(); - begin_worklist.push_back(start); - - while let Some(node) = begin_worklist.pop_front() { - if begin_visited.contains(&node) { - continue; - } - if self.function.nodes[node.idx()].is_fork() { - continue; - } - begin_visited.insert(node); - for pred in self.control_subgraph.preds(node) { - begin_worklist.push_back(pred); - } - } - - let mut end_visited = HashSet::new(); - let mut end_worklist = VecDeque::new(); - end_worklist.push_back(ret); - - while let Some(node) = end_worklist.pop_front() { - if end_visited.contains(&node) { - continue; - } - if self.function.nodes[node.idx()].is_join() { - continue; - } - end_visited.insert(node); - for succ in self.control_subgraph.preds(node) { - end_worklist.push_back(succ); - } - } - - (begin_visited, end_visited) + for i in 0..forks.len()-1 { + fork_tree.entry(forks[i+1]).or_insert_with(HashSet::new).insert(forks[i]); + } + (fork_tree, fork_control_map) + }, + ) } /* @@ -483,21 +441,15 @@ impl GPUContext<'_> { */ fn get_root_forks_and_num_blocks( &self, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, max_num_blocks: usize, - ) -> (Vec<NodeID>, usize) { - let mut root_forks: HashSet<NodeID> = fork_tree.keys().copied().collect(); - for (_, children) in fork_tree.iter() { - for child in children { - root_forks.remove(child); - } - } - let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); + ) -> (HashSet<NodeID>, usize) { + let root_forks: HashSet<NodeID> = fork_tree.get(&NodeID::new(0)).unwrap().clone(); if root_forks.len() != 1 { return (root_forks, 1); } - let root_fork = root_forks[0]; + let root_fork = root_forks.iter().next().unwrap(); let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { panic!("Expected fork node"); }; @@ -517,7 +469,7 @@ impl GPUContext<'_> { * We run post-order traversal on the fork tree to get the thread quota per * subtree. In particular, each fork starts with a base factor as the * maximum over its descendants (leafs have base 1). We traverse up (details - * in helper) and pass the factor and a map from fork node to + * in helper) and pass the factor and a map from fork node to a tuple of * (max quota of its siblings (including itself), its quota, its fork factor) * - all three are needed for codegen. A node is in the map IFF it will be parallelized. * If not, the fork will use the parent's quota. Nodes may be removed from the @@ -525,13 +477,10 @@ impl GPUContext<'_> { */ fn get_thread_quotas( &self, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, - root_forks: &Vec<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + root_fork: NodeID, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize) { - // We clone to add dummy root-of-roots fork - let mut fork_tree = fork_tree.clone(); - fork_tree.insert(root_forks[0], root_forks.clone()); - let (tree_map, tree_quota, _) = self.recurse_thread_quotas(root_forks[0], &fork_tree, true); + let (tree_map, tree_quota, _) = self.recurse_thread_quotas(root_fork, fork_tree, true); (tree_map, tree_quota) } @@ -539,7 +488,7 @@ impl GPUContext<'_> { fn recurse_thread_quotas( &self, curr_fork: NodeID, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, is_root: bool, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { // Subsubtree map is the union of all keys for grandchildren and lower @@ -566,7 +515,7 @@ impl GPUContext<'_> { let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else { panic!("Expected fork node"); }; - let fork_size = self.multiply_fork_factors(factors).unwrap_or(0); + let fork_size = self.multiply_fork_factors(factors).unwrap(); subsubtree_map.insert(*child, (subtree_quota, *quota, fork_size)); } let subtree_map = subsubtree_map; @@ -580,7 +529,7 @@ impl GPUContext<'_> { * c) the known size is a power of 2 * d) all reduces are parallel-reduce or associative * - * Note: in what follows, there are a few cases where we choose between + * Note: there are a few cases where we choose between * parallelizing the fork vs its subtree, by taking max factor over subtree. * However, parts of the subtree may have had smaller quotas and didn't * need to be discarded. For now we avoid this complexity and discard full. @@ -634,68 +583,74 @@ impl GPUContext<'_> { fn get_thread_root_forks( &self, - root_forks: &Vec<NodeID>, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, + root_forks: &HashSet<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, num_blocks: usize, - ) -> Vec<NodeID> { + ) -> (NodeID, HashSet<NodeID>) { if num_blocks > 1 { - root_forks.clone() + (NodeID::new(0), root_forks.clone()) } else { - fork_tree.get(&root_forks[0]).unwrap().to_vec() + let root_fork = root_forks.iter().next().unwrap(); + (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect()) } } fn codegen_data_control( &self, block_fork: Option<NodeID>, - thread_root_forks: &Vec<NodeID>, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, - fork_control_map: &HashMap<NodeID, Vec<NodeID>>, - begin_control: &HashSet<NodeID>, - end_control: &HashSet<NodeID>, + thread_root_forks: &HashSet<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, num_threads: usize, - num_blocks: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { // Define the following states: // 0 is above block fork, 1 is in block fork above any thread fork, 2 is // in any thread fork, 3 is below block fork - // If num_blocks > 1, initialize state to 0, else 1 + // First emit data and control gen for each control node outside any fork. + // Recall that this was tracked through a fake fork node with NodeID 0. + // If num_blocks > 1, initialize state to 0, else 1. This is because + // if there is no block fork, then everything is in a single block, which + // is semantically the same as being directly nested in the block fork. let has_block_fork = block_fork.is_some(); let mut state = if has_block_fork { KernelState::OutBlockFork } else { KernelState::InBlockFork }; - // Then generate data and control for each control in begin_control - for control in begin_control { - let body = &mut gotos.get_mut(control).unwrap().body; + for control in fork_control_map.get(&NodeID::new(0)).unwrap() { + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, None, None, false, init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; + self.codegen_data_node(*data, state, num_threads, None, None, body, &mut tabs)?; } - let term = &mut gotos.get_mut(control).unwrap().term; - self.codegen_control_node(*control, term, 1)?; } - // Then if num_blocks > 1, set state to 1 and generate data and control - // for the single root fork + // Then generate data and control for the single block fork if it exists if has_block_fork { state = KernelState::InBlockFork; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, None, None, false, init, term)?; for data in self.bbs.1[control.idx()].iter() { - let body = &mut gotos.get_mut(control).unwrap().body; - self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; + self.codegen_data_node(*data, state, num_threads, None, Some(block_fork.unwrap()), body, &mut tabs)?; } - let term = &mut gotos.get_mut(control).unwrap().term; - self.codegen_control_node(*control, term, 1)?; } } - // Set state to 2 and begin DFS through fork_tree (after root_fork if - // visited in previous step), updating thread_quota + // Then generate for the thread fork tree by setting state to 2, traverse, + // and update the thread quota. Any traversal is fine, we choose pre-order. + state = KernelState::InThreadFork; for &root_fork in thread_root_forks { self.codegen_data_control_traverse( root_fork, + state, fork_tree, fork_control_map, fork_thread_quota_map, @@ -704,57 +659,46 @@ impl GPUContext<'_> { gotos, )?; } - // If num_blocks > 1, set state to 3, else 1 - state = if num_blocks > 1 { - KernelState::OutBlockFork - } else { - KernelState::InBlockFork - }; - for control in end_control { - let body = &mut gotos.get_mut(control).unwrap().body; - for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; - } - let term = &mut gotos.get_mut(control).unwrap().term; - self.codegen_control_node(*control, term, 1)?; - } - // Then generate data and control for each control in end_control Ok(()) } fn codegen_data_control_traverse( &self, curr_fork: NodeID, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, - fork_control_map: &HashMap<NodeID, Vec<NodeID>>, + state: KernelState, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, parent_quota: usize, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { - let (available_thread_quota, use_thread_quota, fork_factor) = fork_thread_quota_map + let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map .get(&curr_fork) .map(|(a, u, f)| (*a, *u, Some(*f))) .unwrap_or((parent_quota, parent_quota, None)); for control in fork_control_map.get(&curr_fork).unwrap() { - let body = &mut gotos.get_mut(control).unwrap().body; + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor.is_some(), init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, - KernelState::InThreadFork, - available_thread_quota, + state, use_thread_quota, - fork_factor, + parallel_factor, + Some(curr_fork), body, - &mut 1, + &mut tabs, )?; } - let term = &mut gotos.get_mut(control).unwrap().term; - self.codegen_control_node(*control, term, 1)?; } for child in fork_tree.get(&curr_fork).unwrap() { self.codegen_data_control_traverse( *child, + state, fork_tree, fork_control_map, fork_thread_quota_map, @@ -766,20 +710,28 @@ impl GPUContext<'_> { Ok(()) } + // state dictates where we are in the kernel, and affects ThreadID and Write + // use_thread_quota is the number of threads used by the node, and affects + // ThreadID, Read, Write, and associative Binops + // parallel_factor is parallelization degree, and affects ThreadID and associative + // Binops + // nesting_fork is the fork node that the node is nested in, and affects ThreadID + // and Reduce fn codegen_data_node( &self, id: NodeID, state: KernelState, - available_thread_quota: usize, use_thread_quota: usize, - fork_factor: Option<usize>, + parallel_factor: Option<usize>, + nesting_fork: Option<NodeID>, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { let define_variable = self.get_value(id, false, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { - // Phi registers were already emitted. + // Phi registers were already emitted and the data nodes it uses will + // update the phi Node::Phi { control: _, data: _, @@ -788,24 +740,10 @@ impl GPUContext<'_> { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { panic!("Expected ThreadID's control to be a fork node"); }; + let divide = multiply_dcs(&factors[dimension + 1..]); + let modulo = format!("dc{}", factors[*dimension].idx()); match state { KernelState::InBlockFork => { - // Violating DRY with the naming but unsure how to map - // DynamicConstantID to NodeID to use `get_value` - let divide = { - let divide = factors - .iter() - .skip(dimension + 1) - .map(|f| format!("dc{}", f.idx())) - .collect::<Vec<_>>() - .join(" * "); - if divide.is_empty() { - "1".to_string() - } else { - divide - } - }; - let modulo = format!("dc{}", factors[*dimension].idx()); write!( w, "{}{} = (blockIdx.x / ({})) % {};\n", @@ -813,19 +751,40 @@ impl GPUContext<'_> { )?; } KernelState::InThreadFork => { - todo!() + if parallel_factor.is_none() { + let fork_iter = self.get_fork_iter(*control); + write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; + } else { + + + } } _ => { panic!("Unsupported state for ThreadID") } } } - // Fork initializes the reduce and reduct updates the reduce + // Only initialize the reduce, as reduct will update the reduce. If + // serialized, add gate to prevent re-assignment when we hit this reduce + // again Node::Reduce { control: _, - init: _, + init, reduct: _, - } => {} + } => { + let init_val = self.get_value(*init, false, false); + if parallel_factor.is_none() { + let Some(nesting_fork) = nesting_fork else { + panic!("Expected reduce to be nested in a fork node"); + }; + let fork_iter = self.get_fork_iter(nesting_fork); + write!(w, "{}if ({} == 0) {{\n", tabs, fork_iter)?; + write!(w, "{}\t{} = {};\n", tabs, define_variable, init_val)?; + write!(w, "{}}}\n", tabs)?; + } else { + write!(w, "{}{} = {};\n", tabs, define_variable, init_val)?; + } + } // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { @@ -1044,72 +1003,96 @@ impl GPUContext<'_> { fn codegen_control_node( &self, id: NodeID, - w: &mut String, - num_tabs: usize, - ) -> Result<(), Error> { - let tabs = "\t".repeat(num_tabs); - match &self.function.nodes[id.idx()] { + available_thread_quota: Option<usize>, + use_thread_quota: Option<usize>, + is_parallel: bool, + w_init: &mut String, + w_term: &mut String, + ) -> Result<usize, Error> { + let tabs = match &self.function.nodes[id.idx()] { Node::Start | Node::Region { preds: _ } - | Node::Projection { - control: _, - selection: _, - } => { + | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w, "{}goto {}\n", tabs, self.get_block_name(succ))?; + write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + 1 } Node::If { control: _, cond } => { let mut succs = self.control_subgraph.succs(id); let succ1 = succs.next().unwrap(); let succ2 = succs.next().unwrap(); write!( - w, - "{}if ({}) {{\n", - tabs, + w_term, + "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; - write!(w, "{}}} else {{\n", tabs)?; - write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ2))?; - write!(w, "{}}}\n", tabs)?; + write!(w_term, "\t\tgoto {};\n", get_block_name(succ1))?; + write!(w_term, "\t}} else {{\n")?; + write!(w_term, "\t\tgoto {};\n", get_block_name(succ2))?; + write!(w_term, "\t}}\n")?; + 1 } - Node::Fork { - control: _, - factors: _, - } => { - // Emitting reduces before the fork allows the reduce to be - // used outside of the fork. - for &reduce in self.fork_reduce_map.get(&id).unwrap() { - let reduce_val = self.get_value(reduce, true, false); - let Node::Reduce { - control: _, - init, - reduct: _, - } = &self.function.nodes[reduce.idx()] - else { - panic!("Expected reduce node"); + Node::Fork { control: _, factors: _ } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { + write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + write!(w_term, "\t\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\t}}\n")?; + write!(w_term, "\telse {{\n")?; + let join = self.fork_join_map.get(&id).unwrap(); + write!(w_term, "\t\tgoto {};\n", get_block_name(*join))?; + write!(w_term, "\t}}\n")?; + 2 + } else { + write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + 1 + } + } + Node::Join { control } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + let has_thread_quota = available_thread_quota.is_some(); + if has_thread_quota { + let available_thread_quota = available_thread_quota.unwrap(); + let use_thread_quota = use_thread_quota.unwrap(); + if use_thread_quota < available_thread_quota { + write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + write!(w_term, "\t}}\n")?; + } + write!(w_term, "\t__syncthreads();\n")?; + } + if is_parallel { + write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + } else { + let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { + panic!("Expected join node to use a fork node"); }; - let init_val = self.get_value(*init, true, false); - write!(w, "{}{} = {};\n", tabs, reduce_val, init_val)?; + let fork_size = multiply_dcs(factors); + let fork_iter = self.get_fork_iter(*control); + write!(w_term, "\t{} += 1;\n", fork_iter)?; + write!(w_term, "\tif ({} == {}) {{\n", fork_iter, fork_size)?; + write!(w_term, "\t\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\t}}\n")?; + write!(w_term, "\telse {{\n")?; + write!(w_term, "\t\tgoto {};\n", get_block_name(*control))?; + write!(w_term, "\t}}\n")?; } + if has_thread_quota { 2 } else { 1 } } - Node::Join { control: _ } => {} Node::Return { control: _, data } => { if self.types[self.typing[data.idx()].idx()].is_primitive() { let return_val = self.get_value(*data, false, false); - write!( - w, - "{}if (threadIdx.x == 0) {{\n{}\t*ret = {};\n{}}}\n", - tabs, tabs, return_val, tabs - )?; + write!(w_term, "\tif (threadIdx.x == 0) {{\n")?; + write!(w_term, "\t\t*ret = {};\n", return_val)?; + write!(w_term, "\t}}\n")?; } - write!(w, "{}return;\n", tabs)?; + write!(w_term, "\treturn;\n")?; + 1 } _ => { panic!("Unsupported control node type") } - } - Ok(()) + }; + Ok(tabs) } // Handles copying data to/from global and shared memory. Thread parallelization @@ -1129,18 +1112,7 @@ impl GPUContext<'_> { let tabs = "\t".repeat(num_tabs); match &self.types[type_id.idx()] { Type::Array(element_type_id, extents) => { - let rem_array_size = { - let s = extents - .iter() - .map(|id| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join(" * "); - if s.is_empty() { - "1".to_string() - } else { - s - } - }; + let rem_array_size = multiply_dcs(extents); // Either we parallelize over threads or gate the loop by threadIdx.x // == 0 let mut extra_tab = ""; @@ -1464,20 +1436,15 @@ impl GPUContext<'_> { if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); - write!( - w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; - {}{} = dynamic_shared + dynamic_shared_offset;\n - {}dynamic_shared_offset += {};\n", - tabs, alignment, alignment, alignment, tabs, name, tabs, size, - )?; + write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; + write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; } let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") }; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call - let field_constant = &self.constants[constant_fields[i].idx()]; let field_type = self.get_type(type_fields[i], true); let offset = self.get_size(type_fields[i], Some(i)); self.codegen_constant( @@ -1493,13 +1460,9 @@ impl GPUContext<'_> { if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); - write!( - w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; - {}{} = dynamic_shared + dynamic_shared_offset;\n - {}dynamic_shared_offset += {};\n", - tabs, alignment, alignment, alignment, tabs, name, tabs, size, - )?; + write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; + write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; } // No offset updating needed since all variants start at 0 let Type::Summation(variants) = &self.types[type_id.idx()] else { @@ -1529,11 +1492,11 @@ impl GPUContext<'_> { let element_type = self.get_type(*element_type, false); write!( w, - ";\n{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; - {}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n - {}dynamic_shared_offset += {};\n", - tabs, alignment, alignment, alignment, tabs, name, element_type, tabs, size + "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", + tabs, alignment, alignment, alignment )?; + write!(w, "{}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n", tabs, name, element_type)?; + write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; } } Ok(()) @@ -1546,11 +1509,7 @@ impl GPUContext<'_> { fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { - let array_size = extents - .iter() - .map(|id| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join(" * "); + let array_size = multiply_dcs(extents); format!("{} * {}", self.get_alignment(*element_type), array_size) } Type::Product(fields) => { @@ -1759,8 +1718,8 @@ impl GPUContext<'_> { Ok(()) } - fn get_block_name(&self, id: NodeID) -> String { - format!("bb_{}", id.idx()) + fn get_fork_iter(&self, fork: NodeID) -> String { + format!("{}_iter", self.get_value(fork, false, false)) } // Setting ty = true will return with type in declaration format. make_pointer @@ -1811,6 +1770,18 @@ impl GPUContext<'_> { } } +fn get_block_name(id: NodeID) -> String { + format!("bb_{}", id.idx()) +} + +fn multiply_dcs(dcs: &[DynamicConstantID]) -> String { + if dcs.is_empty() { + "1".to_string() + } else { + dcs.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * ") + } +} + // TODO: Add float8, float16, bfloat16 dtypes if they come fn convert_type(ty: &Type, make_pointer: bool) -> String { let mut result = match ty { -- GitLab From f188278aa39eb46188d3d67400c5a17b326a54be Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 16:31:59 -0600 Subject: [PATCH 013/109] tmp --- hercules_cg/src/gpu.rs | 72 ++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 00b0051d..d7629ede 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -202,8 +202,13 @@ struct GPUContext<'a> { return_type_id: &'a TypeID, } +// Pre is its own basic block, to separate one-time vs repeated code. It is +// non-trivial only for Fork nodes to create cooperative groups. +// Init, Body, and Term compose the main basic block, with Init and Term populated +// by control flow (Init used only by Fork and Join) and Body populated by data flow. #[derive(Default, Debug)] struct CudaGoto { + pre: String, init: String, body: String, term: String, @@ -230,6 +235,10 @@ impl GPUContext<'_> { #include <cuda_runtime.h> #include <mma.h> #include <helper_cuda.h> +#include <cooperative_groups.h> +#include <cooperative_groups/memcpy_async.h> +#include <cooperative_groups/reduce.h> +namespace cg = cooperative_groups; #define uabs(a) (a) #define umin(a, b) ((a) < (b) ? (a) : (b)) @@ -377,18 +386,12 @@ impl GPUContext<'_> { } // To abide by c++ reassignment restrictions, we declare all data values - // upfront. We also declare an iteration variable for each fork, which will - // be used for non-parallelized forks. Thus, some may go unused, but we don't - // know which points at time of this call- could move this function after that - // analysis but for now not. + // upfront. fn codegen_declare_all(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { if !self.function.nodes[id.idx()].is_control() { write!(w, "\t{};\n", self.get_value(id, true, false))?; } - if self.function.nodes[id.idx()].is_fork() { - write!(w, "\tunsigned int {} = 0;\n", self.get_fork_iter(id))?; - } } Ok(()) } @@ -622,10 +625,11 @@ impl GPUContext<'_> { }; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); + let pre = &mut goto.pre; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, false, init, term)?; + let mut tabs = self.codegen_control_node(*control, None, None, false, pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, state, num_threads, None, None, body, &mut tabs)?; } @@ -635,10 +639,11 @@ impl GPUContext<'_> { state = KernelState::InBlockFork; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); + let pre = &mut goto.pre; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, false, init, term)?; + let mut tabs = self.codegen_control_node(*control, None, None, false, pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, state, num_threads, None, Some(block_fork.unwrap()), body, &mut tabs)?; } @@ -679,10 +684,11 @@ impl GPUContext<'_> { .unwrap_or((parent_quota, parent_quota, None)); for control in fork_control_map.get(&curr_fork).unwrap() { let goto = gotos.get_mut(control).unwrap(); + let pre = &mut goto.pre; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor.is_some(), init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor.is_some(), pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, @@ -752,7 +758,7 @@ impl GPUContext<'_> { } KernelState::InThreadFork => { if parallel_factor.is_none() { - let fork_iter = self.get_fork_iter(*control); + let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { @@ -777,7 +783,7 @@ impl GPUContext<'_> { let Some(nesting_fork) = nesting_fork else { panic!("Expected reduce to be nested in a fork node"); }; - let fork_iter = self.get_fork_iter(nesting_fork); + let fork_iter = self.get_fork_iter(nesting_fork, false); write!(w, "{}if ({} == 0) {{\n", tabs, fork_iter)?; write!(w, "{}\t{} = {};\n", tabs, define_variable, init_val)?; write!(w, "{}}}\n", tabs)?; @@ -1003,9 +1009,11 @@ impl GPUContext<'_> { fn codegen_control_node( &self, id: NodeID, + state: KernelState, available_thread_quota: Option<usize>, use_thread_quota: Option<usize>, is_parallel: bool, + w_pre: &mut String, w_init: &mut String, w_term: &mut String, ) -> Result<usize, Error> { @@ -1014,7 +1022,7 @@ impl GPUContext<'_> { | Node::Region { preds: _ } | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; 1 } Node::If { control: _, cond } => { @@ -1026,9 +1034,9 @@ impl GPUContext<'_> { "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w_term, "\t\tgoto {};\n", get_block_name(succ1))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, true))?; write!(w_term, "\t}} else {{\n")?; - write!(w_term, "\t\tgoto {};\n", get_block_name(succ2))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, true))?; write!(w_term, "\t}}\n")?; 1 } @@ -1036,15 +1044,15 @@ impl GPUContext<'_> { let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; - write!(w_term, "\t\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, true))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; let join = self.fork_join_map.get(&id).unwrap(); - write!(w_term, "\t\tgoto {};\n", get_block_name(*join))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*join, true))?; write!(w_term, "\t}}\n")?; 2 } else { - write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; 1 } } @@ -1061,19 +1069,19 @@ impl GPUContext<'_> { write!(w_term, "\t__syncthreads();\n")?; } if is_parallel { - write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; } else { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { panic!("Expected join node to use a fork node"); }; let fork_size = multiply_dcs(factors); - let fork_iter = self.get_fork_iter(*control); + let fork_iter = self.get_fork_iter(*control, false); write!(w_term, "\t{} += 1;\n", fork_iter)?; write!(w_term, "\tif ({} == {}) {{\n", fork_iter, fork_size)?; - write!(w_term, "\t\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, true))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; - write!(w_term, "\t\tgoto {};\n", get_block_name(*control))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*control, false))?; write!(w_term, "\t}}\n")?; } if has_thread_quota { 2 } else { 1 } @@ -1718,8 +1726,20 @@ impl GPUContext<'_> { Ok(()) } - fn get_fork_iter(&self, fork: NodeID) -> String { - format!("{}_iter", self.get_value(fork, false, false)) + fn get_cg_name(&self, start_or_fork: NodeID) -> String { + format!("cg_{}", self.get_value(start_or_fork, false, false)) + } + + fn get_fork_iter(&self, fork: NodeID, ty: bool) -> String { + if ty { + format!("unsigned int iter_{}", self.get_value(fork, false, false)) + } else { + format!("iter_{}", self.get_value(fork, false, false)) + } + } + + fn get_block_name(&self, id: NodeID, pre: bool) -> String { + format!("bb_{}{}", self.get_value(id, false, false), if pre { "_pre" } else { "" }) } // Setting ty = true will return with type in declaration format. make_pointer @@ -1770,10 +1790,6 @@ impl GPUContext<'_> { } } -fn get_block_name(id: NodeID) -> String { - format!("bb_{}", id.idx()) -} - fn multiply_dcs(dcs: &[DynamicConstantID]) -> String { if dcs.is_empty() { "1".to_string() -- GitLab From bb68a0dd97531c62f5b486a1c3d16ed20e314802 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 18:01:50 -0600 Subject: [PATCH 014/109] just reduct left --- hercules_cg/src/gpu.rs | 388 ++++++++--------------------------------- 1 file changed, 70 insertions(+), 318 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index d7629ede..9e70956f 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -254,7 +254,7 @@ namespace cg = cooperative_groups; self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; - self.codegen_declare_all(&mut top)?; + self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); @@ -347,7 +347,10 @@ namespace cg = cooperative_groups; // Type is char since it's simplest to use single bytes for indexing, // casting will be needed for use with different types. - write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n")?; + write!(w, ") {{\n")?; + write!(w, "\textern __shared__ char dynamic_shared[];\n")?; + write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; + write!(w, "}}\n")?; Ok(()) } @@ -387,7 +390,7 @@ namespace cg = cooperative_groups; // To abide by c++ reassignment restrictions, we declare all data values // upfront. - fn codegen_declare_all(&self, w: &mut String) -> Result<(), Error> { + fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { if !self.function.nodes[id.idx()].is_control() { write!(w, "\t{};\n", self.get_value(id, true, false))?; @@ -403,6 +406,8 @@ namespace cg = cooperative_groups; fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!(w, "\tsize_t alignment;\n")?; write!(w, "\tsize_t max_variant_size;\n")?; + write!(w, "\tgrid_group grid = this_grid();\n")?; + write!(w, "\tthread_block block = this_thread_block();\n")?; Ok(()) } @@ -614,28 +619,20 @@ namespace cg = cooperative_groups; // First emit data and control gen for each control node outside any fork. // Recall that this was tracked through a fake fork node with NodeID 0. - // If num_blocks > 1, initialize state to 0, else 1. This is because - // if there is no block fork, then everything is in a single block, which - // is semantically the same as being directly nested in the block fork. - let has_block_fork = block_fork.is_some(); - let mut state = if has_block_fork { - KernelState::OutBlockFork - } else { - KernelState::InBlockFork - }; + let mut state = KernelState::OutBlockFork; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); let pre = &mut goto.pre; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, false, pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, None, None, None, pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, num_threads, None, None, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists - if has_block_fork { + if block_fork.is_some() { state = KernelState::InBlockFork; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); @@ -643,9 +640,9 @@ namespace cg = cooperative_groups; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, false, pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, num_threads, None, Some(block_fork.unwrap()), body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), body, &mut tabs)?; } } } @@ -688,12 +685,12 @@ namespace cg = cooperative_groups; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor.is_some(), pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor, pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, state, - use_thread_quota, + Some(use_thread_quota), parallel_factor, Some(curr_fork), body, @@ -727,7 +724,7 @@ namespace cg = cooperative_groups; &self, id: NodeID, state: KernelState, - use_thread_quota: usize, + use_thread_quota: Option<usize>, parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, w: &mut String, @@ -761,8 +758,9 @@ namespace cg = cooperative_groups; let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { - - + // We can directly use use_thread_quota and not worry about available + // because Fork basic block's init section already does gating + write!(w, "{}{} = (threadIdx.x % {}) / {};\n", tabs, define_variable, use_thread_quota.unwrap(), use_thread_quota.unwrap() / parallel_factor.unwrap())?; } } _ => { @@ -943,45 +941,45 @@ namespace cg = cooperative_groups; Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); - let type_id = self.typing[id.idx()]; - self.codegen_copy( - false, - type_id, - &define_variable, - &collect_with_indices, - if !self.types[type_id.idx()].is_primitive() { - Some(use_thread_quota) + let data_type_id = self.typing[id.idx()]; + if self.types[data_type_id.idx()].is_primitive() { + if is_char { + let type_name = self.get_type(data_type_id, true); + write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; } else { - None - }, - false, - w, - *num_tabs, - )?; + write!(w, "{}{} = *{};\n", tabs, define_variable, collect_with_indices)?; + } + } else { + let nested_fork = nesting_fork.unwrap(); + let cg_name = self.get_cg_name(nested_fork, false); + let data_size = self.get_size(data_type_id, None); + write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, define_variable, collect_with_indices, data_size)?; + write!(w, "{}wait({});\n", tabs, cg_name)?; + } } Node::Write { collect, data, indices, } => { - let data_variable = self.get_value(*data, false, false); let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); - let type_id = self.typing[data.idx()]; - self.codegen_copy( - true, - type_id, - &data_variable, - &collect_with_indices, - if !self.types[type_id.idx()].is_primitive() { - Some(use_thread_quota) + let data_variable = self.get_value(*data, false, false); + let data_type_id = self.typing[data.idx()]; + if self.types[data_type_id.idx()].is_primitive() { + if is_char { + let type_name = self.get_type(data_type_id, true); + write!(w, "{}*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; } else { - None - }, - state == KernelState::OutBlockFork, - w, - *num_tabs, - )?; + write!(w, "{}*{} = {};\n", tabs, collect_with_indices, data_variable)?; + } + } else { + let nested_fork = nesting_fork.unwrap(); + let cg_name = self.get_cg_name(nested_fork, false); + let data_size = self.get_size(data_type_id, None); + write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, collect_with_indices, data_variable, data_size)?; + write!(w, "{}wait({});\n", tabs, cg_name)?; + } let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -1009,17 +1007,20 @@ namespace cg = cooperative_groups; fn codegen_control_node( &self, id: NodeID, - state: KernelState, available_thread_quota: Option<usize>, use_thread_quota: Option<usize>, - is_parallel: bool, + parallel_factor: Option<usize>, w_pre: &mut String, w_init: &mut String, w_term: &mut String, ) -> Result<usize, Error> { let tabs = match &self.function.nodes[id.idx()] { - Node::Start - | Node::Region { preds: _ } + Node::Start => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + 1 + } + Node::Region { preds: _ } | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; @@ -1041,6 +1042,17 @@ namespace cg = cooperative_groups; 1 } Node::Fork { control: _, factors: _ } => { + // We don't do anything smart to mitigate control flow divergence + // if use_thread_quota < warp size + let cg_name = self.get_cg_name(id, false); + if use_thread_quota.is_some() { + let use_thread_quota = use_thread_quota.unwrap(); + let use_thread_per_id = use_thread_quota / parallel_factor.unwrap(); + write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_per_id, cg_name, use_thread_per_id)?; + let cg_name_full = self.get_cg_name(id, true); + write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_quota, cg_name_full, use_thread_quota)?; + } + write!(w_pre, "\tgoto {};\n", self.get_block_name(id, false))?; let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; @@ -1066,9 +1078,8 @@ namespace cg = cooperative_groups; write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; write!(w_term, "\t}}\n")?; } - write!(w_term, "\t__syncthreads();\n")?; } - if is_parallel { + if parallel_factor.is_some() { write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; } else { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { @@ -1103,265 +1114,6 @@ namespace cg = cooperative_groups; Ok(tabs) } - // Handles copying data to/from global and shared memory. Thread parallelization - // is used only for arrays (possibly inside another collection). is_char indicates - // a char type and we need to including element size in indexing. - fn codegen_copy( - &self, - is_write: bool, - type_id: TypeID, - data: &String, - collect: &String, - thread_quota: Option<usize>, - block_restrict: bool, - w: &mut String, - num_tabs: usize, - ) -> Result<(), Error> { - let tabs = "\t".repeat(num_tabs); - match &self.types[type_id.idx()] { - Type::Array(element_type_id, extents) => { - let rem_array_size = multiply_dcs(extents); - // Either we parallelize over threads or gate the loop by threadIdx.x - // == 0 - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - let has_thread_quota = thread_quota.is_some(); - write!( - w, - "{}", - if has_thread_quota { - format!( - "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", - tabs, - rem_array_size, - thread_quota.unwrap() - ) - } else { - format!( - "{}if (threadIdx.x == 0) {{\n{}\tfor (int i = 0; i < {}; i++) {{\n", - tabs, tabs, rem_array_size - ) - } - ); - let element_type_name = self.get_type(*element_type_id, true); - let (new_collect, new_data) = if self.is_char(type_id) { - ( - format!( - "{} + i * {}", - collect, - self.get_size(*element_type_id, None) - ), - format!("{} + i * {}", data, self.get_size(*element_type_id, None)), - ) - } else { - (format!("{} + i", collect), format!("{} + i", data)) - }; - let new_collect = format!( - "{}reinterpret_cast<{}>({})", - if self.types[element_type_id.idx()].is_primitive() { - "*" - } else { - "" - }, - element_type_name, - new_collect - ); - let new_data = format!( - "{}reinterpret_cast<{}>({})", - if self.types[element_type_id.idx()].is_primitive() { - "*" - } else { - "" - }, - element_type_name, - new_data - ); - self.codegen_copy( - is_write, - *element_type_id, - &new_data, - &new_collect, - None, - false, - w, - num_tabs - + if block_restrict { 1 } else { 0 } - + if has_thread_quota { 1 } else { 2 }, - )?; - if !has_thread_quota { - write!(w, "{}\t}}\n", tabs)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; - } - Type::Product(fields) => { - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - let has_thread_quota = thread_quota.is_some(); - if !has_thread_quota { - write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; - extra_tab2 = "\t"; - } - for (i, field) in fields.iter().enumerate() { - let offset = self.get_size(type_id, Some(i)); - let field_type_name = self.get_type(*field, true); - let new_collect = format!( - "{}reinterpret_cast<{}>({} + {})", - if self.types[field.idx()].is_primitive() { - "*" - } else { - "" - }, - field_type_name, - collect, - offset - ); - let new_data = format!( - "{}reinterpret_cast<{}>({} + {})", - if self.types[field.idx()].is_primitive() { - "*" - } else { - "" - }, - field_type_name, - data, - offset - ); - self.codegen_copy( - is_write, - *field, - &new_data, - &new_collect, - thread_quota, - false, - w, - num_tabs - + if block_restrict { 1 } else { 0 } - + if has_thread_quota { 0 } else { 1 }, - )?; - } - if !has_thread_quota { - write!(w, "{}\t}}\n", tabs)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; - } - Type::Summation(variants) => { - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - let has_thread_quota = thread_quota.is_some(); - if !has_thread_quota { - write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; - extra_tab2 = "\t"; - } - // We can guarantee correctness for summation by just copying the - // largest variant. - let max_variant_size = self.get_size(type_id, None); - write!( - w, - "{}{}{}max_variant_size = {};\n", - tabs, extra_tab, extra_tab2, max_variant_size - )?; - for (i, variant) in variants.iter().enumerate() { - let prefix = if i == 0 { "if" } else { "else if" }; - let variant_size = self.get_size(*variant, None); - write!( - w, - "{}{}{}{} (max_variant_size == {}) {{\n", - tabs, extra_tab, extra_tab2, prefix, variant_size - )?; - let field_type_name = self.get_type(*variant, true); - let new_collect = format!( - "{}reinterpret_cast<{}>({})", - if self.types[variant.idx()].is_primitive() { - "*" - } else { - "" - }, - field_type_name, - collect - ); - let new_data = format!( - "{}reinterpret_cast<{}>({})", - if self.types[variant.idx()].is_primitive() { - "*" - } else { - "" - }, - field_type_name, - data - ); - self.codegen_copy( - is_write, - *variant, - &new_data, - &new_collect, - thread_quota, - false, - w, - num_tabs - + if block_restrict { 1 } else { 0 } - + if has_thread_quota { 0 } else { 1 }, - )?; - write!(w, "{}{}{}}}\n", tabs, extra_tab, extra_tab2)?; - } - if !has_thread_quota { - write!(w, "{}\t}}\n", tabs)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; - } - // Primitive types - _ => { - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - let has_thread_quota = thread_quota.is_some(); - if has_thread_quota { - write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; - extra_tab2 = "\t"; - } - write!( - w, - "{}{}{}{} = {};\n", - tabs, - extra_tab, - extra_tab2, - if is_write { collect } else { data }, - if is_write { data } else { collect } - )?; - if has_thread_quota { - write!(w, "{}{}}}\n", tabs, extra_tab)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - } - } - Ok(()) - } - // Read/writes to global collections consist of global name + pointer offset. fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { let mut index_ptr = "0".to_string(); @@ -1726,8 +1478,8 @@ namespace cg = cooperative_groups; Ok(()) } - fn get_cg_name(&self, start_or_fork: NodeID) -> String { - format!("cg_{}", self.get_value(start_or_fork, false, false)) + fn get_cg_name(&self, fork: NodeID, full: bool) -> String { + format!("cg_{}{}", self.get_value(fork, false, false), if full { "_full" } else { "" }) } fn get_fork_iter(&self, fork: NodeID, ty: bool) -> String { -- GitLab From 3814bbd82e27de0e40981ba405c20e03cf783e9e Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 18:27:00 -0600 Subject: [PATCH 015/109] sync --- hercules_cg/src/gpu.rs | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 9e70956f..6cf653f3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -216,9 +216,16 @@ struct CudaGoto { #[derive(Clone, Copy, PartialEq, Debug)] enum KernelState { - OutBlockFork, - InBlockFork, - InThreadFork, + OutBlock, + InBlock, + InThread, +} + +#[derive(Clone, Copy, PartialEq, Debug)] +enum CGType { + UsePerId, + Use, + Available, } impl GPUContext<'_> { @@ -619,7 +626,7 @@ namespace cg = cooperative_groups; // First emit data and control gen for each control node outside any fork. // Recall that this was tracked through a fake fork node with NodeID 0. - let mut state = KernelState::OutBlockFork; + let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); let pre = &mut goto.pre; @@ -633,7 +640,7 @@ namespace cg = cooperative_groups; } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { - state = KernelState::InBlockFork; + state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); let pre = &mut goto.pre; @@ -648,7 +655,7 @@ namespace cg = cooperative_groups; } // Then generate for the thread fork tree by setting state to 2, traverse, // and update the thread quota. Any traversal is fine, we choose pre-order. - state = KernelState::InThreadFork; + state = KernelState::InThread; for &root_fork in thread_root_forks { self.codegen_data_control_traverse( root_fork, @@ -746,14 +753,14 @@ namespace cg = cooperative_groups; let divide = multiply_dcs(&factors[dimension + 1..]); let modulo = format!("dc{}", factors[*dimension].idx()); match state { - KernelState::InBlockFork => { + KernelState::InBlock => { write!( w, "{}{} = (blockIdx.x / ({})) % {};\n", tabs, define_variable, divide, modulo )?; } - KernelState::InThreadFork => { + KernelState::InThread => { if parallel_factor.is_none() { let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; @@ -951,7 +958,7 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_name = self.get_cg_name(nested_fork, false); + let cg_name = self.get_cg_name(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, define_variable, collect_with_indices, data_size)?; write!(w, "{}wait({});\n", tabs, cg_name)?; @@ -975,7 +982,7 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_name = self.get_cg_name(nested_fork, false); + let cg_name = self.get_cg_name(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, collect_with_indices, data_variable, data_size)?; write!(w, "{}wait({});\n", tabs, cg_name)?; @@ -1044,13 +1051,16 @@ namespace cg = cooperative_groups; Node::Fork { control: _, factors: _ } => { // We don't do anything smart to mitigate control flow divergence // if use_thread_quota < warp size - let cg_name = self.get_cg_name(id, false); + let cg_name = self.get_cg_name(id, CGType::UsePerId); if use_thread_quota.is_some() { let use_thread_quota = use_thread_quota.unwrap(); let use_thread_per_id = use_thread_quota / parallel_factor.unwrap(); write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_per_id, cg_name, use_thread_per_id)?; - let cg_name_full = self.get_cg_name(id, true); - write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_quota, cg_name_full, use_thread_quota)?; + let cg_name_use = self.get_cg_name(id, CGType::Use); + write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_quota, cg_name_use, use_thread_quota)?; + let available_thread_quota = available_thread_quota.unwrap(); + let cg_name_available = self.get_cg_name(id, CGType::Available); + write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", available_thread_quota, cg_name_available, available_thread_quota)?; } write!(w_pre, "\tgoto {};\n", self.get_block_name(id, false))?; let succ = self.control_subgraph.succs(id).next().unwrap(); @@ -1080,6 +1090,8 @@ namespace cg = cooperative_groups; } } if parallel_factor.is_some() { + let cg_name_available = self.get_cg_name(id, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_name_available)?; write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; } else { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { @@ -1478,8 +1490,8 @@ namespace cg = cooperative_groups; Ok(()) } - fn get_cg_name(&self, fork: NodeID, full: bool) -> String { - format!("cg_{}{}", self.get_value(fork, false, false), if full { "_full" } else { "" }) + fn get_cg_name(&self, fork: NodeID, cg_type: CGType) -> String { + format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" }) } fn get_fork_iter(&self, fork: NodeID, ty: bool) -> String { -- GitLab From 982f23dda521dd8b61fc6621e1560ad3dd8a5864 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 22:13:02 -0600 Subject: [PATCH 016/109] el fin --- hercules_cg/src/gpu.rs | 349 +++++++++++++++++++++++---------------- hercules_ir/src/ir.rs | 2 +- hercules_opt/src/pass.rs | 13 ++ 3 files changed, 219 insertions(+), 145 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 6cf653f3..f1d949ba 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -16,7 +16,6 @@ pub fn gpu_codegen<W: Write>( types: &Vec<Type>, constants: &Vec<Constant>, dynamic_constants: &Vec<DynamicConstant>, - reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, bbs: &BasicBlocks, @@ -37,6 +36,9 @@ pub fn gpu_codegen<W: Write>( * be aligned for its type and for full product to be aligned to its * largest element * - similarly, summation types must be aligned to their largest element + * + * Major TODOs: + * - Matmul/Conv detection */ let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) @@ -164,7 +166,6 @@ pub fn gpu_codegen<W: Write>( types, constants, dynamic_constants, - reverse_postorder, typing, control_subgraph, bbs, @@ -190,7 +191,6 @@ struct GPUContext<'a> { types: &'a Vec<Type>, constants: &'a Vec<Constant>, dynamic_constants: &'a Vec<DynamicConstant>, - reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, @@ -208,8 +208,8 @@ struct GPUContext<'a> { // by control flow (Init used only by Fork and Join) and Body populated by data flow. #[derive(Default, Debug)] struct CudaGoto { - pre: String, init: String, + post_init: String, body: String, term: String, } @@ -232,49 +232,19 @@ impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // All possible includes followed by macros for intrinsic calls on // types with no library support - write!( - w, - " -#include <assert.h> -#include <stdio.h> -#include <stddef.h> -#include <cuda.h> -#include <cuda_runtime.h> -#include <mma.h> -#include <helper_cuda.h> -#include <cooperative_groups.h> -#include <cooperative_groups/memcpy_async.h> -#include <cooperative_groups/reduce.h> -namespace cg = cooperative_groups; - -#define uabs(a) (a) -#define umin(a, b) ((a) < (b) ? (a) : (b)) -#define umax(a, b) ((a) > (b) ? (a) : (b)) -#define powi(a, b) ({{ int res = 1; for(int i = 0; i < b; ++i) res *= a; res; }}) -#define roundi(a) (a) -#define isqrt(a) ((int)sqrtf((float)(a))) - -", - )?; - let mut top = String::new(); - self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; + self.codegen_goto_start(&mut top)?; + write!(w, "{}", top)?; let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); - let start = NodeID::new(0); - let ret = (0..self.function.nodes.len()) - .filter(|idx| self.function.nodes[*idx].is_return()) - .map(NodeID::new) - .next() - .unwrap(); // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -300,12 +270,10 @@ namespace cg = cooperative_groups; &mut gotos, )?; - // Punting on implementation but can likely run einsum -> matmul/conv - // detector on hierarhical fork joins between block edge and given - // thread edge. + let mut rest = String::new(); + self.codegen_gotos(&mut gotos, &mut rest)?; + write!(w, "{}", rest)?; - // finish kernel - write!(w, "{}", top)?; write!(w, "}}\n")?; Ok(()) @@ -313,6 +281,29 @@ namespace cg = cooperative_groups; // Emit kernel signature, arguments, and dynamic shared memory declaration fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { + write!(w, " +#include <assert.h> +#include <stdio.h> +#include <stddef.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <mma.h> +#include <helper_cuda.h> +#include <cooperative_groups.h> +#include <cooperative_groups/memcpy_async.h> +#include <cooperative_groups/reduce.h> +namespace cg = cooperative_groups; + +#define uabs(a) (a) +#define umin(a, b) ((a) < (b) ? (a) : (b)) +#define umax(a, b) ((a) > (b) ? (a) : (b)) +#define powi(a, b) ({{ int res = 1; for(int i = 0; i < b; ++i) res *= a; res; }}) +#define roundi(a) (a) +#define isqrt(a) ((int)sqrtf((float)(a))) + +", + )?; + write!( w, "__global__ void __launch_bounds__({}) {}(", @@ -357,7 +348,6 @@ namespace cg = cooperative_groups; write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; - write!(w, "}}\n")?; Ok(()) } @@ -413,8 +403,25 @@ namespace cg = cooperative_groups; fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!(w, "\tsize_t alignment;\n")?; write!(w, "\tsize_t max_variant_size;\n")?; - write!(w, "\tgrid_group grid = this_grid();\n")?; - write!(w, "\tthread_block block = this_thread_block();\n")?; + write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; + write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; + Ok(()) + } + + fn codegen_goto_start(&self, w: &mut String) -> Result<(), Error> { + let block_start = self.get_block_name(NodeID::new(0), false); + write!(w, "goto {};\n", block_start)?; + Ok(()) + } + + fn codegen_gotos(&self, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { + write!(w, "\n")?; + for (_, goto) in gotos.iter() { + write!(w, "{}\n", goto.init)?; + write!(w, "{}\n", goto.post_init)?; + write!(w, "{}\n", goto.body)?; + write!(w, "{}\n\n", goto.term)?; + } Ok(()) } @@ -629,13 +636,13 @@ namespace cg = cooperative_groups; let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); - let pre = &mut goto.pre; let init = &mut goto.init; + let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, None, pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, false, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists @@ -643,13 +650,13 @@ namespace cg = cooperative_groups; state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); - let pre = &mut goto.pre; let init = &mut goto.init; + let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, body, &mut tabs)?; } } } @@ -686,13 +693,27 @@ namespace cg = cooperative_groups; .get(&curr_fork) .map(|(a, u, f)| (*a, *u, Some(*f))) .unwrap_or((parent_quota, parent_quota, None)); + let reduces = &self.fork_reduce_map[&curr_fork]; + let reducts = if parallel_factor.is_some() { + reduces + .iter() + .map(|&reduce| { + let Node::Reduce { control: _, init: _, reduct} = &self.function.nodes[reduce.idx()] else { + panic!("Expected reduce node"); + }; + *reduct + }) + .collect() + } else { + HashSet::new() + }; for control in fork_control_map.get(&curr_fork).unwrap() { let goto = gotos.get_mut(control).unwrap(); - let pre = &mut goto.pre; let init = &mut goto.init; + let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor, pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, @@ -700,6 +721,7 @@ namespace cg = cooperative_groups; Some(use_thread_quota), parallel_factor, Some(curr_fork), + reducts.contains(data), body, &mut tabs, )?; @@ -734,6 +756,7 @@ namespace cg = cooperative_groups; use_thread_quota: Option<usize>, parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, + is_special_reduct: bool, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { @@ -863,55 +886,75 @@ namespace cg = cooperative_groups; Node::Binary { op, left, right } => { let left_val = self.get_value(*left, false, false); let right_val = self.get_value(*right, false, false); - match (op, &self.types[self.typing[left.idx()].idx()]) { - (BinaryOperator::Rem, Type::Float32) => write!( - w, - "{}{} = fmodf({}, {});\n", - tabs, define_variable, left_val, right_val, - )?, - (BinaryOperator::Rem, Type::Float64) => write!( - w, - "{}{} = fmod({}, {});\n", - tabs, define_variable, left_val, right_val, - )?, - // Doesn't need special syntax but bool type - (BinaryOperator::Or, Type::Boolean) => write!( - w, - "{}{} = {} || {};\n", - tabs, define_variable, left_val, right_val, - )?, - (BinaryOperator::And, Type::Boolean) => write!( - w, - "{}{} = {} && {};\n", - tabs, define_variable, left_val, right_val, - )?, - (op, _) => write!( - w, - "{}{} = {} {} {};\n", - tabs, - define_variable, - left_val, - match op { - BinaryOperator::Add => "+", - BinaryOperator::Sub => "-", - BinaryOperator::Mul => "*", - BinaryOperator::Div => "/", - BinaryOperator::Rem => "%", - BinaryOperator::LT => "<", - BinaryOperator::LTE => "<=", - BinaryOperator::GT => ">", - BinaryOperator::GTE => ">=", - BinaryOperator::EQ => "==", - BinaryOperator::NE => "!=", - BinaryOperator::Or => "|", - BinaryOperator::And => "&", - BinaryOperator::Xor => "^", - BinaryOperator::LSh => "<<", - BinaryOperator::RSh => ">>", - }, - right_val, - )?, - }; + let id_type = self.typing[id.idx()]; + if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And + | BinaryOperator::Xor) && is_special_reduct { + let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[left.idx()] { + right_val + } else { + left_val + }; + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + #[allow(unreachable_patterns)] + let cg_op = match op { + BinaryOperator::Add => "plus", + BinaryOperator::Or => "bit_or", + BinaryOperator::And => "bit_and", + BinaryOperator::Xor => "bit_xor", + _ => unreachable!(), + }; + let id_type_name = self.get_type(id_type, false); + write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, cg_tile, non_reduce_arg, cg_op, id_type_name)?; + } else { + match (op, &self.types[id_type.idx()]) { + (BinaryOperator::Or, Type::Boolean) => write!( + w, + "{}{} = {} || {};\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::And, Type::Boolean) => write!( + w, + "{}{} = {} && {};\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::Rem, Type::Float32) => write!( + w, + "{}{} = fmodf({}, {});\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::Rem, Type::Float64) => write!( + w, + "{}{} = fmod({}, {});\n", + tabs, define_variable, left_val, right_val, + )?, + (op, _) => write!( + w, + "{}{} = {} {} {};\n", + tabs, + define_variable, + left_val, + match op { + BinaryOperator::Add => "+", + BinaryOperator::Sub => "-", + BinaryOperator::Mul => "*", + BinaryOperator::Div => "/", + BinaryOperator::Rem => "%", + BinaryOperator::LT => "<", + BinaryOperator::LTE => "<=", + BinaryOperator::GT => ">", + BinaryOperator::GTE => ">=", + BinaryOperator::EQ => "==", + BinaryOperator::NE => "!=", + BinaryOperator::Or => "|", + BinaryOperator::And => "&", + BinaryOperator::Xor => "^", + BinaryOperator::LSh => "<<", + BinaryOperator::RSh => ">>", + }, + right_val, + )?, + }; + } } Node::Ternary { op, @@ -932,16 +975,34 @@ namespace cg = cooperative_groups; } }, Node::IntrinsicCall { intrinsic, args } => { - let ty = &self.types[self.typing[args[0].idx()].idx()]; - let func_name = self.codegen_intrinsic(intrinsic, ty); - write!( - w, - "{}{} = {}({});\n", - tabs, - define_variable, - func_name, - self.get_value(args[0], false, false), - )?; + let id_type = self.typing[id.idx()]; + if matches!(intrinsic, Intrinsic::Max | Intrinsic::Min) && is_special_reduct { + let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[args[0].idx()] { + self.get_value(args[1], false, false) + } else { + self.get_value(args[0], false, false) + }; + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + #[allow(unreachable_patterns)] + let cg_op = match intrinsic { + Intrinsic::Max => "max", + Intrinsic::Min => "min", + _ => unreachable!(), + }; + let id_type_name = self.get_type(id_type, false); + write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name)?; + } else { + let ty = &self.types[id_type.idx()]; + let func_name = self.codegen_intrinsic(intrinsic, ty); + write!( + w, + "{}{} = {}({});\n", + tabs, + define_variable, + func_name, + self.get_value(args[0], false, false), + )?; + } } // Main difference between read and write is codegen_copy takes the // returned node's type for read and data node's type for write @@ -958,10 +1019,10 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_name = self.get_cg_name(nested_fork, CGType::UsePerId); + let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); - write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, define_variable, collect_with_indices, data_size)?; - write!(w, "{}wait({});\n", tabs, cg_name)?; + write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, define_variable, collect_with_indices, data_size)?; + write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } } Node::Write { @@ -982,10 +1043,10 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_name = self.get_cg_name(nested_fork, CGType::UsePerId); + let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); - write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, collect_with_indices, data_variable, data_size)?; - write!(w, "{}wait({});\n", tabs, cg_name)?; + write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, collect_with_indices, data_variable, data_size)?; + write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; @@ -1017,20 +1078,20 @@ namespace cg = cooperative_groups; available_thread_quota: Option<usize>, use_thread_quota: Option<usize>, parallel_factor: Option<usize>, - w_pre: &mut String, w_init: &mut String, + w_post_init: &mut String, w_term: &mut String, ) -> Result<usize, Error> { let tabs = match &self.function.nodes[id.idx()] { Node::Start => { let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; 1 } Node::Region { preds: _ } | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; 1 } Node::If { control: _, cond } => { @@ -1042,43 +1103,43 @@ namespace cg = cooperative_groups; "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, true))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, false))?; write!(w_term, "\t}} else {{\n")?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, true))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, false))?; write!(w_term, "\t}}\n")?; 1 } Node::Fork { control: _, factors: _ } => { // We don't do anything smart to mitigate control flow divergence // if use_thread_quota < warp size - let cg_name = self.get_cg_name(id, CGType::UsePerId); + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { let use_thread_quota = use_thread_quota.unwrap(); let use_thread_per_id = use_thread_quota / parallel_factor.unwrap(); - write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_per_id, cg_name, use_thread_per_id)?; - let cg_name_use = self.get_cg_name(id, CGType::Use); - write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_quota, cg_name_use, use_thread_quota)?; + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id)?; + let cg_tile_use = self.get_cg_tile(id, CGType::Use); + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota)?; let available_thread_quota = available_thread_quota.unwrap(); - let cg_name_available = self.get_cg_name(id, CGType::Available); - write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", available_thread_quota, cg_name_available, available_thread_quota)?; + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota)?; } - write!(w_pre, "\tgoto {};\n", self.get_block_name(id, false))?; + write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { - write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_post_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; let join = self.fork_join_map.get(&id).unwrap(); - write!(w_term, "\t\tgoto {};\n", self.get_block_name(*join, true))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*join, false))?; write!(w_term, "\t}}\n")?; 2 } else { - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; 1 } } - Node::Join { control } => { + Node::Join { control: fork } => { let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); if has_thread_quota { @@ -1090,21 +1151,21 @@ namespace cg = cooperative_groups; } } if parallel_factor.is_some() { - let cg_name_available = self.get_cg_name(id, CGType::Available); - write!(w_term, "\t{}.sync();\n", cg_name_available)?; - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; } else { - let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { + let Node::Fork { factors, .. } = &self.function.nodes[fork.idx()] else { panic!("Expected join node to use a fork node"); }; let fork_size = multiply_dcs(factors); - let fork_iter = self.get_fork_iter(*control, false); + let fork_iter = self.get_fork_iter(*fork, false); write!(w_term, "\t{} += 1;\n", fork_iter)?; write!(w_term, "\tif ({} == {}) {{\n", fork_iter, fork_size)?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(*control, false))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*fork, true))?; write!(w_term, "\t}}\n")?; } if has_thread_quota { 2 } else { 1 } @@ -1490,7 +1551,7 @@ namespace cg = cooperative_groups; Ok(()) } - fn get_cg_name(&self, fork: NodeID, cg_type: CGType) -> String { + fn get_cg_tile(&self, fork: NodeID, cg_type: CGType) -> String { format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" }) } @@ -1502,8 +1563,8 @@ namespace cg = cooperative_groups; } } - fn get_block_name(&self, id: NodeID, pre: bool) -> String { - format!("bb_{}{}", self.get_value(id, false, false), if pre { "_pre" } else { "" }) + fn get_block_name(&self, id: NodeID, post: bool) -> String { + format!("bb_{}{}", self.get_value(id, false, false), if post { "_post" } else { "" }) } // Setting ty = true will return with type in declaration format. make_pointer diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 432623c5..2faf2bb6 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -335,7 +335,7 @@ pub enum Schedule { #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum Device { LLVM, - NVVM, + CUDA, // Entry functions are lowered to async Rust code that calls device // functions (leaf nodes in the call graph), possibly concurrently. AsyncRust, diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 3b7c81ed..217d5996 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -935,6 +935,7 @@ impl PassManager { let mut rust_rt = String::new(); let mut llvm_ir = String::new(); + let mut cuda_ir = String::new(); for idx in 0..self.module.functions.len() { match devices[idx] { Device::LLVM => cpu_codegen( @@ -964,6 +965,18 @@ impl PassManager { &mut rust_rt, ) .unwrap(), + Device::CUDA => gpu_codegen( + &self.module.functions[idx], + &self.module.types, + &self.module.constants, + &self.module.dynamic_constants, + &typing[idx], + &control_subgraphs[idx], + &bbs[idx], + &collection_objects[&FunctionID::new(idx)], + &mut cuda_ir, + ) + .unwrap(), _ => todo!(), } } -- GitLab From 6549b02430c939b54a3d1cf74adf63c918862274 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 6 Jan 2025 09:34:15 -0600 Subject: [PATCH 017/109] cleanup comments --- hercules_cg/src/gpu.rs | 374 ++++++++++++++++++++++++----------------- 1 file changed, 223 insertions(+), 151 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index f1d949ba..10e7d9e3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -8,8 +8,8 @@ use self::hercules_ir::*; /* * The top level function to compile a Hercules IR function into CUDA - * kernel for execution on the GPU. We generate CUDA C textually, based - * on the CPU LLVM approach. + * kernel for execution on the GPU. We generate CUDA C textually, with a lot + * of similarities with the CPU LLVM generation plus custom GPU parallelization. */ pub fn gpu_codegen<W: Write>( function: &Function, @@ -24,21 +24,26 @@ pub fn gpu_codegen<W: Write>( ) -> Result<(), Error> { /* * We assert the following: - * - Fork node must have >= 1 reduce nodes + * - There is at least one Fork node + * - Fork node must have >= 1 Reduce nodes * - If the returned data type is a collection, it must have * originated from a single known parameter. Can relax to allow * one of multiple parameters. * * We don't assert but assume the following: - * - max_num_blocks is within constraint of 1D grid size. This can be - * relaxed if we want to support larger grids. - * - product types are packed with padding inserted for each element to + * - max_num_blocks in KernelParams is within constraint of 1D grid size. This + * can be relaxed if we want to support larger grids. + * - Product types are packed with padding inserted for each element to * be aligned for its type and for full product to be aligned to its * largest element - * - similarly, summation types must be aligned to their largest element + * - Summation types must be aligned to their largest element * * Major TODOs: + * - Fix dynamic shared memory allocation to reuse old shmem. The main case + * for improvement is when we have serialized forks with unused intermediate + * values from previous iterations. * - Matmul/Conv detection + * - Add float8, float16, bfloat16 dtypes if they come */ let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) @@ -46,11 +51,11 @@ pub fn gpu_codegen<W: Write>( .map(NodeID::new) .collect(); - // Fork reduce map should have all reduces contained in some key + // Fork Reduce map should have all reduces contained in some key let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); - // Reduct reduce map should have all non-parallel and non-associative reduces - // contained in some key. Unlike fork, reduct is not involved in any assertions, - // put it here for convenience but can move. + // Reduct Reduce map should have all non-parallel and non-associative reduces + // contained in some key. Unlike Fork, Reduct is not involved in any assertions. + // It's placed here for convenience but can be moved. let reduct_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); for reduce_node in &reduce_nodes { if let Node::Reduce { @@ -76,7 +81,7 @@ pub fn gpu_codegen<W: Write>( } } if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce) - && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) + && !function.schedules[reduce_node.idx()].contains(&Schedule::TightAssociative) { reduct_reduce_map .entry(*reduct) @@ -85,6 +90,9 @@ pub fn gpu_codegen<W: Write>( } } } + if fork_reduce_map.is_empty() { + panic!("Function must have at least one fork node"); + } for idx in 0..function.nodes.len() { if function.nodes[idx].is_fork() && fork_reduce_map @@ -95,6 +103,9 @@ pub fn gpu_codegen<W: Write>( } } + // Obtain the Return node and if it's a collection, use the collection objects + // analysis to determine the origin. Also save the return node id for later + // conversion of primitive Return into Parameter. let (return_node_id, data_node_id) = { let pos = function .nodes @@ -179,7 +190,6 @@ pub fn gpu_codegen<W: Write>( ctx.codegen_function(w) } -// Kernel parameters that are fixed prior to codegen. struct GPUKernelParams { max_num_blocks: usize, max_num_threads: usize, @@ -202,10 +212,14 @@ struct GPUContext<'a> { return_type_id: &'a TypeID, } -// Pre is its own basic block, to separate one-time vs repeated code. It is -// non-trivial only for Fork nodes to create cooperative groups. -// Init, Body, and Term compose the main basic block, with Init and Term populated -// by control flow (Init used only by Fork and Join) and Body populated by data flow. +/* + * For all control nodes besides forks, Init, Body, and Term compose the main basic + * block, with Init and Term populated by control flow (Init used only by Fork and + * Join) and Body populated by data flow. + * For serialized Fork nodes which may be jumped back to by corresponding Join node, + * init and post_init separate one-time code (currently just cooperative group + * creation) from repeated code. + */ #[derive(Default, Debug)] struct CudaGoto { init: String, @@ -214,6 +228,14 @@ struct CudaGoto { term: String, } +/* + * KernelState is used for data and control node organization and generation. + * We define a block fork as one with each ThreadID being a block, and a thread + * fork as one with each ThreadID being a subset of threads within a block. + * OutBlock is outside a potential block fork at the full grid level, InBlock + * is inside a block fork but outside any thread forks, and InThread is inside + * a thread fork. + */ #[derive(Clone, Copy, PartialEq, Debug)] enum KernelState { OutBlock, @@ -221,6 +243,12 @@ enum KernelState { InThread, } +/* + * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) + * threads for a current ThreadID, Use is the union of such threads for all ThreadIDs + * in the current innermost Fork, and Available is Use plus additional threads not + * used in the current Fork. + */ #[derive(Clone, Copy, PartialEq, Debug)] enum CGType { UsePerId, @@ -230,8 +258,7 @@ enum CGType { impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { - // All possible includes followed by macros for intrinsic calls on - // types with no library support + // Emit all code up to the "goto" to Start's block let mut top = String::new(); self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; @@ -240,13 +267,16 @@ impl GPUContext<'_> { self.codegen_goto_start(&mut top)?; write!(w, "{}", top)?; + // Create structures and determine block and thread parallelization strategy let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); + println!("fork join map size: {}", self.fork_join_map.len()); + println!("fork tree size: {}", fork_tree.len()); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); - // We use CUDA's goto to jump between basic blocks. + // Setup for CUDA's "goto" for control flow between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) .map(|idx| { @@ -256,6 +286,7 @@ impl GPUContext<'_> { }) .collect(); + // Core function for the CUDA code of all data and control nodes. self.codegen_data_control( if num_blocks > 1 { Some(thread_root_root_fork) @@ -270,6 +301,7 @@ impl GPUContext<'_> { &mut gotos, )?; + // Emit all code from the previous step let mut rest = String::new(); self.codegen_gotos(&mut gotos, &mut rest)?; write!(w, "{}", rest)?; @@ -279,7 +311,7 @@ impl GPUContext<'_> { Ok(()) } - // Emit kernel signature, arguments, and dynamic shared memory declaration + // Emit kernel headers, signature, arguments, and dynamic shared memory declaration fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { write!(w, " #include <assert.h> @@ -343,8 +375,9 @@ namespace cg = cooperative_groups; )?; } - // Type is char since it's simplest to use single bytes for indexing, - // casting will be needed for use with different types. + // Type is char since it's simplest to use single bytes for indexing + // and it's required for heterogeneous Product and Summation types. + // Casting is later used for conversion to different types like int. write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; @@ -389,20 +422,23 @@ namespace cg = cooperative_groups; // upfront. fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if !self.function.nodes[id.idx()].is_control() { + if !self.function.nodes[id.idx()].is_control() && + !self.function.nodes[id.idx()].is_dynamic_constant() && + !self.function.nodes[id.idx()].is_parameter() { write!(w, "\t{};\n", self.get_value(id, true, false))?; } } Ok(()) } - // Emit helper registers that are used throughout the kernel- alignment - // is for proper dynamic shared memory allocation, max_variant_size is - // for variant selection during read/write copies since we don't keep - // tag (don't need and it can double summation memory usage due to alignment) + /* + * Emit helper registers that are used throughout the kernel. alignment + * is for proper dynamic shared memory allocation. grid and block are + * from CUDA's cooperative groups API and are used specifically for reads and + * writes. + */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!(w, "\tsize_t alignment;\n")?; - write!(w, "\tsize_t max_variant_size;\n")?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; Ok(()) @@ -449,8 +485,10 @@ namespace cg = cooperative_groups; } else { fork_control_map.entry(nested_fork).or_insert_with(HashSet::new).insert(control); } - for i in 0..forks.len()-1 { - fork_tree.entry(forks[i+1]).or_insert_with(HashSet::new).insert(forks[i]); + if forks.len() > 1 { + for i in 0..forks.len()-1 { + fork_tree.entry(forks[i+1]).or_insert_with(HashSet::new).insert(forks[i]); + } } (fork_tree, fork_control_map) }, @@ -459,7 +497,9 @@ namespace cg = cooperative_groups; /* * If tree has a single root fork of known size s <= max_num_blocks - * with parallel-fork schedule, then set num_blocks to s, else set num_blocks to 1. + * with parallel-fork schedule, then set num_blocks to s, else set num_blocks + * to 1. Also return the root fork(s) for parallelization strategy within + * threadblocks for threads and their eventual generation. */ fn get_root_forks_and_num_blocks( &self, @@ -493,9 +533,11 @@ namespace cg = cooperative_groups; * maximum over its descendants (leafs have base 1). We traverse up (details * in helper) and pass the factor and a map from fork node to a tuple of * (max quota of its siblings (including itself), its quota, its fork factor) + * from each node to its parents. The parent then compares * - all three are needed for codegen. A node is in the map IFF it will be parallelized. - * If not, the fork will use the parent's quota. Nodes may be removed from the - * map when traversing up the tree due to either of the max scenarios. + * If not, the fork will use the parent's quota and serialize over the Fork's + * ThreadIDs. Nodes may be removed from the map when traversing up the tree + * due to an ancestor having a larger factor that conflicts. */ fn get_thread_quotas( &self, @@ -514,8 +556,10 @@ namespace cg = cooperative_groups; is_root: bool, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { // Subsubtree map is the union of all keys for grandchildren and lower - // nodes, and subtree_quota is constructed map from children to their - // quota + // nodes. children_quota_map is a constructed map from parallelized children + // to their quota to update the subsubtree map at grandchildren level to + // subtreemap at children level. subtree_quota is cumulative factor of + // subtree and is then compared to this fork's factor. let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree .get(&curr_fork) .unwrap() @@ -544,18 +588,13 @@ namespace cg = cooperative_groups; if is_root { return (subtree_map, subtree_quota, true) } - /* - * A node can only be considered for parallelization if: - * a) it has statically known size - * b) the known size is less than or equal to the max_num_threads - * c) the known size is a power of 2 - * d) all reduces are parallel-reduce or associative - * - * Note: there are a few cases where we choose between - * parallelizing the fork vs its subtree, by taking max factor over subtree. - * However, parts of the subtree may have had smaller quotas and didn't - * need to be discarded. For now we avoid this complexity and discard full. - */ + // A node can only be considered for parallelization if: + // a) it has statically known size + // b) the known size is less than or equal to the max_num_threads + // c) the known size is a power of 2 + // d) all reduces are parallel-reduce or associative + // + // If not, just take the max cumulative factor of its subtree let reduces = &self.fork_reduce_map[&curr_fork]; if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] && let Some(fork_size) = self.multiply_fork_factors(factors) @@ -563,46 +602,39 @@ namespace cg = cooperative_groups; && fork_size.is_power_of_two() && reduces.iter().all(|&reduce| { self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) - || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) + || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) { - /* - * If there's an associative reduce, - * if fork and subtree fit in warp, parallelize both - * else if fork is a multiple of warp size, parallelize the max between them - * else parallelize subtree - * Else, parallelize both - */ - if fork_size <= self.kernel_params.max_num_threads / subtree_quota { - if reduces.iter().any(|&reduce| { - self.function.schedules[reduce.idx()].contains(&Schedule::Associative) - }) { - if self.kernel_params.threads_per_warp % (fork_size * subtree_quota) == 0 { - (subtree_map, fork_size * subtree_quota, true) - } else if fork_size % self.kernel_params.threads_per_warp == 0 { - if fork_size >= subtree_quota { - (HashMap::new(), fork_size, true) - } else { - (subtree_map, subtree_quota, false) - } - } else { - (subtree_map, subtree_quota, false) - } + // If there's an associative Reduce, parallelize the larger factor + // between the Fork and subtree + // Else, all Reduces must be only parallel-reduce, so parallelize + // both if they fit and the larger if not. + // The reason for this distinction is that we only perform Reduces over + // ThreadID-based values over consecutive CUDA threads, so there's no + // opportunity for further nested parallelization. In contrast, this + // restriction doesn't help for parallel Writes, so nested parallelization + // is possible. + if reduces.iter().any(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) + }) || fork_size > self.kernel_params.max_num_threads / subtree_quota { + if fork_size >= subtree_quota { + (HashMap::new(), fork_size, true) } else { - (subtree_map, fork_size * subtree_quota, true) + (subtree_map, subtree_quota, false) } - } - // We have to choose either the fork or its subtree - else if fork_size >= subtree_quota { - (HashMap::new(), fork_size, true) } else { - (subtree_map, subtree_quota, false) + (subtree_map, fork_size * subtree_quota, true) } } else { (subtree_map, subtree_quota, false) } } + /* + * If there's a block fork, then thread root forks are it's child forks. If + * not, thread root forks are the root forks. This will be used to begin the + * thread fork tree traversal for codegen. + */ fn get_thread_root_forks( &self, root_forks: &HashSet<NodeID>, @@ -617,6 +649,9 @@ namespace cg = cooperative_groups; } } + /* + * Codegen for all control and data nodes. + */ fn codegen_data_control( &self, block_fork: Option<NodeID>, @@ -627,10 +662,6 @@ namespace cg = cooperative_groups; num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { - // Define the following states: - // 0 is above block fork, 1 is in block fork above any thread fork, 2 is - // in any thread fork, 3 is below block fork - // First emit data and control gen for each control node outside any fork. // Recall that this was tracked through a fake fork node with NodeID 0. let mut state = KernelState::OutBlock; @@ -660,8 +691,7 @@ namespace cg = cooperative_groups; } } } - // Then generate for the thread fork tree by setting state to 2, traverse, - // and update the thread quota. Any traversal is fine, we choose pre-order. + // Then generate for the thread fork tree through Fork node traversal. state = KernelState::InThread; for &root_fork in thread_root_forks { self.codegen_data_control_traverse( @@ -678,6 +708,12 @@ namespace cg = cooperative_groups; Ok(()) } + /* + * The important feature of this traversal is that we update the available + * thread quota, use thread quota, and parallel factor for each Fork node. + * Either this information is in the precomputed map, or we use the parent's + * quota with no parallel factor. + */ fn codegen_data_control_traverse( &self, curr_fork: NodeID, @@ -742,13 +778,6 @@ namespace cg = cooperative_groups; Ok(()) } - // state dictates where we are in the kernel, and affects ThreadID and Write - // use_thread_quota is the number of threads used by the node, and affects - // ThreadID, Read, Write, and associative Binops - // parallel_factor is parallelization degree, and affects ThreadID and associative - // Binops - // nesting_fork is the fork node that the node is nested in, and affects ThreadID - // and Reduce fn codegen_data_node( &self, id: NodeID, @@ -763,7 +792,7 @@ namespace cg = cooperative_groups; let define_variable = self.get_value(id, false, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { - // Phi registers were already emitted and the data nodes it uses will + // Phi registers emitted at top and the data nodes it uses will // update the phi Node::Phi { control: _, @@ -785,6 +814,8 @@ namespace cg = cooperative_groups; } KernelState::InThread => { if parallel_factor.is_none() { + // No dependence on threadIdx.x because each used thread + // will run this Fork serially let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { @@ -798,9 +829,10 @@ namespace cg = cooperative_groups; } } } - // Only initialize the reduce, as reduct will update the reduce. If - // serialized, add gate to prevent re-assignment when we hit this reduce - // again + // The Reduce node only generates it's initialization, as reduct will + // perform the update. If serialized, add gate to prevent re-assignment + // when we hit this reduce again due to the control flow loop between + // the Fork and Join. Node::Reduce { control: _, init, @@ -821,18 +853,27 @@ namespace cg = cooperative_groups; } // Parameters emitted at top Node::Parameter { index: _ } => {} + // If the constant is primitive, it's stored in register so we repeat + // for all threads. Otherwise, it's stored in shared memory so we only + // want to "allocate" and initialize it once. Node::Constant { id: cons_id } => { + let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); + if (!is_primitive) { + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; + *num_tabs += 1; + } self.codegen_constant( - if self.types[self.typing[id.idx()].idx()].is_primitive() { - define_variable - } else { - format!("*{}", define_variable) - }, + define_variable, *cons_id, true, w, *num_tabs, )?; + if (!is_primitive) { + write!(w, "{}}}\n", tabs)?; + *num_tabs -= 1; + } } // Dynamic constants emitted at top Node::DynamicConstant { id: _ } => {} @@ -889,12 +930,17 @@ namespace cg = cooperative_groups; let id_type = self.typing[id.idx()]; if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And | BinaryOperator::Xor) && is_special_reduct { + // For parallelized associative Reduces, use the cooperative + // groups reduce API. Associative multiplication is not + // supported. We need to use CGType::Use not CGType::UsePerId + // because for parallelized reduction we only have one thread + // per ThreadID and the reduction is over Use, not UsePerId. let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[left.idx()] { right_val } else { left_val }; - let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + let cg_tile = self.get_cg_tile(id, CGType::Use); #[allow(unreachable_patterns)] let cg_op = match op { BinaryOperator::Add => "plus", @@ -977,12 +1023,13 @@ namespace cg = cooperative_groups; Node::IntrinsicCall { intrinsic, args } => { let id_type = self.typing[id.idx()]; if matches!(intrinsic, Intrinsic::Max | Intrinsic::Min) && is_special_reduct { + // Similar to associative Binops let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[args[0].idx()] { self.get_value(args[1], false, false) } else { self.get_value(args[0], false, false) }; - let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + let cg_tile = self.get_cg_tile(id, CGType::Use); #[allow(unreachable_patterns)] let cg_op = match intrinsic { Intrinsic::Max => "max", @@ -1004,8 +1051,15 @@ namespace cg = cooperative_groups; )?; } } - // Main difference between read and write is codegen_copy takes the - // returned node's type for read and data node's type for write + // For read, all the cases are: + // 1. Reading collection from/to global to/from shared + // 2. Reading primitive from/to global to/from shared + // 3. Reading primitive from/to global to/from register + // 4. Reading primitive from/to shared to/from register + // The first three can all use cooperative groups memcpy and the last + // one can't. However, the C++/CUDA semantics for the last three are + // identical, so we differentiate the cases by data type instead of + // data source and destination. Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); @@ -1025,6 +1079,9 @@ namespace cg = cooperative_groups; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } } + // For write, the cases are the same, but since we're using C++/CUDA + // not-thread-safe write semantics, we need to gate the write with + // a thread rank check. Node::Write { collect, data, @@ -1034,16 +1091,18 @@ namespace cg = cooperative_groups; let collect_with_indices = self.codegen_collect(*collect, indices, is_char); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; + let nested_fork = nesting_fork.unwrap(); + let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); if self.types[data_type_id.idx()].is_primitive() { + write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; if is_char { let type_name = self.get_type(data_type_id, true); - write!(w, "{}*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; + write!(w, "{}\t*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; } else { - write!(w, "{}*{} = {};\n", tabs, collect_with_indices, data_variable)?; + write!(w, "{}\t*{} = {};\n", tabs, collect_with_indices, data_variable)?; } + write!(w, "{}}}\n", tabs)?; } else { - let nested_fork = nesting_fork.unwrap(); - let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, collect_with_indices, data_variable, data_size)?; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; @@ -1055,6 +1114,8 @@ namespace cg = cooperative_groups; panic!("Unsupported data node type") } } + // Since the data uses and reducts are responsible for updating Phi and + // Reduce nodes, respectively, we check and emit those for each data node. if let Some(phis) = self.label_data_for_phi.get(&id) { let val = self.get_value(id, false, false); for phi in phis { @@ -1083,12 +1144,8 @@ namespace cg = cooperative_groups; w_term: &mut String, ) -> Result<usize, Error> { let tabs = match &self.function.nodes[id.idx()] { - Node::Start => { - let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; - 1 - } - Node::Region { preds: _ } + Node::Start + | Node::Region { preds: _ } | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; @@ -1110,8 +1167,14 @@ namespace cg = cooperative_groups; 1 } Node::Fork { control: _, factors: _ } => { - // We don't do anything smart to mitigate control flow divergence - // if use_thread_quota < warp size + // We create a cooperative group tile for each of: used threads per + // thread ID- for reads and writes-, used threads across all thread + // IDs- for parallelized reductions-, and available threads- to + // synchronize between used and unused threads. We want to create + // these only once, so we create two goto sections for each fork- + // one run only once for creating groups, and other may be ran + // multiple times if the Fork is serialized and Join jumps back + // to it. let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { let use_thread_quota = use_thread_quota.unwrap(); @@ -1124,6 +1187,9 @@ namespace cg = cooperative_groups; write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota)?; } write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; + // Fork nodes gate the used vs unused threads out of all available + // threads. If unused, we jump straight to the Join, and if used, + // we jump to successor like normal. let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { write!(w_post_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; @@ -1140,6 +1206,8 @@ namespace cg = cooperative_groups; } } Node::Join { control: fork } => { + // Join nodes also gate the used vs unused threads with a tile + // sync after the body. let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); if has_thread_quota { @@ -1150,9 +1218,14 @@ namespace cg = cooperative_groups; write!(w_term, "\t}}\n")?; } } + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + // If the Fork was parallelized, each thread or UsedPerId tile of + // threads only runs one ThreadID, so we can jump straight to the + // successor. Else, we jump back to the Fork until all ThreadIDs + // or equivalently the Fork's full factor number of iterations have + // been completed. if parallel_factor.is_some() { - let cg_tile_available = self.get_cg_tile(id, CGType::Available); - write!(w_term, "\t{}.sync();\n", cg_tile_available)?; write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; } else { let Node::Fork { factors, .. } = &self.function.nodes[fork.idx()] else { @@ -1171,6 +1244,8 @@ namespace cg = cooperative_groups; if has_thread_quota { 2 } else { 1 } } Node::Return { control: _, data } => { + // Since we lift originally primitive returns into a parameter, + // we write to that parameter upon return. if self.types[self.typing[data.idx()].idx()].is_primitive() { let return_val = self.get_value(*data, false, false); write!(w_term, "\tif (threadIdx.x == 0) {{\n")?; @@ -1187,7 +1262,13 @@ namespace cg = cooperative_groups; Ok(tabs) } - // Read/writes to global collections consist of global name + pointer offset. + /* + * This function emits collection name + pointer math for the provided indices. + * One nuance is whether the collection is represented as char pointer or + * the original primitive pointer. For Field, it's always char, for Variant, + * it doesn't matter here, and for Array, it depends- so we may need to tack + * on the element size to the index math. + */ fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { let mut index_ptr = "0".to_string(); let type_id = self.typing[collect.idx()]; @@ -1206,15 +1287,7 @@ namespace cg = cooperative_groups; else { panic!("Expected array type") }; - let mut cumulative_offset = "1 * ".to_string() - + extents - .iter() - .enumerate() - .filter(|(i, _)| *i >= array_indices.len()) - .map(|(_, id)| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join(" * ") - .as_str(); + let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); for index in array_indices.iter().rev() { cumulative_offset = format!( "{} * ({} + ", @@ -1238,10 +1311,17 @@ namespace cg = cooperative_groups; format!("{} + {}", name, index_ptr) } - // Standalone function allows us to handle recursive initialization for - // product and summation collections. `allow_allocate` prevents unnecessary - // shared memory allocations for nested product and summation collections. - // Since not initialized, array collections don't need to be recursed into. + /* + * The outlined codegen for constants allows us to handle recursive initialization + * for collections. We perform "allocation" by atomically incrementing dynamic + * shared memory and CUDA's support for dynamic is limited to a single extern + * array. Dynamic is required here because not all dynamic constants and therefore + * array sizes are known. This approach will need further work, as currently + * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` + * prevents unnecessary shared memory allocations for nested product and summation + * collections, since the outermost allocates everything for the full collection. + * Since not initialized, array collections don't need to be recursed into. + */ fn codegen_constant( &self, name: String, @@ -1335,10 +1415,12 @@ namespace cg = cooperative_groups; Ok(()) } - // Emit code to calculate data size. For Product types, setting `field_number` - // gives data size up to but not including that field, so = 2 gives 1st field - // and offset to 2nd field. This is useful for generating constant initialization - // and read/write index math. + /* + * Emit code to calculate data size. For Product types, setting `num_fields` + * gives data size up to but not including that field, so = 2 gives 1st field + * and offset to 2nd field. This is useful for constant initialization and read/write + * index math. + */ fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { @@ -1541,16 +1623,6 @@ namespace cg = cooperative_groups; } } - // matmul detection- only called if einsum detected - fn matmul_detection(&self) -> Result<(), Error> { - Ok(()) - } - - // convolution detection- only called if einsum detected - fn convolution_detection(&self) -> Result<(), Error> { - Ok(()) - } - fn get_cg_tile(&self, fork: NodeID, cg_type: CGType) -> String { format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" }) } @@ -1567,8 +1639,11 @@ namespace cg = cooperative_groups; format!("bb_{}{}", self.get_value(id, false, false), if post { "_post" } else { "" }) } - // Setting ty = true will return with type in declaration format. make_pointer - // is only considered if ty = true and only relevant for primitive types. + /* + * Setting `ty = true` will return with type in declaration format. `make_pointer` + * is only considered if `ty = true` and only relevant for primitive types- + * otherwise it makes no difference because collections are already pointers. + */ fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { if ty { @@ -1595,11 +1670,9 @@ namespace cg = cooperative_groups; } } - // Setting make_pointer = true will only affect primitive types- the - // collections are already pointers fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { - // Product and summation collections are char* for byte-addressability + // Product and summation collections are char* for 1 byte-addressability // since we can have variable type fields Type::Product(_) | Type::Summation(_) => "char*".to_string(), Type::Array(element_type, _) => self.get_type(*element_type, true), @@ -1623,7 +1696,6 @@ fn multiply_dcs(dcs: &[DynamicConstantID]) -> String { } } -// TODO: Add float8, float16, bfloat16 dtypes if they come fn convert_type(ty: &Type, make_pointer: bool) -> String { let mut result = match ty { Type::Boolean => "bool".to_string(), -- GitLab From bf8359ac3e340253f1ddfed823905d134e1d1d48 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 6 Jan 2025 09:57:39 -0600 Subject: [PATCH 018/109] comms --- hercules_cg/src/gpu.rs | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 10e7d9e3..38d4a9bb 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -38,10 +38,37 @@ pub fn gpu_codegen<W: Write>( * largest element * - Summation types must be aligned to their largest element * - * Major TODOs: + * Notes on GPU parallelization strategy and tips for IR transformations: + * - The top level block fork and any lower thread forks require a known Fork + * size. Thus for an otherwise parallelizable Fork with unknown size, + * consider splitting it into two Forks with one of known size. For block + * level, the known fork has to be the (only) top-most fork. + * - The thread-level strategy is determined by starting at the most nested + * Forks and working outwards in a greedy manner, with caps by GPU spec. + * Thus, to ensure some outer Fork is parallelized, ensure the inner + * parallelizable Forks aren't too large or consider removing schedule + * annotations. + * - Tight-Associative reductions can only be efficiently implemented if + * different Hercules ThreadIDs correspond to consecutive CUDA threads. But + * this prevents nested parallelization since each parallel group must always + * be a contiguous tile of threads. We use a heuristic of choosing the larger + * factor when this results in a conflict between a Fork and it's subtree, + * but this choice may not be optimal. + * - A given Fork (not talking about its children) can only be parallelized + * if all its Reduces are Parallel-Reduce or Tight-Associative. So if the + * Fork contains expensive parallelizable operations, ensure all reductions + * are parallelizable or if not try pulling those out into a different Fork. + * - We do nothing to mitigate intra-warp divergence. To mitigate this, the + * IR, for example, should ensure the innermost parallelizable Forks either + * have factor >= warp size (32) or remove Fork/Reduce node schedule + * annotations. + * + * Main TODOs: * - Fix dynamic shared memory allocation to reuse old shmem. The main case - * for improvement is when we have serialized forks with unused intermediate - * values from previous iterations. + * for improvement is when we have serialized forks with unused intermediate + * values from previous iterations. + * - Add mapping from Region node to Fork node if there's a reduce whose control + * is a Region not Join. * - Matmul/Conv detection * - Add float8, float16, bfloat16 dtypes if they come */ -- GitLab From d7d36313bf398df8241995703d1cb6cfec7310ae Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 6 Jan 2025 11:16:51 -0600 Subject: [PATCH 019/109] gpu juno --- juno_samples/matmul/src/gpu_matmul.jn | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 juno_samples/matmul/src/gpu_matmul.jn diff --git a/juno_samples/matmul/src/gpu_matmul.jn b/juno_samples/matmul/src/gpu_matmul.jn new file mode 100644 index 00000000..e719ba9b --- /dev/null +++ b/juno_samples/matmul/src/gpu_matmul.jn @@ -0,0 +1,45 @@ +#[entry] +fn tiled_64_matmul_with_n_1024<m : usize, l : usize>(a : i32[1024, m], b : i32[m, l]) -> i32 { + let res = 0; + + for bi = 0 to 16 { + for bk = 0 to l / 64 { + // TODO: make these all the same size, clone analysis should undo GVN's + // combining of these three arrays. + let atile : i32[66, 64]; + let btile : i32[65, 64]; + let ctile : i32[64, 64]; + + for tile_idx = 0 to m / 64 { + for ti = 0 to 64 { + for tk = 0 to 64 { + atile[ti, tk] = a[bi * 64 + ti, tile_idx * 64 + tk]; + btile[ti, tk] = b[tile_idx * 64 + ti, bk * 64 + tk]; + // TODO: remove setting ctile to zero explicitly, clone analysis + // should see a lack of a phi for ctile in the block loops and + // induce a copy of an initial value of ctile (all zeros) on each + // iteration of the block loops. + ctile[ti, tk] = 0; + } + } + for ti = 0 to 64 { + for tk = 0 to 64 { + let c_acc = ctile[ti, tk]; + for inner_idx = 0 to 64 { + c_acc += atile[ti, inner_idx] * btile[inner_idx, tk]; + } + ctile[ti, tk] = c_acc; + } + } + } + + for ti = 0 to 64 { + for tk = 0 to 64 { + res += ctile[ti, tk]; + } + } + } + } + + return res; +} -- GitLab From 6e2a4cd970923775a3043e2725ba8bf3eae01fd4 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 6 Jan 2025 12:30:03 -0600 Subject: [PATCH 020/109] minor --- hercules_cg/src/gpu.rs | 50 ++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 38d4a9bb..25443be5 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -459,13 +459,11 @@ namespace cg = cooperative_groups; } /* - * Emit helper registers that are used throughout the kernel. alignment - * is for proper dynamic shared memory allocation. grid and block are - * from CUDA's cooperative groups API and are used specifically for reads and - * writes. + * Emit helper registers that are used throughout the kernel. grid and block + * are from CUDA's cooperative groups API and are used specifically for reads + * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { - write!(w, "\tsize_t alignment;\n")?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; Ok(()) @@ -479,9 +477,15 @@ namespace cg = cooperative_groups; fn codegen_gotos(&self, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { write!(w, "\n")?; - for (_, goto) in gotos.iter() { + for (id, goto) in gotos.iter() { + let goto_block = self.get_block_name(*id, false); + write!(w, "{}:\n", goto_block)?; write!(w, "{}\n", goto.init)?; - write!(w, "{}\n", goto.post_init)?; + if !goto.post_init.is_empty() { + let goto_block = self.get_block_name(*id, true); + write!(w, "{}:\n", goto_block)?; + write!(w, "{}\n", goto.post_init)?; + } write!(w, "{}\n", goto.body)?; write!(w, "{}\n\n", goto.term)?; } @@ -886,7 +890,11 @@ namespace cg = cooperative_groups; Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); if (!is_primitive) { - let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + let cg_tile = match state { + KernelState::OutBlock + | KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(id, CGType::UsePerId), + }; write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; } @@ -967,6 +975,8 @@ namespace cg = cooperative_groups; } else { left_val }; + // Special reduct is only enabled for thread parallelization + // so don't need to worry about grid and block cases let cg_tile = self.get_cg_tile(id, CGType::Use); #[allow(unreachable_patterns)] let cg_op = match op { @@ -1086,7 +1096,7 @@ namespace cg = cooperative_groups; // The first three can all use cooperative groups memcpy and the last // one can't. However, the C++/CUDA semantics for the last three are // identical, so we differentiate the cases by data type instead of - // data source and destination. + // data src/dest, with only collection type using collective group. Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); @@ -1100,15 +1110,19 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); + let cg_tile = match state { + KernelState::OutBlock => "grid".to_string(), + KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nested_fork, CGType::UsePerId), + }; let data_size = self.get_size(data_type_id, None); write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, define_variable, collect_with_indices, data_size)?; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } } - // For write, the cases are the same, but since we're using C++/CUDA - // not-thread-safe write semantics, we need to gate the write with - // a thread rank check. + // For write, the cases are the same, but when using C++ dereference + // semantics, we need to gate the write with a thread rank check for + // thread safety. Node::Write { collect, data, @@ -1119,7 +1133,11 @@ namespace cg = cooperative_groups; let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; let nested_fork = nesting_fork.unwrap(); - let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); + let cg_tile = match state { + KernelState::OutBlock => "grid".to_string(), + KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nested_fork, CGType::UsePerId), + }; if self.types[data_type_id.idx()].is_primitive() { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; if is_char { @@ -1244,9 +1262,9 @@ namespace cg = cooperative_groups; write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; write!(w_term, "\t}}\n")?; } + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; } - let cg_tile_available = self.get_cg_tile(id, CGType::Available); - write!(w_term, "\t{}.sync();\n", cg_tile_available)?; // If the Fork was parallelized, each thread or UsedPerId tile of // threads only runs one ThreadID, so we can jump straight to the // successor. Else, we jump back to the Fork until all ThreadIDs -- GitLab From 8027b180627f3412710bc2f815d77fd10d8b01ae Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 10 Jan 2025 10:21:09 -0600 Subject: [PATCH 021/109] w host --- hercules_cg/src/device.rs | 2 + hercules_cg/src/gpu.rs | 325 +++++++++++++----- hercules_opt/src/pass.rs | 1 + hercules_samples/matmul/src/matmul.hir | 8 +- juno_samples/test2.jn | 25 ++ .../__pycache__/mobilenet.cpython-310.pyc | Bin 0 -> 582 bytes .../__pycache__/torch_export.cpython-310.pyc | Bin 0 -> 3266 bytes 7 files changed, 264 insertions(+), 97 deletions(-) create mode 100644 juno_samples/test2.jn create mode 100644 torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc create mode 100644 torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs index 7dbeeeda..50b7dda4 100644 --- a/hercules_cg/src/device.rs +++ b/hercules_cg/src/device.rs @@ -11,6 +11,8 @@ pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec let mut devices = vec![]; for (idx, function) in functions.into_iter().enumerate() { + devices.push(Device::CUDA); + continue; if let Some(device) = function.device { devices.push(device); } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 { diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 25443be5..a153b7ef 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -78,6 +78,12 @@ pub fn gpu_codegen<W: Write>( .map(NodeID::new) .collect(); + + let fork_join_map = &fork_join_map(function, control_subgraph); + let join_fork_map: &HashMap<NodeID, NodeID> = &fork_join_map + .into_iter() + .map(|(fork, join)| (*join, *fork)) + .collect(); // Fork Reduce map should have all reduces contained in some key let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); // Reduct Reduce map should have all non-parallel and non-associative reduces @@ -92,9 +98,8 @@ pub fn gpu_codegen<W: Write>( } = &function.nodes[reduce_node.idx()] { match function.nodes[control.idx()] { - Node::Join { - control: fork_node, .. - } => { + Node::Join {..} => { + let fork_node = join_fork_map[control]; fork_reduce_map .entry(fork_node) .or_default() @@ -123,8 +128,7 @@ pub fn gpu_codegen<W: Write>( for idx in 0..function.nodes.len() { if function.nodes[idx].is_fork() && fork_reduce_map - .get(&NodeID::new(idx)) - .map_or(true, |reduces| reduces.is_empty()) + .get(&NodeID::new(idx)).is_none_or(|reduces| reduces.is_empty()) { panic!("Fork node {} has no reduce nodes", idx); } @@ -197,7 +201,7 @@ pub fn gpu_codegen<W: Write>( }; let label_data_for_phi = &label_data_for_phi(); - let fork_join_map = &fork_join_map(function, control_subgraph); + let def_use_map = &def_use(function); let ctx = GPUContext { function, @@ -208,10 +212,12 @@ pub fn gpu_codegen<W: Write>( control_subgraph, bbs, kernel_params, + def_use_map, + fork_join_map, + join_fork_map, fork_reduce_map, reduct_reduce_map, label_data_for_phi, - fork_join_map, return_type_id, }; ctx.codegen_function(w) @@ -232,10 +238,12 @@ struct GPUContext<'a> { control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, kernel_params: &'a GPUKernelParams, + def_use_map: &'a ImmutableDefUseMap, + fork_join_map: &'a HashMap<NodeID, NodeID>, + join_fork_map: &'a HashMap<NodeID, NodeID>, fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, - fork_join_map: &'a HashMap<NodeID, NodeID>, return_type_id: &'a TypeID, } @@ -296,12 +304,12 @@ impl GPUContext<'_> { // Create structures and determine block and thread parallelization strategy let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); - println!("fork join map size: {}", self.fork_join_map.len()); - println!("fork tree size: {}", fork_tree.len()); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); + println!("num_blocks: {}", num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); + let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); // Setup for CUDA's "goto" for control flow between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -324,17 +332,22 @@ impl GPUContext<'_> { &fork_tree, &fork_control_map, &fork_thread_quota_map, + &extra_dim_collects, num_threads, &mut gotos, )?; - // Emit all code from the previous step - let mut rest = String::new(); - self.codegen_gotos(&mut gotos, &mut rest)?; - write!(w, "{}", rest)?; - + // Emit all GPU kernel code from previous steps + let mut kernel_body = String::new(); + self.codegen_gotos(&mut gotos, &mut kernel_body)?; + write!(w, "{}", kernel_body)?; write!(w, "}}\n")?; + // Emit host launch code + let mut host_launch = String::new(); + self.codegen_launch_code(num_blocks, num_threads, &mut host_launch)?; + write!(w, "{}", host_launch)?; + Ok(()) } @@ -347,7 +360,6 @@ impl GPUContext<'_> { #include <cuda.h> #include <cuda_runtime.h> #include <mma.h> -#include <helper_cuda.h> #include <cooperative_groups.h> #include <cooperative_groups/memcpy_async.h> #include <cooperative_groups/reduce.h> @@ -407,6 +419,8 @@ namespace cg = cooperative_groups; // Casting is later used for conversion to different types like int. write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; + // This will only get used by thread rank 0 in each block, since it + // does all shared memory "allocation" write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; Ok(()) @@ -471,7 +485,7 @@ namespace cg = cooperative_groups; fn codegen_goto_start(&self, w: &mut String) -> Result<(), Error> { let block_start = self.get_block_name(NodeID::new(0), false); - write!(w, "goto {};\n", block_start)?; + write!(w, "\tgoto {};\n", block_start)?; Ok(()) } @@ -480,15 +494,59 @@ namespace cg = cooperative_groups; for (id, goto) in gotos.iter() { let goto_block = self.get_block_name(*id, false); write!(w, "{}:\n", goto_block)?; - write!(w, "{}\n", goto.init)?; + write!(w, "{}", goto.init)?; if !goto.post_init.is_empty() { let goto_block = self.get_block_name(*id, true); write!(w, "{}:\n", goto_block)?; - write!(w, "{}\n", goto.post_init)?; + write!(w, "{}", goto.post_init)?; + } + write!(w, "{}", goto.body)?; + write!(w, "{}\n", goto.term)?; + } + Ok(()) + } + + fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, w: &mut String) -> Result<(), Error> { + write!(w, " +int main(")?; + // The following steps are for host-side C function arguments, but we also + // need to pass arguments to kernel, so we keep track of the arguments here. + let mut pass_args = String::new(); + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + write!(w, "unsigned long long dc_p{}", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; } - write!(w, "{}\n", goto.body)?; - write!(w, "{}\n\n", goto.term)?; + let param_type = self.get_type(*ty, false); + write!(w, "{} p{}", param_type, idx)?; + write!(pass_args, "p{}", idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { + write!(w, ", ")?; + write!(pass_args, ", ")?; + let ret_type = self.get_type(*self.return_type_id, true); + write!(w, "{} ret", ret_type)?; + write!(pass_args, "ret")?; } + write!(w, ") {{ + {}<<<{}, {}>>>({}); +}}", self.function.name, num_blocks, num_threads, pass_args); Ok(()) } @@ -510,17 +568,19 @@ namespace cg = cooperative_groups; fork_nesting.into_iter().fold( (HashMap::new(), HashMap::new()), |(mut fork_tree, mut fork_control_map), (control, forks)| { - let nested_fork = forks.first().copied().unwrap_or(NodeID::new(0)); if self.function.nodes[control.idx()].is_fork() { - fork_tree.entry(nested_fork).or_insert_with(HashSet::new).insert(control); - } else { - fork_control_map.entry(nested_fork).or_insert_with(HashSet::new).insert(control); - } - if forks.len() > 1 { - for i in 0..forks.len()-1 { - fork_tree.entry(forks[i+1]).or_insert_with(HashSet::new).insert(forks[i]); - } + // If control node is fork make sure it's in the fork_tree even + // if has no nested forks. + fork_tree.entry(control).or_insert_with(HashSet::new); + // Then get it's nesting fork- index = 1 to not count itself! + let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0)); + fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(control); + println!("fork_tree parent: {}, child: {}", nesting_fork.idx(), control.idx()); } + // Here the desired fork is always the first fork + let fork = forks.first().copied().unwrap_or(NodeID::new(0)); + fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(control); + println!("fork_control_map parent: {}, child: {}", fork.idx(), control.idx()); (fork_tree, fork_control_map) }, ) @@ -557,6 +617,25 @@ namespace cg = cooperative_groups; } } + /* + * If there's a block fork, then thread root forks are it's child forks. If + * not, thread root forks are the root forks. This will be used to begin the + * thread fork tree traversal for codegen. + */ + fn get_thread_root_forks( + &self, + root_forks: &HashSet<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + num_blocks: usize, + ) -> (NodeID, HashSet<NodeID>) { + if num_blocks > 1 { + let root_fork = root_forks.iter().next().unwrap(); + (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect()) + } else { + (NodeID::new(0), root_forks.clone()) + } + } + /* * This analysis determines the parallelization strategy within threadblocks. * We run post-order traversal on the fork tree to get the thread quota per @@ -661,23 +740,43 @@ namespace cg = cooperative_groups; } } - /* - * If there's a block fork, then thread root forks are it's child forks. If - * not, thread root forks are the root forks. This will be used to begin the - * thread fork tree traversal for codegen. + /* + * All non reduced-over collections used in fork joins have an extra dimension. + * However, this is only useful if ThreadIDs run in parallel not serially, + * otherwise it's unnecessarily consuming shared memory. This function returns + * the set of collections that have an unnecessary extra dimension. */ - fn get_thread_root_forks( + fn get_extra_dim_collects( &self, - root_forks: &HashSet<NodeID>, - fork_tree: &HashMap<NodeID, HashSet<NodeID>>, - num_blocks: usize, - ) -> (NodeID, HashSet<NodeID>) { - if num_blocks > 1 { - (NodeID::new(0), root_forks.clone()) - } else { - let root_fork = root_forks.iter().next().unwrap(); - (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect()) - } + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + ) -> HashSet<TypeID> { + // Get all constant collection creations + let collect_consts: HashSet<NodeID> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_constant() && !self.types[self.typing[*idx].idx()].is_primitive()) + .map(|idx| NodeID::new(idx)) + .collect(); + // Reverse fork_control_map + let control_fork_map: HashMap<NodeID, NodeID> = fork_control_map.iter() + .flat_map(|(fork, controls)| { + controls.iter().map(move |control| (*control, *fork)) + }) + .collect(); + // Get all uses of each collection, map each use to basic block, then map each basic block to fork + let collect_fork_users: HashMap<NodeID, HashSet<NodeID>> = collect_consts.iter() + .map(|collect_const| { + (*collect_const, self.def_use_map.get_users(*collect_const)) + }) + .map(|(collect_const, users)| { + (collect_const, users.iter().map(|user| control_fork_map[&self.bbs.0[user.idx()]]).collect()) + }) + .collect(); + // For now assert that each collection is used by a single fork and get + // parallel status, TODO: revisit + collect_fork_users.iter() + .filter(|(_, fork_users)| !fork_thread_quota_map.contains_key(fork_users.iter().next().unwrap())) + .map(|(collect_const, _)| self.typing[collect_const.idx()]) + .collect() } /* @@ -690,6 +789,7 @@ namespace cg = cooperative_groups; fork_tree: &HashMap<NodeID, HashSet<NodeID>>, fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + extra_dim_collects: &HashSet<TypeID>, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -697,6 +797,7 @@ namespace cg = cooperative_groups; // Recall that this was tracked through a fake fork node with NodeID 0. let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { + println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -704,13 +805,14 @@ namespace cg = cooperative_groups; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, false, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { + println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -718,7 +820,7 @@ namespace cg = cooperative_groups; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, body, &mut tabs)?; } } } @@ -733,6 +835,7 @@ namespace cg = cooperative_groups; fork_thread_quota_map, 1, num_threads, + extra_dim_collects, gotos, )?; } @@ -754,6 +857,7 @@ namespace cg = cooperative_groups; fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, parent_quota: usize, num_threads: usize, + extra_dim_collections: &HashSet<TypeID>, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map @@ -775,6 +879,7 @@ namespace cg = cooperative_groups; HashSet::new() }; for control in fork_control_map.get(&curr_fork).unwrap() { + println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -789,6 +894,7 @@ namespace cg = cooperative_groups; parallel_factor, Some(curr_fork), reducts.contains(data), + extra_dim_collections, body, &mut tabs, )?; @@ -803,6 +909,7 @@ namespace cg = cooperative_groups; fork_thread_quota_map, use_thread_quota, num_threads, + extra_dim_collections, gotos, )?; } @@ -817,6 +924,7 @@ namespace cg = cooperative_groups; parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, is_special_reduct: bool, + extra_dim_collects: &HashSet<TypeID>, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { @@ -890,10 +998,12 @@ namespace cg = cooperative_groups; Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); if (!is_primitive) { - let cg_tile = match state { - KernelState::OutBlock - | KernelState::InBlock => "block".to_string(), - KernelState::InThread => self.get_cg_tile(id, CGType::UsePerId), + let cg_tile = { + let KernelState::OutBlock = state else { + panic!("Expected constant to be in start basic block + outside any fork"); + }; + "block".to_string() }; write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; @@ -902,6 +1012,7 @@ namespace cg = cooperative_groups; define_variable, *cons_id, true, + Some(extra_dim_collects), w, *num_tabs, )?; @@ -1099,14 +1210,14 @@ namespace cg = cooperative_groups; // data src/dest, with only collection type using collective group. Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); - let collect_with_indices = self.codegen_collect(*collect, indices, is_char); + let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_type_id = self.typing[id.idx()]; if self.types[data_type_id.idx()].is_primitive() { if is_char { let type_name = self.get_type(data_type_id, true); write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; } else { - write!(w, "{}{} = *{};\n", tabs, define_variable, collect_with_indices)?; + write!(w, "{}{} = *({});\n", tabs, define_variable, collect_with_indices)?; } } else { let nested_fork = nesting_fork.unwrap(); @@ -1115,7 +1226,7 @@ namespace cg = cooperative_groups; KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nested_fork, CGType::UsePerId), }; - let data_size = self.get_size(data_type_id, None); + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, define_variable, collect_with_indices, data_size)?; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } @@ -1129,7 +1240,7 @@ namespace cg = cooperative_groups; indices, } => { let is_char = self.is_char(self.typing[collect.idx()]); - let collect_with_indices = self.codegen_collect(*collect, indices, is_char); + let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; let nested_fork = nesting_fork.unwrap(); @@ -1144,11 +1255,11 @@ namespace cg = cooperative_groups; let type_name = self.get_type(data_type_id, true); write!(w, "{}\t*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; } else { - write!(w, "{}\t*{} = {};\n", tabs, collect_with_indices, data_variable)?; + write!(w, "{}\t*({}) = {};\n", tabs, collect_with_indices, data_variable)?; } write!(w, "{}}}\n", tabs)?; } else { - let data_size = self.get_size(data_type_id, None); + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, collect_with_indices, data_variable, data_size)?; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } @@ -1169,7 +1280,7 @@ namespace cg = cooperative_groups; } } if let Some(reduces) = self.reduct_reduce_map.get(&id) { - let val = self.get_value(id, true, false); + let val = self.get_value(id, false, false); for reduce in reduces { let reduce_val = self.get_value(*reduce, false, false); write!(w, "{}{} = {};\n", tabs, reduce_val, val)?; @@ -1223,21 +1334,29 @@ namespace cg = cooperative_groups; let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { let use_thread_quota = use_thread_quota.unwrap(); - let use_thread_per_id = use_thread_quota / parallel_factor.unwrap(); + let use_thread_per_id = if parallel_factor.is_some() { + use_thread_quota / parallel_factor.unwrap() + } else { + use_thread_quota + }; write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id)?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota)?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota)?; + if parallel_factor.is_none() { + write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?; + write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; + } } - write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; // Fork nodes gate the used vs unused threads out of all available // threads. If unused, we jump straight to the Join, and if used, // we jump to successor like normal. let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { - write!(w_post_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + let w_target = if parallel_factor.is_none() { w_post_init } else { w_init }; + write!(w_target, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; @@ -1246,23 +1365,30 @@ namespace cg = cooperative_groups; write!(w_term, "\t}}\n")?; 2 } else { + // Make sure post-init isn't empty so it goto header generated + if use_thread_quota.is_some() && parallel_factor.is_none() { + write!(w_post_init, " ")?; + } write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; 1 } } - Node::Join { control: fork } => { + Node::Join { control: _ } => { // Join nodes also gate the used vs unused threads with a tile // sync after the body. let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); + let mut tabs = 1; if has_thread_quota { let available_thread_quota = available_thread_quota.unwrap(); let use_thread_quota = use_thread_quota.unwrap(); if use_thread_quota < available_thread_quota { write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; write!(w_term, "\t}}\n")?; + tabs += 1; } - let cg_tile_available = self.get_cg_tile(id, CGType::Available); + let fork = self.join_fork_map.get(&id).unwrap(); + let cg_tile_available = self.get_cg_tile(*fork, CGType::Available); write!(w_term, "\t{}.sync();\n", cg_tile_available)?; } // If the Fork was parallelized, each thread or UsedPerId tile of @@ -1273,8 +1399,9 @@ namespace cg = cooperative_groups; if parallel_factor.is_some() { write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; } else { + let fork = self.join_fork_map.get(&id).unwrap(); let Node::Fork { factors, .. } = &self.function.nodes[fork.idx()] else { - panic!("Expected join node to use a fork node"); + panic!("Expected join_fork_map to point to a fork node"); }; let fork_size = multiply_dcs(factors); let fork_iter = self.get_fork_iter(*fork, false); @@ -1286,7 +1413,7 @@ namespace cg = cooperative_groups; write!(w_term, "\t\tgoto {};\n", self.get_block_name(*fork, true))?; write!(w_term, "\t}}\n")?; } - if has_thread_quota { 2 } else { 1 } + tabs } Node::Return { control: _, data } => { // Since we lift originally primitive returns into a parameter, @@ -1314,13 +1441,13 @@ namespace cg = cooperative_groups; * it doesn't matter here, and for Array, it depends- so we may need to tack * on the element size to the index math. */ - fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { + fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool, has_extra_dim: bool) -> String { let mut index_ptr = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { match index { Index::Field(field) => { - self.get_size(type_id, Some(*field)); + self.get_size(type_id, Some(*field), None); } // Variants of summations have zero offset Index::Variant(_) => {} @@ -1333,11 +1460,12 @@ namespace cg = cooperative_groups; panic!("Expected array type") }; let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); - for index in array_indices.iter().rev() { + for (i, index) in array_indices.iter().skip(if has_extra_dim { 1 } else { 0 }).rev().enumerate() { cumulative_offset = format!( - "{} * ({} + ", + "{} * ({} + {}", cumulative_offset, - self.get_value(*index, false, false) + self.get_value(*index, false, false), + format!("dc{}", extents[i].idx()) ); } index_ptr.push_str(&format!( @@ -1346,7 +1474,7 @@ namespace cg = cooperative_groups; ")".repeat(array_indices.len()) )); if is_char { - let element_size = self.get_size(*element_type, None); + let element_size = self.get_size(*element_type, None, None); index_ptr.push_str(&format!(" * {}", element_size)); } } @@ -1372,6 +1500,7 @@ namespace cg = cooperative_groups; name: String, cons_id: ConstantID, allow_allocate: bool, + extra_dim_collects: Option<&HashSet<TypeID>>, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { @@ -1388,12 +1517,12 @@ namespace cg = cooperative_groups; Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, val)?, Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, - // All three followign collections involve align then allocate from the + // All three following collections involve align then allocate from the // single dynamic shared memory buffer by using and updating the offset. Constant::Product(type_id, constant_fields) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None); + let size = self.get_size(*type_id, None, extra_dim_collects); write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; @@ -1404,20 +1533,26 @@ namespace cg = cooperative_groups; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call let field_type = self.get_type(type_fields[i], true); - let offset = self.get_size(type_fields[i], Some(i)); - self.codegen_constant( - format!("*reinterpret_cast<{}>({}{})", field_type, name, offset), - constant_fields[i], - false, - w, - num_tabs, - ); + let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects); + let field_constant = &self.constants[constant_fields[i].idx()]; + if field_constant.is_scalar() { + self.codegen_constant( + format!("*reinterpret_cast<{}>({}+{})", field_type, name, offset), + constant_fields[i], + false, + extra_dim_collects, + w, + num_tabs, + ); + } else if !field_constant.is_array() { + self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, w, num_tabs); + } } } Constant::Summation(type_id, variant, field) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None); + let size = self.get_size(*type_id, None, extra_dim_collects); write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; @@ -1431,26 +1566,30 @@ namespace cg = cooperative_groups; let variant_constant = &self.constants[field.idx()]; if variant_constant.is_scalar() { self.codegen_constant( - format!("*reinterpret_cast<{}>{}", variant_type, name), - cons_id, + format!("*reinterpret_cast<{}>({})", variant_type, name), + *field, false, + extra_dim_collects, w, num_tabs, ); } else if !variant_constant.is_array() { - self.codegen_constant(name, cons_id, false, w, num_tabs); + self.codegen_constant(name, *field, false, extra_dim_collects, w, num_tabs); }; } Constant::Array(type_id) => { let Type::Array(element_type, _) = &self.types[type_id.idx()] else { panic!("Expected array type") }; + if !allow_allocate { + panic!("Nested array constant should not be re-allocated"); + } let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None); - let element_type = self.get_type(*element_type, false); + let size = self.get_size(*type_id, None, extra_dim_collects); + let element_type = self.get_type(*element_type, true); write!( w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", + "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {};\n", tabs, alignment, alignment, alignment )?; write!(w, "{}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n", tabs, name, element_type)?; @@ -1466,10 +1605,10 @@ namespace cg = cooperative_groups; * and offset to 2nd field. This is useful for constant initialization and read/write * index math. */ - fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { - let array_size = multiply_dcs(extents); + let array_size = multiply_dcs(if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { &extents[1..] } else { extents }); format!("{} * {}", self.get_alignment(*element_type), array_size) } Type::Product(fields) => { @@ -1478,7 +1617,7 @@ namespace cg = cooperative_groups; .iter() .enumerate() .filter(|(i, _)| i < num_fields) - .map(|(_, id)| (self.get_size(*id, None), self.get_alignment(*id))) + .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects), self.get_alignment(*id))) .fold(String::from("0"), |acc, (size, align)| { if acc == "0" { size @@ -1493,7 +1632,7 @@ namespace cg = cooperative_groups; format!( "{} - {}", with_field, - self.get_size(fields[*num_fields], None) + self.get_size(fields[*num_fields], None, extra_dim_collects) ) } else { with_field @@ -1503,7 +1642,7 @@ namespace cg = cooperative_groups; // The argmax variant by size is not guaranteed to be same as // argmax variant by alignment, eg product of 3 4-byte primitives // vs 1 8-byte primitive, so we need to calculate both. - let max_size = variants.iter().map(|id| self.get_size(*id, None)).fold( + let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects)).fold( String::from("0"), |acc, x| { if acc == "0" { diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 217d5996..08faa5b0 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -980,6 +980,7 @@ impl PassManager { _ => todo!(), } } + println!("{}", cuda_ir); println!("{}", llvm_ir); println!("{}", rust_rt); diff --git a/hercules_samples/matmul/src/matmul.hir b/hercules_samples/matmul/src/matmul.hir index ab0f384a..400ab5e1 100644 --- a/hercules_samples/matmul/src/matmul.hir +++ b/hercules_samples/matmul/src/matmul.hir @@ -1,9 +1,9 @@ -fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2) - c = constant(array(i32, #0, #2), []) - i_j_ctrl = fork(start, #0, #2) +fn matmul(a: array(i32, 16, 64), b: array(i32, 64, 32)) -> array(i32, 16, 32) + c = constant(array(i32, 16, 32), []) + i_j_ctrl = fork(start, 16, 32) i_idx = thread_id(i_j_ctrl, 0) j_idx = thread_id(i_j_ctrl, 1) - k_ctrl = fork(i_j_ctrl, #1) + k_ctrl = fork(i_j_ctrl, 64) k_idx = thread_id(k_ctrl, 0) k_join_ctrl = join(k_ctrl) i_j_join_ctrl = join(k_join_ctrl) diff --git a/juno_samples/test2.jn b/juno_samples/test2.jn new file mode 100644 index 00000000..a1fc6e65 --- /dev/null +++ b/juno_samples/test2.jn @@ -0,0 +1,25 @@ +#[entry] +fn main<m, n : usize>() -> i32[m, n, 64, 64] { + let res : i32[m, n, 64, 64]; + for bi = 0 to m { + for bj = 0 to n { + let tile : i32[64, 64]; + for ti = 0 to 64 { + for tj = 0 to 64 { + tile[ti, tj] = (ti as i32) + (tj as i32) + (bi as i32) + (bj as i32); + } + } + for si = 1 to 63 { + for sj = 1 to 63 { + tile[si, sj] = tile[si-1, sj-1] + tile[si+1, sj+1]; + } + } + for ri = 0 to 64 { + for rj = 0 to 64 { + res[bi, bj, ri, rj] = tile[ri, rj]; + } + } + } + } + return res; +} diff --git a/torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc b/torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb6ac34cdbe90728338b32e7b1eae9b0edb1b69b GIT binary patch literal 582 zcmYjN!HN?>5bf^iOlBvO!0tg%Jczd(+<`qWB8Z;Ei+HKa(3`3x9Vb0Q_w2APd-Eo{ z|Bze+e<@c_^9zDtbp*AVe$`b})q7pdbeapcZ>M*wuS$p?G5Fsy4qovM_Y5dd)&ine zwcILDGQN{K*(Q)kK?*gF9wcZIo6#o46vvq1344wCLrQuxM)8<?E=8=^Wt=&|$zHL~ zG3Bh@P4<E(iCw{6DHd4l^-{u=64Erq=@;ff`9aKQ11l2m=kic&n=4zBqps>NV%#-; zv99Q*ZG883E+KC{b>3J<I7n_9o<&F^@Snw8!{ifrM|BnKEkPPOZ;@c!8bs?>ps%b6 zLtNDGl;L*vm+;(l*3V}!I#R;f(e*ai#+`2)q&oP>;3F4|ju}wSLHfFJ{RKj{B-34c z3Xr~IdB#Igo6uE@wQ;Nm@&$*%E-m?krry*r^2VWqf^N*Vt*J786}}5Ui7XL*z9~Nj z@}X?Gkf<w>`f^Qvaa~h*anx7-+0W89t|NzKI8F-XzhAYV57{NF<(`*uVc`iwCJI@| SJbA7ksZ5<EN+}sn_UktrSe*C( literal 0 HcmV?d00001 diff --git a/torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc b/torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ca624dd252afba2297a33d14c41710767938ad1 GIT binary patch literal 3266 zcma)8&2QVt6`vs~ijpW<{!X0jE}Il+<YeW<Uvk`hG-$fpJ=9$^>854mvW#XVQ>ICI zII?4b0#Z1|p0?QgVgc!xKc)Acc->RZEl|K{o4y&+T5mT%OM#;|kMHC6-n{q7uv{)8 zxR$GNXH-S#8?m`~8L;^&+|eIkA&6if+2UXE@quD1SkS7k9%!~EY0b|a=(a9t?C0$~ z$x+lbx&@+B4X7f?x0TC?7_>wR1h)-RB&AoHZ4&$#)yy~G1?|*uqSF_>ka3`LPx?Kd z0-=5C^1UQ?5EAMq75c(G0KVe|y*`iXNh$1epq#en(?n;K_gT<Vg}LlnfPWF|&)|;! zsvs1j5gH?mK8C1>@(N<&xT|#4Q+24uYK%_tP~%#x#rRYi=Hgs8H`HUjs}q%IU*KWB zgNBBOhJ`pkf+)xM5!(IRuoxFd#u$lzwh1)EJ;tnYSR?tkc!by^;N?kvjEQj>rIMr| zpZt~=M<oJ!V`W&1OYCV}8kwX7lI9}V3uZ}K%I!<J%0zDeyK-Q^Or}T`tOzV+GEHVC z^emYZG?|x{UXulhW<Eu8gxTwhdx_4Ra7rm&h-eWa0?7{ImDo7NZ8a{$=3N9aO1ocT zwn#2b^s+dQkl;_anYCe=sKZKJ7)^~4<nNQFGBlg&aB5f`PQ~RB*d>=1(HMj7%wUCA z<1+X(PcbPiB69gpP)Tq~(-W)bIh_Hi>M4$=_c41LS4Y!iX^gSJn{lJR$1`y?p1zC7 zm0{&6R6$VE5TE%HpG;%eVe~t6_?Kh2<BGt2Sc{Dj_z_kPYkW3Ed@9cTW`=ndFi-P2 z@*erYD->5B0kTtpdxyL~nvbhv{1qY}#ItbvBHvm3U!kv6I45JY5DV+jPa>E1AXkUi zNDlOjF&@tG#aMwj=i~YArS4^N1%B2SLUz8p6wi?#lB;4zmY{>Kxg893Zo&1~J)+h_ zP6O+cUt8^vS+4cz$&-+^_AJ>$77u|X`lT-A<?@hqqHReD6H;#b`)X-(PQ<?3i`$m$ zSSxH>*#k>9sRdA?kVO{XbGdckasa~jXzJw0k$`yYGGa~4Ug+p;>*4b#^!uDzJ(vOl zdL+W*;Y6C81nz;?a-u!AN26`)S3T~9f$LkuYe|ZofYV;j3pyDdkf(qKObHc>i#Zcn z$9t4fYbOjSh=H!lfteWxTL%yX*p+Tu*=U*ksTmdh2ndR7OpvX7j@npG7{gY@?0AK2 z?Q^6dI|t{L8(ih$NSrjK^}Bx9a{cIj{g-0QJ!JrxvKCCM$t7rU&%zze!NSog8euWX zVjLq!A?PuBjdyBll8gGFofr;GNsl`Y%Yk@JNwkRiZM*o~ji@7!_%%)nj??npC~}<F z=<F9OzlkV|R(i}0DCH}J9<4xHT75{+hZ65fin`6h0FITcL~<HO89NK6cfv~GaY-Q6 zU@77v9^Clf@z&1)x1g3M*LmPiffI>I9-}UV{AUQ46<}M#S$xcdU81>A%4`bAbb+@U zO7dBey$_p67z9Ae{|BH<t)`1)in?)}r09s`K}vu&9p|v``souHG@B9r%2`%h)b}C( z-=ZnQQzjvq7f<@Kz@@`tAQM+Lwg}`$@a{8v%nP`ac0_`UXZYF~{s|N58C;S0>tQdd zL#+n&cHj5wgtpzj&j*jLH>2g|@r`Y(y|&tX*4$mT?m@{z#WjUSQ>rwjPV-!;dA<3p zzWkuMd((RM`<uHrYHx&}Gh7>_n%}x|p*}ja9&wj(C*rg>*qQhPr{Iq)H+SFZF;#6| z&~09q-mX8`KKJ{eX3ySPU0qvitlnDN+}c=gG&Z(wHSFT*?e)g`*4pNJ!=AgnwZ6XD zSYN-@SZmz6v$1h|^Y(^aTwlGjerL0>dS~NZCblk^*h=udGrT>}(=?vpXK&uZwe#eA z#V5Ku689xo{s`r_n;+sBdc2Q6gYG+&ohM6C9yBzXQoohpqYK9_PrPuqLj!0LcApAs zqBE423uy4cy9;q5Jy3310~N9fmh(NA(SS#ZL0@<gRL1@w*XQjIx1d-I8ni+JbFOwJ z(dd!uCpl?3QGy`Rq1>QmnJE6m?7)E8%gQ*Zq(#Q4OWf!16_ty6zQ^H@Gq0Bv<tazU z9(;r;44%pfp5!_#?Dvv9qdnhk(M0uWkmNj0528ecnoV>u!UB>MKXd&){W)VHOEe!a zC#viFiPjB0K;;E6a5xjWNc1!|@UTxjmK2W}m~%LN!D|b$#hK_3mKTd?5W9RV=Ojso znGN5D;@gpCdv;UERI(O^GKE&^*yWZ_VYm^;_W~N()10}jeYu5f9=R+MAOp%)g>oJk zqPG*3?L|qh9rht0?AS&5&Pc^okGx0>5R~)5E`-eMNaOa@xvLC<eg3T~US<DX9OlI$ z5Fth}EXLHFY7~lCS52&`8m?eNsbB>g>I~e#1CH|YoHBu9;w5-?3D1DkRiHGyfU7_Y zjtO(vd}}Bg)<I9h%3I|hdP%{ivZT)8tMFXM+RLJ1s8x_=9|M3|nbkxlQh4<=!)X!e z{6xG|nP?ADmweCnp39SDw}Hi$#d1SD)St*NfJB8ekXA1QFGr(n594k+R_}|c6TJtE Z7>43!U`5dZt*I#gR#fF*idOmOe*hW`e6j!l literal 0 HcmV?d00001 -- GitLab From 52a768fa4bda7ae648b28e759338a1ad965e3d9e Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 10 Jan 2025 10:22:18 -0600 Subject: [PATCH 022/109] no torch frontend --- .../__pycache__/mobilenet.cpython-310.pyc | Bin 582 -> 0 bytes .../__pycache__/torch_export.cpython-310.pyc | Bin 3266 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc delete mode 100644 torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc diff --git a/torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc b/torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc deleted file mode 100644 index cb6ac34cdbe90728338b32e7b1eae9b0edb1b69b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 582 zcmYjN!HN?>5bf^iOlBvO!0tg%Jczd(+<`qWB8Z;Ei+HKa(3`3x9Vb0Q_w2APd-Eo{ z|Bze+e<@c_^9zDtbp*AVe$`b})q7pdbeapcZ>M*wuS$p?G5Fsy4qovM_Y5dd)&ine zwcILDGQN{K*(Q)kK?*gF9wcZIo6#o46vvq1344wCLrQuxM)8<?E=8=^Wt=&|$zHL~ zG3Bh@P4<E(iCw{6DHd4l^-{u=64Erq=@;ff`9aKQ11l2m=kic&n=4zBqps>NV%#-; zv99Q*ZG883E+KC{b>3J<I7n_9o<&F^@Snw8!{ifrM|BnKEkPPOZ;@c!8bs?>ps%b6 zLtNDGl;L*vm+;(l*3V}!I#R;f(e*ai#+`2)q&oP>;3F4|ju}wSLHfFJ{RKj{B-34c z3Xr~IdB#Igo6uE@wQ;Nm@&$*%E-m?krry*r^2VWqf^N*Vt*J786}}5Ui7XL*z9~Nj z@}X?Gkf<w>`f^Qvaa~h*anx7-+0W89t|NzKI8F-XzhAYV57{NF<(`*uVc`iwCJI@| SJbA7ksZ5<EN+}sn_UktrSe*C( diff --git a/torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc b/torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc deleted file mode 100644 index 3ca624dd252afba2297a33d14c41710767938ad1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3266 zcma)8&2QVt6`vs~ijpW<{!X0jE}Il+<YeW<Uvk`hG-$fpJ=9$^>854mvW#XVQ>ICI zII?4b0#Z1|p0?QgVgc!xKc)Acc->RZEl|K{o4y&+T5mT%OM#;|kMHC6-n{q7uv{)8 zxR$GNXH-S#8?m`~8L;^&+|eIkA&6if+2UXE@quD1SkS7k9%!~EY0b|a=(a9t?C0$~ z$x+lbx&@+B4X7f?x0TC?7_>wR1h)-RB&AoHZ4&$#)yy~G1?|*uqSF_>ka3`LPx?Kd z0-=5C^1UQ?5EAMq75c(G0KVe|y*`iXNh$1epq#en(?n;K_gT<Vg}LlnfPWF|&)|;! zsvs1j5gH?mK8C1>@(N<&xT|#4Q+24uYK%_tP~%#x#rRYi=Hgs8H`HUjs}q%IU*KWB zgNBBOhJ`pkf+)xM5!(IRuoxFd#u$lzwh1)EJ;tnYSR?tkc!by^;N?kvjEQj>rIMr| zpZt~=M<oJ!V`W&1OYCV}8kwX7lI9}V3uZ}K%I!<J%0zDeyK-Q^Or}T`tOzV+GEHVC z^emYZG?|x{UXulhW<Eu8gxTwhdx_4Ra7rm&h-eWa0?7{ImDo7NZ8a{$=3N9aO1ocT zwn#2b^s+dQkl;_anYCe=sKZKJ7)^~4<nNQFGBlg&aB5f`PQ~RB*d>=1(HMj7%wUCA z<1+X(PcbPiB69gpP)Tq~(-W)bIh_Hi>M4$=_c41LS4Y!iX^gSJn{lJR$1`y?p1zC7 zm0{&6R6$VE5TE%HpG;%eVe~t6_?Kh2<BGt2Sc{Dj_z_kPYkW3Ed@9cTW`=ndFi-P2 z@*erYD->5B0kTtpdxyL~nvbhv{1qY}#ItbvBHvm3U!kv6I45JY5DV+jPa>E1AXkUi zNDlOjF&@tG#aMwj=i~YArS4^N1%B2SLUz8p6wi?#lB;4zmY{>Kxg893Zo&1~J)+h_ zP6O+cUt8^vS+4cz$&-+^_AJ>$77u|X`lT-A<?@hqqHReD6H;#b`)X-(PQ<?3i`$m$ zSSxH>*#k>9sRdA?kVO{XbGdckasa~jXzJw0k$`yYGGa~4Ug+p;>*4b#^!uDzJ(vOl zdL+W*;Y6C81nz;?a-u!AN26`)S3T~9f$LkuYe|ZofYV;j3pyDdkf(qKObHc>i#Zcn z$9t4fYbOjSh=H!lfteWxTL%yX*p+Tu*=U*ksTmdh2ndR7OpvX7j@npG7{gY@?0AK2 z?Q^6dI|t{L8(ih$NSrjK^}Bx9a{cIj{g-0QJ!JrxvKCCM$t7rU&%zze!NSog8euWX zVjLq!A?PuBjdyBll8gGFofr;GNsl`Y%Yk@JNwkRiZM*o~ji@7!_%%)nj??npC~}<F z=<F9OzlkV|R(i}0DCH}J9<4xHT75{+hZ65fin`6h0FITcL~<HO89NK6cfv~GaY-Q6 zU@77v9^Clf@z&1)x1g3M*LmPiffI>I9-}UV{AUQ46<}M#S$xcdU81>A%4`bAbb+@U zO7dBey$_p67z9Ae{|BH<t)`1)in?)}r09s`K}vu&9p|v``souHG@B9r%2`%h)b}C( z-=ZnQQzjvq7f<@Kz@@`tAQM+Lwg}`$@a{8v%nP`ac0_`UXZYF~{s|N58C;S0>tQdd zL#+n&cHj5wgtpzj&j*jLH>2g|@r`Y(y|&tX*4$mT?m@{z#WjUSQ>rwjPV-!;dA<3p zzWkuMd((RM`<uHrYHx&}Gh7>_n%}x|p*}ja9&wj(C*rg>*qQhPr{Iq)H+SFZF;#6| z&~09q-mX8`KKJ{eX3ySPU0qvitlnDN+}c=gG&Z(wHSFT*?e)g`*4pNJ!=AgnwZ6XD zSYN-@SZmz6v$1h|^Y(^aTwlGjerL0>dS~NZCblk^*h=udGrT>}(=?vpXK&uZwe#eA z#V5Ku689xo{s`r_n;+sBdc2Q6gYG+&ohM6C9yBzXQoohpqYK9_PrPuqLj!0LcApAs zqBE423uy4cy9;q5Jy3310~N9fmh(NA(SS#ZL0@<gRL1@w*XQjIx1d-I8ni+JbFOwJ z(dd!uCpl?3QGy`Rq1>QmnJE6m?7)E8%gQ*Zq(#Q4OWf!16_ty6zQ^H@Gq0Bv<tazU z9(;r;44%pfp5!_#?Dvv9qdnhk(M0uWkmNj0528ecnoV>u!UB>MKXd&){W)VHOEe!a zC#viFiPjB0K;;E6a5xjWNc1!|@UTxjmK2W}m~%LN!D|b$#hK_3mKTd?5W9RV=Ojso znGN5D;@gpCdv;UERI(O^GKE&^*yWZ_VYm^;_W~N()10}jeYu5f9=R+MAOp%)g>oJk zqPG*3?L|qh9rht0?AS&5&Pc^okGx0>5R~)5E`-eMNaOa@xvLC<eg3T~US<DX9OlI$ z5Fth}EXLHFY7~lCS52&`8m?eNsbB>g>I~e#1CH|YoHBu9;w5-?3D1DkRiHGyfU7_Y zjtO(vd}}Bg)<I9h%3I|hdP%{ivZT)8tMFXM+RLJ1s8x_=9|M3|nbkxlQh4<=!)X!e z{6xG|nP?ADmweCnp39SDw}Hi$#d1SD)St*NfJB8ekXA1QFGr(n594k+R_}|c6TJtE Z7>43!U`5dZt*I#gR#fF*idOmOe*hW`e6j!l -- GitLab From fc2a77f3b940165c441a9b78db47f7205f12488e Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 10 Jan 2025 18:53:09 -0600 Subject: [PATCH 023/109] runs --- hercules_cg/src/gpu.rs | 212 +++++++++++++++++++++++++++++------------ 1 file changed, 149 insertions(+), 63 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a153b7ef..499ecce8 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -296,6 +296,7 @@ impl GPUContext<'_> { // Emit all code up to the "goto" to Start's block let mut top = String::new(); self.codegen_kernel_begin(&mut top)?; + let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; @@ -306,10 +307,11 @@ impl GPUContext<'_> { let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - println!("num_blocks: {}", num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); - let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); + // TODO: Uncomment and adjust once we know logic of extra dim + // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); + let extra_dim_collects = HashSet::new(); // Setup for CUDA's "goto" for control flow between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -333,19 +335,20 @@ impl GPUContext<'_> { &fork_control_map, &fork_thread_quota_map, &extra_dim_collects, + &mut dynamic_shared_offset, num_threads, &mut gotos, )?; // Emit all GPU kernel code from previous steps let mut kernel_body = String::new(); - self.codegen_gotos(&mut gotos, &mut kernel_body)?; + self.codegen_gotos(false, &mut gotos, &mut kernel_body)?; write!(w, "{}", kernel_body)?; write!(w, "}}\n")?; // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(num_blocks, num_threads, &mut host_launch)?; + self.codegen_launch_code(true, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) @@ -420,8 +423,9 @@ namespace cg = cooperative_groups; write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; // This will only get used by thread rank 0 in each block, since it - // does all shared memory "allocation" - write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; + // does all shared memory "allocation". The actual state is preserved + // in Rust string and this offset is assigned to for ease of readability. + write!(w, "\tuint64_t dynamic_shared_offset;\n")?; Ok(()) } @@ -489,11 +493,14 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_gotos(&self, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { + fn codegen_gotos(&self, goto_debug: bool, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { write!(w, "\n")?; for (id, goto) in gotos.iter() { let goto_block = self.get_block_name(*id, false); write!(w, "{}:\n", goto_block)?; + if goto_debug { + write!(w, "\tprintf(\"goto {}\\n\");\n", goto_block)?; + } write!(w, "{}", goto.init)?; if !goto.post_init.is_empty() { let goto_block = self.get_block_name(*id, true); @@ -506,47 +513,119 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, run_debug: bool, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { write!(w, " -int main(")?; +int main() {{ +")?; // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); - // The first set of parameters are dynamic constants. - let mut first_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(w, ", ")?; + if run_debug { + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(pass_args, ", ")?; + } + write!(w, "\tunsigned long long dc_p{} = 1ull;\n", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + self.codegen_dynamic_constants(w)?; + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(pass_args, ", ")?; + } + let param_type = self.get_type(*ty, false); + if self.types[ty.idx()].is_primitive() { + write!(w, "\t{} p{} = 1;\n", param_type, idx)?; + } else { + let param_size = self.get_size(*ty, None, None); + write!(w, "\t{} p{};\n", param_type, idx); + write!(w, "\tif (cudaMalloc(&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; + write!(w, "\t\tprintf(\"Error allocating memory for parameter %d\\n\", {});\n", idx)?; + write!(w, "\t\treturn -1;\n"); + write!(w, "\t}}\n"); + } + write!(pass_args, "p{}", idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { write!(pass_args, ", ")?; + let ret_type_no_pnt = self.get_type(*self.return_type_id, false); + let ret_type = self.get_type(*self.return_type_id, true); + write!(w, "\t{} ret;\n", ret_type)?; + write!(w, "\tif (cudaMalloc(&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; + write!(w, "\t\tprintf(\"Error allocating memory for return value\\n\");\n")?; + write!(w, "\t\treturn -1;\n")?; + write!(w, "\t}}\n"); + write!(pass_args, "ret")?; } - write!(w, "unsigned long long dc_p{}", idx)?; - write!(pass_args, "dc_p{}", idx)?; - } - // The second set of parameters are normal arguments. - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { + write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); + write!(w, "\tbool skip = false;\n")?; + write!(w, "\tcudaError_t err = cudaGetLastError();\n")?; + write!(w, "\tif (err != cudaSuccess) {{\n")?; + write!(w, "\t\tprintf(\"Error launching kernel: %s\\n\", cudaGetErrorString(err));\n")?; + write!(w, "\t\tskip = true;\n")?; + write!(w, "\t}}\n"); + write!(w, "\tif (cudaDeviceSynchronize() != cudaSuccess && !skip) {{\n")?; + write!(w, "\t\tprintf(\"Error synchronizing device\\n\");\n")?; + write!(w, "\t\tskip = true;\n")?; + write!(w, "\t}}\n"); + for (idx, ty) in self.function.param_types.iter().enumerate() { + if !self.types[ty.idx()].is_primitive() { + write!(w, "\tcudaFree(p{});\n", idx)?; + } + } + if self.types[self.return_type_id.idx()].is_primitive() { + write!(w, "\tcudaFree(ret);\n"); + } + write!(w, "\treturn 0;\n"); + write!(w, "}}\n"); + } + + else { + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + write!(w, "unsigned long long dc_p{}", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + let param_type = self.get_type(*ty, false); + write!(w, "{} p{}", param_type, idx)?; + write!(pass_args, "p{}", idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { write!(w, ", ")?; write!(pass_args, ", ")?; + let ret_type = self.get_type(*self.return_type_id, true); + write!(w, "{} ret", ret_type)?; + write!(pass_args, "ret")?; } - let param_type = self.get_type(*ty, false); - write!(w, "{} p{}", param_type, idx)?; - write!(pass_args, "p{}", idx)?; - } - // Pull primitive return to a pointer parameter - if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, ", ")?; - write!(pass_args, ", ")?; - let ret_type = self.get_type(*self.return_type_id, true); - write!(w, "{} ret", ret_type)?; - write!(pass_args, "ret")?; + write!(w, ") {{ + {}<<<{}, {}, {}>>>({}); +}}", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); } - write!(w, ") {{ - {}<<<{}, {}>>>({}); -}}", self.function.name, num_blocks, num_threads, pass_args); + Ok(()) } @@ -575,12 +654,10 @@ int main(")?; // Then get it's nesting fork- index = 1 to not count itself! let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0)); fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(control); - println!("fork_tree parent: {}, child: {}", nesting_fork.idx(), control.idx()); } // Here the desired fork is always the first fork let fork = forks.first().copied().unwrap_or(NodeID::new(0)); fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(control); - println!("fork_control_map parent: {}, child: {}", fork.idx(), control.idx()); (fork_tree, fork_control_map) }, ) @@ -676,7 +753,7 @@ int main(")?; .iter() .map(|child| (child, self.recurse_thread_quotas(*child, fork_tree, false))) .fold( - (HashMap::new(), HashMap::new(), 0), + (HashMap::new(), HashMap::new(), 1), |(mut subsubtree_map, mut children_quota_map, subtree_quota), (child, (curr_map, curr_quota, use_curr))| { subsubtree_map.extend(curr_map); if use_curr { @@ -771,8 +848,6 @@ int main(")?; (collect_const, users.iter().map(|user| control_fork_map[&self.bbs.0[user.idx()]]).collect()) }) .collect(); - // For now assert that each collection is used by a single fork and get - // parallel status, TODO: revisit collect_fork_users.iter() .filter(|(_, fork_users)| !fork_thread_quota_map.contains_key(fork_users.iter().next().unwrap())) .map(|(collect_const, _)| self.typing[collect_const.idx()]) @@ -790,6 +865,7 @@ int main(")?; fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -797,7 +873,6 @@ int main(")?; // Recall that this was tracked through a fake fork node with NodeID 0. let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -805,14 +880,13 @@ int main(")?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -820,7 +894,7 @@ int main(")?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } } @@ -836,6 +910,7 @@ int main(")?; 1, num_threads, extra_dim_collects, + dynamic_shared_offset, gotos, )?; } @@ -858,6 +933,7 @@ int main(")?; parent_quota: usize, num_threads: usize, extra_dim_collections: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map @@ -879,7 +955,6 @@ int main(")?; HashSet::new() }; for control in fork_control_map.get(&curr_fork).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -895,6 +970,7 @@ int main(")?; Some(curr_fork), reducts.contains(data), extra_dim_collections, + dynamic_shared_offset, body, &mut tabs, )?; @@ -910,6 +986,7 @@ int main(")?; use_thread_quota, num_threads, extra_dim_collections, + dynamic_shared_offset, gotos, )?; } @@ -925,6 +1002,7 @@ int main(")?; nesting_fork: Option<NodeID>, is_special_reduct: bool, extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { @@ -1013,6 +1091,7 @@ int main(")?; *cons_id, true, Some(extra_dim_collects), + dynamic_shared_offset, w, *num_tabs, )?; @@ -1460,18 +1539,23 @@ int main(")?; panic!("Expected array type") }; let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); + let max_left_array_index = array_indices.len() - 1 - if has_extra_dim { 1 } else { 0 }; for (i, index) in array_indices.iter().skip(if has_extra_dim { 1 } else { 0 }).rev().enumerate() { cumulative_offset = format!( - "{} * ({} + {}", + "{} * ({}{}", cumulative_offset, self.get_value(*index, false, false), - format!("dc{}", extents[i].idx()) + if i != max_left_array_index { + format!(" + dc{}", extents[max_left_array_index - i].idx()) + } else { + "".to_string() + } ); } index_ptr.push_str(&format!( " + {}{}", cumulative_offset, - ")".repeat(array_indices.len()) + ")".repeat(array_indices.len() - if has_extra_dim { 1 } else { 0 }) )); if is_char { let element_size = self.get_size(*element_type, None, None); @@ -1501,6 +1585,7 @@ int main(")?; cons_id: ConstantID, allow_allocate: bool, extra_dim_collects: Option<&HashSet<TypeID>>, + dynamic_shared_offset: &mut String, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { @@ -1523,9 +1608,10 @@ int main(")?; if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); - write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") @@ -1541,11 +1627,12 @@ int main(")?; constant_fields[i], false, extra_dim_collects, + dynamic_shared_offset, w, num_tabs, ); } else if !field_constant.is_array() { - self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, w, num_tabs); + self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, dynamic_shared_offset, w, num_tabs); } } } @@ -1553,9 +1640,10 @@ int main(")?; if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); - write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } // No offset updating needed since all variants start at 0 let Type::Summation(variants) = &self.types[type_id.idx()] else { @@ -1570,11 +1658,12 @@ int main(")?; *field, false, extra_dim_collects, + dynamic_shared_offset, w, num_tabs, ); } else if !variant_constant.is_array() { - self.codegen_constant(name, *field, false, extra_dim_collects, w, num_tabs); + self.codegen_constant(name, *field, false, extra_dim_collects, dynamic_shared_offset, w, num_tabs); }; } Constant::Array(type_id) => { @@ -1587,13 +1676,10 @@ int main(")?; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); let element_type = self.get_type(*element_type, true); - write!( - w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {};\n", - tabs, alignment, alignment, alignment - )?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n", tabs, name, element_type)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } } Ok(()) -- GitLab From 259bcfb4a12b173eebc8d25344cbf9ef1d7a8f0a Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Wed, 25 Dec 2024 08:37:33 -0800 Subject: [PATCH 024/109] manual rebase --- hercules_cg/src/gpu.rs | 181 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 hercules_cg/src/gpu.rs diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs new file mode 100644 index 00000000..c54e5810 --- /dev/null +++ b/hercules_cg/src/gpu.rs @@ -0,0 +1,181 @@ +extern crate bitvec; +extern crate hercules_ir; + +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::fmt::{Error, Write}; +use std::iter::{zip, FromIterator}; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use self::bitvec::prelude::*; + +use self::hercules_ir::*; + +use crate::*; + +static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0); + +/* + * The top level function to compile a Hercules IR function into NVVM IR kernel for + * execution on the GPU. We generate NVVM IR textually, copying from the CPU LLVM approach. + */ +pub fn gpu_codegen<W: Write>( + function: &Function, + types: &Vec<Type>, + constants: &Vec<Constant>, + dynamic_constants: &Vec<DynamicConstant>, + reverse_postorder: &Vec<NodeID>, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + data_nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>, + bbs: &Vec<NodeID>, + w: &mut W, +) -> Result<(), Error> { + // temporary hardcoded values + let kernel = GPUKernel { + max_num_blocks: 1024, + max_num_threads: 1024, + threads_per_warp: 32, + }; + let ctx = GPUContext { + function, + types, + constants, + dynamic_constants, + reverse_postorder, + typing, + control_subgraph, + bbs, + structs: HashSet::new(), + w, + kernel, + }; + ctx.codegen_function() +} + +struct GPUContext<'a, W: Write> { + function: &'a Function, + types: &'a Vec<Type>, + constants: &'a Vec<Constant>, + dynamic_constants: &'a Vec<DynamicConstant>, + reverse_postorder: &'a Vec<NodeID>, + typing: &'a Vec<TypeID>, + control_subgraph: &'a Subgraph, + bbs: &'a Vec<NodeID>, + structs: HashSet<usize>, + w: &'a mut W, + kernel: GPUKernel, +} + +struct GPUKernel { + max_num_blocks: usize, + max_num_threads: usize, + threads_per_warp: usize, +} + +#[derive(Default, Debug)] +struct CudaBlock { + label: String, + body: String, +} + +impl<'a, W: Write> GPUContext<'a, W> { + fn codegen_function(&self) -> Result<(), Error> { + // Static content and function signature + write!( + self.w, + " +#include <assert.h> +#include <stdio.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <mma.h> +#include <helper_cuda.h> +", + )?; + + let mut function_signature = String::new(); + write!(&mut function_signature, "template <")?; + // The dynamic constants become template parameters. + let mut first_template_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(&mut function_signature, ", ")?; + } + write!(&mut function_signature, "long long int dc_p{}", idx)?; + } + write!(&mut function_signature, ">\n")?; + + write!(&mut function_signature, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_blocks, self.function.name)?; + // The second set of parameters are normal arguments. + let mut first_param = true; + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(&mut function_signature, ", ")?; + } + write!(&mut function_signature, "{} p{}", self.get_type(*ty)?, idx)?; + } + write!(&mut function_signature, ") {\n")?; + + // do actual stuff + // step 1. determine number of outermost fork joins at block level. we greedily accept while: a) total number of blocks < max_num_blocks, b) each fork join is strictly nested meaning no other neighbor fork joins, and c) each fork join's + + // finish kernel + write!(&mut function_signature, "}\n")?; + } + + // matmul detection- only called if einsum detected + fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {} + + // convolution detection- only called if einsum detected + fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {} + + fn get_type(&self, id: TypeID) -> Result<String, Error> { + match self.types[id.idx()] { + Type::Product(ref product_ty_ids) => { + if !self.structs.contains(&id.idx()) { + write!(self.w, "\nstruct Product_{} {{\n", id.idx())?; + for (i, product_ty_id) in product_ty_ids.iter().enumerate() { + write!(self.w, "\t{} field_{};\n", self.get_type(*product_ty_id)?, i)?; + } + write!(self.w, "}};\n")?; + self.structs.insert(id.idx()); + } + Ok(format!("Product_{}", id.idx())) + } + Type::Summation(ref summation_ty_ids) => { + if !self.structs.contains(&id.idx()) { + write!(self.w, "\nstruct Summation_{} {{\n\t union {{\n", id.idx())?; + for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { + write!(self.w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id)?, i)?; + } + write!(self.w, "\t}};\n\tuint8_t tag;\n}};\n")?; + self.structs.insert(id.idx()); + } + Ok(format!("Summation_{}", id.idx())) + } + _ => Ok(convert_type(&self.types[id.idx()])), + } + } + + // TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 + fn convert_type(ty: &Type) -> String { + match ty { + Type::Boolean => "bool".to_string(), + Type::Integer8 => "int8_t".to_string(), + Type::UnsignedInteger8 => "uint8_t".to_string(), + Type::Integer16 => "short".to_string(), + Type::UnsignedInteger16 => "unsigned short".to_string(), + Type::Integer32 => "int".to_string(), + Type::UnsignedInteger32 => "unsigned int".to_string(), + Type::Integer64 => "long long".to_string(), + Type::UnsignedInteger64 => "unsigned long long".to_string(), + Type::Float32 => "float".to_string(), + Type::Float64 => "double".to_string(), + _ => panic!("Unsupported type"), + } + } +} \ No newline at end of file -- GitLab From bee47b102c010927cec829dc777df100d52550da Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Thu, 26 Dec 2024 18:15:54 -0800 Subject: [PATCH 025/109] before goto test --- .gitignore | 1 + hercules_cg/src/gpu.rs | 221 ++++++++++++++++++++++++++++++++++------- 2 files changed, 187 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index f8a684ce..22c9343e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ *.hrt .*.swp .vscode +*_env diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index c54e5810..e177f420 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -31,11 +31,13 @@ pub fn gpu_codegen<W: Write>( w: &mut W, ) -> Result<(), Error> { // temporary hardcoded values - let kernel = GPUKernel { + let kernel_params = GPUKernelParams { max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, + num_smps: 60, // ¯\_(ツ)_/¯ }; + let mut kernel_attrs = GPUKernelAttrs::default(); let ctx = GPUContext { function, types, @@ -45,13 +47,27 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, - structs: HashSet::new(), + cuda_structs: HashSet::new(), w, - kernel, + kernel_params, + &mut kernel_attrs, }; ctx.codegen_function() } +struct GPUKernelParams { + max_num_blocks: usize, + max_num_threads: usize, + threads_per_warp: usize, + num_smps: usize, +} + +#[derive(Default)] +struct GPUKernelAttrs { + num_blocks: usize, + num_threads: usize, +} + struct GPUContext<'a, W: Write> { function: &'a Function, types: &'a Vec<Type>, @@ -61,20 +77,15 @@ struct GPUContext<'a, W: Write> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a Vec<NodeID>, - structs: HashSet<usize>, + cuda_structs: HashSet<usize>, w: &'a mut W, - kernel: GPUKernel, -} - -struct GPUKernel { - max_num_blocks: usize, - max_num_threads: usize, - threads_per_warp: usize, + kernel_params: GPUKernelParams, + kernel_attrs: &'a mut GPUKernelAttrs, } #[derive(Default, Debug)] -struct CudaBlock { - label: String, +struct CudaGoto { + header: String, body: String, } @@ -93,67 +104,79 @@ impl<'a, W: Write> GPUContext<'a, W> { ", )?; - let mut function_signature = String::new(); - write!(&mut function_signature, "template <")?; + let mut kernel_begin = String::new(); + write!(&mut kernel_begin, "template <")?; // The dynamic constants become template parameters. let mut first_template_param = true; for idx in 0..self.function.num_dynamic_constants { if first_param { first_param = false; } else { - write!(&mut function_signature, ", ")?; + write!(&mut kernel_begin, ", ")?; } - write!(&mut function_signature, "long long int dc_p{}", idx)?; + write!(&mut kernel_begin, "long long int dc_p{}", idx)?; } - write!(&mut function_signature, ">\n")?; + write!(&mut kernel_begin, ">\n")?; - write!(&mut function_signature, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_blocks, self.function.name)?; + write!(&mut kernel_begin, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_threads, self.function.name)?; // The second set of parameters are normal arguments. let mut first_param = true; for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; } else { - write!(&mut function_signature, ", ")?; + write!(&mut kernel_begin, ", ")?; } - write!(&mut function_signature, "{} p{}", self.get_type(*ty)?, idx)?; + write!(&mut kernel_begin, "{} p{}", self.get_type(*ty)?, idx)?; } - write!(&mut function_signature, ") {\n")?; + write!(&mut kernel_begin, ") {\n")?; - // do actual stuff - // step 1. determine number of outermost fork joins at block level. we greedily accept while: a) total number of blocks < max_num_blocks, b) each fork join is strictly nested meaning no other neighbor fork joins, and c) each fork join's + // Uses CUDA's goto structure; we will not gen for all control nodes, eg block and thread fork joins. + let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_control()) + .map(|idx| (NodeID::new(idx), CudaGoto::default())) + .collect(); - // finish kernel - write!(&mut function_signature, "}\n")?; - } + // step 1. Assign outermost fork joins to block level + let (block_fork_ids, fork_sizes) = self.codegen_block_creation(&mut kernel_begin)?; + // step 2. Sink logic from outer block fork joins. If it's a write, add necessary block-id based qualifier. For now, it's done naively at the top of the kernel. + let mut block_stride = self.kernel_attrs.num_blocks; + gotos[NodeID::new(0)].header = "start_sink".to_string(); + self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut gotos)?; + for (i, fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { + block_stride = block_stride.saturating_div(fork_sizes[i]); + gotos[fork_id].header = format!("block_sink_{}", fork_id.idx()); + self.codegen_block_sink(fork_id, block_fork_ids[i + 1], block_stride, &mut gotos)?; + } - // matmul detection- only called if einsum detected - fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {} - // convolution detection- only called if einsum detected - fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {} + // step 3. determine fork joins at warp/thread level. a) rather than strict nest, neighbors are allowed and we get sequence of edge (aka innermost at thread level) fork joins rather than single. b) if innermost is = threads_per_warp, we can use warp-level features. c) for now punt on implementation but can likely run einsum -> matmul/conv detector on hierarhical fork joins between block edge and given thread edge. + + // finish kernel + write!(&mut kernel_begin, "}\n")?; + } fn get_type(&self, id: TypeID) -> Result<String, Error> { match self.types[id.idx()] { Type::Product(ref product_ty_ids) => { - if !self.structs.contains(&id.idx()) { + if !self.cuda_structs.contains(&id.idx()) { write!(self.w, "\nstruct Product_{} {{\n", id.idx())?; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { write!(self.w, "\t{} field_{};\n", self.get_type(*product_ty_id)?, i)?; } write!(self.w, "}};\n")?; - self.structs.insert(id.idx()); + self.cuda_structs.insert(id.idx()); } Ok(format!("Product_{}", id.idx())) } Type::Summation(ref summation_ty_ids) => { - if !self.structs.contains(&id.idx()) { + if !self.cuda_structs.contains(&id.idx()) { write!(self.w, "\nstruct Summation_{} {{\n\t union {{\n", id.idx())?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { write!(self.w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id)?, i)?; } write!(self.w, "\t}};\n\tuint8_t tag;\n}};\n")?; - self.structs.insert(id.idx()); + self.cuda_structs.insert(id.idx()); } Ok(format!("Summation_{}", id.idx())) } @@ -178,4 +201,132 @@ impl<'a, W: Write> GPUContext<'a, W> { _ => panic!("Unsupported type"), } } + + // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + fn codegen_block_creation(&self, kernel_body: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { + // a) + let mut fork_forward_adjacency: HashMap::<NodeID, Vec<NodeID>> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_fork()) + .map(|idx| (NodeID::new(idx), vec![])) + .collect(); + let mut unhandled_fork_nodes = VecDeque::new(); + + for (fork_node, children) in fork_forward_adjacency.iter_mut() { + unhandled_fork_nodes.push_back(*fork_node); + while !unhandled_fork_nodes.is_empty() { + let fork_node = unhandled_fork_nodes.pop_front().unwrap(); + let fork_node_children = self.function.nodes[fork_node.idx()].children(); + for child in fork_node_children { + if self.function.nodes[child.idx()].is_fork() { + children.push(child); + } else if !self.function.nodes[child.idx()].is_join() { + unhandled_fork_nodes.push_back(child); + } + } + } + } + + let mut root_forks: HashSet<NodeID> = fork_forward_adjacency.keys().copied().collect(); + for (fork_node, children) in fork_forward_adjacency.iter() { + for child in children { + root_forks.remove(child); + } + } + let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); + if root_forks.len() != 1 { + return vec![]; + } + + let mut strict_forks = vec![root_forks[0]]; + let mut curr_fork = root_forks[0]; + while fork_join_map.get(&curr_fork).is_some() { + let children = &fork_forward_adjacency[&curr_fork]; + if children.len() != 1 { + break; + } + curr_fork = children[0]; + strict_forks.push(curr_fork); + } + + // b, (stronger version of) c, and d + let mut valid_forks = 0; + let mut cumulative_blocks = 1usize; + let mut fork_sizes = Vec::new(); + + for fork in strict_forks.iter() { + if !self.function.schedules[fork.idx()].contains(&Schedule::Vectorizable) { + break; + } + let factors = match &self.function.nodes[fork.idx()] { + Node::Fork { factors, .. } => factors, + _ => return Err(Error::new("Expected Fork node in strict_forks")) + }; + let fork_size = factors.iter() + .try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, &self.dynamic_constants) + .ok_or_else(|| Error::new("Fork factors must be evaluatable to constants")) + .map(|val| acc.saturating_mul(val)) + })?; + let new_blocks = cumulative_blocks.saturating_mul(fork_size); + if new_blocks > self.kernel_params.max_num_blocks { + break; + } + cumulative_blocks = new_blocks; + fork_sizes.push(fork_size); + valid_forks += 1; + } + + self.kernel_attrs.num_blocks = cumulative_blocks; + let valid_forks = strict_forks.into_iter() + .take(valid_forks) + .collect::<Vec<_>>(); + + // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected by this. For now, keep it simple. + if valid_forks.len() != 0 { + write!(&mut kernel_body, "\tconst int block_x = blockIdx.x;\n")?; + } + Ok((valid_forks, fork_sizes)) + } + + fn codegen_control_node(&self, id: NodeID, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} + + fn codegen_data_node(&self, id: NodeID, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} + + fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} + + fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { + // 1. Get control nodes including fork_id that are dominated by fork_id + // and that dominate next_fork_id + let dom = dominator(self.control_subgraph, fork_id); + assert!(dom.does_dom(fork_id, next_fork_id)); + let mut control_nodes_between = vec![fork_id]; + for node_id in self.control_subgraph.iter() { + if dom.does_dom(fork_id, node_id) && dom.does_dom(node_id, next_fork_id) { + control_nodes_between.push(node_id); + } + } + // 2. Call regular data codegen for blocks corresponding to + // control nodes, with extra if surrounding index-dependent write + // (TODO: consider shared memory optimization) + for node_id in control_nodes_between.iter() { + self.codegen_data_node(node_id, &mut gotos[node_id].body)?; + } + // 3. call regular control codegen using goto structure + } + + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> usize { + factors.iter() + .map(|&factor_id| { + evaluate_dynamic_constant(factor_id, &self.dynamic_constants) + .unwrap_or_else(|| panic!("Fork factors must be evaluatable to constants")) + }) + .product() + } + + // matmul detection- only called if einsum detected + fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {} + + // convolution detection- only called if einsum detected + fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {} + } \ No newline at end of file -- GitLab From 9dcf93fe4b03dc945886c8bc986c8adcbe71c982 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 27 Dec 2024 14:31:46 -0800 Subject: [PATCH 026/109] post goto & lint --- hercules_cg/src/gpu.rs | 417 ++++++++++++++++++++++++++--------------- hercules_cg/src/lib.rs | 2 + 2 files changed, 272 insertions(+), 147 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index e177f420..3e86db22 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,16 +3,15 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; -use std::iter::{zip, FromIterator}; -use std::sync::atomic::{AtomicUsize, Ordering}; -use self::bitvec::prelude::*; +// use std::iter::{zip, FromIterator}; +// use std::sync::atomic::{AtomicUsize, Ordering}; -use self::hercules_ir::*; +// use self::bitvec::prelude::*; -use crate::*; +use self::hercules_ir::*; -static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0); +// use crate::*; /* * The top level function to compile a Hercules IR function into NVVM IR kernel for @@ -26,19 +25,48 @@ pub fn gpu_codegen<W: Write>( reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, - data_nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>, bbs: &Vec<NodeID>, w: &mut W, ) -> Result<(), Error> { - // temporary hardcoded values + // Temporary hardcoded values let kernel_params = GPUKernelParams { max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, - num_smps: 60, // ¯\_(ツ)_/¯ + num_smps: 60, }; let mut kernel_attrs = GPUKernelAttrs::default(); - let ctx = GPUContext { + + // Create fork forward adjacency and join map upfront as part of context + let make_fork_structures = || -> (HashMap::<NodeID, Vec<NodeID>>, HashMap::<NodeID, NodeID>) { + let mut fork_forward_adjacency: HashMap::<NodeID, Vec<NodeID>> = (0..function.nodes.len()) + .filter(|idx| function.nodes[*idx].is_fork()) + .map(|idx| (NodeID::new(idx), vec![])) + .collect(); + let mut fork_join_map = HashMap::new(); + let mut queued_nodes = VecDeque::new(); + + for (fork_node, children) in fork_forward_adjacency.iter_mut() { + queued_nodes.push_back(*fork_node); + while !queued_nodes.is_empty() { + let node = queued_nodes.pop_front().unwrap(); + for child in control_subgraph.succs(node) { + if function.nodes[child.idx()].is_fork() { + children.push(child); + } else if function.nodes[child.idx()].is_join() { + fork_join_map.insert(*fork_node, child); + } else { + queued_nodes.push_back(child); + } + } + } + } + (fork_forward_adjacency, fork_join_map) + }; + + let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); + + let mut ctx = GPUContext { function, types, constants, @@ -48,11 +76,12 @@ pub fn gpu_codegen<W: Write>( control_subgraph, bbs, cuda_structs: HashSet::new(), - w, kernel_params, - &mut kernel_attrs, + kernel_attrs: &mut kernel_attrs, + fork_forward_adjacency, + fork_join_map, }; - ctx.codegen_function() + ctx.codegen_function(w) } struct GPUKernelParams { @@ -68,7 +97,7 @@ struct GPUKernelAttrs { num_threads: usize, } -struct GPUContext<'a, W: Write> { +struct GPUContext<'a> { function: &'a Function, types: &'a Vec<Type>, constants: &'a Vec<Constant>, @@ -78,22 +107,25 @@ struct GPUContext<'a, W: Write> { control_subgraph: &'a Subgraph, bbs: &'a Vec<NodeID>, cuda_structs: HashSet<usize>, - w: &'a mut W, kernel_params: GPUKernelParams, kernel_attrs: &'a mut GPUKernelAttrs, + fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, + fork_join_map: HashMap<NodeID, NodeID>, } #[derive(Default, Debug)] struct CudaGoto { header: String, body: String, + term: String, + handled: bool, } -impl<'a, W: Write> GPUContext<'a, W> { - fn codegen_function(&self) -> Result<(), Error> { - // Static content and function signature +impl<'a> GPUContext<'a> { + fn codegen_function<W: Write>(&mut self, w: &mut W) -> Result<(), Error> { + // Include all possible imports write!( - self.w, + w, " #include <assert.h> #include <stdio.h> @@ -104,143 +136,137 @@ impl<'a, W: Write> GPUContext<'a, W> { ", )?; - let mut kernel_begin = String::new(); - write!(&mut kernel_begin, "template <")?; - // The dynamic constants become template parameters. - let mut first_template_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(&mut kernel_begin, ", ")?; - } - write!(&mut kernel_begin, "long long int dc_p{}", idx)?; - } - write!(&mut kernel_begin, ">\n")?; + let mut top = String::new(); - write!(&mut kernel_begin, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_threads, self.function.name)?; - // The second set of parameters are normal arguments. - let mut first_param = true; - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { - write!(&mut kernel_begin, ", ")?; - } - write!(&mut kernel_begin, "{} p{}", self.get_type(*ty)?, idx)?; - } - write!(&mut kernel_begin, ") {\n")?; + // Create all possible structs + self.codegen_structs(&mut top)?; + // Kernel template, signature, and arguments + self.codegen_kernel_begin(&mut top)?; - // Uses CUDA's goto structure; we will not gen for all control nodes, eg block and thread fork joins. + // Uses CUDA's goto structure; some control nodes' gen may be moved, eg + // block and thread fork joins. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) - .map(|idx| (NodeID::new(idx), CudaGoto::default())) + .map(|idx| { + let node_id = NodeID::new(idx); + let mut goto = CudaGoto::default(); + goto.header = format!("{}{}", self.function.nodes[idx].upper_case_name(), idx); + (node_id, goto) + }) .collect(); - // step 1. Assign outermost fork joins to block level - let (block_fork_ids, fork_sizes) = self.codegen_block_creation(&mut kernel_begin)?; - // step 2. Sink logic from outer block fork joins. If it's a write, add necessary block-id based qualifier. For now, it's done naively at the top of the kernel. + // Generate phi registers at top, later can consider smarter scoping + self.codegen_phi_registers(&mut top)?; + + // Assign outermost fork joins to block level + let (block_fork_ids, block_fork_sizes) = self.codegen_block_creation(&mut top)?; + // Sink logic from outer block fork joins. If it's a write, add + // necessary block-id based condition. For now, it's done naively at the + // top of the kernel. let mut block_stride = self.kernel_attrs.num_blocks; - gotos[NodeID::new(0)].header = "start_sink".to_string(); - self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut gotos)?; - for (i, fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { - block_stride = block_stride.saturating_div(fork_sizes[i]); - gotos[fork_id].header = format!("block_sink_{}", fork_id.idx()); - self.codegen_block_sink(fork_id, block_fork_ids[i + 1], block_stride, &mut gotos)?; + gotos.get_mut(&NodeID::new(0)).unwrap().header = "start_sink".to_string(); + self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut top, &mut gotos)?; + for (i, &fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { + block_stride = block_stride.saturating_div(block_fork_sizes[i]); + self.codegen_block_sink(fork_id, block_fork_ids[i + 1], block_stride, &mut top, &mut gotos)?; } - - // step 3. determine fork joins at warp/thread level. a) rather than strict nest, neighbors are allowed and we get sequence of edge (aka innermost at thread level) fork joins rather than single. b) if innermost is = threads_per_warp, we can use warp-level features. c) for now punt on implementation but can likely run einsum -> matmul/conv detector on hierarhical fork joins between block edge and given thread edge. + // Assign inner fork joins to thread level, with labels for warp + let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1], &mut top)?; + // Punting on implementation but can likely run einsum -> matmul/conv + // detector on hierarhical fork joins between block edge and given + // thread edge. // finish kernel - write!(&mut kernel_begin, "}\n")?; + write!(w, "{}", top)?; + write!(w, "}}\n")?; + + Ok(()) } - fn get_type(&self, id: TypeID) -> Result<String, Error> { - match self.types[id.idx()] { - Type::Product(ref product_ty_ids) => { - if !self.cuda_structs.contains(&id.idx()) { - write!(self.w, "\nstruct Product_{} {{\n", id.idx())?; + fn codegen_structs(&self, w: &mut String) -> Result<(), Error> { + for (id, ty) in self.types.iter().enumerate() { + match ty { + Type::Product(ref product_ty_ids) => { + write!(w, "\nstruct Product_{} {{\n", id)?; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { - write!(self.w, "\t{} field_{};\n", self.get_type(*product_ty_id)?, i)?; + write!(w, "\t{} field_{};\n", self.get_type(*product_ty_id), i)?; } - write!(self.w, "}};\n")?; - self.cuda_structs.insert(id.idx()); + write!(w, "}};\n")?; } - Ok(format!("Product_{}", id.idx())) - } - Type::Summation(ref summation_ty_ids) => { - if !self.cuda_structs.contains(&id.idx()) { - write!(self.w, "\nstruct Summation_{} {{\n\t union {{\n", id.idx())?; + Type::Summation(ref summation_ty_ids) => { + write!(w, "\nstruct Summation_{} {{\n\t union {{\n", id)?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { - write!(self.w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id)?, i)?; + write!(w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id), i)?; } - write!(self.w, "\t}};\n\tuint8_t tag;\n}};\n")?; - self.cuda_structs.insert(id.idx()); + write!(w, "\t}};\n\tuint8_t tag;\n}};\n")?; } - Ok(format!("Summation_{}", id.idx())) + _ => {} } - _ => Ok(convert_type(&self.types[id.idx()])), - } + } + Ok(()) } - // TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 - fn convert_type(ty: &Type) -> String { - match ty { - Type::Boolean => "bool".to_string(), - Type::Integer8 => "int8_t".to_string(), - Type::UnsignedInteger8 => "uint8_t".to_string(), - Type::Integer16 => "short".to_string(), - Type::UnsignedInteger16 => "unsigned short".to_string(), - Type::Integer32 => "int".to_string(), - Type::UnsignedInteger32 => "unsigned int".to_string(), - Type::Integer64 => "long long".to_string(), - Type::UnsignedInteger64 => "unsigned long long".to_string(), - Type::Float32 => "float".to_string(), - Type::Float64 => "double".to_string(), - _ => panic!("Unsupported type"), + + fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { + write!(w, "template <")?; + // The dynamic constants become template parameters. + let mut first_template_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_template_param { + first_template_param = false; + } else { + write!(w, ", ")?; + } + write!(w, "long long int dc_p{}", idx)?; } + write!(w, ">\n")?; + + write!(w, "__global__ void __launch_bounds__({}) {}(", self.kernel_params.max_num_threads, self.function.name)?; + // The second set of parameters are normal arguments. + let mut first_param = true; + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + } + write!(w, "{} p{}", self.get_type(*ty), idx)?; + } + write!(w, ") {{\n")?; + + Ok(()) } - // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. - fn codegen_block_creation(&self, kernel_body: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { - // a) - let mut fork_forward_adjacency: HashMap::<NodeID, Vec<NodeID>> = (0..self.function.nodes.len()) - .filter(|idx| self.function.nodes[*idx].is_fork()) - .map(|idx| (NodeID::new(idx), vec![])) - .collect(); - let mut unhandled_fork_nodes = VecDeque::new(); - for (fork_node, children) in fork_forward_adjacency.iter_mut() { - unhandled_fork_nodes.push_back(*fork_node); - while !unhandled_fork_nodes.is_empty() { - let fork_node = unhandled_fork_nodes.pop_front().unwrap(); - let fork_node_children = self.function.nodes[fork_node.idx()].children(); - for child in fork_node_children { - if self.function.nodes[child.idx()].is_fork() { - children.push(child); - } else if !self.function.nodes[child.idx()].is_join() { - unhandled_fork_nodes.push_back(child); - } - } + fn codegen_phi_registers(&self, kernel_body: &mut String) -> Result<(), Error> { + for id in (0..self.function.nodes.len()).map(NodeID::new) { + if let Node::Phi { control: _, data } = &self.function.nodes[id.idx()] { + let ty = self.get_type(self.typing[data[0].idx()]); + write!(kernel_body, "\t{} {}{}_value;\n", ty, self.function.nodes[id.idx()].upper_case_name(), id.idx())?; } } - - let mut root_forks: HashSet<NodeID> = fork_forward_adjacency.keys().copied().collect(); - for (fork_node, children) in fork_forward_adjacency.iter() { + Ok(()) + } + + // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + fn codegen_block_creation(&mut self, kernel_body: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { + // a) + let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); + for (_, children) in self.fork_forward_adjacency.iter() { for child in children { root_forks.remove(child); } } let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); if root_forks.len() != 1 { - return vec![]; + return Err(Error); } let mut strict_forks = vec![root_forks[0]]; let mut curr_fork = root_forks[0]; - while fork_join_map.get(&curr_fork).is_some() { - let children = &fork_forward_adjacency[&curr_fork]; + while self.fork_join_map.get(&curr_fork).is_some() { + let children = &self.fork_forward_adjacency[&curr_fork]; if children.len() != 1 { break; } @@ -249,9 +275,9 @@ impl<'a, W: Write> GPUContext<'a, W> { } // b, (stronger version of) c, and d - let mut valid_forks = 0; + let mut valid_block_forks = 0; let mut cumulative_blocks = 1usize; - let mut fork_sizes = Vec::new(); + let mut block_fork_sizes = Vec::new(); for fork in strict_forks.iter() { if !self.function.schedules[fork.idx()].contains(&Schedule::Vectorizable) { @@ -259,12 +285,12 @@ impl<'a, W: Write> GPUContext<'a, W> { } let factors = match &self.function.nodes[fork.idx()] { Node::Fork { factors, .. } => factors, - _ => return Err(Error::new("Expected Fork node in strict_forks")) + _ => return Err(Error) }; let fork_size = factors.iter() .try_fold(1usize, |acc, &factor_id| { evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .ok_or_else(|| Error::new("Fork factors must be evaluatable to constants")) + .ok_or_else(|| Error) .map(|val| acc.saturating_mul(val)) })?; let new_blocks = cumulative_blocks.saturating_mul(fork_size); @@ -272,46 +298,121 @@ impl<'a, W: Write> GPUContext<'a, W> { break; } cumulative_blocks = new_blocks; - fork_sizes.push(fork_size); - valid_forks += 1; + block_fork_sizes.push(fork_size); + valid_block_forks += 1; } self.kernel_attrs.num_blocks = cumulative_blocks; - let valid_forks = strict_forks.into_iter() - .take(valid_forks) + let valid_block_forks = strict_forks.into_iter() + .take(valid_block_forks) .collect::<Vec<_>>(); - // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected by this. For now, keep it simple. - if valid_forks.len() != 0 { - write!(&mut kernel_body, "\tconst int block_x = blockIdx.x;\n")?; - } - Ok((valid_forks, fork_sizes)) + // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. + write!(kernel_body, "\tconst int block_x = blockIdx.x;\n")?; + Ok((valid_block_forks, block_fork_sizes)) } - fn codegen_control_node(&self, id: NodeID, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} - - fn codegen_data_node(&self, id: NodeID, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} - - fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, kernel_body: &mut String, num_indents: usize) -> Result<(), Error> {} - - fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { + fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, kernel_body: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { // 1. Get control nodes including fork_id that are dominated by fork_id - // and that dominate next_fork_id + // and not dominated by next_fork_id and not dominated by fork_id's join let dom = dominator(self.control_subgraph, fork_id); assert!(dom.does_dom(fork_id, next_fork_id)); - let mut control_nodes_between = vec![fork_id]; + let mut control_nodes_between = vec![]; for node_id in self.control_subgraph.iter() { - if dom.does_dom(fork_id, node_id) && dom.does_dom(node_id, next_fork_id) { - control_nodes_between.push(node_id); + if dom.does_dom(fork_id, *node_id) && !dom.does_dom(next_fork_id, *node_id) && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) { + control_nodes_between.push(*node_id); } } // 2. Call regular data codegen for blocks corresponding to // control nodes, with extra if surrounding index-dependent write // (TODO: consider shared memory optimization) - for node_id in control_nodes_between.iter() { - self.codegen_data_node(node_id, &mut gotos[node_id].body)?; + for &node_id in control_nodes_between.iter() { + self.codegen_data_node(node_id, &mut gotos.get_mut(&node_id).unwrap().body)?; } // 3. call regular control codegen using goto structure + + Ok(()) + } + + // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. + fn codegen_thread_creation(&mut self, inner_block_fork: NodeID, kernel_body: &mut String) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { + let fork_forward_adjacency = self.fork_forward_adjacency.as_ref().unwrap(); + + let mut thread_fork_parents = HashMap::new(); + let mut thread_fork_sizes = HashMap::new(); + let mut thread_fork_cumulative_sizes = HashMap::new(); + thread_fork_cumulative_sizes.insert(inner_block_fork, 1); + let mut thread_fork_edges = vec![]; + let mut max_thread_size = 1; + let mut stack = vec![inner_block_fork]; + let mut visited = HashSet::new(); + visited.insert(inner_block_fork); + while let Some(pop) = stack.pop() { + let children = &fork_forward_adjacency[&pop]; + + // Reverse child order due to use of stack for DFS + for &child in children.iter().rev() { + if !visited.contains(&child) { + visited.insert(child); + thread_fork_parents.insert(child, pop); + let fork_size = match &self.function.nodes[child.idx()] { + Node::Fork { factors, .. } => factors.iter() + .try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, &self.dynamic_constants) + .ok_or_else(|| Error) + .map(|val| acc.saturating_mul(val)) + })?, + _ => return Err(Error) + }; + thread_fork_sizes.insert(child, fork_size); + + let new_cumulative_size = (thread_fork_cumulative_sizes[&pop] as usize).saturating_mul(fork_size as usize); + if new_cumulative_size > self.kernel_params.max_num_threads { + // Expanding to child fork exceeds thread limit, so + // current fork is an edge fork + thread_fork_edges.push(pop); + max_thread_size = max_thread_size.max(thread_fork_cumulative_sizes[&pop]); + } else { + // Recurse into child fork + thread_fork_cumulative_sizes.insert(child, new_cumulative_size); + stack.push(child); + } + } else { + panic!("Fork child shouldn't have multiple fork parents"); + } + } + } + + // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. + self.kernel_attrs.num_threads = max_thread_size; + // If limit on number of threads in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. + write!(kernel_body, "\tconst int thread_x = threadIdx.x;\n")?; + + Ok((thread_fork_parents, thread_fork_sizes, thread_fork_edges)) + } + + fn codegen_control_node(&self, id: NodeID, kernel_body: &mut String) -> Result<(), Error> { + Ok(()) + } + + fn codegen_data_node(&self, id: NodeID, kernel_body: &mut String) -> Result<(), Error> { + Ok(()) + } + + fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, kernel_body: &mut String) -> Result<(), Error> { + Ok(()) + } + + fn get_type(&self, id: TypeID) -> String { + match self.types[id.idx()] { + Type::Product(ref product_ty_ids) => { + format!("Product_{} *", id.idx()) + } + Type::Summation(ref summation_ty_ids) => { + format!("Summation_{} *", id.idx()) + } + _ => convert_type(&self.types[id.idx()]), + } } fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> usize { @@ -324,9 +425,31 @@ impl<'a, W: Write> GPUContext<'a, W> { } // matmul detection- only called if einsum detected - fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {} + fn matmul_detection(&self, w: &mut W) -> Result<(), Error> { + Ok(()) + } // convolution detection- only called if einsum detected - fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {} + fn convolution_detection(&self, w: &mut W) -> Result<(), Error> { + Ok(()) + } + +} -} \ No newline at end of file +// TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 +fn convert_type(ty: &Type) -> String { + match ty { + Type::Boolean => "bool".to_string(), + Type::Integer8 => "int8_t".to_string(), + Type::UnsignedInteger8 => "uint8_t".to_string(), + Type::Integer16 => "short".to_string(), + Type::UnsignedInteger16 => "unsigned short".to_string(), + Type::Integer32 => "int".to_string(), + Type::UnsignedInteger32 => "unsigned int".to_string(), + Type::Integer64 => "long long".to_string(), + Type::UnsignedInteger64 => "unsigned long long".to_string(), + Type::Float32 => "float".to_string(), + Type::Float64 => "double".to_string(), + _ => panic!("Unsupported type"), + } +} diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index a70effb5..e41f0205 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -1,10 +1,12 @@ #![feature(if_let_guard, let_chains)] pub mod cpu; +pub mod gpu; pub mod device; pub mod rt; pub use crate::cpu::*; +pub use crate::gpu::*; pub use crate::device::*; pub use crate::rt::*; -- GitLab From c7752add3e7fdf51e03dca7e91c02e5300f5abb7 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 27 Dec 2024 17:25:29 -0800 Subject: [PATCH 027/109] consts --- hercules_cg/src/gpu.rs | 232 ++++++++++++++++++++++++++++++----------- 1 file changed, 171 insertions(+), 61 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 3e86db22..7c56fb41 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -4,7 +4,7 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; -// use std::iter::{zip, FromIterator}; +use std::iter::FromIterator; // zip // use std::sync::atomic::{AtomicUsize, Ordering}; // use self::bitvec::prelude::*; @@ -63,9 +63,21 @@ pub fn gpu_codegen<W: Write>( } (fork_forward_adjacency, fork_join_map) }; - let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); + let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { + let mut label_data_for_phi = HashMap::new(); + for (idx, node) in function.nodes.iter().enumerate() { + if let Node::Phi { control: _, data } = node { + for &data_id in data.iter() { + label_data_for_phi.entry(data_id).or_insert(vec![]).push(NodeID::new(idx)); + } + } + } + label_data_for_phi + }; + let label_data_for_phi = label_data_for_phi(); + let mut ctx = GPUContext { function, types, @@ -80,6 +92,7 @@ pub fn gpu_codegen<W: Write>( kernel_attrs: &mut kernel_attrs, fork_forward_adjacency, fork_join_map, + label_data_for_phi, }; ctx.codegen_function(w) } @@ -111,6 +124,7 @@ struct GPUContext<'a> { kernel_attrs: &'a mut GPUKernelAttrs, fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, fork_join_map: HashMap<NodeID, NodeID>, + label_data_for_phi: HashMap<NodeID, Vec<NodeID>>, } #[derive(Default, Debug)] @@ -150,7 +164,7 @@ impl<'a> GPUContext<'a> { .map(|idx| { let node_id = NodeID::new(idx); let mut goto = CudaGoto::default(); - goto.header = format!("{}{}", self.function.nodes[idx].upper_case_name(), idx); + goto.header = self.get_value(node_id, false); (node_id, goto) }) .collect(); @@ -161,10 +175,8 @@ impl<'a> GPUContext<'a> { // Assign outermost fork joins to block level let (block_fork_ids, block_fork_sizes) = self.codegen_block_creation(&mut top)?; // Sink logic from outer block fork joins. If it's a write, add - // necessary block-id based condition. For now, it's done naively at the - // top of the kernel. + // necessary block-id based condition. let mut block_stride = self.kernel_attrs.num_blocks; - gotos.get_mut(&NodeID::new(0)).unwrap().header = "start_sink".to_string(); self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut top, &mut gotos)?; for (i, &fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { block_stride = block_stride.saturating_div(block_fork_sizes[i]); @@ -239,18 +251,17 @@ impl<'a> GPUContext<'a> { } - fn codegen_phi_registers(&self, kernel_body: &mut String) -> Result<(), Error> { + fn codegen_phi_registers(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if let Node::Phi { control: _, data } = &self.function.nodes[id.idx()] { - let ty = self.get_type(self.typing[data[0].idx()]); - write!(kernel_body, "\t{} {}{}_value;\n", ty, self.function.nodes[id.idx()].upper_case_name(), id.idx())?; + if let Node::Phi { control: _, data: _ } = &self.function.nodes[id.idx()] { + write!(w, "\t{};\n", self.get_value(id, true))?; } } Ok(()) } // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. - fn codegen_block_creation(&mut self, kernel_body: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { + fn codegen_block_creation(&mut self, w: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { // a) let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); for (_, children) in self.fork_forward_adjacency.iter() { @@ -260,7 +271,7 @@ impl<'a> GPUContext<'a> { } let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); if root_forks.len() != 1 { - return Err(Error); + panic!("Exactly one root fork is required for outermost GPU block fork"); } let mut strict_forks = vec![root_forks[0]]; @@ -285,14 +296,9 @@ impl<'a> GPUContext<'a> { } let factors = match &self.function.nodes[fork.idx()] { Node::Fork { factors, .. } => factors, - _ => return Err(Error) + _ => panic!("Expected fork node") }; - let fork_size = factors.iter() - .try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .ok_or_else(|| Error) - .map(|val| acc.saturating_mul(val)) - })?; + let fork_size = self.multiply_fork_factors(factors)?; let new_blocks = cumulative_blocks.saturating_mul(fork_size); if new_blocks > self.kernel_params.max_num_blocks { break; @@ -308,36 +314,61 @@ impl<'a> GPUContext<'a> { .collect::<Vec<_>>(); // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - write!(kernel_body, "\tconst int block_x = blockIdx.x;\n")?; + write!(w, "\tconst int block_x = blockIdx.x;\n")?; Ok((valid_block_forks, block_fork_sizes)) } - fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, kernel_body: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { + fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, w: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { // 1. Get control nodes including fork_id that are dominated by fork_id // and not dominated by next_fork_id and not dominated by fork_id's join let dom = dominator(self.control_subgraph, fork_id); assert!(dom.does_dom(fork_id, next_fork_id)); - let mut control_nodes_between = vec![]; + let mut control_nodes_between = HashSet::new(); for node_id in self.control_subgraph.iter() { if dom.does_dom(fork_id, *node_id) && !dom.does_dom(next_fork_id, *node_id) && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) { - control_nodes_between.push(*node_id); + control_nodes_between.insert(*node_id); + } + } + + // 2. Emit data flow for nodes assigned to those basic blocks. Phi + // registers were already emitted at top. + // TEMPORARY: ignoring the special write case for now + let mut worklist = VecDeque::from_iter( + self.reverse_postorder + .into_iter() + .filter(|id| !self.function.nodes[id.idx()].is_control() + && control_nodes_between.contains(&self.bbs[id.idx()]) + && !self.function.nodes[id.idx()].is_phi() + ), + ); + let mut visited = HashSet::new(); + while let Some(id) = worklist.pop_front() { + let node = &self.function.nodes[id.idx()]; + if node.is_reduce() { + panic!("Reduce nodes should not be in block sink"); + } + if get_uses(node) + .as_ref() + .into_iter() + .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) + { + self.codegen_data_node(*id, w)?; + visited.insert(id); + } else { + worklist.push_back(id); } } - // 2. Call regular data codegen for blocks corresponding to - // control nodes, with extra if surrounding index-dependent write - // (TODO: consider shared memory optimization) - for &node_id in control_nodes_between.iter() { - self.codegen_data_node(node_id, &mut gotos.get_mut(&node_id).unwrap().body)?; + + // 3. Emit control flow + for control_node in control_nodes_between { + self.codegen_control_node(control_node, w)?; } - // 3. call regular control codegen using goto structure Ok(()) } // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. - fn codegen_thread_creation(&mut self, inner_block_fork: NodeID, kernel_body: &mut String) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { - let fork_forward_adjacency = self.fork_forward_adjacency.as_ref().unwrap(); - + fn codegen_thread_creation(&mut self, inner_block_fork: NodeID, w: &mut String) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { let mut thread_fork_parents = HashMap::new(); let mut thread_fork_sizes = HashMap::new(); let mut thread_fork_cumulative_sizes = HashMap::new(); @@ -348,7 +379,7 @@ impl<'a> GPUContext<'a> { let mut visited = HashSet::new(); visited.insert(inner_block_fork); while let Some(pop) = stack.pop() { - let children = &fork_forward_adjacency[&pop]; + let children = &self.fork_forward_adjacency[&pop]; // Reverse child order due to use of stack for DFS for &child in children.iter().rev() { @@ -356,13 +387,8 @@ impl<'a> GPUContext<'a> { visited.insert(child); thread_fork_parents.insert(child, pop); let fork_size = match &self.function.nodes[child.idx()] { - Node::Fork { factors, .. } => factors.iter() - .try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .ok_or_else(|| Error) - .map(|val| acc.saturating_mul(val)) - })?, - _ => return Err(Error) + Node::Fork { factors, .. } => self.multiply_fork_factors(factors)?, + _ => panic!("Expected fork node") }; thread_fork_sizes.insert(child, fork_size); @@ -386,52 +412,136 @@ impl<'a> GPUContext<'a> { // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. self.kernel_attrs.num_threads = max_thread_size; // If limit on number of threads in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - write!(kernel_body, "\tconst int thread_x = threadIdx.x;\n")?; + write!(w, "\tconst int thread_x = threadIdx.x;\n")?; Ok((thread_fork_parents, thread_fork_sizes, thread_fork_edges)) } - fn codegen_control_node(&self, id: NodeID, kernel_body: &mut String) -> Result<(), Error> { + fn codegen_control_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { + Ok(()) + } + + fn codegen_data_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { + match &self.function.nodes[id.idx()] { + Node::Phi { control: _, data: _ } => {} + Node::Parameter { index } => { + write!(w, "\t{} = p{};\n", self.get_value(id, true), index)?; + } + Node::Constant { id: cons_id } => { + write_constant() + + } + Node::DynamicConstant { id: _ } => {} + Node::Unary { op: _, input: _ } => {} + Node::Binary { op: _, left: _, right: _ } => {} + Node::Ternary { op: _, first: _, second: _, third: _ } => {} + Node::IntrinsicCall { intrinsic: _, args: _ } => {} + Node::Read { collect: _, indices: _ } => {} + Node::Write { collect: _, data: _, indices: _ } => {} + Node::Projection { control: _, selection: _ } => {} + Node::Undef { ty: _ } => {} + _ => {} + } Ok(()) } - fn codegen_data_node(&self, id: NodeID, kernel_body: &mut String) -> Result<(), Error> { + fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, w: &mut String) -> Result<(), Error> { Ok(()) } - fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, kernel_body: &mut String) -> Result<(), Error> { + // matmul detection- only called if einsum detected + fn matmul_detection(&self) -> Result<(), Error> { Ok(()) } - fn get_type(&self, id: TypeID) -> String { - match self.types[id.idx()] { - Type::Product(ref product_ty_ids) => { - format!("Product_{} *", id.idx()) + // convolution detection- only called if einsum detected + fn convolution_detection(&self) -> Result<(), Error> { + Ok(()) + } + + fn write_constant(&self, name: String, type_name: String, cons_id: ConstantID, w: &mut String) -> Result<(), Error> { + write!(w, "\t{} {}", type_name, name)?; + match self.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(w, " = {}\n", val)?, + Constant::Integer8(val) => write!(w, " = {}\n", val)?, + Constant::UnsignedInteger8(val) => write!(w, " = {}\n", val)?, + Constant::Integer16(val) => write!(w, " = {}\n", val)?, + Constant::UnsignedInteger16(val) => write!(w, " = {}\n", val)?, + Constant::Integer32(val) => write!(w, " = {}\n", val)?, + Constant::UnsignedInteger32(val) => write!(w, " = {}ul\n", val)?, + Constant::Integer64(val) => write!(w, " = {}ll\n", val)?, + Constant::UnsignedInteger64(val) => write!(w, " = {}ull\n", val)?, + Constant::Float32(val) => write!(w, " = {}f\n", val)?, + Constant::Float64(val) => write!(w, " = {}\n", val)?, + Constant::Product(_, fields) => { + write!(w, ";\n")?; + for (i, field) in fields.iter().enumerate() { + self.write_constant(format!("{}_field_{}", name, i), self.constant_to_type_name(*field), *field, w)?; + write!(w, "\t{}.field_{} = {}_field_{};\n", name, i, name, i)?; + } } - Type::Summation(ref summation_ty_ids) => { - format!("Summation_{} *", id.idx()) + Constant::Summation(_, variant, field) => { + write!(w, ";\n")?; + self.write_constant(format!("{}_field_{}", name, variant), self.constant_to_type_name(field), field, w)?; + write!(w, "\t{}.tag = {};\n\t{}.field_{} = {}_field_{};\n", name, variant, name, variant, name, variant)?; } - _ => convert_type(&self.types[id.idx()]), + Constant::Array(_) => { + write!(w, ";\n")?; + for (i, element) in elements.iter().enumerate() { + self.write_constant(format!("{}_element_{}", name, i), self.constant_to_type_name(*element), *element, w)?; + write!(w, "\t{}[{}] = {}_element_{};\n", name, i, name, i)?; + } + } + } + Ok(()) + } + + fn constant_to_type_name(&self, cons_id: ConstantID) -> String { + match self.constants[cons_id.idx()] { + Constant::Boolean(_) => "bool".to_string(), + Constant::Integer8(_) => "int8_t".to_string(), + Constant::UnsignedInteger8(_) => "uint8_t".to_string(), + Constant::Integer16(_) => "short".to_string(), + Constant::UnsignedInteger16(_) => "unsigned short".to_string(), + Constant::Integer32(_) => "int".to_string(), + Constant::UnsignedInteger32(_) => "unsigned int".to_string(), + Constant::Integer64(_) => "long long".to_string(), + Constant::UnsignedInteger64(_) => "unsigned long long".to_string(), + Constant::Float32(_) => "float".to_string(), + Constant::Float64(_) => "double".to_string(), + Constant::Product(type_id, _) => self.get_type(type_id), + Constant::Summation(type_id, _, _) => self.get_type(type_id), + Constant::Array(type_id) => self.get_type(type_id), } } - fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> usize { + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Result<usize, Error> { factors.iter() - .map(|&factor_id| { + .try_fold(1usize, |acc, &factor_id| { evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .unwrap_or_else(|| panic!("Fork factors must be evaluatable to constants")) + .ok_or_else(|| Error) + .map(|val| acc.saturating_mul(val)) }) - .product() } - // matmul detection- only called if einsum detected - fn matmul_detection(&self, w: &mut W) -> Result<(), Error> { - Ok(()) + fn get_value(&self, id: NodeID, ty: bool) -> String { + if ty { + format!("{} {}{}", self.get_type(self.typing[id.idx()]), self.function.nodes[id.idx()].lower_case_name(), id.idx()) + } else { + format!("{}{}", self.function.nodes[id.idx()].lower_case_name(), id.idx()) + } } - // convolution detection- only called if einsum detected - fn convolution_detection(&self, w: &mut W) -> Result<(), Error> { - Ok(()) + fn get_type(&self, id: TypeID) -> String { + match self.types[id.idx()] { + Type::Product(_) => { + format!("Product_{}", id.idx()) + } + Type::Summation(_) => { + format!("Summation_{}", id.idx()) + } + _ => convert_type(&self.types[id.idx()]), + } } } -- GitLab From c78dc6de3d5f4a2eb96ed0b7dee6fe2d4a43ee5d Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 30 Dec 2024 02:41:57 -0800 Subject: [PATCH 028/109] backward and forward --- hercules_cg/src/gpu.rs | 977 ++++++++++++++++++++++++++++++++--------- hercules_ir/src/ir.rs | 238 +++++++++- 2 files changed, 1007 insertions(+), 208 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 7c56fb41..aa5908a2 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,15 +3,35 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; - -use std::iter::FromIterator; // zip -// use std::sync::atomic::{AtomicUsize, Ordering}; - -// use self::bitvec::prelude::*; +use std::hash::{Hash, Hasher}; +use std::iter::FromIterator; use self::hercules_ir::*; -// use crate::*; +#[derive(Debug, Clone)] +struct HashableIndex<'a>(Vec<&'a str>); +impl<'a> FromIterator<&'a String> for HashableIndex<'a> { + fn from_iter<I: IntoIterator<Item = &'a String>>(iter: I) -> Self { + HashableIndex(iter.into_iter().map(|s| s.as_str()).collect()) + } +} +impl<'a> PartialEq for HashableIndex<'a> { + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} +impl<'a> Eq for HashableIndex<'a> {} +impl<'a> Hash for HashableIndex<'a> { + fn hash<H: Hasher>(&self, state: &mut H) { + self.0.hash(state); + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +enum MemoryType { + Shared, + Register, +} /* * The top level function to compile a Hercules IR function into NVVM IR kernel for @@ -33,13 +53,13 @@ pub fn gpu_codegen<W: Write>( max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, - num_smps: 60, + num_smps: 60, }; - let mut kernel_attrs = GPUKernelAttrs::default(); + let kernel_attrs = GPUKernelAttrs::default(); // Create fork forward adjacency and join map upfront as part of context - let make_fork_structures = || -> (HashMap::<NodeID, Vec<NodeID>>, HashMap::<NodeID, NodeID>) { - let mut fork_forward_adjacency: HashMap::<NodeID, Vec<NodeID>> = (0..function.nodes.len()) + let make_fork_structures = || -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, NodeID>) { + let mut fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>> = (0..function.nodes.len()) .filter(|idx| function.nodes[*idx].is_fork()) .map(|idx| (NodeID::new(idx), vec![])) .collect(); @@ -65,12 +85,40 @@ pub fn gpu_codegen<W: Write>( }; let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); + let map_join_reduce = || -> HashMap<NodeID, Vec<NodeID>> { + let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) + .filter(|idx| function.nodes[*idx].is_reduce()) + .map(NodeID::new) + .collect(); + let mut map_join_reduce = HashMap::new(); + for (_, join) in fork_join_map.iter() { + let reduce_nodes_for_join = reduce_nodes + .iter() + .filter(|reduce_node| match &function.nodes[reduce_node.idx()] { + Node::Reduce { + control, + init: _, + reduct: _, + } => control.idx() == join.idx(), + _ => false, + }) + .copied() + .collect(); + map_join_reduce.insert(*join, reduce_nodes_for_join); + } + map_join_reduce + }; + let join_reduce_map = map_join_reduce(); + let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { let mut label_data_for_phi = HashMap::new(); for (idx, node) in function.nodes.iter().enumerate() { if let Node::Phi { control: _, data } = node { for &data_id in data.iter() { - label_data_for_phi.entry(data_id).or_insert(vec![]).push(NodeID::new(idx)); + label_data_for_phi + .entry(data_id) + .or_insert(vec![]) + .push(NodeID::new(idx)); } } } @@ -78,6 +126,27 @@ pub fn gpu_codegen<W: Write>( }; let label_data_for_phi = label_data_for_phi(); + // Since global memory traffic is expensive, we use shared memory and + // registers basically as write-back caches, but we write back due to + // end of scope rather than due to synchronization (which is solved by + // shmem). + // param_cache tracks cache for each parameter by accessed index and + // memory type. Note that indexing is hierarchical, so [a, b] contains + // [a, b, c] and will give a hit upon query of the latter. param_cache is + // only added to for copies from global -> shared or global -> register. + // Writes update the cache, but we track specific indices written in + // param_cache_writes to know what to write back (and avoid redundant + // writes). + let param_cache = vec![ + HashMap::<(HashableIndex<'static>, MemoryType), String>::new(); + function.param_types.len() + ]; + let param_cache_writes = + vec![HashSet::<(HashableIndex<'static>, MemoryType)>::new(); function.param_types.len()]; + // Statically unknown shared memory buffers need to use dynamic offsets from. + // the dynamic shared memory buffer + let mut dynamic_shared_offset = "0".to_string(); + let mut ctx = GPUContext { function, types, @@ -87,16 +156,19 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, - cuda_structs: HashSet::new(), kernel_params, - kernel_attrs: &mut kernel_attrs, + kernel_attrs, fork_forward_adjacency, fork_join_map, label_data_for_phi, + join_reduce_map, + param_cache, + param_cache_writes, }; ctx.codegen_function(w) } +// Fixed prior to codegen struct GPUKernelParams { max_num_blocks: usize, max_num_threads: usize, @@ -104,10 +176,12 @@ struct GPUKernelParams { num_smps: usize, } +// Set during codegen #[derive(Default)] struct GPUKernelAttrs { num_blocks: usize, num_threads: usize, + extern_shmem_offset: String, } struct GPUContext<'a> { @@ -119,12 +193,14 @@ struct GPUContext<'a> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a Vec<NodeID>, - cuda_structs: HashSet<usize>, kernel_params: GPUKernelParams, - kernel_attrs: &'a mut GPUKernelAttrs, + kernel_attrs: GPUKernelAttrs, fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, fork_join_map: HashMap<NodeID, NodeID>, label_data_for_phi: HashMap<NodeID, Vec<NodeID>>, + join_reduce_map: HashMap<NodeID, Vec<NodeID>>, + param_cache: Vec<HashMap<(HashableIndex, MemoryType), String>>, + param_cache_writes: Vec<HashSet<(HashableIndex, MemoryType)>>, } #[derive(Default, Debug)] @@ -135,11 +211,11 @@ struct CudaGoto { handled: bool, } -impl<'a> GPUContext<'a> { +impl GPUContext<'_> { fn codegen_function<W: Write>(&mut self, w: &mut W) -> Result<(), Error> { - // Include all possible imports + // Include all possible includes then macros write!( - w, + w, " #include <assert.h> #include <stdio.h> @@ -147,46 +223,63 @@ impl<'a> GPUContext<'a> { #include <cuda_runtime.h> #include <mma.h> #include <helper_cuda.h> + +#define uabs(a) (a) +#define umin(a, b) ((a) < (b) ? (a) : (b)) +#define umax(a, b) ((a) > (b) ? (a) : (b)) +#define powi(a, b) ({{ int res = 1; for(int i = 0; i < b; ++i) res *= a; res; }}) +#define roundi(a) (a) +#define isqrt(a) ((int)sqrtf((float)(a))) + ", )?; let mut top = String::new(); - // Create all possible structs + // Emit all possible structs self.codegen_structs(&mut top)?; - // Kernel template, signature, and arguments + // Emit kernel template, signature, and arguments self.codegen_kernel_begin(&mut top)?; + // Need to emit dynamic offsets for extern shmem, we do this by strings. + self.kernel_attrs.extern_shmem_offset = "0".to_string(); + + // Emit calculation of all dynamic constants + self.codegen_dynamic_constants(&mut top)?; - // Uses CUDA's goto structure; some control nodes' gen may be moved, eg + // Uses CUDA's goto structure; some control nodes' gen may be moved, eg // block and thread fork joins. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) .map(|idx| { let node_id = NodeID::new(idx); - let mut goto = CudaGoto::default(); - goto.header = self.get_value(node_id, false); + let goto = CudaGoto { + header: self.get_value(node_id, false, false), + ..Default::default() + }; (node_id, goto) }) .collect(); - // Generate phi registers at top, later can consider smarter scoping - self.codegen_phi_registers(&mut top)?; - - // Assign outermost fork joins to block level - let (block_fork_ids, block_fork_sizes) = self.codegen_block_creation(&mut top)?; + // Assign outermost fork joins to block level. TODO: remove block_sizes + // if still not needed later + let (block_fork_ids, _) = self.codegen_block_creation()?; + // Assign inner fork joins to thread level. We do this before block sink + // because we need thread size for shared memory optimizations + let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = + self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1])?; // Sink logic from outer block fork joins. If it's a write, add // necessary block-id based condition. - let mut block_stride = self.kernel_attrs.num_blocks; - self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], block_stride, &mut top, &mut gotos)?; - for (i, &fork_id) in block_fork_ids.iter().enumerate().take(block_fork_ids.len() - 1) { - block_stride = block_stride.saturating_div(block_fork_sizes[i]); - self.codegen_block_sink(fork_id, block_fork_ids[i + 1], block_stride, &mut top, &mut gotos)?; + self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], &mut top, &mut gotos)?; + for (i, &fork_id) in block_fork_ids + .iter() + .enumerate() + .take(block_fork_ids.len() - 1) + { + self.codegen_block_sink(fork_id, block_fork_ids[i + 1], &mut top, &mut gotos)?; } - // Assign inner fork joins to thread level, with labels for warp - let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1], &mut top)?; - // Punting on implementation but can likely run einsum -> matmul/conv - // detector on hierarhical fork joins between block edge and given + // Punting on implementation but can likely run einsum -> matmul/conv + // detector on hierarhical fork joins between block edge and given // thread edge. // finish kernel @@ -202,67 +295,137 @@ impl<'a> GPUContext<'a> { Type::Product(ref product_ty_ids) => { write!(w, "\nstruct Product_{} {{\n", id)?; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { - write!(w, "\t{} field_{};\n", self.get_type(*product_ty_id), i)?; + write!( + w, + "\t{} field_{};\n", + self.get_type(*product_ty_id, false), + i + )?; } write!(w, "}};\n")?; } Type::Summation(ref summation_ty_ids) => { write!(w, "\nstruct Summation_{} {{\n\t union {{\n", id)?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { - write!(w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id), i)?; + write!( + w, + "\t\t{} field_{};\n", + self.get_type(*summation_ty_id, false), + i + )?; } write!(w, "\t}};\n\tuint8_t tag;\n}};\n")?; } _ => {} } - } + } Ok(()) } - fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { - write!(w, "template <")?; - // The dynamic constants become template parameters. - let mut first_template_param = true; + write!( + w, + "__global__ void __launch_bounds__({}) {}(", + self.kernel_params.max_num_threads, self.function.name + )?; + // The first set of parameters are dynamic constants. + let mut first_param = true; for idx in 0..self.function.num_dynamic_constants { - if first_template_param { - first_template_param = false; + if first_param { + first_param = false; } else { write!(w, ", ")?; } - write!(w, "long long int dc_p{}", idx)?; + write!(w, "unsigned long long dc_p{}", idx)?; } - write!(w, ">\n")?; - - write!(w, "__global__ void __launch_bounds__({}) {}(", self.kernel_params.max_num_threads, self.function.name)?; // The second set of parameters are normal arguments. - let mut first_param = true; for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; } else { write!(w, ", ")?; } - write!(w, "{} p{}", self.get_type(*ty), idx)?; + write!(w, "{} p{}", self.get_type(*ty, true), idx)?; } - write!(w, ") {{\n")?; + // We convert originally non-void functions to void functions by adding a + // return parameter. For now we ignore the case where return was derived + // from a parameter through reads and writes, and instead always memcpy. + let return_index = self.function.nodes.iter().position(|node| node.is_return()); + if let Some(return_index) = return_index { + if let Node::Return { + control: _, + data: return_data, + } = &self.function.nodes[return_index] + { + write!( + w, + ", {} return_val", + self.get_type(self.typing[return_data.idx()], true) + )?; + } else { + panic!("Expected return node"); + } + } + + // Type is char since it's simplest to use single bytes for indexing, + // casting will be needed for use with different types. + write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n")?; Ok(()) } + fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { + for dc in dynamic_constants_bottom_up(self.dynamic_constants) { + let dc_val = format!("unsigned long long dc{}", dc.idx()); + match self.dynamic_constants[dc.idx()] { + DynamicConstant::Constant(val) => write!(w, "\t{} = {}ull;\n", dc_val, val)?, + DynamicConstant::Parameter(idx) => { + if idx < self.function.num_dynamic_constants as usize { + write!(w, "\t{} = dc_p{};\n", dc_val, idx)? + } else { + write!(w, "\t{} = 0;\n", dc_val)? + } + } + DynamicConstant::Add(left, right) => { + write!(w, "\t{} = dc{} + dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Sub(left, right) => { + write!(w, "\t{} = dc{} - dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Mul(left, right) => { + write!(w, "\t{} = dc{} * dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Div(left, right) => { + write!(w, "\t{} = dc{} / dc{};\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Rem(left, right) => { + write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())? + } + } + } + Ok(()) + } - fn codegen_phi_registers(&self, w: &mut String) -> Result<(), Error> { + fn codegen_phi_registers<F>(&self, w: &mut String, should_process: F) -> Result<(), Error> + where + F: Fn(NodeID) -> bool, + { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if let Node::Phi { control: _, data: _ } = &self.function.nodes[id.idx()] { - write!(w, "\t{};\n", self.get_value(id, true))?; + if let Node::Phi { + control: _, + data: _, + } = &self.function.nodes[id.idx()] + { + if should_process(id) { + write!(w, "\t{};\n", self.get_value(id, true, true))?; + } } } Ok(()) } - // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) total number of blocks < max_num_blocks, c) each fork join's bounds are independent of outer fork joins, and d) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. - fn codegen_block_creation(&mut self, w: &mut String) -> Result<(Vec<NodeID>, Vec<usize>), Error> { - // a) + // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the join has no user reduce nodes, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> { let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); for (_, children) in self.fork_forward_adjacency.iter() { for child in children { @@ -273,19 +436,20 @@ impl<'a> GPUContext<'a> { if root_forks.len() != 1 { panic!("Exactly one root fork is required for outermost GPU block fork"); } - + + // a and b let mut strict_forks = vec![root_forks[0]]; let mut curr_fork = root_forks[0]; - while self.fork_join_map.get(&curr_fork).is_some() { + while let Some(join) = self.fork_join_map.get(&curr_fork) { let children = &self.fork_forward_adjacency[&curr_fork]; - if children.len() != 1 { + if children.len() != 1 || !self.join_reduce_map.contains_key(join) { break; } curr_fork = children[0]; strict_forks.push(curr_fork); } - // b, (stronger version of) c, and d + // c, (stronger version of) d, and e let mut valid_block_forks = 0; let mut cumulative_blocks = 1usize; let mut block_fork_sizes = Vec::new(); @@ -296,7 +460,7 @@ impl<'a> GPUContext<'a> { } let factors = match &self.function.nodes[fork.idx()] { Node::Fork { factors, .. } => factors, - _ => panic!("Expected fork node") + _ => panic!("Expected fork node"), }; let fork_size = self.multiply_fork_factors(factors)?; let new_blocks = cumulative_blocks.saturating_mul(fork_size); @@ -308,67 +472,21 @@ impl<'a> GPUContext<'a> { valid_block_forks += 1; } + // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. self.kernel_attrs.num_blocks = cumulative_blocks; - let valid_block_forks = strict_forks.into_iter() + let valid_block_forks = strict_forks + .into_iter() .take(valid_block_forks) .collect::<Vec<_>>(); - // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - write!(w, "\tconst int block_x = blockIdx.x;\n")?; Ok((valid_block_forks, block_fork_sizes)) } - fn codegen_block_sink(&self, fork_id: NodeID, next_fork_id: NodeID, block_stride: usize, w: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>) -> Result<(), Error> { - // 1. Get control nodes including fork_id that are dominated by fork_id - // and not dominated by next_fork_id and not dominated by fork_id's join - let dom = dominator(self.control_subgraph, fork_id); - assert!(dom.does_dom(fork_id, next_fork_id)); - let mut control_nodes_between = HashSet::new(); - for node_id in self.control_subgraph.iter() { - if dom.does_dom(fork_id, *node_id) && !dom.does_dom(next_fork_id, *node_id) && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) { - control_nodes_between.insert(*node_id); - } - } - - // 2. Emit data flow for nodes assigned to those basic blocks. Phi - // registers were already emitted at top. - // TEMPORARY: ignoring the special write case for now - let mut worklist = VecDeque::from_iter( - self.reverse_postorder - .into_iter() - .filter(|id| !self.function.nodes[id.idx()].is_control() - && control_nodes_between.contains(&self.bbs[id.idx()]) - && !self.function.nodes[id.idx()].is_phi() - ), - ); - let mut visited = HashSet::new(); - while let Some(id) = worklist.pop_front() { - let node = &self.function.nodes[id.idx()]; - if node.is_reduce() { - panic!("Reduce nodes should not be in block sink"); - } - if get_uses(node) - .as_ref() - .into_iter() - .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) - { - self.codegen_data_node(*id, w)?; - visited.insert(id); - } else { - worklist.push_back(id); - } - } - - // 3. Emit control flow - for control_node in control_nodes_between { - self.codegen_control_node(control_node, w)?; - } - - Ok(()) - } - - // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. - fn codegen_thread_creation(&mut self, inner_block_fork: NodeID, w: &mut String) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { + // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. + fn codegen_thread_creation( + &mut self, + inner_block_fork: NodeID, + ) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { let mut thread_fork_parents = HashMap::new(); let mut thread_fork_sizes = HashMap::new(); let mut thread_fork_cumulative_sizes = HashMap::new(); @@ -381,18 +499,19 @@ impl<'a> GPUContext<'a> { while let Some(pop) = stack.pop() { let children = &self.fork_forward_adjacency[&pop]; - // Reverse child order due to use of stack for DFS + // Reverse child order due to use of stack for DFS for &child in children.iter().rev() { if !visited.contains(&child) { visited.insert(child); thread_fork_parents.insert(child, pop); let fork_size = match &self.function.nodes[child.idx()] { Node::Fork { factors, .. } => self.multiply_fork_factors(factors)?, - _ => panic!("Expected fork node") + _ => panic!("Expected fork node"), }; thread_fork_sizes.insert(child, fork_size); - let new_cumulative_size = (thread_fork_cumulative_sizes[&pop] as usize).saturating_mul(fork_size as usize); + let new_cumulative_size = (thread_fork_cumulative_sizes[&pop] as usize) + .saturating_mul(fork_size as usize); if new_cumulative_size > self.kernel_params.max_num_threads { // Expanding to child fork exceeds thread limit, so // current fork is an edge fork @@ -409,43 +528,260 @@ impl<'a> GPUContext<'a> { } } - // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. - self.kernel_attrs.num_threads = max_thread_size; + // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. // If limit on number of threads in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - write!(w, "\tconst int thread_x = threadIdx.x;\n")?; + self.kernel_attrs.num_threads = max_thread_size; Ok((thread_fork_parents, thread_fork_sizes, thread_fork_edges)) } + fn codegen_block_sink( + &self, + fork_id: NodeID, + next_fork_id: NodeID, + w: &mut String, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + // 1. Get control nodes including fork_id that are dominated by fork_id + // and not dominated by next_fork_id and not dominated by fork_id's join + let dom = dominator(self.control_subgraph, fork_id); + assert!(dom.does_dom(fork_id, next_fork_id)); + let mut control_nodes_between = HashSet::new(); + for node_id in self.control_subgraph.iter() { + if dom.does_dom(fork_id, *node_id) + && !dom.does_dom(next_fork_id, *node_id) + && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) + { + control_nodes_between.insert(*node_id); + } + } + + // 2. Emit data flow for nodes assigned to those basic blocks + // 2a. All phi registers first + self.codegen_phi_registers(w, |id| control_nodes_between.contains(&self.bbs[id.idx()]))?; + // 2b. All other data nodes + let mut worklist = VecDeque::from_iter(self.reverse_postorder.iter().filter(|id| { + !self.function.nodes[id.idx()].is_control() + && control_nodes_between.contains(&self.bbs[id.idx()]) + && !self.function.nodes[id.idx()].is_phi() + })); + let mut visited = HashSet::new(); + while let Some(id) = worklist.pop_front() { + let node = &self.function.nodes[id.idx()]; + if node.is_reduce() { + panic!("Reduce nodes should not be in block sink"); + } + if get_uses(node) + .as_ref() + .iter() + .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) + { + self.codegen_data_node( + *id, + &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body, + )?; + visited.insert(id); + } else { + worklist.push_back(id); + } + } + + // 3. Emit control flow + for control_node in control_nodes_between { + self.codegen_control_node(control_node, w)?; + } + + Ok(()) + } + fn codegen_control_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { Ok(()) } - fn codegen_data_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { + fn codegen_data_node( + &self, + id: NodeID, + w: &mut String, + ) -> Result<(), Error> { + // For now only used shared memory when creating an array + let declare_variable = self.get_value(id, true, false).to_string(); match &self.function.nodes[id.idx()] { - Node::Phi { control: _, data: _ } => {} - Node::Parameter { index } => { - write!(w, "\t{} = p{};\n", self.get_value(id, true), index)?; - } + // Phi registers were already emitted. + Node::Phi { + control: _, + data: _, + } => {} + // No SSA requirement for CUDA + Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { - write_constant() - + self.codegen_constant( + declare_variable, + self.get_value(id, false, false), + *cons_id, + w, + )?; } + // No SSA requirement for CUDA Node::DynamicConstant { id: _ } => {} - Node::Unary { op: _, input: _ } => {} - Node::Binary { op: _, left: _, right: _ } => {} - Node::Ternary { op: _, first: _, second: _, third: _ } => {} - Node::IntrinsicCall { intrinsic: _, args: _ } => {} - Node::Read { collect: _, indices: _ } => {} - Node::Write { collect: _, data: _, indices: _ } => {} - Node::Projection { control: _, selection: _ } => {} - Node::Undef { ty: _ } => {} - _ => {} + Node::Unary { op, input } => match op { + UnaryOperator::Not => match &self.types[self.typing[input.idx()].idx()] { + Type::Boolean => { + write!( + w, + "\t{} = !{};\n", + declare_variable, + self.get_value(*input, false, false), + )?; + } + ty if ty.is_fixed() => { + write!( + w, + "\t{} = ~{};\n", + declare_variable, + self.get_value(*input, false, false), + )?; + } + _ => panic!("Unsupported type for not operator"), + }, + UnaryOperator::Neg => match &self.types[self.typing[input.idx()].idx()] { + ty if ty.is_signed() || ty.is_float() => { + write!( + w, + "\t{} = -{};\n", + declare_variable, + self.get_value(*input, false, false), + )?; + } + _ => { + panic!("Unsupported type for neg operator") + } + }, + UnaryOperator::Cast(dst_ty_id) => { + write!( + w, + "\t{} = static_cast<{}>({});\n", + declare_variable, + self.get_type(*dst_ty_id, false), + self.get_value(*input, false, false), + )?; + } + }, + Node::Binary { op, left, right } => { + let left_val = self.get_value(*left, false, false); + let right_val = self.get_value(*right, false, false); + match (op, &self.types[self.typing[left.idx()].idx()]) { + (BinaryOperator::Rem, Type::Float32) => write!( + w, + "\t{} = fmodf({}, {});\n", + declare_variable, + left_val, + right_val, + )?, + (BinaryOperator::Rem, Type::Float64) => write!( + w, + "\t{} = fmod({}, {});\n", + declare_variable, + left_val, + right_val, + )?, + // Doesn't need special syntax but bool type + (BinaryOperator::Or, Type::Boolean) => write!( + w, + "\t{} = {} || {};\n", + declare_variable, + left_val, + right_val, + )?, + (BinaryOperator::And, Type::Boolean) => write!( + w, + "\t{} = {} && {};\n", + declare_variable, + left_val, + right_val, + )?, + (op, _) => write!( + w, + "\t{} = {} {} {};\n", + declare_variable, + left_val, + match op { + BinaryOperator::Add => "+", + BinaryOperator::Sub => "-", + BinaryOperator::Mul => "*", + BinaryOperator::Div => "/", + BinaryOperator::Rem => "%", + BinaryOperator::LT => "<", + BinaryOperator::LTE => "<=", + BinaryOperator::GT => ">", + BinaryOperator::GTE => ">=", + BinaryOperator::EQ => "==", + BinaryOperator::NE => "!=", + BinaryOperator::Or => "|", + BinaryOperator::And => "&", + BinaryOperator::Xor => "^", + BinaryOperator::LSh => "<<", + BinaryOperator::RSh => ">>", + }, + right_val, + )?, + }; + } + Node::Ternary {op, first, second, third} => match op { + TernaryOperator::Select => { + write!( + w, + "\t{} = {} ? {} : {};\n", + declare_variable, + self.get_value(*first, false, false), + self.get_value(*second, false, false), + self.get_value(*third, false, false), + )?; + } + }, + Node::IntrinsicCall { intrinsic, args } => { + let ty = &self.types[self.typing[args[0].idx()].idx()]; + let func_name = self.codegen_intrinsic(intrinsic, ty); + write!( + w, + "\t{} = {}({});\n", + declare_variable, + func_name, + self.get_value(args[0], false, false), + )?; + } + Node::Read { collect, indices } => { + let index_ptr_name = self.codegen_indices(*collect, indices); + // If it's a parameter node then copy from global memory, else + // reference from shared memory or registers. + if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { + // We parallelize copies from global memory across threads for + // array types, either immediate or nested in the collection. + if self.types[self.typing[id.idx()].idx()].is_primitive() { + write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?; + } else { + self.codegen_global_to_shared(id, declare_variable, index_ptr_name, indices.len(), true, w)?; + } + } else { + write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?; + } + } + Node::Write {collect: _, data: _, indices: _} => { + // TODO + } + _ => { + panic!("Unsupported node type") + } + } + if let Some(phis) = self.label_data_for_phi.get(&id) { + for phi in phis { + write!( + w, + "\t{} = {};\n", + self.get_value(*phi, false, false), + self.get_value(id, false, false) + )?; + } } - Ok(()) - } - - fn codegen_data_node_with_write_if(&self, id: NodeID, block_stride: usize, w: &mut String) -> Result<(), Error> { Ok(()) } @@ -459,96 +795,329 @@ impl<'a> GPUContext<'a> { Ok(()) } - fn write_constant(&self, name: String, type_name: String, cons_id: ConstantID, w: &mut String) -> Result<(), Error> { - write!(w, "\t{} {}", type_name, name)?; - match self.constants[cons_id.idx()] { - Constant::Boolean(val) => write!(w, " = {}\n", val)?, - Constant::Integer8(val) => write!(w, " = {}\n", val)?, - Constant::UnsignedInteger8(val) => write!(w, " = {}\n", val)?, - Constant::Integer16(val) => write!(w, " = {}\n", val)?, - Constant::UnsignedInteger16(val) => write!(w, " = {}\n", val)?, - Constant::Integer32(val) => write!(w, " = {}\n", val)?, - Constant::UnsignedInteger32(val) => write!(w, " = {}ul\n", val)?, - Constant::Integer64(val) => write!(w, " = {}ll\n", val)?, - Constant::UnsignedInteger64(val) => write!(w, " = {}ull\n", val)?, - Constant::Float32(val) => write!(w, " = {}f\n", val)?, - Constant::Float64(val) => write!(w, " = {}\n", val)?, + // Standalone function allows us to handle recursive initialization for + // product and summation collections + fn codegen_constant( + &self, + declare_variable: String, + name: String, + cons_id: ConstantID, + w: &mut String, + ) -> Result<(), Error> { + write!(w, "\t{}", declare_variable)?; + match &self.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(w, " = {};\n", val)?, + Constant::Integer8(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger8(val) => write!(w, " = {};\n", val)?, + Constant::Integer16(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger16(val) => write!(w, " = {};\n", val)?, + Constant::Integer32(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger32(val) => write!(w, " = {}ul;\n", val)?, + Constant::Integer64(val) => write!(w, " = {}ll;\n", val)?, + Constant::UnsignedInteger64(val) => write!(w, " = {}ull;\n", val)?, + Constant::Float32(val) => write!(w, " = {}f;\n", val)?, + Constant::Float64(val) => write!(w, " = {};\n", val)?, Constant::Product(_, fields) => { write!(w, ";\n")?; for (i, field) in fields.iter().enumerate() { - self.write_constant(format!("{}_field_{}", name, i), self.constant_to_type_name(*field), *field, w)?; - write!(w, "\t{}.field_{} = {}_field_{};\n", name, i, name, i)?; + // We don't emit array fields and size was set by struct definition + if !self.constants[field.idx()].is_array() { + // Don't need type declaration for the fields + self.codegen_constant( + format!("{}.field_{}", name, i), + format!("{}.field_{}", name, i), + *field, + w, + )?; + } } } Constant::Summation(_, variant, field) => { - write!(w, ";\n")?; - self.write_constant(format!("{}_field_{}", name, variant), self.constant_to_type_name(field), field, w)?; - write!(w, "\t{}.tag = {};\n\t{}.field_{} = {}_field_{};\n", name, variant, name, variant, name, variant)?; + write!(w, ";\n\t{}.tag = {};\n", name, variant)?; + // See two comments in Constant::Product + if !self.constants[field.idx()].is_array() { + self.codegen_constant( + format!("\t{}.field_{}", name, variant), + format!("\t{}.field_{}", name, variant), + *field, + w, + )?; + } } - Constant::Array(_) => { - write!(w, ";\n")?; - for (i, element) in elements.iter().enumerate() { - self.write_constant(format!("{}_element_{}", name, i), self.constant_to_type_name(*element), *element, w)?; - write!(w, "\t{}[{}] = {}_element_{};\n", name, i, name, i)?; + Constant::Array(type_id) => { + let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { + panic!("Expected array type") + }; + // For now we do element-wise alignment, later could consider (n-1)d array + // alignment. Then we "allocate" from the single dynamic shared memory buffer + // by using and updating the offset. + let element_size = format!("sizeof({})", self.get_type(*element_type, false)); + let array_size = extents + .iter() + .map(|id| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join("*"); + write!(w, ";\n\talignment = {};\n\tdynamic_shared_offset = + (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = + reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t + dynamic_shared_offset += {}", element_size, name, self.get_type(*element_type, false), array_size)?; + } + } + Ok(()) + } + + fn codegen_global_to_shared(&self, id: NodeID, declare_variable: String, index_ptr_name: String, array_depth: Option<usize>, outermost: bool, w: &mut String) -> Result<(), Error> { + match &self.types[self.typing[id.idx()].idx()] { + Type::Array(_, extents) => { + let array_depth = array_depth.unwrap(); + let rem_array_size = extents + .iter() + .enumerate() + .filter(|(i, _)| *i >= array_depth) + .map(|(_, id)| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join("*"); + let mut running_div_factor = "1".to_string(); + write!(w, "\tfor (int i = threadIdx.x; i < {}; i += {}) {{\n", rem_array_size, self.kernel_attrs.num_threads)?; + let mut indices = vec![]; + for i in (array_depth..extents.len()).rev() { + indices.push(format!("[(({}) / ({})) % dc{}]", rem_array_size, running_div_factor, extents[i].idx())); + running_div_factor = format!("{} * {}", running_div_factor, format!("dc{}", extents[i].idx())); } + let indices_str = indices.join(""); + // TODO: condition by primitive vs collection, if latter then recurse + // with outermost = false + write!(w, "\t\t{}{} = {}{};\n", declare_variable, indices_str, index_ptr_name, indices_str)?; } + // TODO: handle product and summation collections } Ok(()) } - fn constant_to_type_name(&self, cons_id: ConstantID) -> String { - match self.constants[cons_id.idx()] { - Constant::Boolean(_) => "bool".to_string(), - Constant::Integer8(_) => "int8_t".to_string(), - Constant::UnsignedInteger8(_) => "uint8_t".to_string(), - Constant::Integer16(_) => "short".to_string(), - Constant::UnsignedInteger16(_) => "unsigned short".to_string(), - Constant::Integer32(_) => "int".to_string(), - Constant::UnsignedInteger32(_) => "unsigned int".to_string(), - Constant::Integer64(_) => "long long".to_string(), - Constant::UnsignedInteger64(_) => "unsigned long long".to_string(), - Constant::Float32(_) => "float".to_string(), - Constant::Float64(_) => "double".to_string(), - Constant::Product(type_id, _) => self.get_type(type_id), - Constant::Summation(type_id, _, _) => self.get_type(type_id), - Constant::Array(type_id) => self.get_type(type_id), + fn codegen_indices(&self, collect: NodeID, indices: &[Index]) -> String { + let mut index_ptr_name = format!("{}", self.get_value(collect, false, false)); + for index in indices { + match index { + Index::Field(field) => { + index_ptr_name.push_str(&format!(".field_{}", field)); + } + Index::Variant(variant) => { + index_ptr_name.push_str(&format!(".field_{}", variant)); + } + Index::Position(indices) => { + index_ptr_name.push_str(&indices + .iter() + .map(|index| format!("[{}]", self.get_value(*index, false, false))) + .collect::<Vec<_>>() + .join("")); + } + } } + index_ptr_name + } + + fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { + let func_name = match intrinsic { + Intrinsic::Abs => match ty { + Type::Float32 => "__fabsf", + Type::Float64 => "__fabs", + ty if ty.is_signed() => "abs", + ty if ty.is_unsigned() => "uabs", + _ => panic!("Unsupported type for Abs"), + }, + Intrinsic::ACos => match ty { + ty if ty.is_float() => "__acosf", + _ => "acos", + }, + Intrinsic::ASin => match ty { + ty if ty.is_float() => "__asinf", + _ => "asin", + }, + Intrinsic::ATan => match ty { + ty if ty.is_float() => "__atanf", + _ => "atan", + }, + Intrinsic::ATan2 => match ty { + ty if ty.is_float() => "__atan2f", + _ => "atan2", + }, + Intrinsic::Ceil => match ty { + ty if ty.is_float() => "__ceilf", + _ => "ceil", + }, + Intrinsic::Cos => match ty { + ty if ty.is_float() => "__cosf", + _ => "cos", + }, + Intrinsic::Cosh => match ty { + ty if ty.is_float() => "coshf", + _ => "cosh", + }, + Intrinsic::Exp => match ty { + ty if ty.is_float() => "__expf", + _ => "exp", + }, + Intrinsic::Exp2 => match ty { + ty if ty.is_float() => "__exp2f", + _ => "exp2", + }, + Intrinsic::Floor => match ty { + ty if ty.is_float() => "__floorf", + _ => "floor", + }, + Intrinsic::Ln => match ty { + ty if ty.is_float() => "__logf", + _ => "log", + }, + Intrinsic::Log10 => match ty { + ty if ty.is_float() => "__log10f", + _ => "log10", + }, + Intrinsic::Log2 => match ty { + ty if ty.is_float() => "__log2f", + _ => "log2", + }, + Intrinsic::Max => match ty { + Type::Float32 => "fmaxf", + Type::Float64 => "fmax", + ty if ty.is_signed() => "smax", + ty if ty.is_unsigned() => "umax", + _ => "max", + }, + Intrinsic::Min => match ty { + Type::Float32 => "__fminf", + Type::Float64 => "__fmin", + ty if ty.is_signed() => "smin", + ty if ty.is_unsigned() => "umin", + _ => "min", + }, + Intrinsic::Pow | Intrinsic::Powf => match ty { + Type::Float32 => "__powf", + Type::Float64 => "pow", + _ => panic!("Unsupported type for Pow"), + }, + Intrinsic::Powi => match ty { + ty if ty.is_signed() || ty.is_unsigned() => "powi", + _ => panic!("Unsupported type for Powi"), + }, + Intrinsic::Round => match ty { + ty if ty.is_float() => "__roundf", + ty if ty.is_signed() || ty.is_unsigned() => "roundi", + _ => "round", + }, + Intrinsic::Sin => match ty { + ty if ty.is_float() => "__sinf", + _ => "sin", + }, + Intrinsic::Sinh => match ty { + ty if ty.is_float() => "sinhf", + _ => "sinh", + }, + Intrinsic::Sqrt => match ty { + ty if ty.is_float() => "__sqrtf", + ty if ty.is_signed() || ty.is_unsigned() => "isqrt", + _ => "sqrt", + }, + Intrinsic::Tan => match ty { + ty if ty.is_float() => "__tanf", + _ => "tan", + }, + Intrinsic::Tanh => match ty { + ty if ty.is_float() => "tanhf", + _ => "tanh", + }, + _ => panic!("Unsupported intrinsic {:?}", intrinsic), + }; + func_name.to_string() } fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Result<usize, Error> { - factors.iter() - .try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, &self.dynamic_constants) - .ok_or_else(|| Error) - .map(|val| acc.saturating_mul(val)) - }) + factors.iter().try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, self.dynamic_constants) + .ok_or(Error) + .map(|val| acc.saturating_mul(val)) + }) } - fn get_value(&self, id: NodeID, ty: bool) -> String { - if ty { - format!("{} {}{}", self.get_type(self.typing[id.idx()]), self.function.nodes[id.idx()].lower_case_name(), id.idx()) + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { + if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { + if ty { + panic!("Dynamic constants shouldn't be re-initialized") + } + format!("dc{}", dc_id.idx()) + } else if let Node::Parameter { index } = &self.function.nodes[id.idx()] { + if ty { + panic!("Parameters shouldn't be re-initialized") + } + format!("p{}", index) + } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { + let mut declare_array = format!( + "{} (*{}{})", + self.get_type(*element_type, false), + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + ); + for extent in extents.iter().skip(1) { + declare_array.push_str(&format!("[dc{}]", extent.idx())); + } + declare_array + } else if ty { + format!( + "{} {}{}", + self.get_type(self.typing[id.idx()], make_pointer), + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + ) } else { - format!("{}{}", self.function.nodes[id.idx()].lower_case_name(), id.idx()) + format!( + "{}{}", + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + ) } } - fn get_type(&self, id: TypeID) -> String { - match self.types[id.idx()] { + // make_pointer enforces static pointer and not recursive or array pointer: + // multi-d arrays are single pointers with custom indexing. + fn get_type(&self, id: TypeID, make_pointer: bool) -> String { + match &self.types[id.idx()] { Type::Product(_) => { - format!("Product_{}", id.idx()) + format!( + "Product_{}{}", + id.idx(), + if make_pointer { "*" } else { "" } + ) } Type::Summation(_) => { - format!("Summation_{}", id.idx()) + format!( + "Summation_{}{}", + id.idx(), + if make_pointer { "*" } else { "" } + ) } - _ => convert_type(&self.types[id.idx()]), + Type::Array(element_type, extents) => { + // This suffix lets us work with references of dynamic shared memory + // and use n-d array indexing. + let mut suffix = "(*)".to_string(); + if extents.len() > 1 { + for extent in extents.iter().skip(1) { + suffix.push_str(&format!("[dc{}]", extent.idx())); + } + } + format!( + "{}{}", + self.get_type(*element_type, false), + if make_pointer { "*" } else { &suffix } + ) + } + _ => convert_type(&self.types[id.idx()], make_pointer), } } - } // TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 -fn convert_type(ty: &Type) -> String { - match ty { +fn convert_type(ty: &Type, make_pointer: bool) -> String { + let mut result = match ty { Type::Boolean => "bool".to_string(), Type::Integer8 => "int8_t".to_string(), Type::UnsignedInteger8 => "uint8_t".to_string(), @@ -561,5 +1130,9 @@ fn convert_type(ty: &Type) -> String { Type::Float32 => "float".to_string(), Type::Float64 => "double".to_string(), _ => panic!("Unsupported type"), + }; + if make_pointer { + result.push('*'); } + result } diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 4fd0cf0b..956e25a9 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1,13 +1,19 @@ -use std::fmt::Write; +extern crate bitvec; +extern crate ordered_float; +extern crate serde; + +use self::bitvec::prelude::*; +use self::serde::Deserialize; +use self::serde::Serialize; +use std::cmp::Ordering; +use std::cmp::{max, min}; +use std::collections::HashMap; +use std::convert::TryInto; +use std::fmt::{Error, Write}; use std::ops::Coroutine; use std::ops::CoroutineState; use std::pin::Pin; -use bitvec::prelude::*; -use ordered_float::OrderedFloat; -use serde::Deserialize; -use serde::Serialize; - use crate::*; /* @@ -829,6 +835,14 @@ impl Type { } } + pub fn is_summation(&self) -> bool { + if let Type::Summation(_) = self { + true + } else { + false + } + } + pub fn is_array(&self) -> bool { if let Type::Array(_, _) = self { true @@ -992,6 +1006,218 @@ impl DynamicConstant { } } +#[derive(Default, Clone)] +struct DynamicConstantRange { + min: isize, + max: isize, +} + +// The ith element is the exponent of the ith parameter, all together giving a +// unique key for each combination of parameters aka term. +#[derive(Eq, PartialEq, Hash)] +struct ParamKey(Vec<isize>); + +pub fn dynamic_constant_cmp( + a: DynamicConstantID, + b: DynamicConstantID, + dcs: &Vec<DynamicConstant>, + num_params: usize, +) -> Result<Option<Ordering>, Error> { + fn dynamic_constant_evaluation_iter( + a: DynamicConstantID, + dcs: &Vec<DynamicConstant>, + num_params: usize, + ) -> Result<HashMap<ParamKey, DynamicConstantRange>, Error> { + // We evaluate each dynamic constant by constructing range for each "term", + // aka unique combination of parameter exponents (eg param1^0 * param2^0 + // aka scalar represented by [0, 0] or param1^1 * param2^2 by [1, 2]). + // Range instead of single value is needed due to use of modulo. + let mut ranges = HashMap::new(); + match dcs[a.idx()] { + DynamicConstant::Parameter(idx) => { + let mut param_vec = vec![0; num_params]; + param_vec[idx] = 1; + ranges.insert(ParamKey(param_vec), DynamicConstantRange { min: 1, max: 1 }); + } + DynamicConstant::Constant(cons) => { + let param_vec = vec![0; num_params]; + ranges.insert( + ParamKey(param_vec), + DynamicConstantRange { + min: cons.try_into().map_err(|_| Error)?, + max: cons.try_into().map_err(|_| Error)?, + }, + ); + } + DynamicConstant::Add(left, right) => { + // Add same-form terms by adding their values. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + ranges.extend(left_ranges); + for r in right_ranges { + if let Some(l) = ranges.get_mut(&r.0) { + l.min += r.1.min; + l.max += r.1.max; + } else { + ranges.insert(r.0, r.1); + } + } + } + DynamicConstant::Sub(left, right) => { + // Subtract same-form terms by subtracting their values. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + ranges.extend(left_ranges); + for r in right_ranges { + if let Some(l) = ranges.get_mut(&r.0) { + l.min -= r.1.max; + l.max -= r.1.min; + } else { + ranges.insert( + r.0, + DynamicConstantRange { + min: -r.1.max, + max: -r.1.min, + }, + ); + } + } + } + DynamicConstant::Mul(left, right) => { + // Pairwise multiply each term by elementwise adding the two + // exponent keys and multiplying the values. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + for l in left_ranges { + for r in right_ranges.iter() { + let mut param_vec = l.0 .0.clone(); + for (idx, r_val) in r.0 .0.iter().enumerate() { + param_vec[idx] += r_val; + } + ranges.insert( + ParamKey(param_vec), + DynamicConstantRange { + min: l.1.min * r.1.min, + max: l.1.max * r.1.max, + }, + ); + } + } + } + DynamicConstant::Div(left, right) => { + // Pairwise divide each term by elementwise subtracting the two + // exponent keys and dividing the values. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + for l in left_ranges { + for r in right_ranges.iter() { + let mut param_vec = l.0 .0.clone(); + for (idx, r_val) in r.0 .0.iter().enumerate() { + param_vec[idx] -= r_val; + } + ranges.insert( + ParamKey(param_vec), + DynamicConstantRange { + min: l.1.min / r.1.min, + max: l.1.max / r.1.max, + }, + ); + } + } + } + DynamicConstant::Rem(left, right) => { + // We do simplest check for 0 or scalar multiple, and ignore all + // other cases of pure multiple. If check fails, the remainder is + // somewhere between 0 and the right value. + let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; + let mut is_zero = true; + for l in left_ranges.iter() { + if l.1.min != 0 || l.1.max != 0 { + is_zero = false; + break; + } + } + if is_zero { + return Ok(ranges); + } + + // Scalar multiple requires both that all right terms have left + // term with same positive multiplier, and there are no + // outstanding left terms after matching. + let mut is_scalar_multiple = true; + let mut scalar_factor = 0; + let mut remaining_left_terms = left_ranges.len(); + let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; + for r in right_ranges.iter() { + if let Some(l_range) = left_ranges.get(r.0) { + if l_range.min != l_range.max || r.1.min != r.1.max || l_range.min % r.1.min != 0 || (scalar_factor != 0 && l_range.min / r.1.min != scalar_factor) { + is_scalar_multiple = false; + break; + } + scalar_factor = l_range.min / r.1.min; + remaining_left_terms -= 1; + } + } + if is_scalar_multiple && scalar_factor >= 0 && remaining_left_terms == 0 { + return Ok(ranges); + } + + for r in right_ranges { + ranges.insert( + r.0, + DynamicConstantRange { + min: min(0, r.1.min), + max: max(0, r.1.max), + }, + ); + } + } + } + Ok(ranges) + } + + let a_ranges = dynamic_constant_evaluation_iter(a, dcs, num_params)?; + let b_ranges = dynamic_constant_evaluation_iter(b, dcs, num_params)?; + // a >= b iff a's min >= b's max. >= requires all terms in b to satisfy: + // if also in a, then a's coef >= b's coef; if not in a, have b's coef <= 0. + let mut a_is_greater = true; + for b in b_ranges.iter() { + if let Some(a) = a_ranges.get(b.0) { + if a.min < b.1.max { + a_is_greater = false; + break; + } + } else if b.1.min > 0 { + a_is_greater = false; + break; + } + } + + // Now check if b >= a. + let mut b_is_greater = true; + for a in a_ranges.iter() { + if let Some(b) = b_ranges.get(a.0) { + if b.min < a.1.max { + b_is_greater = false; + break; + } + } else if a.1.min > 0 { + b_is_greater = false; + break; + } + } + + if a_is_greater && b_is_greater { + Ok(Some(Ordering::Equal)) + } else if a_is_greater { + Ok(Some(Ordering::Greater)) + } else if b_is_greater { + Ok(Some(Ordering::Less)) + } else { + Ok(None) + } +} + pub fn evaluate_dynamic_constant( cons: DynamicConstantID, dcs: &Vec<DynamicConstant>, -- GitLab From b2ca2967540f27adb7bdae7f02930766ed5d5668 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 30 Dec 2024 15:45:04 -0800 Subject: [PATCH 029/109] indexing --- hercules_cg/src/gpu.rs | 310 +++++++++++++++++++++++++++++++---------- 1 file changed, 240 insertions(+), 70 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index aa5908a2..29135195 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -85,6 +85,7 @@ pub fn gpu_codegen<W: Write>( }; let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); + // Maybe can delete let map_join_reduce = || -> HashMap<NodeID, Vec<NodeID>> { let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) .filter(|idx| function.nodes[*idx].is_reduce()) @@ -219,6 +220,7 @@ impl GPUContext<'_> { " #include <assert.h> #include <stdio.h> +#include <stddef.h> #include <cuda.h> #include <cuda_runtime.h> #include <mma.h> @@ -293,7 +295,7 @@ impl GPUContext<'_> { for (id, ty) in self.types.iter().enumerate() { match ty { Type::Product(ref product_ty_ids) => { - write!(w, "\nstruct Product_{} {{\n", id)?; + write!(w, "\ntypedef struct Product_{} {{\n", id)?; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { write!( w, @@ -302,10 +304,10 @@ impl GPUContext<'_> { i )?; } - write!(w, "}};\n")?; + write!(w, "}} Product_{};\n", id)?; } Type::Summation(ref summation_ty_ids) => { - write!(w, "\nstruct Summation_{} {{\n\t union {{\n", id)?; + write!(w, "\ntypedef struct Summation_{} {{\n\t union {{\n", id)?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { write!( w, @@ -314,7 +316,7 @@ impl GPUContext<'_> { i )?; } - write!(w, "\t}};\n\tuint8_t tag;\n}};\n")?; + write!(w, "\t}};\n\tuint8_t tag;\n}} Summation_{};\n", id)?; } _ => {} } @@ -348,7 +350,7 @@ impl GPUContext<'_> { write!(w, "{} p{}", self.get_type(*ty, true), idx)?; } // We convert originally non-void functions to void functions by adding a - // return parameter. For now we ignore the case where return was derived + // return parameter. For now we ignore the case where return was derived // from a parameter through reads and writes, and instead always memcpy. let return_index = self.function.nodes.iter().position(|node| node.is_return()); if let Some(return_index) = return_index { @@ -424,7 +426,7 @@ impl GPUContext<'_> { Ok(()) } - // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the join has no user reduce nodes, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the forks are parallel reduce forks, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> { let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); for (_, children) in self.fork_forward_adjacency.iter() { @@ -442,7 +444,7 @@ impl GPUContext<'_> { let mut curr_fork = root_forks[0]; while let Some(join) = self.fork_join_map.get(&curr_fork) { let children = &self.fork_forward_adjacency[&curr_fork]; - if children.len() != 1 || !self.join_reduce_map.contains_key(join) { + if children.len() != 1 || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) { break; } curr_fork = children[0]; @@ -556,7 +558,7 @@ impl GPUContext<'_> { } } - // 2. Emit data flow for nodes assigned to those basic blocks + // 2. Emit data flow for nodes assigned to basic blocks in block sink // 2a. All phi registers first self.codegen_phi_registers(w, |id| control_nodes_between.contains(&self.bbs[id.idx()]))?; // 2b. All other data nodes @@ -578,6 +580,7 @@ impl GPUContext<'_> { { self.codegen_data_node( *id, + 1, &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body, )?; visited.insert(id); @@ -598,13 +601,10 @@ impl GPUContext<'_> { Ok(()) } - fn codegen_data_node( - &self, - id: NodeID, - w: &mut String, - ) -> Result<(), Error> { + fn codegen_data_node(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> { // For now only used shared memory when creating an array let declare_variable = self.get_value(id, true, false).to_string(); + let tabs = "\t".repeat(num_tabs); match &self.function.nodes[id.idx()] { // Phi registers were already emitted. Node::Phi { @@ -628,7 +628,8 @@ impl GPUContext<'_> { Type::Boolean => { write!( w, - "\t{} = !{};\n", + "{}{} = !{};\n", + tabs, declare_variable, self.get_value(*input, false, false), )?; @@ -636,7 +637,8 @@ impl GPUContext<'_> { ty if ty.is_fixed() => { write!( w, - "\t{} = ~{};\n", + "{}{} = ~{};\n", + tabs, declare_variable, self.get_value(*input, false, false), )?; @@ -647,7 +649,8 @@ impl GPUContext<'_> { ty if ty.is_signed() || ty.is_float() => { write!( w, - "\t{} = -{};\n", + "{}{} = -{};\n", + tabs, declare_variable, self.get_value(*input, false, false), )?; @@ -659,7 +662,8 @@ impl GPUContext<'_> { UnaryOperator::Cast(dst_ty_id) => { write!( w, - "\t{} = static_cast<{}>({});\n", + "{}{} = static_cast<{}>({});\n", + tabs, declare_variable, self.get_type(*dst_ty_id, false), self.get_value(*input, false, false), @@ -672,36 +676,29 @@ impl GPUContext<'_> { match (op, &self.types[self.typing[left.idx()].idx()]) { (BinaryOperator::Rem, Type::Float32) => write!( w, - "\t{} = fmodf({}, {});\n", - declare_variable, - left_val, - right_val, + "{}{} = fmodf({}, {});\n", + tabs, declare_variable, left_val, right_val, )?, (BinaryOperator::Rem, Type::Float64) => write!( w, - "\t{} = fmod({}, {});\n", - declare_variable, - left_val, - right_val, + "{}{} = fmod({}, {});\n", + tabs, declare_variable, left_val, right_val, )?, // Doesn't need special syntax but bool type (BinaryOperator::Or, Type::Boolean) => write!( w, - "\t{} = {} || {};\n", - declare_variable, - left_val, - right_val, + "{}{} = {} || {};\n", + tabs, declare_variable, left_val, right_val, )?, (BinaryOperator::And, Type::Boolean) => write!( w, - "\t{} = {} && {};\n", - declare_variable, - left_val, - right_val, + "{}{} = {} && {};\n", + tabs, declare_variable, left_val, right_val, )?, (op, _) => write!( w, - "\t{} = {} {} {};\n", + "{}{} = {} {} {};\n", + tabs, declare_variable, left_val, match op { @@ -726,11 +723,17 @@ impl GPUContext<'_> { )?, }; } - Node::Ternary {op, first, second, third} => match op { + Node::Ternary { + op, + first, + second, + third, + } => match op { TernaryOperator::Select => { write!( w, - "\t{} = {} ? {} : {};\n", + "{}{} = {} ? {} : {};\n", + tabs, declare_variable, self.get_value(*first, false, false), self.get_value(*second, false, false), @@ -743,30 +746,53 @@ impl GPUContext<'_> { let func_name = self.codegen_intrinsic(intrinsic, ty); write!( w, - "\t{} = {}({});\n", + "{}{} = {}({});\n", + tabs, declare_variable, func_name, self.get_value(args[0], false, false), )?; } Node::Read { collect, indices } => { - let index_ptr_name = self.codegen_indices(*collect, indices); - // If it's a parameter node then copy from global memory, else - // reference from shared memory or registers. + // If it's a parameter node then copy from global memory, else + // from shared memory or registers. if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { - // We parallelize copies from global memory across threads for - // array types, either immediate or nested in the collection. - if self.types[self.typing[id.idx()].idx()].is_primitive() { - write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?; - } else { - self.codegen_global_to_shared(id, declare_variable, index_ptr_name, indices.len(), true, w)?; - } + let index_ptr_name = self.codegen_indices(*collect, indices, true); + self.codegen_copy_from_global( + true, + self.typing[id.idx()], + &declare_variable, + &index_ptr_name, + Some(indices.len()), + true, + num_tabs, + w, + )?; } else { - write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?; + let index_ptr_name = self.codegen_indices(*collect, indices,false); + write!(w, "{}{} = {};\n", tabs, declare_variable, index_ptr_name)?; } } - Node::Write {collect: _, data: _, indices: _} => { - // TODO + Node::Write {collect, data, indices} => { + let data_variable = self.get_value(*data, false, false); + // If it's a parameter node then copy to global memory, else + // to shared memory or registers + if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { + let index_ptr_name = self.codegen_indices(*collect, indices, true); + self.codegen_copy_to_from_global( + false, + self.typing[id.idx()], + &data_variable, + &index_ptr_name, + Some(indices.len()), + true, + num_tabs, + w, + )?; + } else { + let index_ptr_name = self.codegen_indices(*collect, indices, false); + write!(w, "{}{} = {};\n", tabs, index_ptr_name, data_variable)?; + } } _ => { panic!("Unsupported node type") @@ -848,7 +874,7 @@ impl GPUContext<'_> { let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { panic!("Expected array type") }; - // For now we do element-wise alignment, later could consider (n-1)d array + // For now we do element-wise alignment, later could consider (n-1)d array // alignment. Then we "allocate" from the single dynamic shared memory buffer // by using and updating the offset. let element_size = format!("sizeof({})", self.get_type(*element_type, false)); @@ -857,18 +883,41 @@ impl GPUContext<'_> { .map(|id| format!("dc{}", id.idx())) .collect::<Vec<_>>() .join("*"); - write!(w, ";\n\talignment = {};\n\tdynamic_shared_offset = + write!( + w, + ";\n\talignment = {};\n\tdynamic_shared_offset = (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t - dynamic_shared_offset += {}", element_size, name, self.get_type(*element_type, false), array_size)?; + dynamic_shared_offset += {}", + element_size, + name, + self.get_type(*element_type, false), + array_size + )?; } } Ok(()) } - fn codegen_global_to_shared(&self, id: NodeID, declare_variable: String, index_ptr_name: String, array_depth: Option<usize>, outermost: bool, w: &mut String) -> Result<(), Error> { - match &self.types[self.typing[id.idx()].idx()] { - Type::Array(_, extents) => { + // Used for reads and writes due to identical logic. data_variable is the + // resulting reference for reads, and is the source for writes. Writes don't + // emit a new reference. + fn codegen_copy_from_global( + &self, + is_read: bool, + type_id: TypeID, + data_variable: &String, + index_ptr_name: &String, + array_depth: Option<usize>, + parallelize: bool, + num_tabs: usize, + w: &mut String, + ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); + let lhs = if is_read { data_variable } else { index_ptr_name }; + let rhs = if is_read { index_ptr_name } else { data_variable }; + match &self.types[type_id.idx()] { + Type::Array(element_type_id, extents) => { let array_depth = array_depth.unwrap(); let rem_array_size = extents .iter() @@ -878,23 +927,95 @@ impl GPUContext<'_> { .collect::<Vec<_>>() .join("*"); let mut running_div_factor = "1".to_string(); - write!(w, "\tfor (int i = threadIdx.x; i < {}; i += {}) {{\n", rem_array_size, self.kernel_attrs.num_threads)?; let mut indices = vec![]; for i in (array_depth..extents.len()).rev() { - indices.push(format!("[(({}) / ({})) % dc{}]", rem_array_size, running_div_factor, extents[i].idx())); - running_div_factor = format!("{} * {}", running_div_factor, format!("dc{}", extents[i].idx())); + indices.push(format!( + "[(({}) / ({})) % dc{}]", + rem_array_size, + running_div_factor, + extents[i].idx() + )); + running_div_factor = format!( + "{} * {}", + running_div_factor, + format!("dc{}", extents[i].idx()) + ); } let indices_str = indices.join(""); - // TODO: condition by primitive vs collection, if latter then recurse - // with outermost = false - write!(w, "\t\t{}{} = {}{};\n", declare_variable, indices_str, index_ptr_name, indices_str)?; + // Parallelizing only affects loop bounds + let begin_copy = if parallelize { + format!( + "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", + tabs, rem_array_size, self.kernel_attrs.num_threads + ) + } else { + format!("{}for (int i = 0; i < {}; i++) {{\n", tabs, rem_array_size) + }; + write!(w, "{}", begin_copy)?; + self.codegen_copy_to_from_global( + is_read, + *element_type_id, + &format!("{}{}", data_variable, indices_str), + &format!("{}{}", index_ptr_name, indices_str), + None, + false, + num_tabs + 1, + w, + )?; + let end_copy = if parallelize { + format!("{}}}\n{}__syncthreads();\n", tabs, tabs) + } else { + format!("{}}}\n", tabs) + }; + write!(w, "{}", end_copy)?; + } + Type::Product(fields) => { + for field in fields { + self.codegen_copy_to_from_global( + is_read, + *field, + &format!("{}{}", data_variable, field.idx()), + &format!("{}{}", index_ptr_name, field.idx()), + None, + false, + num_tabs + 1, + w, + )?; + } + } + Type::Summation(fields) => { + // First copy the tag + write!(w, "{}{}.tag = {}.tag;\n", tabs, lhs, rhs)?; + // Then copy the active field based on the tag + write!(w, "{}switch({}.tag) {{\n", tabs, rhs)?; + for (variant_idx, field) in fields.iter().enumerate() { + write!(w, "{}\tcase {}: {{\n", tabs, variant_idx)?; + // Recursively copy the field's contents + self.codegen_copy_to_from_global( + is_read, + *field, + &format!("{}.field_{}", data_variable, variant_idx), + &format!("{}.field_{}", index_ptr_name, variant_idx), + None, + false, + num_tabs + 2, + w + )?; + write!(w, "{}\t\tbreak;\n", tabs)?; + write!(w, "{}\t}}\n", tabs)?; + } + write!(w, "{}}}\n", tabs)?; + } + // Primitive types + _ => { + write!(w, "{}{} = {};\n", tabs, lhs, rhs)?; } - // TODO: handle product and summation collections } Ok(()) } - fn codegen_indices(&self, collect: NodeID, indices: &[Index]) -> String { + // Use normal indexing for local collections + fn codegen_indices_local(&self, collect: NodeID, indices: &[Index]) -> String { let mut index_ptr_name = format!("{}", self.get_value(collect, false, false)); for index in indices { match index { @@ -905,15 +1026,54 @@ impl GPUContext<'_> { index_ptr_name.push_str(&format!(".field_{}", variant)); } Index::Position(indices) => { - index_ptr_name.push_str(&indices + index_ptr_name.push_str( + &indices + .iter() + .map(|index| format!("[{}]", self.get_value(*index, false, false))) + .collect::<Vec<_>>() + .join(""), + ); + } + } + } + index_ptr_name + } + + // Use arithmetic for global collections as they're accessed as pointers + fn codegen_indices_global(&self, collect: NodeID, indices: &[Index]) -> String { + let mut index_ptr_name = format!("{}[0", self.get_value(collect, false, false)); + let type_id = self.typing[collect.idx()]; + for index in indices { + match index { + Index::Field(field) => { + let offset = (0..*field) + .map(|i| format!("offsetof({}, field_{})", self.get_type(type_id, false), i)) + .collect::<Vec<_>>() + .join(" + "); + index_ptr_name.push_str(&format!(" + {}", offset)); + } + // Variants of summations have zero offset + Index::Variant(_) => {} + Index::Position(array_indices) => { + let Type::Array(_, extents) = &self.types[self.typing[collect.idx()].idx()] else { + panic!("Expected array type") + }; + let mut cumulative_offset = "1 * ".to_string() + extents .iter() - .map(|index| format!("[{}]", self.get_value(*index, false, false))) + .enumerate() + .filter(|(i, _)| *i >= array_indices.len()) + .map(|(_, id)| format!("dc{}", id.idx())) .collect::<Vec<_>>() - .join("")); + .join(" * ") + .as_str(); + for index in array_indices.iter().rev() { + cumulative_offset = format!("{} * ({} + ", cumulative_offset, self.get_value(*index, false, false)); + } + index_ptr_name.push_str(&format!(" + {}{}", cumulative_offset, ")".repeat(array_indices.len()))); } } } - index_ptr_name + format!("{}]", index_ptr_name) } fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { @@ -1050,7 +1210,17 @@ impl GPUContext<'_> { panic!("Parameters shouldn't be re-initialized") } format!("p{}", index) - } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { + } else if let Node::Write { collect, data: _, indices: _ } = &self.function.nodes[id.idx()] { + if ty { + panic!("Writes shouldn't be initialized, they're replaced with the referenced collection") + } + if make_pointer { + panic!("Writes shouldn't be called as pointer") + } + self.get_value(*collect, false, false) + } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] + { + // Shmem/register arrays have special formatting let mut declare_array = format!( "{} (*{}{})", self.get_type(*element_type, false), @@ -1077,7 +1247,7 @@ impl GPUContext<'_> { } } - // make_pointer enforces static pointer and not recursive or array pointer: + // make_pointer enforces static pointer and not recursive or array pointer: // multi-d arrays are single pointers with custom indexing. fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { -- GitLab From 7cccb612086d3a8663fbafa997e74a6327a4daf2 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Tue, 31 Dec 2024 12:44:10 -0800 Subject: [PATCH 030/109] rw finish --- hercules_cg/src/gpu.rs | 965 +++++++++++++++++++++++++---------------- 1 file changed, 594 insertions(+), 371 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 29135195..3ad9297b 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,39 +3,14 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; -use std::hash::{Hash, Hasher}; use std::iter::FromIterator; use self::hercules_ir::*; -#[derive(Debug, Clone)] -struct HashableIndex<'a>(Vec<&'a str>); -impl<'a> FromIterator<&'a String> for HashableIndex<'a> { - fn from_iter<I: IntoIterator<Item = &'a String>>(iter: I) -> Self { - HashableIndex(iter.into_iter().map(|s| s.as_str()).collect()) - } -} -impl<'a> PartialEq for HashableIndex<'a> { - fn eq(&self, other: &Self) -> bool { - self.0 == other.0 - } -} -impl<'a> Eq for HashableIndex<'a> {} -impl<'a> Hash for HashableIndex<'a> { - fn hash<H: Hasher>(&self, state: &mut H) { - self.0.hash(state); - } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -enum MemoryType { - Shared, - Register, -} - /* - * The top level function to compile a Hercules IR function into NVVM IR kernel for - * execution on the GPU. We generate NVVM IR textually, copying from the CPU LLVM approach. + * The top level function to compile a Hercules IR function into CUDA kernel for + * execution on the GPU. We generate CUDA C textually, based on the CPU LLVM + * approach. */ pub fn gpu_codegen<W: Write>( function: &Function, @@ -57,6 +32,37 @@ pub fn gpu_codegen<W: Write>( }; let kernel_attrs = GPUKernelAttrs::default(); + // GPU backend assertions + for ty in types.iter() { + if let Type::Array(type_id, _) = ty { + if let Type::Array(..) = types[type_id.idx()] { + panic!("Array element type can't be another array"); + } + } + } + + let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) + .filter(|idx| function.nodes[*idx].is_reduce()) + .map(NodeID::new) + .collect(); + for idx in 0..function.nodes.len() { + if function.nodes[idx].is_join() && reduce_nodes + .iter() + .filter(|reduce_node| match &function.nodes[reduce_node.idx()] { + Node::Reduce { + control, + init: _, + reduct: _, + } => control.idx() == idx, + _ => false, + }) + .count() + == 0 + { + panic!("Join node {} has no reduce nodes", idx); + } + } + // Create fork forward adjacency and join map upfront as part of context let make_fork_structures = || -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, NodeID>) { let mut fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>> = (0..function.nodes.len()) @@ -85,32 +91,6 @@ pub fn gpu_codegen<W: Write>( }; let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); - // Maybe can delete - let map_join_reduce = || -> HashMap<NodeID, Vec<NodeID>> { - let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) - .filter(|idx| function.nodes[*idx].is_reduce()) - .map(NodeID::new) - .collect(); - let mut map_join_reduce = HashMap::new(); - for (_, join) in fork_join_map.iter() { - let reduce_nodes_for_join = reduce_nodes - .iter() - .filter(|reduce_node| match &function.nodes[reduce_node.idx()] { - Node::Reduce { - control, - init: _, - reduct: _, - } => control.idx() == join.idx(), - _ => false, - }) - .copied() - .collect(); - map_join_reduce.insert(*join, reduce_nodes_for_join); - } - map_join_reduce - }; - let join_reduce_map = map_join_reduce(); - let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { let mut label_data_for_phi = HashMap::new(); for (idx, node) in function.nodes.iter().enumerate() { @@ -127,27 +107,6 @@ pub fn gpu_codegen<W: Write>( }; let label_data_for_phi = label_data_for_phi(); - // Since global memory traffic is expensive, we use shared memory and - // registers basically as write-back caches, but we write back due to - // end of scope rather than due to synchronization (which is solved by - // shmem). - // param_cache tracks cache for each parameter by accessed index and - // memory type. Note that indexing is hierarchical, so [a, b] contains - // [a, b, c] and will give a hit upon query of the latter. param_cache is - // only added to for copies from global -> shared or global -> register. - // Writes update the cache, but we track specific indices written in - // param_cache_writes to know what to write back (and avoid redundant - // writes). - let param_cache = vec![ - HashMap::<(HashableIndex<'static>, MemoryType), String>::new(); - function.param_types.len() - ]; - let param_cache_writes = - vec![HashSet::<(HashableIndex<'static>, MemoryType)>::new(); function.param_types.len()]; - // Statically unknown shared memory buffers need to use dynamic offsets from. - // the dynamic shared memory buffer - let mut dynamic_shared_offset = "0".to_string(); - let mut ctx = GPUContext { function, types, @@ -162,9 +121,6 @@ pub fn gpu_codegen<W: Write>( fork_forward_adjacency, fork_join_map, label_data_for_phi, - join_reduce_map, - param_cache, - param_cache_writes, }; ctx.codegen_function(w) } @@ -182,7 +138,6 @@ struct GPUKernelParams { struct GPUKernelAttrs { num_blocks: usize, num_threads: usize, - extern_shmem_offset: String, } struct GPUContext<'a> { @@ -199,9 +154,6 @@ struct GPUContext<'a> { fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, fork_join_map: HashMap<NodeID, NodeID>, label_data_for_phi: HashMap<NodeID, Vec<NodeID>>, - join_reduce_map: HashMap<NodeID, Vec<NodeID>>, - param_cache: Vec<HashMap<(HashableIndex, MemoryType), String>>, - param_cache_writes: Vec<HashSet<(HashableIndex, MemoryType)>>, } #[derive(Default, Debug)] @@ -214,7 +166,8 @@ struct CudaGoto { impl GPUContext<'_> { fn codegen_function<W: Write>(&mut self, w: &mut W) -> Result<(), Error> { - // Include all possible includes then macros + // All possible includes followed by macros for intrinsic calls on + // types with no library support write!( w, " @@ -238,18 +191,16 @@ impl GPUContext<'_> { let mut top = String::new(); - // Emit all possible structs - self.codegen_structs(&mut top)?; - // Emit kernel template, signature, and arguments + // Emit kernel signature, arguments, and dynamic shared memory declaration self.codegen_kernel_begin(&mut top)?; - // Need to emit dynamic offsets for extern shmem, we do this by strings. - self.kernel_attrs.extern_shmem_offset = "0".to_string(); - // Emit calculation of all dynamic constants self.codegen_dynamic_constants(&mut top)?; + // Emit all possible struct definitions and dummy pointers for each type. + // These may depend on dynamic constants, for example an array field with + // dynamic constant dims. + self.codegen_type_init(&mut top)?; - // Uses CUDA's goto structure; some control nodes' gen may be moved, eg - // block and thread fork joins. + // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) .map(|idx| { @@ -262,15 +213,14 @@ impl GPUContext<'_> { }) .collect(); - // Assign outermost fork joins to block level. TODO: remove block_sizes + // Assign outermost valid fork joins to block level. TODO: remove block_sizes // if still not needed later let (block_fork_ids, _) = self.codegen_block_creation()?; // Assign inner fork joins to thread level. We do this before block sink // because we need thread size for shared memory optimizations let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1])?; - // Sink logic from outer block fork joins. If it's a write, add - // necessary block-id based condition. + // Sink logic from outer block fork joins. self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], &mut top, &mut gotos)?; for (i, &fork_id) in block_fork_ids .iter() @@ -291,39 +241,6 @@ impl GPUContext<'_> { Ok(()) } - fn codegen_structs(&self, w: &mut String) -> Result<(), Error> { - for (id, ty) in self.types.iter().enumerate() { - match ty { - Type::Product(ref product_ty_ids) => { - write!(w, "\ntypedef struct Product_{} {{\n", id)?; - for (i, product_ty_id) in product_ty_ids.iter().enumerate() { - write!( - w, - "\t{} field_{};\n", - self.get_type(*product_ty_id, false), - i - )?; - } - write!(w, "}} Product_{};\n", id)?; - } - Type::Summation(ref summation_ty_ids) => { - write!(w, "\ntypedef struct Summation_{} {{\n\t union {{\n", id)?; - for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { - write!( - w, - "\t\t{} field_{};\n", - self.get_type(*summation_ty_id, false), - i - )?; - } - write!(w, "\t}};\n\tuint8_t tag;\n}} Summation_{};\n", id)?; - } - _ => {} - } - } - Ok(()) - } - fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { write!( w, @@ -347,26 +264,7 @@ impl GPUContext<'_> { } else { write!(w, ", ")?; } - write!(w, "{} p{}", self.get_type(*ty, true), idx)?; - } - // We convert originally non-void functions to void functions by adding a - // return parameter. For now we ignore the case where return was derived - // from a parameter through reads and writes, and instead always memcpy. - let return_index = self.function.nodes.iter().position(|node| node.is_return()); - if let Some(return_index) = return_index { - if let Node::Return { - control: _, - data: return_data, - } = &self.function.nodes[return_index] - { - write!( - w, - ", {} return_val", - self.get_type(self.typing[return_data.idx()], true) - )?; - } else { - panic!("Expected return node"); - } + write!(w, "{} p{}", self.get_type(*ty, true, true), idx)?; } // Type is char since it's simplest to use single bytes for indexing, @@ -408,6 +306,78 @@ impl GPUContext<'_> { Ok(()) } + // Emit struct definitions for each typeid of product or summation type. If + // multiple typeids have the same type, they're separately emitted. Might + // not be most elegant, but using typeid is more convenient when instantiating + // than eg searching for index of type in types vector. Also emit dummy pointers + // for struct and primitive type ids for possible future use when moving to/from + // global memory + fn codegen_type_init(&self, w: &mut String) -> Result<(), Error> { + for type_id in self.typing.iter() { + let type_id_idx = type_id.idx(); + let ty = &self.types[type_id_idx]; + match ty { + Type::Product(ref product_ty_ids) => { + write!(w, "\ttypedef struct Product_{} {{\n", type_id_idx)?; + for (i, product_ty_id) in product_ty_ids.iter().enumerate() { + write!( + w, + "\t\t{} field_{};\n", + self.get_type(*product_ty_id, false, false), + i + )?; + } + write!(w, "}} Product_{};\n", type_id_idx)?; + write!( + w, + "\tProduct_{}* product_{}_dummy;\n", + type_id_idx, type_id_idx + )?; + } + Type::Summation(ref summation_ty_ids) => { + write!( + w, + "\ttypedef struct Summation_{} {{\n\t\t union {{\n", + type_id_idx + )?; + for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { + write!( + w, + "\t\t\t{} field_{};\n", + self.get_type(*summation_ty_id, false, false), + i + )?; + } + write!( + w, + "\t\t}};\n\t\tuint8_t tag;\n\t}} Summation_{};\n", + type_id_idx + )?; + write!( + w, + "\tSummation_{}* summation_{}_dummy;\n", + type_id_idx, type_id_idx + )?; + } + // Arrays are decomposed into their element type during transfer + // so no need to emit dummy pointers + Type::Array(_, _) => {} + // Primitive types + _ => { + write!( + w, + "\t{} {}_{}_dummy;\n", + convert_type(ty, true), + convert_type(ty, false), + type_id_idx + )?; + } + } + } + + Ok(()) + } + fn codegen_phi_registers<F>(&self, w: &mut String, should_process: F) -> Result<(), Error> where F: Fn(NodeID) -> bool, @@ -426,7 +396,12 @@ impl GPUContext<'_> { Ok(()) } - // Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the forks are parallel reduce forks, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred. + /* + Construct block forks by greedily accepting while: a) each fork join is strictly + nested meaning no other neighbor fork joins, b) the forks are parallel forks, + c) total number of blocks < max_num_blocks, and d) each fork's factor is statically + known. + */ fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> { let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); for (_, children) in self.fork_forward_adjacency.iter() { @@ -442,16 +417,18 @@ impl GPUContext<'_> { // a and b let mut strict_forks = vec![root_forks[0]]; let mut curr_fork = root_forks[0]; - while let Some(join) = self.fork_join_map.get(&curr_fork) { + while self.fork_join_map.contains_key(&curr_fork) { let children = &self.fork_forward_adjacency[&curr_fork]; - if children.len() != 1 || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) { + if children.len() != 1 + || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) + { break; } curr_fork = children[0]; strict_forks.push(curr_fork); } - // c, (stronger version of) d, and e + // c and d let mut valid_block_forks = 0; let mut cumulative_blocks = 1usize; let mut block_fork_sizes = Vec::new(); @@ -474,7 +451,8 @@ impl GPUContext<'_> { valid_block_forks += 1; } - // If limit on number of blocks in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. + // If limit on number of blocks in 1D grid is reached, we could consider 2D + // or 3D grids. Performance is not affected so for now keep it simple with 1D. self.kernel_attrs.num_blocks = cumulative_blocks; let valid_block_forks = strict_forks .into_iter() @@ -578,11 +556,8 @@ impl GPUContext<'_> { .iter() .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) { - self.codegen_data_node( - *id, - 1, - &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body, - )?; + let body = &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body; + self.codegen_data_node(*id, body, 1)?; visited.insert(id); } else { worklist.push_back(id); @@ -591,17 +566,64 @@ impl GPUContext<'_> { // 3. Emit control flow for control_node in control_nodes_between { - self.codegen_control_node(control_node, w)?; + let term = &mut gotos.get_mut(&self.bbs[control_node.idx()]).unwrap().term; + self.codegen_control_node(control_node, term, 1)?; } Ok(()) } - fn codegen_control_node(&self, id: NodeID, w: &mut String) -> Result<(), Error> { + fn codegen_control_node( + &self, + id: NodeID, + w: &mut String, + num_tabs: usize, + ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); + match &self.function.nodes[id.idx()] { + Node::Start + | Node::Region { preds: _ } + | Node::Projection { + control: _, + selection: _, + } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(w, "{}goto {}\n", tabs, self.get_block_name(succ))?; + } + Node::If { control: _, cond } => { + let mut succs = self.control_subgraph.succs(id); + let succ1 = succs.next().unwrap(); + let succ2 = succs.next().unwrap(); + write!( + w, + "{}if ({}) {{\n", + tabs, + self.get_value(*cond, false, false) + )?; + write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; + write!(w, "{}}} else {{\n", tabs)?; + write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ2))?; + write!(w, "{}}}\n", tabs)?; + } + Node::Fork { + control: _, + factors: _, + } => {} + Node::Join { control: _ } => {} + Node::Return { + control: _, + data: _, + } => { + write!(w, "{}return;\n", tabs)?; + } + _ => { + panic!("Unsupported control node type") + } + } Ok(()) } - fn codegen_data_node(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> { + fn codegen_data_node(&self, id: NodeID, w: &mut String, num_tabs: usize) -> Result<(), Error> { // For now only used shared memory when creating an array let declare_variable = self.get_value(id, true, false).to_string(); let tabs = "\t".repeat(num_tabs); @@ -665,7 +687,7 @@ impl GPUContext<'_> { "{}{} = static_cast<{}>({});\n", tabs, declare_variable, - self.get_type(*dst_ty_id, false), + self.get_type(*dst_ty_id, false, false), self.get_value(*input, false, false), )?; } @@ -757,45 +779,54 @@ impl GPUContext<'_> { // If it's a parameter node then copy from global memory, else // from shared memory or registers. if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { - let index_ptr_name = self.codegen_indices(*collect, indices, true); - self.codegen_copy_from_global( - true, + write!(w, "{};\n", declare_variable); + let is_char = self.is_parameter_char(self.typing[collect.idx()]); + let global_collect = self.codegen_global_collect(*collect, indices, is_char); + self.codegen_copy_from_to_global( + false, self.typing[id.idx()], &declare_variable, - &index_ptr_name, - Some(indices.len()), + &global_collect, + indices, true, - num_tabs, + is_char, w, + num_tabs, )?; } else { - let index_ptr_name = self.codegen_indices(*collect, indices,false); - write!(w, "{}{} = {};\n", tabs, declare_variable, index_ptr_name)?; + let local_collect = self.codegen_local_collect(*collect, indices); + write!(w, "{}{} = {};\n", tabs, declare_variable, local_collect)?; } } - Node::Write {collect, data, indices} => { + Node::Write { + collect, + data, + indices, + } => { let data_variable = self.get_value(*data, false, false); // If it's a parameter node then copy to global memory, else // to shared memory or registers if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { - let index_ptr_name = self.codegen_indices(*collect, indices, true); - self.codegen_copy_to_from_global( - false, + let is_char = self.is_parameter_char(self.typing[collect.idx()]); + let global_collect = self.codegen_global_collect(*collect, indices, is_char); + self.codegen_copy_from_to_global( + true, self.typing[id.idx()], &data_variable, - &index_ptr_name, - Some(indices.len()), + &global_collect, + &indices, true, - num_tabs, + is_char, w, + num_tabs, )?; } else { - let index_ptr_name = self.codegen_indices(*collect, indices, false); - write!(w, "{}{} = {};\n", tabs, index_ptr_name, data_variable)?; + let local_collect = self.codegen_local_collect(*collect, indices); + write!(w, "{}{} = {};\n", tabs, local_collect, data_variable)?; } } _ => { - panic!("Unsupported node type") + panic!("Unsupported data node type") } } if let Some(phis) = self.label_data_for_phi.get(&id) { @@ -811,125 +842,50 @@ impl GPUContext<'_> { Ok(()) } - // matmul detection- only called if einsum detected - fn matmul_detection(&self) -> Result<(), Error> { - Ok(()) - } - - // convolution detection- only called if einsum detected - fn convolution_detection(&self) -> Result<(), Error> { - Ok(()) - } - - // Standalone function allows us to handle recursive initialization for - // product and summation collections - fn codegen_constant( + // Handles reads/writes from global memory aka parameter node. We tack local + // (shmem + reg) array indexing and struct field access onto data, and tack + // global pointer offset onto global. Thread parallelization is used only for + // shared memory arrays. is_char indicates the global is a char type and we + // need to multiply the global index by the element size. + fn codegen_copy_from_to_global( &self, - declare_variable: String, - name: String, - cons_id: ConstantID, - w: &mut String, - ) -> Result<(), Error> { - write!(w, "\t{}", declare_variable)?; - match &self.constants[cons_id.idx()] { - Constant::Boolean(val) => write!(w, " = {};\n", val)?, - Constant::Integer8(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger8(val) => write!(w, " = {};\n", val)?, - Constant::Integer16(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger16(val) => write!(w, " = {};\n", val)?, - Constant::Integer32(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger32(val) => write!(w, " = {}ul;\n", val)?, - Constant::Integer64(val) => write!(w, " = {}ll;\n", val)?, - Constant::UnsignedInteger64(val) => write!(w, " = {}ull;\n", val)?, - Constant::Float32(val) => write!(w, " = {}f;\n", val)?, - Constant::Float64(val) => write!(w, " = {};\n", val)?, - Constant::Product(_, fields) => { - write!(w, ";\n")?; - for (i, field) in fields.iter().enumerate() { - // We don't emit array fields and size was set by struct definition - if !self.constants[field.idx()].is_array() { - // Don't need type declaration for the fields - self.codegen_constant( - format!("{}.field_{}", name, i), - format!("{}.field_{}", name, i), - *field, - w, - )?; - } - } - } - Constant::Summation(_, variant, field) => { - write!(w, ";\n\t{}.tag = {};\n", name, variant)?; - // See two comments in Constant::Product - if !self.constants[field.idx()].is_array() { - self.codegen_constant( - format!("\t{}.field_{}", name, variant), - format!("\t{}.field_{}", name, variant), - *field, - w, - )?; - } - } - Constant::Array(type_id) => { - let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { - panic!("Expected array type") - }; - // For now we do element-wise alignment, later could consider (n-1)d array - // alignment. Then we "allocate" from the single dynamic shared memory buffer - // by using and updating the offset. - let element_size = format!("sizeof({})", self.get_type(*element_type, false)); - let array_size = extents - .iter() - .map(|id| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join("*"); - write!( - w, - ";\n\talignment = {};\n\tdynamic_shared_offset = - (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = - reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t - dynamic_shared_offset += {}", - element_size, - name, - self.get_type(*element_type, false), - array_size - )?; - } - } - Ok(()) - } - - // Used for reads and writes due to identical logic. data_variable is the - // resulting reference for reads, and is the source for writes. Writes don't - // emit a new reference. - fn codegen_copy_from_global( - &self, - is_read: bool, + is_write: bool, type_id: TypeID, - data_variable: &String, - index_ptr_name: &String, - array_depth: Option<usize>, + data: &String, + global: &String, + indices: &[Index], parallelize: bool, - num_tabs: usize, + is_char: bool, w: &mut String, + num_tabs: usize, ) -> Result<(), Error> { let tabs = "\t".repeat(num_tabs); - let lhs = if is_read { data_variable } else { index_ptr_name }; - let rhs = if is_read { index_ptr_name } else { data_variable }; match &self.types[type_id.idx()] { Type::Array(element_type_id, extents) => { - let array_depth = array_depth.unwrap(); - let rem_array_size = extents - .iter() - .enumerate() - .filter(|(i, _)| *i >= array_depth) - .map(|(_, id)| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join("*"); + let Index::Position(array_indices) = &indices[0] else { + panic!("Expected position index for array access") + }; + if matches!(self.types[element_type_id.idx()], Type::Array(..)) { + panic!("Nested arrays are not supported"); + } + let rem_array_size = { + let s = extents + .iter() + .enumerate() + .filter(|(i, _)| *i >= array_indices.len()) + .map(|(_, id)| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join(" * "); + if s.is_empty() { + "1".to_string() + } else { + s + } + }; let mut running_div_factor = "1".to_string(); - let mut indices = vec![]; - for i in (array_depth..extents.len()).rev() { - indices.push(format!( + let mut level_indices_str = "".to_string(); + for i in (array_indices.len()..extents.len()).rev() { + level_indices_str.push_str(&format!( "[(({}) / ({})) % dc{}]", rem_array_size, running_div_factor, @@ -941,7 +897,6 @@ impl GPUContext<'_> { format!("dc{}", extents[i].idx()) ); } - let indices_str = indices.join(""); // Parallelizing only affects loop bounds let begin_copy = if parallelize { format!( @@ -952,15 +907,25 @@ impl GPUContext<'_> { format!("{}for (int i = 0; i < {}; i++) {{\n", tabs, rem_array_size) }; write!(w, "{}", begin_copy)?; - self.codegen_copy_to_from_global( - is_read, + let new_global = if is_char { + format!( + "{} + i * sizeof({})", + global, + self.get_type(*element_type_id, false, false) + ) + } else { + format!("{} + i", global) + }; + self.codegen_copy_from_to_global( + is_write, *element_type_id, - &format!("{}{}", data_variable, indices_str), - &format!("{}{}", index_ptr_name, indices_str), - None, + &format!("{}{}", data, level_indices_str), + &new_global, + &indices[1..], false, - num_tabs + 1, + is_char, w, + num_tabs + 1, )?; let end_copy = if parallelize { format!("{}}}\n{}__syncthreads();\n", tabs, tabs) @@ -970,53 +935,124 @@ impl GPUContext<'_> { write!(w, "{}", end_copy)?; } Type::Product(fields) => { - for field in fields { - self.codegen_copy_to_from_global( - is_read, - *field, - &format!("{}{}", data_variable, field.idx()), - &format!("{}{}", index_ptr_name, field.idx()), - None, + if !is_char { + panic!("Product type must be char addressed") + } + if indices.is_empty() { + let dummy_var = format!("product_{}_dummy", type_id.idx()); + let type_name = self.get_type(type_id, false, false); + write!( + w, + "{}{} = reinterpret_cast<{}*>({});\n", + tabs, dummy_var, type_name, global + )?; + let dummy_ptr = format!("*{}", dummy_var); + write!( + w, + "{}{} = {};\n", + tabs, + if is_write { &dummy_ptr } else { data }, + if is_write { data } else { &dummy_ptr } + )?; + } else { + let Index::Field(field_index) = &indices[0] else { + panic!("Expected field index for product access") + }; + let offset = (0..*field_index) + .map(|i| self.get_size(fields[i])) + .sum::<usize>(); + let new_global = format!("{} + {}", global, offset); + let new_data = format!("{}.field_{}", data, *field_index); + self.codegen_copy_from_to_global( + is_write, + fields[*field_index], + &new_data, + &new_global, + &indices[1..], false, - num_tabs + 1, + is_char, w, + num_tabs + 1, )?; } } Type::Summation(fields) => { - // First copy the tag - write!(w, "{}{}.tag = {}.tag;\n", tabs, lhs, rhs)?; - // Then copy the active field based on the tag - write!(w, "{}switch({}.tag) {{\n", tabs, rhs)?; - for (variant_idx, field) in fields.iter().enumerate() { - write!(w, "{}\tcase {}: {{\n", tabs, variant_idx)?; - // Recursively copy the field's contents - self.codegen_copy_to_from_global( - is_read, - *field, - &format!("{}.field_{}", data_variable, variant_idx), - &format!("{}.field_{}", index_ptr_name, variant_idx), - None, + if !is_char { + panic!("Summation type must be char addressed") + } + if indices.is_empty() { + let dummy_var = format!("summation_{}_dummy", type_id.idx()); + let type_name = self.get_type(type_id, false, false); + write!( + w, + "{}{} = reinterpret_cast<{}*>({});\n", + tabs, dummy_var, type_name, global + )?; + let dummy_ptr = format!("*{}", dummy_var); + write!( + w, + "{}{} = {};\n", + tabs, + if is_write { &dummy_ptr } else { data }, + if is_write { data } else { &dummy_ptr } + )?; + } else { + // Since all variants are 0-byte offset, the global index + // remains unchanged. + let Index::Variant(variant_index) = &indices[0] else { + panic!("Expected variant index for summation access") + }; + let new_data = format!("{}.field_{}", data, *variant_index); + self.codegen_copy_from_to_global( + is_write, + fields[*variant_index], + &new_data, + &global, + &indices[1..], false, - num_tabs + 2, - w + is_char, + w, + num_tabs + 1, )?; - write!(w, "{}\t\tbreak;\n", tabs)?; - write!(w, "{}\t}}\n", tabs)?; } - write!(w, "{}}}\n", tabs)?; } // Primitive types _ => { - write!(w, "{}{} = {};\n", tabs, lhs, rhs)?; + if is_char { + let type_name = self.get_type(type_id, false, false); + let dummy_var = format!("{}_{}_dummy", type_name, type_id.idx()); + write!( + w, + "{}{} = reinterpret_cast<{}*>({});\n", + tabs, dummy_var, type_name, global + )?; + let dummy_ptr = format!("*{}", dummy_var); + write!( + w, + "{}{} = {};\n", + tabs, + if is_write { &dummy_ptr } else { data }, + if is_write { data } else { &dummy_ptr } + )?; + } else { + let global_ptr = format!("*({})", global); + write!( + w, + "{}{} = {};\n", + tabs, + if is_write { &global_ptr } else { data }, + if is_write { data } else { &global_ptr } + )?; + } } } Ok(()) } - // Use normal indexing for local collections - fn codegen_indices_local(&self, collect: NodeID, indices: &[Index]) -> String { - let mut index_ptr_name = format!("{}", self.get_value(collect, false, false)); + // Read/writes to local collections consist of local name + array indexing + // and struct field access. + fn codegen_local_collect(&self, collect: NodeID, indices: &[Index]) -> String { + let mut index_ptr_name = "".to_string(); for index in indices { match index { Index::Field(field) => { @@ -1036,44 +1072,152 @@ impl GPUContext<'_> { } } } - index_ptr_name + let name = self.get_value(collect, false, false); + format!("{} + {}", name, index_ptr_name) } - // Use arithmetic for global collections as they're accessed as pointers - fn codegen_indices_global(&self, collect: NodeID, indices: &[Index]) -> String { - let mut index_ptr_name = format!("{}[0", self.get_value(collect, false, false)); + // Read/writes to global collections consist of global name + pointer offset. + fn codegen_global_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { + let mut index_ptr_name = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { match index { - Index::Field(field) => { + // Sum the offset of prior fields in bytes + Index::Field(field) => { let offset = (0..*field) - .map(|i| format!("offsetof({}, field_{})", self.get_type(type_id, false), i)) + .map(|i| { + format!( + "offsetof({}, field_{})", + self.get_type(type_id, false, false), + i + ) + }) .collect::<Vec<_>>() .join(" + "); - index_ptr_name.push_str(&format!(" + {}", offset)); + if *field > 0 { + index_ptr_name.push_str(&format!(" + {}", offset)); + } } // Variants of summations have zero offset Index::Variant(_) => {} + // Convert multi-d array index to 1-d index, and optionally + // convert to single-byte index by multiplying by element size Index::Position(array_indices) => { - let Type::Array(_, extents) = &self.types[self.typing[collect.idx()].idx()] else { + let Type::Array(element_type, extents) = + &self.types[self.typing[collect.idx()].idx()] + else { panic!("Expected array type") }; - let mut cumulative_offset = "1 * ".to_string() + extents - .iter() - .enumerate() - .filter(|(i, _)| *i >= array_indices.len()) - .map(|(_, id)| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join(" * ") - .as_str(); + let mut cumulative_offset = "1 * ".to_string() + + extents + .iter() + .enumerate() + .filter(|(i, _)| *i >= array_indices.len()) + .map(|(_, id)| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join(" * ") + .as_str(); for index in array_indices.iter().rev() { - cumulative_offset = format!("{} * ({} + ", cumulative_offset, self.get_value(*index, false, false)); + cumulative_offset = format!( + "{} * ({} + ", + cumulative_offset, + self.get_value(*index, false, false) + ); + } + index_ptr_name.push_str(&format!( + " + {}{}", + cumulative_offset, + ")".repeat(array_indices.len()) + )); + if is_char { + let element_size = + format!("sizeof({})", self.get_type(*element_type, false, false)); + index_ptr_name.push_str(&format!(" * {}", element_size)); } - index_ptr_name.push_str(&format!(" + {}{}", cumulative_offset, ")".repeat(array_indices.len()))); } } } - format!("{}]", index_ptr_name) + let name = self.get_value(collect, false, false); + format!("{} + {}", name, index_ptr_name) + } + + // Standalone function allows us to handle recursive initialization for + // product and summation collections + fn codegen_constant( + &self, + declare_variable: String, + name: String, + cons_id: ConstantID, + w: &mut String, + ) -> Result<(), Error> { + write!(w, "\t{}", declare_variable)?; + match &self.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(w, " = {};\n", val)?, + Constant::Integer8(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger8(val) => write!(w, " = {};\n", val)?, + Constant::Integer16(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger16(val) => write!(w, " = {};\n", val)?, + Constant::Integer32(val) => write!(w, " = {};\n", val)?, + Constant::UnsignedInteger32(val) => write!(w, " = {}ul;\n", val)?, + Constant::Integer64(val) => write!(w, " = {}ll;\n", val)?, + Constant::UnsignedInteger64(val) => write!(w, " = {}ull;\n", val)?, + Constant::Float32(val) => write!(w, " = {}f;\n", val)?, + Constant::Float64(val) => write!(w, " = {};\n", val)?, + Constant::Product(_, fields) => { + write!(w, ";\n")?; + for (i, field) in fields.iter().enumerate() { + // We don't emit array fields and size was set by struct definition + if !self.constants[field.idx()].is_array() { + // Don't need type declaration for the fields + self.codegen_constant( + format!("{}.field_{}", name, i), + format!("{}.field_{}", name, i), + *field, + w, + )?; + } + } + } + Constant::Summation(_, variant, field) => { + write!(w, ";\n\t{}.tag = {};\n", name, variant)?; + // See two comments in Constant::Product + if !self.constants[field.idx()].is_array() { + self.codegen_constant( + format!("\t{}.field_{}", name, variant), + format!("\t{}.field_{}", name, variant), + *field, + w, + )?; + } + } + Constant::Array(type_id) => { + let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { + panic!("Expected array type") + }; + // For now we do element-wise alignment, later could consider (n-1)d array + // alignment. Then we "allocate" from the single dynamic shared memory buffer + // by using and updating the offset. + let element_size = + format!("sizeof({})", self.get_type(*element_type, false, false)); + let array_size = extents + .iter() + .map(|id| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join("*"); + write!( + w, + ";\n\talignment = {};\n\tdynamic_shared_offset = + (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = + reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t + dynamic_shared_offset += {}", + element_size, + name, + self.get_type(*element_type, false, false), + array_size + )?; + } + } + Ok(()) } fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { @@ -1191,6 +1335,66 @@ impl GPUContext<'_> { func_name.to_string() } + // Check if a parameter should be represented as char*. Must be a product, + // summation, or array of product/summation types. This should only be + // called on parameters. + fn is_parameter_char(&self, type_id: TypeID) -> bool { + match &self.types[type_id.idx()] { + Type::Product(_) | Type::Summation(_) => true, + Type::Array(element_type, _) => self.is_parameter_char(*element_type), + _ => false, + } + } + + // matmul detection- only called if einsum detected + fn matmul_detection(&self) -> Result<(), Error> { + Ok(()) + } + + // convolution detection- only called if einsum detected + fn convolution_detection(&self) -> Result<(), Error> { + Ok(()) + } + + fn get_size(&self, type_id: TypeID) -> usize { + match &self.types[type_id.idx()] { + Type::Array(element_type, extents) => { + let element_alignment = self.get_alignment(*element_type); + extents + .iter() + .try_fold(element_alignment, |acc, &extent| { + evaluate_dynamic_constant(extent, self.dynamic_constants) + .map(|val| acc.saturating_mul(val)) + }) + .unwrap_or(0) + } + _ => self.get_alignment(type_id), + } + } + + fn get_alignment(&self, type_id: TypeID) -> usize { + match &self.types[type_id.idx()] { + Type::Array(element_type, _) => self.get_alignment(*element_type), + Type::Product(fields) => fields + .iter() + .map(|field| self.get_alignment(*field)) + .sum::<usize>(), + Type::Summation(fields) => { + fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(0) + + 1 + } + Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, + Type::Integer16 | Type::UnsignedInteger16 => 2, + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, + _ => panic!("Unsupported type for alignment"), + } + } + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Result<usize, Error> { factors.iter().try_fold(1usize, |acc, &factor_id| { evaluate_dynamic_constant(factor_id, self.dynamic_constants) @@ -1199,6 +1403,10 @@ impl GPUContext<'_> { }) } + fn get_block_name(&self, id: NodeID) -> String { + format!("bb_{}", id.idx()) + } + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { if ty { @@ -1210,7 +1418,12 @@ impl GPUContext<'_> { panic!("Parameters shouldn't be re-initialized") } format!("p{}", index) - } else if let Node::Write { collect, data: _, indices: _ } = &self.function.nodes[id.idx()] { + } else if let Node::Write { + collect, + data: _, + indices: _, + } = &self.function.nodes[id.idx()] + { if ty { panic!("Writes shouldn't be initialized, they're replaced with the referenced collection") } @@ -1218,12 +1431,13 @@ impl GPUContext<'_> { panic!("Writes shouldn't be called as pointer") } self.get_value(*collect, false, false) - } else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] + } else if ty + && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { // Shmem/register arrays have special formatting let mut declare_array = format!( "{} (*{}{})", - self.get_type(*element_type, false), + self.get_type(*element_type, false, false), self.function.nodes[id.idx()].lower_case_name(), id.idx() ); @@ -1234,7 +1448,7 @@ impl GPUContext<'_> { } else if ty { format!( "{} {}{}", - self.get_type(self.typing[id.idx()], make_pointer), + self.get_type(self.typing[id.idx()], make_pointer, false), self.function.nodes[id.idx()].lower_case_name(), id.idx() ) @@ -1247,37 +1461,46 @@ impl GPUContext<'_> { } } - // make_pointer enforces static pointer and not recursive or array pointer: - // multi-d arrays are single pointers with custom indexing. - fn get_type(&self, id: TypeID, make_pointer: bool) -> String { + fn get_type(&self, id: TypeID, make_pointer: bool, is_global: bool) -> String { match &self.types[id.idx()] { + // Product and summation collections are char* for byte-addressability + // since we can have variable type fields Type::Product(_) => { - format!( - "Product_{}{}", - id.idx(), - if make_pointer { "*" } else { "" } - ) + if make_pointer { + "char*".to_string() + } else if is_global { + "char".to_string() + } else { + format!("Product_{}", id.idx()) + } } Type::Summation(_) => { - format!( - "Summation_{}{}", - id.idx(), - if make_pointer { "*" } else { "" } - ) + if make_pointer { + "char*".to_string() + } else if is_global { + "char".to_string() + } else { + format!("Summation_{}", id.idx()) + } } Type::Array(element_type, extents) => { // This suffix lets us work with references of dynamic shared memory // and use n-d array indexing. - let mut suffix = "(*)".to_string(); - if extents.len() > 1 { - for extent in extents.iter().skip(1) { - suffix.push_str(&format!("[dc{}]", extent.idx())); - } - } format!( "{}{}", - self.get_type(*element_type, false), - if make_pointer { "*" } else { &suffix } + self.get_type(*element_type, false, is_global), + if make_pointer { + "*".to_string() + } else { + format!( + "(*){}", + extents + .iter() + .skip(1) + .map(|extent| format!("[dc{}]", extent.idx())) + .collect::<String>() + ) + } ) } _ => convert_type(&self.types[id.idx()], make_pointer), @@ -1285,7 +1508,7 @@ impl GPUContext<'_> { } } -// TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16 +// TODO: Add float8, float16, bfloat16 dtypes if they come fn convert_type(ty: &Type, make_pointer: bool) -> String { let mut result = match ty { Type::Boolean => "bool".to_string(), -- GitLab From dd116c215799ee5a8ce7195dac52ccebe6c6eb91 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 3 Jan 2025 13:55:43 -0800 Subject: [PATCH 031/109] before i blow it up --- .gitignore | 2 + hercules_cg/src/gpu.rs | 1427 ++++++++++++++++----------- hercules_ir/src/ir.rs | 1 - juno_samples/matmul/src/matmul.hbin | Bin 0 -> 1323 bytes juno_samples/matmul/src/matmul.pdf | Bin 0 -> 88675 bytes 5 files changed, 856 insertions(+), 574 deletions(-) create mode 100644 juno_samples/matmul/src/matmul.hbin create mode 100644 juno_samples/matmul/src/matmul.pdf diff --git a/.gitignore b/.gitignore index 22c9343e..45f2e61b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ .*.swp .vscode *_env + +juno_samples/matmul/src/matmul_indented.jn diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 3ad9297b..768324ca 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -4,13 +4,16 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; use std::iter::FromIterator; +use std::os::unix::thread; + +use bitvec::field; use self::hercules_ir::*; /* - * The top level function to compile a Hercules IR function into CUDA kernel for - * execution on the GPU. We generate CUDA C textually, based on the CPU LLVM - * approach. + * The top level function to compile a Hercules IR function into CUDA + * kernel for execution on the GPU. We generate CUDA C textually, based + * on the CPU LLVM approach. */ pub fn gpu_codegen<W: Write>( function: &Function, @@ -20,24 +23,41 @@ pub fn gpu_codegen<W: Write>( reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, + antideps: &Vec<(NodeID, NodeID)>, bbs: &Vec<NodeID>, + collection_objects: &FunctionCollectionObjects, w: &mut W, ) -> Result<(), Error> { - // Temporary hardcoded values - let kernel_params = GPUKernelParams { - max_num_blocks: 1024, - max_num_threads: 1024, - threads_per_warp: 32, - num_smps: 60, - }; - let kernel_attrs = GPUKernelAttrs::default(); + /* + * We assert the following: + * - Array element type can't be another array + * - Any array field in a struct must have known size + * - Fork node must have >= 1 reduce nodes + * - If the returned data type is a collection, it must have + * originated from a parameter. Technically could extend to + * multiple parameters but we aren't going to. + * + * We don't assert but assume the following: + * - Global memory can't be used in a phi or select node + * - max_num_blocks is within constraint of 1D grid size. This can be + * relaxed if we want to support larger grids. + */ - // GPU backend assertions for ty in types.iter() { - if let Type::Array(type_id, _) = ty { - if let Type::Array(..) = types[type_id.idx()] { - panic!("Array element type can't be another array"); + match ty { + Type::Array(type_id, _) => { + if let Type::Array(..) = types[type_id.idx()] { + panic!("Array element type can't be another array"); + } + } + Type::Product(type_ids) | Type::Summation(type_ids) => { + for type_id in type_ids.iter() { + if let Type::Array(_, extents) = &types[type_id.idx()] && multiply_dynamic_constants(dynamic_constants, &extents).is_none() { + panic!("Array field in product msut have known size") + } + } } + _ => {} } } @@ -45,51 +65,91 @@ pub fn gpu_codegen<W: Write>( .filter(|idx| function.nodes[*idx].is_reduce()) .map(NodeID::new) .collect(); + + let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); + for reduce_node in &reduce_nodes { + if let Node::Reduce { + control, + init: _, + reduct: _, + } = &function.nodes[reduce_node.idx()] + { + match function.nodes[control.idx()] { + Node::Join { + control: fork_node, .. + } => { + fork_reduce_map + .entry(fork_node) + .or_default() + .push(*reduce_node); + } + Node::Region { preds: _ } => { + // TODO: map region node to fork node + } + _ => { + panic!("Reduce's control must be a join or region node"); + } + } + } + } for idx in 0..function.nodes.len() { - if function.nodes[idx].is_join() && reduce_nodes - .iter() - .filter(|reduce_node| match &function.nodes[reduce_node.idx()] { - Node::Reduce { - control, - init: _, - reduct: _, - } => control.idx() == idx, - _ => false, - }) - .count() - == 0 + if function.nodes[idx].is_fork() + && fork_reduce_map + .get(&NodeID::new(idx)) + .map_or(true, |reduces| reduces.is_empty()) { panic!("Join node {} has no reduce nodes", idx); } } - // Create fork forward adjacency and join map upfront as part of context - let make_fork_structures = || -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, NodeID>) { - let mut fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>> = (0..function.nodes.len()) - .filter(|idx| function.nodes[*idx].is_fork()) - .map(|idx| (NodeID::new(idx), vec![])) - .collect(); - let mut fork_join_map = HashMap::new(); - let mut queued_nodes = VecDeque::new(); - - for (fork_node, children) in fork_forward_adjacency.iter_mut() { - queued_nodes.push_back(*fork_node); - while !queued_nodes.is_empty() { - let node = queued_nodes.pop_front().unwrap(); - for child in control_subgraph.succs(node) { - if function.nodes[child.idx()].is_fork() { - children.push(child); - } else if function.nodes[child.idx()].is_join() { - fork_join_map.insert(*fork_node, child); - } else { - queued_nodes.push_back(child); + let (return_node_id, data_node_id) = { + let pos = function + .nodes + .iter() + .position(|node| { + matches!( + node, + Node::Return { + control: _, + data: _ } - } + ) + }) + .expect("Function must have a return node"); + let Node::Return { control: _, data } = &function.nodes[pos] else { + panic!("Return node must be a return node"); + }; + (NodeID::new(pos), *data) + }; + + let return_type_id = &typing[return_node_id.idx()]; + let return_type = &types[return_type_id.idx()]; + if return_type.is_array() || return_type.is_product() || return_type.is_summation() { + let objects = &collection_objects.objects(data_node_id); + if objects.len() > 1 { + let origin = collection_objects.origin(objects[0]); + if !objects + .iter() + .all(|obj| collection_objects.origin(*obj) == origin) + { + panic!( + "Returned data node {} has multiple collection objects with different origins", + data_node_id.idx() + ); + } + if !matches!(origin, CollectionObjectOrigin::Parameter(..)) { + panic!("Returns collection object that did not originate from a parameter"); } } - (fork_forward_adjacency, fork_join_map) + } + + // Temporary hardcoded values + let kernel_params = &GPUKernelParams { + max_num_blocks: 1024, + max_num_threads: 1024, + threads_per_warp: 32, + greedy_associative_thresh: 32, }; - let (fork_forward_adjacency, fork_join_map) = make_fork_structures(); let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { let mut label_data_for_phi = HashMap::new(); @@ -105,9 +165,9 @@ pub fn gpu_codegen<W: Write>( } label_data_for_phi }; - let label_data_for_phi = label_data_for_phi(); + let label_data_for_phi = &label_data_for_phi(); - let mut ctx = GPUContext { + let ctx = GPUContext { function, types, constants, @@ -115,29 +175,23 @@ pub fn gpu_codegen<W: Write>( reverse_postorder, typing, control_subgraph, + antideps, bbs, kernel_params, - kernel_attrs, - fork_forward_adjacency, - fork_join_map, + fork_reduce_map, label_data_for_phi, + return_type_id, }; ctx.codegen_function(w) } -// Fixed prior to codegen +// Kernel parameters that are fixed prior to codegen. See description of +// greedy_associative_thresh in codegen_function. struct GPUKernelParams { max_num_blocks: usize, max_num_threads: usize, threads_per_warp: usize, - num_smps: usize, -} - -// Set during codegen -#[derive(Default)] -struct GPUKernelAttrs { - num_blocks: usize, - num_threads: usize, + greedy_associative_thresh: usize, } struct GPUContext<'a> { @@ -148,24 +202,23 @@ struct GPUContext<'a> { reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, + antideps: &'a Vec<(NodeID, NodeID)>, bbs: &'a Vec<NodeID>, - kernel_params: GPUKernelParams, - kernel_attrs: GPUKernelAttrs, - fork_forward_adjacency: HashMap<NodeID, Vec<NodeID>>, - fork_join_map: HashMap<NodeID, NodeID>, - label_data_for_phi: HashMap<NodeID, Vec<NodeID>>, + kernel_params: &'a GPUKernelParams, + fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, + label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, + return_type_id: &'a TypeID, } #[derive(Default, Debug)] struct CudaGoto { - header: String, + init: String, body: String, term: String, - handled: bool, } impl GPUContext<'_> { - fn codegen_function<W: Write>(&mut self, w: &mut W) -> Result<(), Error> { + fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // All possible includes followed by macros for intrinsic calls on // types with no library support write!( @@ -191,44 +244,45 @@ impl GPUContext<'_> { let mut top = String::new(); - // Emit kernel signature, arguments, and dynamic shared memory declaration self.codegen_kernel_begin(&mut top)?; - // Emit calculation of all dynamic constants self.codegen_dynamic_constants(&mut top)?; - // Emit all possible struct definitions and dummy pointers for each type. - // These may depend on dynamic constants, for example an array field with - // dynamic constant dims. - self.codegen_type_init(&mut top)?; + self.codegen_struct_def(&mut top)?; + self.codegen_reused_locals(&mut top)?; + let (fork_tree, fork_control_map) = self.make_fork_structures(); + let (root_forks, num_blocks) = + self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); + let (cumul_factors, num_threads) = self.get_cumulative_factors(&fork_tree, &root_forks); + let start = NodeID::new(0); + let ret = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_return()) + .map(NodeID::new) + .next() + .unwrap(); + let (begin_control, end_control) = self.get_begin_end_control(start, ret); + let global_refs = self.get_global_refs(); // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) .map(|idx| { let node_id = NodeID::new(idx); - let goto = CudaGoto { - header: self.get_value(node_id, false, false), - ..Default::default() - }; + let goto = CudaGoto::default(); (node_id, goto) }) .collect(); - // Assign outermost valid fork joins to block level. TODO: remove block_sizes - // if still not needed later - let (block_fork_ids, _) = self.codegen_block_creation()?; - // Assign inner fork joins to thread level. We do this before block sink - // because we need thread size for shared memory optimizations - let (_thread_fork_parents, _thread_fork_sizes, _thread_fork_edges) = - self.codegen_thread_creation(block_fork_ids[block_fork_ids.len() - 1])?; - // Sink logic from outer block fork joins. - self.codegen_block_sink(NodeID::new(0), block_fork_ids[0], &mut top, &mut gotos)?; - for (i, &fork_id) in block_fork_ids - .iter() - .enumerate() - .take(block_fork_ids.len() - 1) - { - self.codegen_block_sink(fork_id, block_fork_ids[i + 1], &mut top, &mut gotos)?; - } + self.codegen_data_control( + &root_forks, + &fork_tree, + &fork_control_map, + &begin_control, + &end_control, + &global_refs, + &cumul_factors, + num_threads, + num_blocks, + &mut gotos, + )?; // Punting on implementation but can likely run einsum -> matmul/conv // detector on hierarhical fork joins between block edge and given @@ -241,6 +295,7 @@ impl GPUContext<'_> { Ok(()) } + // Emit kernel signature, arguments, and dynamic shared memory declaration fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { write!( w, @@ -264,16 +319,31 @@ impl GPUContext<'_> { } else { write!(w, ", ")?; } - write!(w, "{} p{}", self.get_type(*ty, true, true), idx)?; + let param_type = if self.types[ty.idx()].is_primitive() { + self.get_type(*ty, false, false) + } else { + format!("{} __restrict__", self.get_type(*ty, true, true)) + }; + write!(w, "{} p{}", param_type, idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { + write!(w, ", ")?; + write!( + w, + "{} __restrict__ ret", + self.get_type(*self.return_type_id, true, true) + )?; } // Type is char since it's simplest to use single bytes for indexing, // casting will be needed for use with different types. - write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n")?; + write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n\tsize_t alignment;\n")?; Ok(()) } + // Emit calculation of all dynamic constants fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { for dc in dynamic_constants_bottom_up(self.dynamic_constants) { let dc_val = format!("unsigned long long dc{}", dc.idx()); @@ -307,37 +377,44 @@ impl GPUContext<'_> { } // Emit struct definitions for each typeid of product or summation type. If - // multiple typeids have the same type, they're separately emitted. Might - // not be most elegant, but using typeid is more convenient when instantiating - // than eg searching for index of type in types vector. Also emit dummy pointers - // for struct and primitive type ids for possible future use when moving to/from - // global memory - fn codegen_type_init(&self, w: &mut String) -> Result<(), Error> { + // multiple typeids have the same type, they're separately emitted. Lastly emit + // dummy alignment for later use in dynamic shared memory slices. + fn codegen_struct_def(&self, w: &mut String) -> Result<(), Error> { for type_id in self.typing.iter() { let type_id_idx = type_id.idx(); - let ty = &self.types[type_id_idx]; - match ty { + match &self.types[type_id_idx] { Type::Product(ref product_ty_ids) => { - write!(w, "\ttypedef struct Product_{} {{\n", type_id_idx)?; + let product_size = self.get_size(*type_id); + write!(w, "\ttypedef struct alignas({}) Product_{} {{\n", product_size, type_id_idx)?; + let mut cumul_size = 0; for (i, product_ty_id) in product_ty_ids.iter().enumerate() { + let field_alignment = self.get_alignment(*product_ty_id); + if (cumul_size % field_alignment) != 0 { + let padding = field_alignment - cumul_size % field_alignment; + cumul_size += padding; + write!( + w, + "\t\tchar[{}] pad{};\n", + padding, + i, + )?; + } write!( w, "\t\t{} field_{};\n", self.get_type(*product_ty_id, false, false), i )?; + cumul_size += self.get_size(*product_ty_id); } - write!(w, "}} Product_{};\n", type_id_idx)?; - write!( - w, - "\tProduct_{}* product_{}_dummy;\n", - type_id_idx, type_id_idx - )?; + write!(w, "\t}} __attribute__((packed)) Product_{};\n", type_id_idx)?; } Type::Summation(ref summation_ty_ids) => { + let summation_size = self.get_size(*type_id); write!( w, - "\ttypedef struct Summation_{} {{\n\t\t union {{\n", + "\ttypedef struct alignas({}) Summation_{} {{\n\t\t union {{\n", + summation_size, type_id_idx )?; for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { @@ -350,300 +427,394 @@ impl GPUContext<'_> { } write!( w, - "\t\t}};\n\t\tuint8_t tag;\n\t}} Summation_{};\n", - type_id_idx - )?; - write!( - w, - "\tSummation_{}* summation_{}_dummy;\n", - type_id_idx, type_id_idx - )?; - } - // Arrays are decomposed into their element type during transfer - // so no need to emit dummy pointers - Type::Array(_, _) => {} - // Primitive types - _ => { - write!( - w, - "\t{} {}_{}_dummy;\n", - convert_type(ty, true), - convert_type(ty, false), + "\t\t}};\n\t}} __attribute__((packed)) Summation_{};\n", type_id_idx )?; } + _ => {} } } Ok(()) } - fn codegen_phi_registers<F>(&self, w: &mut String, should_process: F) -> Result<(), Error> - where - F: Fn(NodeID) -> bool, - { + // We generate all phi values and all flags for phi and select upfront that + // indicate if collection, whether their current value is global + fn codegen_reused_locals(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if let Node::Phi { - control: _, - data: _, - } = &self.function.nodes[id.idx()] - { - if should_process(id) { - write!(w, "\t{};\n", self.get_value(id, true, true))?; - } + match &self.function.nodes[id.idx()] { + Node::Phi {..} => { + write!(w, "\t{};\n", self.get_value(id, true, true, false))?; + } + _ => {} + } + let global_flag = self.get_global_flag(id, true); + if global_flag.is_some() { + write!(w, "\t{};\n", global_flag.unwrap())?; } } Ok(()) } - /* - Construct block forks by greedily accepting while: a) each fork join is strictly - nested meaning no other neighbor fork joins, b) the forks are parallel forks, - c) total number of blocks < max_num_blocks, and d) each fork's factor is statically - known. - */ - fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> { - let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect(); - for (_, children) in self.fork_forward_adjacency.iter() { - for child in children { - root_forks.remove(child); + /* Create two fork structures: + * First, fork_forward_adjacency is a map from each fork node F to all forks satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we don't count self-domination + * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we do count self-domination + */ + fn make_fork_structures(&self) -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, Vec<NodeID>>) { + let mut fork_tree: HashMap<NodeID, Vec<NodeID>> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_fork()) + .map(|idx| (NodeID::new(idx), vec![])) + .collect(); + let mut fork_control_map = HashMap::new(); + let mut queued_nodes = VecDeque::new(); + + for (fork_node, fork_children) in fork_tree.iter_mut() { + let mut control_vec = vec![]; + queued_nodes.push_back(*fork_node); + while !queued_nodes.is_empty() { + let node = queued_nodes.pop_front().unwrap(); + control_vec.push(node); + for child in self.control_subgraph.succs(node) { + if self.function.nodes[child.idx()].is_fork() { + fork_children.push(child); + } else if self.function.nodes[child.idx()].is_join() { + control_vec.push(child); + } else { + queued_nodes.push_back(child); + } + } } + fork_control_map.insert(*fork_node, control_vec); } - let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); - if root_forks.len() != 1 { - panic!("Exactly one root fork is required for outermost GPU block fork"); - } + (fork_tree, fork_control_map) + } - // a and b - let mut strict_forks = vec![root_forks[0]]; - let mut curr_fork = root_forks[0]; - while self.fork_join_map.contains_key(&curr_fork) { - let children = &self.fork_forward_adjacency[&curr_fork]; - if children.len() != 1 - || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) - { - break; + // Get control nodes succeeding the start and preceding all forks, and + // control nodes preceding the return and succeeding all joins + fn get_begin_end_control( + &self, + start: NodeID, + ret: NodeID, + ) -> (HashSet<NodeID>, HashSet<NodeID>) { + let mut begin_visited = HashSet::new(); + let mut begin_worklist = VecDeque::new(); + begin_worklist.push_back(start); + + while let Some(node) = begin_worklist.pop_front() { + if begin_visited.contains(&node) { + continue; + } + if self.function.nodes[node.idx()].is_fork() { + continue; + } + begin_visited.insert(node); + for pred in self.control_subgraph.preds(node) { + begin_worklist.push_back(pred); } - curr_fork = children[0]; - strict_forks.push(curr_fork); } - // c and d - let mut valid_block_forks = 0; - let mut cumulative_blocks = 1usize; - let mut block_fork_sizes = Vec::new(); + let mut end_visited = HashSet::new(); + let mut end_worklist = VecDeque::new(); + end_worklist.push_back(ret); - for fork in strict_forks.iter() { - if !self.function.schedules[fork.idx()].contains(&Schedule::Vectorizable) { - break; + while let Some(node) = end_worklist.pop_front() { + if end_visited.contains(&node) { + continue; } - let factors = match &self.function.nodes[fork.idx()] { - Node::Fork { factors, .. } => factors, - _ => panic!("Expected fork node"), - }; - let fork_size = self.multiply_fork_factors(factors)?; - let new_blocks = cumulative_blocks.saturating_mul(fork_size); - if new_blocks > self.kernel_params.max_num_blocks { - break; + if self.function.nodes[node.idx()].is_join() { + continue; + } + end_visited.insert(node); + for succ in self.control_subgraph.preds(node) { + end_worklist.push_back(succ); } - cumulative_blocks = new_blocks; - block_fork_sizes.push(fork_size); - valid_block_forks += 1; } - // If limit on number of blocks in 1D grid is reached, we could consider 2D - // or 3D grids. Performance is not affected so for now keep it simple with 1D. - self.kernel_attrs.num_blocks = cumulative_blocks; - let valid_block_forks = strict_forks - .into_iter() - .take(valid_block_forks) - .collect::<Vec<_>>(); - - Ok((valid_block_forks, block_fork_sizes)) + (begin_visited, end_visited) } - // Construct thread/warp forks by: a) same as block but rather than strict nest, neighbors are allowed and we get sequence of "edge" (aka innermost at thread level) fork joins rather than single, and b) if innermost is = threads_per_warp, we can use warp-level features. - fn codegen_thread_creation( - &mut self, - inner_block_fork: NodeID, - ) -> Result<(HashMap<NodeID, NodeID>, HashMap<NodeID, usize>, Vec<NodeID>), Error> { - let mut thread_fork_parents = HashMap::new(); - let mut thread_fork_sizes = HashMap::new(); - let mut thread_fork_cumulative_sizes = HashMap::new(); - thread_fork_cumulative_sizes.insert(inner_block_fork, 1); - let mut thread_fork_edges = vec![]; - let mut max_thread_size = 1; - let mut stack = vec![inner_block_fork]; - let mut visited = HashSet::new(); - visited.insert(inner_block_fork); - while let Some(pop) = stack.pop() { - let children = &self.fork_forward_adjacency[&pop]; - - // Reverse child order due to use of stack for DFS - for &child in children.iter().rev() { - if !visited.contains(&child) { - visited.insert(child); - thread_fork_parents.insert(child, pop); - let fork_size = match &self.function.nodes[child.idx()] { - Node::Fork { factors, .. } => self.multiply_fork_factors(factors)?, - _ => panic!("Expected fork node"), - }; - thread_fork_sizes.insert(child, fork_size); - - let new_cumulative_size = (thread_fork_cumulative_sizes[&pop] as usize) - .saturating_mul(fork_size as usize); - if new_cumulative_size > self.kernel_params.max_num_threads { - // Expanding to child fork exceeds thread limit, so - // current fork is an edge fork - thread_fork_edges.push(pop); - max_thread_size = max_thread_size.max(thread_fork_cumulative_sizes[&pop]); - } else { - // Recurse into child fork - thread_fork_cumulative_sizes.insert(child, new_cumulative_size); - stack.push(child); - } - } else { - panic!("Fork child shouldn't have multiple fork parents"); + // Get all globals and global references, where for GPU purposes global = + // collection parameter + fn get_global_refs(&self) -> HashSet<NodeID> { + // We start with collection parameters, and follow any reduce or write users. + let mut queued_nodes: VecDeque<NodeID> = (0..self.function.nodes.len()) + .filter(|idx| { + self.function.nodes[*idx].is_parameter() + && !self.types[self.typing[*idx].idx()].is_primitive() + }) + .map(NodeID::new) + .collect(); + + let def_use = def_use(&self.function); + let mut global_nodes = HashSet::new(); + + while !queued_nodes.is_empty() { + let node_id = queued_nodes.pop_front().unwrap(); + global_nodes.insert(node_id); + let node_users = def_use.get_users(node_id); + for user in node_users { + match self.function.nodes[user.idx()] { + Node::Write { .. } | Node::Reduce { .. } => queued_nodes.push_back(*user), + _ => {} } } } - // We take max cumulative size seen among fork edges as thread size. Any edge with less than max may need extra id-based condition. - // If limit on number of threads in 1D grid is reached, we could consider 2D or 3D grid. Performance is not affected so for now keep it simple with 1D. - self.kernel_attrs.num_threads = max_thread_size; - - Ok((thread_fork_parents, thread_fork_sizes, thread_fork_edges)) + global_nodes } - fn codegen_block_sink( + /* + * If tree has a single root fork of known size s <= max_num_blocks + * with parallel-fork schedule, then set num_blocks to s, else set num_blocks to 1. + */ + fn get_root_forks_and_num_blocks( &self, - fork_id: NodeID, - next_fork_id: NodeID, - w: &mut String, - gotos: &mut BTreeMap<NodeID, CudaGoto>, - ) -> Result<(), Error> { - // 1. Get control nodes including fork_id that are dominated by fork_id - // and not dominated by next_fork_id and not dominated by fork_id's join - let dom = dominator(self.control_subgraph, fork_id); - assert!(dom.does_dom(fork_id, next_fork_id)); - let mut control_nodes_between = HashSet::new(); - for node_id in self.control_subgraph.iter() { - if dom.does_dom(fork_id, *node_id) - && !dom.does_dom(next_fork_id, *node_id) - && !dom.does_dom(self.fork_join_map[&fork_id], *node_id) - { - control_nodes_between.insert(*node_id); + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + max_num_blocks: usize, + ) -> (Vec<NodeID>, usize) { + let mut root_forks: HashSet<NodeID> = fork_tree.keys().copied().collect(); + for (_, children) in fork_tree.iter() { + for child in children { + root_forks.remove(child); } } - - // 2. Emit data flow for nodes assigned to basic blocks in block sink - // 2a. All phi registers first - self.codegen_phi_registers(w, |id| control_nodes_between.contains(&self.bbs[id.idx()]))?; - // 2b. All other data nodes - let mut worklist = VecDeque::from_iter(self.reverse_postorder.iter().filter(|id| { - !self.function.nodes[id.idx()].is_control() - && control_nodes_between.contains(&self.bbs[id.idx()]) - && !self.function.nodes[id.idx()].is_phi() - })); - let mut visited = HashSet::new(); - while let Some(id) = worklist.pop_front() { - let node = &self.function.nodes[id.idx()]; - if node.is_reduce() { - panic!("Reduce nodes should not be in block sink"); - } - if get_uses(node) - .as_ref() - .iter() - .all(|u| self.function.nodes[u.idx()].is_control() || visited.contains(u)) - { - let body = &mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body; - self.codegen_data_node(*id, body, 1)?; - visited.insert(id); - } else { - worklist.push_back(id); - } + let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); + if root_forks.len() != 1 { + return (root_forks, 1); } - // 3. Emit control flow - for control_node in control_nodes_between { - let term = &mut gotos.get_mut(&self.bbs[control_node.idx()]).unwrap().term; - self.codegen_control_node(control_node, term, 1)?; + let root_fork = root_forks[0]; + let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { + panic!("Expected fork node"); + }; + let fork_size = multiply_dynamic_constants(self.dynamic_constants, factors); + if let Some(fork_size) = fork_size + && fork_size <= max_num_blocks + && self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) + { + (root_forks, fork_size) + } else { + (root_forks, 1) } + } - Ok(()) + /* + * Once inside the block-level forks, we initiate a cumul_factor at 1. If + * encountering a child fork with known size s < max_num_threads / cumul_factor, + * with all reduces being parallel or associative, then we parallelize along + * s, else we serialize. Then step into child and update cumul_factor if needed. + * One exception is if fork factor is a multiple of greedy_associative_thresh + * and at least one reduce is associative, in which case we use warp reduction + * and disable cumul_factor change for its subtree. At end, we've mapped + * each fork to its cumulative factor, and if not present fork uses it's parent's + * factor. + */ + fn get_cumulative_factors( + &self, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + root_forks: &Vec<NodeID>, + ) -> (HashMap<NodeID, usize>, usize) { + let mut cumul_factors = HashMap::new(); + for root_fork in root_forks { + cumul_factors.insert(*root_fork, 1); + self.recurse_cumul_factors(*root_fork, fork_tree, 1, &mut cumul_factors); + } + let num_threads = *cumul_factors.values().max().unwrap(); + (cumul_factors, num_threads) } - fn codegen_control_node( + fn recurse_cumul_factors( &self, - id: NodeID, - w: &mut String, - num_tabs: usize, - ) -> Result<(), Error> { - let tabs = "\t".repeat(num_tabs); - match &self.function.nodes[id.idx()] { - Node::Start - | Node::Region { preds: _ } - | Node::Projection { - control: _, - selection: _, - } => { - let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w, "{}goto {}\n", tabs, self.get_block_name(succ))?; + curr_fork: NodeID, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + cumul_factor: usize, + cumul_factors: &mut HashMap<NodeID, usize>, + ) { + let reduces = &self.fork_reduce_map[&curr_fork]; + if reduces.iter().all(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) + || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) + }) && let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] + && let Some(fork_size) = multiply_dynamic_constants(self.dynamic_constants, factors) + && fork_size <= self.kernel_params.max_num_threads / cumul_factor + { + if fork_size % self.kernel_params.greedy_associative_thresh == 0 + && reduces.iter().any(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::Associative) + }) { + cumul_factors.insert(curr_fork, cumul_factor * fork_size); + } else { + let mut max_factor = cumul_factor * fork_size; + for child in fork_tree[&curr_fork].iter() { + self.recurse_cumul_factors(*child, fork_tree, cumul_factor * fork_size, cumul_factors); + max_factor = max_factor.max(cumul_factors[child]); + } + cumul_factors.insert(curr_fork, max_factor); } - Node::If { control: _, cond } => { - let mut succs = self.control_subgraph.succs(id); - let succ1 = succs.next().unwrap(); - let succ2 = succs.next().unwrap(); - write!( - w, - "{}if ({}) {{\n", - tabs, - self.get_value(*cond, false, false) - )?; - write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; - write!(w, "{}}} else {{\n", tabs)?; - write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ2))?; - write!(w, "{}}}\n", tabs)?; + } else { + let mut max_factor = cumul_factor; + for child in fork_tree[&curr_fork].iter() { + self.recurse_cumul_factors(*child, fork_tree, cumul_factor, cumul_factors); + max_factor = max_factor.max(cumul_factors[child]); } - Node::Fork { - control: _, - factors: _, - } => {} - Node::Join { control: _ } => {} - Node::Return { - control: _, - data: _, - } => { - write!(w, "{}return;\n", tabs)?; + cumul_factors.insert(curr_fork, max_factor); + } + } + + // /* + // * For each parallel reduce with a reduct write, meaning it's at the end of + // * a potential parallel reduction chain, we walk back to beginning of chain + // * and update the write's collect to be the beginning's init. + // */ + // fn update_write_collects(&self) -> HashMap<NodeID, NodeID> { + // let mut write_collect_map = HashMap::new(); + // let mut parallel_reduces: HashSet<NodeID> = (0..self.function.nodes.len()) + // .map(NodeID::new) + // .filter(|&node_id| { + // self.function.schedules[node_id.idx()].contains(&Schedule::ParallelReduce) + // }) + // .collect(); + // for reduce in parallel_reduces.clone() { + // if let Node::Reduce { + // control: _, + // init, + // reduct, + // } = &self.function.nodes[reduce.idx()] + // && let Node::Write { .. } = &self.function.nodes[reduct.idx()] + // { + // parallel_reduces.remove(&reduce); + // while parallel_reduces.contains(&init) { + // let Node::Reduce { + // control: _, + // init, + // reduct: _, + // } = &self.function.nodes[init.idx()] + // else { + // panic!("Expected reduce node"); + // }; + // parallel_reduces.remove(&init); + // } + // write_collect_map.insert(*reduct, *init); + // } + // } + // write_collect_map + // } + + fn codegen_data_control( + &self, + root_forks: &Vec<NodeID>, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + fork_control_map: &HashMap<NodeID, Vec<NodeID>>, + begin_control: &HashSet<NodeID>, + end_control: &HashSet<NodeID>, + global_refs: &HashSet<NodeID>, + cumul_factors: &HashMap<NodeID, usize>, + num_threads: usize, + num_blocks: usize, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + let control_to_data = + (0..self.bbs.len()).fold(HashMap::<NodeID, Vec<NodeID>>::new(), |mut map, id| { + if let Some(control) = self.bbs.get(id) { + map.entry(*control).or_default().push(NodeID::new(id)); + }; + map + }); + + // Define the following states: + // 0 is above block fork, 1 is in block fork above any thread fork, 2 is + // in any thread fork, 3 is below block fork + + // If num_blocks > 1, initialize state to 0, else 1 + let mut state = if num_blocks > 1 { 0 } else { 1 }; + // Then generate data and control for each control in begin_control + for control in begin_control { + let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; + for data in control_to_data.get(control).unwrap() { + self.codegen_data_node(*data, state, num_threads, body, &mut 1, global_refs)?; } - _ => { - panic!("Unsupported control node type") + let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; + self.codegen_control_node(*control, term, 1)?; + } + // Then if num_blocks > 1, set state to 1 and generate data and control + // for the single root fork + if num_blocks > 1 { + state = 1; + for control in fork_control_map.get(&root_forks[0]).unwrap() { + for data in control_to_data.get(control).unwrap() { + let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; + self.codegen_data_node(*data, state, num_threads, body, &mut 1, global_refs)?; + } + let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; + self.codegen_control_node(*control, term, 1)?; } } + // Set state to 2 and begin DFS through fork_tree (after root_fork if + // visited in previous step), updating thread_quota + // If num_blocks > 1, set state to 3, else 1 + // Then generate data and control for each control in end_control Ok(()) } - fn codegen_data_node(&self, id: NodeID, w: &mut String, num_tabs: usize) -> Result<(), Error> { - // For now only used shared memory when creating an array - let declare_variable = self.get_value(id, true, false).to_string(); - let tabs = "\t".repeat(num_tabs); + fn codegen_data_node( + &self, + id: NodeID, + state: usize, + thread_quota: usize, + w: &mut String, + num_tabs: &mut usize, + global_refs: &HashSet<NodeID>, + ) -> Result<(), Error> { + let declare_variable = self.get_value(id, true, false, false).to_string(); + let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { // Phi registers were already emitted. Node::Phi { control: _, data: _, } => {} - // No SSA requirement for CUDA + Node::ThreadID { + control, + dimension, + } => { + let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { + panic!("Expected ThreadID's control to be a fork node"); + }; + match state { + 1 => { + // Violating DRY with the naming but unsure how to map + // DynamicConstantID to NodeID to use `get_value` + let divide = factors.iter().skip(dimension + 1).map(|f| format!("dc{}", f.idx())).collect::<Vec<_>>().join(" * "); + let modulo = format!("dc{}", factors[*dimension].idx()); + write!(w, "{}{} = (blockIdx.x / ({})) % {};\n", tabs, declare_variable, divide, modulo)?; + } + 2 => {} + _ => { panic!("Unsupported state for ThreadID") } + } + } + Node::Reduce { control: _, init, reduct: _ } => { + let init_val = self.get_value(*init, false, false, false); + write!(w, "{}{} = {};\n", tabs, declare_variable, init_val)?; + } + // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { + write!(w, "{}{};\n", tabs, declare_variable)?; + let define_variable = self.get_value(id, false, false, false); self.codegen_constant( - declare_variable, - self.get_value(id, false, false), + if self.types[self.typing[id.idx()].idx()].is_primitive() { define_variable } else { format!("*{}", define_variable)}, *cons_id, w, + *num_tabs, )?; } - // No SSA requirement for CUDA + // Dynamic constants emitted at top Node::DynamicConstant { id: _ } => {} Node::Unary { op, input } => match op { UnaryOperator::Not => match &self.types[self.typing[input.idx()].idx()] { @@ -653,7 +824,7 @@ impl GPUContext<'_> { "{}{} = !{};\n", tabs, declare_variable, - self.get_value(*input, false, false), + self.get_value(*input, false, false, false), )?; } ty if ty.is_fixed() => { @@ -662,7 +833,7 @@ impl GPUContext<'_> { "{}{} = ~{};\n", tabs, declare_variable, - self.get_value(*input, false, false), + self.get_value(*input, false, false, false), )?; } _ => panic!("Unsupported type for not operator"), @@ -674,7 +845,7 @@ impl GPUContext<'_> { "{}{} = -{};\n", tabs, declare_variable, - self.get_value(*input, false, false), + self.get_value(*input, false, false, false), )?; } _ => { @@ -688,13 +859,13 @@ impl GPUContext<'_> { tabs, declare_variable, self.get_type(*dst_ty_id, false, false), - self.get_value(*input, false, false), + self.get_value(*input, false, false, false), )?; } }, Node::Binary { op, left, right } => { - let left_val = self.get_value(*left, false, false); - let right_val = self.get_value(*right, false, false); + let left_val = self.get_value(*left, false, false, false); + let right_val = self.get_value(*right, false, false, false); match (op, &self.types[self.typing[left.idx()].idx()]) { (BinaryOperator::Rem, Type::Float32) => write!( w, @@ -754,12 +925,17 @@ impl GPUContext<'_> { TernaryOperator::Select => { write!( w, - "{}{} = {} ? {} : {};\n", + "{}{} = {} ? {} : {};\n{}{} = {} ? {} : {};\n", tabs, declare_variable, - self.get_value(*first, false, false), - self.get_value(*second, false, false), - self.get_value(*third, false, false), + self.get_value(*first, false, false, false), + self.get_value(*second, false, false, false), + self.get_value(*third, false, false, false), + tabs, + self.get_value(id, false, false, false), + self.get_value(*first, false, false, false), + global_refs.contains(second), + global_refs.contains(third) )?; } }, @@ -772,30 +948,54 @@ impl GPUContext<'_> { tabs, declare_variable, func_name, - self.get_value(args[0], false, false), + self.get_value(args[0], false, false, false), )?; } Node::Read { collect, indices } => { - // If it's a parameter node then copy from global memory, else - // from shared memory or registers. - if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { - write!(w, "{};\n", declare_variable); + // Copy from global memory or from shared memory or registers. + // Generate if-else for phi and select where we don't statically know + // the case. + write!(w, "{}{};\n", tabs, declare_variable); + let define_variable = self.get_value(id, false, false, false); + let global_flag = self.get_global_flag(*collect, false); + let has_global_flag = global_flag.is_some(); + if has_global_flag { + write!(w, "{}if ({}) {{\n{}\t", tabs, global_flag.unwrap(), tabs); + *num_tabs += 1; + } + if global_refs.contains(collect) || has_global_flag { let is_char = self.is_parameter_char(self.typing[collect.idx()]); - let global_collect = self.codegen_global_collect(*collect, indices, is_char); + let global_collect = self.codegen_collect(*collect, indices, true, has_global_flag, is_char); + let type_id = self.typing[id.idx()]; + let is_array = self.types[type_id.idx()].is_array(); self.codegen_copy_from_to_global( false, - self.typing[id.idx()], - &declare_variable, + type_id, + &define_variable, &global_collect, indices, - true, + if is_array { + Some(thread_quota) + } else { + None + }, + !is_array, + false, is_char, w, - num_tabs, + *num_tabs, )?; - } else { - let local_collect = self.codegen_local_collect(*collect, indices); - write!(w, "{}{} = {};\n", tabs, declare_variable, local_collect)?; + } + if has_global_flag { + write!(w, "{}}} else {{\n", tabs); + } + if !global_refs.contains(collect) || has_global_flag { + let local_collect = self.codegen_collect(*collect, indices, false, has_global_flag, false); + write!(w, "{}{} = {};\n", tabs, define_variable, local_collect)?; + } + if has_global_flag { + write!(w, "{}}}\n", tabs); + *num_tabs -= 1; } } Node::Write { @@ -803,27 +1003,50 @@ impl GPUContext<'_> { data, indices, } => { - let data_variable = self.get_value(*data, false, false); - // If it's a parameter node then copy to global memory, else - // to shared memory or registers - if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] { + // Only difference vs read is the LHS vs RHS, and creating write- + // labeled reference after + write!(w, "{}{};\n", tabs, declare_variable); + let global_flag = self.get_global_flag(*collect, false); + let has_global_flag = global_flag.is_some(); + if has_global_flag { + write!(w, "{}if ({}) {{\n", tabs, global_flag.unwrap()); + *num_tabs += 1; + } + let data_variable = self.get_value(*data, false, false, global_refs.contains(collect)); + if global_refs.contains(collect) || has_global_flag { let is_char = self.is_parameter_char(self.typing[collect.idx()]); - let global_collect = self.codegen_global_collect(*collect, indices, is_char); + let global_collect = self.codegen_collect(*collect, indices, true, has_global_flag, is_char); + let type_id = self.typing[id.idx()]; + let is_array = self.types[type_id.idx()].is_array(); self.codegen_copy_from_to_global( true, - self.typing[id.idx()], + type_id, &data_variable, &global_collect, - &indices, - true, + indices, + if is_array { + Some(thread_quota) + } else { + None + }, + !is_array, + state == 0, is_char, w, - num_tabs, + *num_tabs, )?; - } else { - let local_collect = self.codegen_local_collect(*collect, indices); + } + if has_global_flag { + write!(w, "{}}} else {{\n", tabs); + } + if !global_refs.contains(collect) || has_global_flag { + let local_collect = self.codegen_collect(*collect, indices, false, has_global_flag, false); write!(w, "{}{} = {};\n", tabs, local_collect, data_variable)?; } + if has_global_flag { + write!(w, "{}}}\n", tabs); + *num_tabs -= 1; + } } _ => { panic!("Unsupported data node type") @@ -833,10 +1056,69 @@ impl GPUContext<'_> { for phi in phis { write!( w, - "\t{} = {};\n", - self.get_value(*phi, false, false), - self.get_value(id, false, false) + "{}{} = {};\n{}{} = {};\n", + tabs, + self.get_value(*phi, false, false, false), + self.get_value(id, false, false, false), + tabs, + self.get_global_flag(*phi, false).unwrap(), + global_refs.contains(&id) + )?; + } + } + Ok(()) + } + + fn codegen_control_node( + &self, + id: NodeID, + w: &mut String, + num_tabs: usize, + ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); + match &self.function.nodes[id.idx()] { + Node::Start + | Node::Region { preds: _ } + | Node::Projection { + control: _, + selection: _, + } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(w, "{}goto {}\n", tabs, self.get_block_name(succ))?; + } + Node::If { control: _, cond } => { + let mut succs = self.control_subgraph.succs(id); + let succ1 = succs.next().unwrap(); + let succ2 = succs.next().unwrap(); + write!( + w, + "{}if ({}) {{\n", + tabs, + self.get_value(*cond, false, false, false) )?; + write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; + write!(w, "{}}} else {{\n", tabs)?; + write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ2))?; + write!(w, "{}}}\n", tabs)?; + } + Node::Fork { + control: _, + factors: _, + } => {} + Node::Join { control: _ } => {} + Node::Return { control: _, data } => { + if self.types[self.typing[data.idx()].idx()].is_primitive() { + let return_val = self.get_value(*data, false, false, false); + write!( + w, + "{}if (threadIdx.x == 0) {{\n{}\t*ret = {};\n{}}}\n", + tabs, tabs, return_val, tabs + )?; + } + write!(w, "{}return;\n", tabs)?; + } + _ => { + panic!("Unsupported control node type") } } Ok(()) @@ -854,7 +1136,9 @@ impl GPUContext<'_> { data: &String, global: &String, indices: &[Index], - parallelize: bool, + thread_quota: Option<usize>, + thread_restrict: bool, + block_restrict: bool, is_char: bool, w: &mut String, num_tabs: usize, @@ -882,29 +1166,19 @@ impl GPUContext<'_> { s } }; - let mut running_div_factor = "1".to_string(); - let mut level_indices_str = "".to_string(); - for i in (array_indices.len()..extents.len()).rev() { - level_indices_str.push_str(&format!( - "[(({}) / ({})) % dc{}]", - rem_array_size, - running_div_factor, - extents[i].idx() - )); - running_div_factor = format!( - "{} * {}", - running_div_factor, - format!("dc{}", extents[i].idx()) - ); - } - // Parallelizing only affects loop bounds - let begin_copy = if parallelize { + // If we parallelize over threads, then we index by threadIdx.x, + // else we gate the loop by threadIdx.x == 0 + let has_thread_quota = thread_quota.is_some(); + let begin_copy = if has_thread_quota { format!( "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", - tabs, rem_array_size, self.kernel_attrs.num_threads + tabs, rem_array_size, thread_quota.unwrap() ) } else { - format!("{}for (int i = 0; i < {}; i++) {{\n", tabs, rem_array_size) + format!( + "{}if (threadIdx.x == 0) {{\n{}\tfor (int i = 0; i < {}; i++) {{\n", + tabs, tabs, rem_array_size + ) }; write!(w, "{}", begin_copy)?; let new_global = if is_char { @@ -919,42 +1193,57 @@ impl GPUContext<'_> { self.codegen_copy_from_to_global( is_write, *element_type_id, - &format!("{}{}", data, level_indices_str), + &format!("{} + i", data), &new_global, &indices[1..], + None, + false, false, is_char, w, - num_tabs + 1, + num_tabs + if has_thread_quota { 1 } else { 2 }, )?; - let end_copy = if parallelize { - format!("{}}}\n{}__syncthreads();\n", tabs, tabs) - } else { - format!("{}}}\n", tabs) - }; - write!(w, "{}", end_copy)?; + if !has_thread_quota { + write!(w, "{}\t}}\n", tabs)?; + } + write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; } - Type::Product(fields) => { + Type::Product(fields) | Type::Summation(fields) => { if !is_char { - panic!("Product type must be char addressed") + panic!("Global product or summation must be char addressed") } + let is_product = matches!(self.types[type_id.idx()], Type::Product(..)); if indices.is_empty() { - let dummy_var = format!("product_{}_dummy", type_id.idx()); - let type_name = self.get_type(type_id, false, false); - write!( - w, - "{}{} = reinterpret_cast<{}*>({});\n", - tabs, dummy_var, type_name, global - )?; - let dummy_ptr = format!("*{}", dummy_var); + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; + } + if thread_restrict { + write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; + extra_tab2 = "\t"; + } + let reinterpret = format!("*reinterpret_cast<{}_{}*>", if is_product { "Product" } else { "Summation" }, type_id.idx()); + let reinterpret_global = format!("{}({})", reinterpret, global); + let reinterpret_data = format!("{}({})", reinterpret, data); write!( w, - "{}{} = {};\n", + "{}{}{}{} = {};\n", tabs, - if is_write { &dummy_ptr } else { data }, - if is_write { data } else { &dummy_ptr } + extra_tab, + extra_tab2, + if is_write { &reinterpret_global } else { &reinterpret_data }, + if is_write { &reinterpret_data } else { &reinterpret_global } )?; - } else { + if thread_restrict { + write!(w, "{}{}}}\n", tabs, extra_tab)?; + } + if block_restrict { + write!(w, "{}}}\n", tabs)?; + } + } else if is_product { + // Iterate over fields in product to find offset let Index::Field(field_index) = &indices[0] else { panic!("Expected field index for product access") }; @@ -962,54 +1251,34 @@ impl GPUContext<'_> { .map(|i| self.get_size(fields[i])) .sum::<usize>(); let new_global = format!("{} + {}", global, offset); - let new_data = format!("{}.field_{}", data, *field_index); + let new_data = format!("{} + {}", data, offset); self.codegen_copy_from_to_global( is_write, fields[*field_index], &new_data, &new_global, &indices[1..], - false, + None, + thread_restrict, + block_restrict, is_char, w, num_tabs + 1, )?; - } - } - Type::Summation(fields) => { - if !is_char { - panic!("Summation type must be char addressed") - } - if indices.is_empty() { - let dummy_var = format!("summation_{}_dummy", type_id.idx()); - let type_name = self.get_type(type_id, false, false); - write!( - w, - "{}{} = reinterpret_cast<{}*>({});\n", - tabs, dummy_var, type_name, global - )?; - let dummy_ptr = format!("*{}", dummy_var); - write!( - w, - "{}{} = {};\n", - tabs, - if is_write { &dummy_ptr } else { data }, - if is_write { data } else { &dummy_ptr } - )?; } else { - // Since all variants are 0-byte offset, the global index - // remains unchanged. + // All variants of summations have zero offset let Index::Variant(variant_index) = &indices[0] else { panic!("Expected variant index for summation access") }; - let new_data = format!("{}.field_{}", data, *variant_index); self.codegen_copy_from_to_global( is_write, fields[*variant_index], - &new_data, + &data, &global, &indices[1..], - false, + None, + thread_restrict, + block_restrict, is_char, w, num_tabs + 1, @@ -1018,66 +1287,84 @@ impl GPUContext<'_> { } // Primitive types _ => { + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; + } + if thread_restrict { + write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; + extra_tab2 = "\t"; + } if is_char { - let type_name = self.get_type(type_id, false, false); - let dummy_var = format!("{}_{}_dummy", type_name, type_id.idx()); - write!( - w, - "{}{} = reinterpret_cast<{}*>({});\n", - tabs, dummy_var, type_name, global - )?; - let dummy_ptr = format!("*{}", dummy_var); + let type_name = self.get_type(type_id, true, false); + let reinterpret = format!("*reinterpret_cast<{}>", type_name); + let reinterpret_global = format!("{}({})", reinterpret, global); + let reinterpret_data = format!("{}({})", reinterpret, data); write!( w, - "{}{} = {};\n", + "{}{}{}{} = {};\n", tabs, - if is_write { &dummy_ptr } else { data }, - if is_write { data } else { &dummy_ptr } + extra_tab, + extra_tab2, + if is_write { &reinterpret_global } else { &reinterpret_data }, + if is_write { &reinterpret_data } else { &reinterpret_global } )?; } else { - let global_ptr = format!("*({})", global); write!( w, - "{}{} = {};\n", + "{}*{} = *{};\n", tabs, - if is_write { &global_ptr } else { data }, - if is_write { data } else { &global_ptr } + if is_write { &global } else { data }, + if is_write { data } else { &global } )?; } - } - } - Ok(()) - } - - // Read/writes to local collections consist of local name + array indexing - // and struct field access. - fn codegen_local_collect(&self, collect: NodeID, indices: &[Index]) -> String { - let mut index_ptr_name = "".to_string(); - for index in indices { - match index { - Index::Field(field) => { - index_ptr_name.push_str(&format!(".field_{}", field)); + if thread_restrict { + write!(w, "{}{}}}\n", tabs, extra_tab)?; } - Index::Variant(variant) => { - index_ptr_name.push_str(&format!(".field_{}", variant)); - } - Index::Position(indices) => { - index_ptr_name.push_str( - &indices - .iter() - .map(|index| format!("[{}]", self.get_value(*index, false, false))) - .collect::<Vec<_>>() - .join(""), - ); + if block_restrict { + write!(w, "{}}}\n", tabs)?; } } } - let name = self.get_value(collect, false, false); - format!("{} + {}", name, index_ptr_name) + Ok(()) } + // // Read/writes to local collections consist of local name + array indexing + // // and struct field access. + // fn codegen_local_collect(&self, collect: NodeID, indices: &[Index], has_global_flag: bool) -> String { + // let mut index_ptr_name = "".to_string(); + // for index in indices { + // match index { + // Index::Field(field) => { + // index_ptr_name.push_str(&format!(".field_{}", field)); + // } + // Index::Variant(variant) => { + // index_ptr_name.push_str(&format!(".field_{}", variant)); + // } + // Index::Position(indices) => { + // index_ptr_name.push_str( + // &indices + // .iter() + // .map(|index| format!("[{}]", self.get_value(*index, false, false, false))) + // .collect::<Vec<_>>() + // .join(""), + // ); + // } + // } + // } + // let name = self.get_value(collect, false, false, false); + // let full_name = if has_global_flag { + // format!("reinterpret_cast<{}>({})", self.get_type(self.typing[collect.idx()], false, false), name) + // } else { + // name + // }; + // format!("{} + {}", full_name, index_ptr_name) + // } + // Read/writes to global collections consist of global name + pointer offset. - fn codegen_global_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { + fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_global: bool, has_global_flag: bool, is_char: bool) -> String { let mut index_ptr_name = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { @@ -1121,7 +1408,7 @@ impl GPUContext<'_> { cumulative_offset = format!( "{} * ({} + ", cumulative_offset, - self.get_value(*index, false, false) + self.get_value(*index, false, false, false) ); } index_ptr_name.push_str(&format!( @@ -1137,20 +1424,26 @@ impl GPUContext<'_> { } } } - let name = self.get_value(collect, false, false); - format!("{} + {}", name, index_ptr_name) + let name = self.get_value(collect, false, false, false); + let full_name = if is_global && has_global_flag { + format!("reinterpret_cast<{}>({})", self.get_type(type_id, true, true), name) + } else if has_global_flag { + format!("reinterpret_cast<{}>({})", self.get_type(type_id, false, false), name) + } else { + name + }; + format!("{} + {}", full_name, index_ptr_name) } // Standalone function allows us to handle recursive initialization for // product and summation collections fn codegen_constant( &self, - declare_variable: String, name: String, cons_id: ConstantID, w: &mut String, + num_tabs: usize, ) -> Result<(), Error> { - write!(w, "\t{}", declare_variable)?; match &self.constants[cons_id.idx()] { Constant::Boolean(val) => write!(w, " = {};\n", val)?, Constant::Integer8(val) => write!(w, " = {};\n", val)?, @@ -1166,15 +1459,16 @@ impl GPUContext<'_> { Constant::Product(_, fields) => { write!(w, ";\n")?; for (i, field) in fields.iter().enumerate() { - // We don't emit array fields and size was set by struct definition + // Array size was set by struct definition and we don't emit array content if !self.constants[field.idx()].is_array() { - // Don't need type declaration for the fields - self.codegen_constant( - format!("{}.field_{}", name, i), - format!("{}.field_{}", name, i), - *field, - w, - )?; + // // Don't need type declaration for the fields + // self.codegen_constant( + // format!("{}.field_{}", name, i), + // format!("{}.field_{}", name, i), + // *field, + // w, + // )?; + } } } @@ -1207,8 +1501,8 @@ impl GPUContext<'_> { write!( w, ";\n\talignment = {};\n\tdynamic_shared_offset = - (dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} = - reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t + ((dynamic_shared_offset + alignment - 1) / alignment) * alignment; + \n\t{} = reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t dynamic_shared_offset += {}", element_size, name, @@ -1366,7 +1660,7 @@ impl GPUContext<'_> { evaluate_dynamic_constant(extent, self.dynamic_constants) .map(|val| acc.saturating_mul(val)) }) - .unwrap_or(0) + .unwrap_or_else(|| panic!("Queried size for array with unknown size")) } _ => self.get_alignment(type_id), } @@ -1375,17 +1669,24 @@ impl GPUContext<'_> { fn get_alignment(&self, type_id: TypeID) -> usize { match &self.types[type_id.idx()] { Type::Array(element_type, _) => self.get_alignment(*element_type), - Type::Product(fields) => fields - .iter() - .map(|field| self.get_alignment(*field)) - .sum::<usize>(), + Type::Product(fields) => { + let product_size = fields + .iter() + .map(|field| self.get_alignment(*field)) + .sum::<usize>(); + let field_alignment = fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(1); + field_alignment * ((product_size + (field_alignment - 1)) / field_alignment) + } , Type::Summation(fields) => { fields .iter() .map(|field| self.get_alignment(*field)) .max() .unwrap_or(0) - + 1 } Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, Type::Integer16 | Type::UnsignedInteger16 => 2, @@ -1395,19 +1696,11 @@ impl GPUContext<'_> { } } - fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Result<usize, Error> { - factors.iter().try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, self.dynamic_constants) - .ok_or(Error) - .map(|val| acc.saturating_mul(val)) - }) - } - fn get_block_name(&self, id: NodeID) -> String { format!("bb_{}", id.idx()) } - fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool, global_pointer: bool) -> String { if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { if ty { panic!("Dynamic constants shouldn't be re-initialized") @@ -1418,23 +1711,9 @@ impl GPUContext<'_> { panic!("Parameters shouldn't be re-initialized") } format!("p{}", index) - } else if let Node::Write { - collect, - data: _, - indices: _, - } = &self.function.nodes[id.idx()] - { - if ty { - panic!("Writes shouldn't be initialized, they're replaced with the referenced collection") - } - if make_pointer { - panic!("Writes shouldn't be called as pointer") - } - self.get_value(*collect, false, false) } else if ty - && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] - { - // Shmem/register arrays have special formatting + && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { + // Dynamic shared memory arrays have special formatting let mut declare_array = format!( "{} (*{}{})", self.get_type(*element_type, false, false), @@ -1448,7 +1727,7 @@ impl GPUContext<'_> { } else if ty { format!( "{} {}{}", - self.get_type(self.typing[id.idx()], make_pointer, false), + self.get_type(self.typing[id.idx()], make_pointer, global_pointer), self.function.nodes[id.idx()].lower_case_name(), id.idx() ) @@ -1461,53 +1740,55 @@ impl GPUContext<'_> { } } - fn get_type(&self, id: TypeID, make_pointer: bool, is_global: bool) -> String { + fn get_global_flag(&self, id: NodeID, ty: bool) -> Option<String> { + let node = &self.function.nodes[id.idx()]; + if (!node.is_phi() && !matches!(node, Node::Ternary { op: TernaryOperator::Select, ..})) || self.types[self.typing[id.idx()].idx()].is_primitive() { + None + } else if ty { + Some(format!( + "bool {}{}_is_global", + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + )) + } else { + Some(format!( + "{}{}_is_global", + self.function.nodes[id.idx()].lower_case_name(), + id.idx() + )) + } + } + + fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { // Product and summation collections are char* for byte-addressability - // since we can have variable type fields - Type::Product(_) => { + // since we can have variable type fields. is_global can only be true + // if make_pointer is true, with the exception of recursive call + // from array match arm + Type::Product(_) | Type::Summation(_) => { if make_pointer { "char*".to_string() - } else if is_global { - "char".to_string() } else { - format!("Product_{}", id.idx()) - } - } - Type::Summation(_) => { - if make_pointer { - "char*".to_string() - } else if is_global { "char".to_string() - } else { - format!("Summation_{}", id.idx()) } } - Type::Array(element_type, extents) => { + Type::Array(element_type, _) => { // This suffix lets us work with references of dynamic shared memory // and use n-d array indexing. - format!( - "{}{}", - self.get_type(*element_type, false, is_global), - if make_pointer { - "*".to_string() - } else { - format!( - "(*){}", - extents - .iter() - .skip(1) - .map(|extent| format!("[dc{}]", extent.idx())) - .collect::<String>() - ) - } - ) + self.get_type(*element_type, true) } _ => convert_type(&self.types[id.idx()], make_pointer), } } } +fn multiply_dynamic_constants(dcs: &Vec<DynamicConstant>, factors: &[DynamicConstantID]) -> Option<usize> { + factors.iter().try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, dcs) + .map(|val| acc.saturating_mul(val)) + }) +} + // TODO: Add float8, float16, bfloat16 dtypes if they come fn convert_type(ty: &Type, make_pointer: bool) -> String { let mut result = match ty { diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 956e25a9..05b8bf3e 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1369,7 +1369,6 @@ impl Node { selection: _ } ); - define_pattern_predicate!(is_undef, Node::Undef { ty: _ }); pub fn try_region(&self) -> Option<&[NodeID]> { diff --git a/juno_samples/matmul/src/matmul.hbin b/juno_samples/matmul/src/matmul.hbin new file mode 100644 index 0000000000000000000000000000000000000000..046898dddc96e417ac560697d7030bf9583f77d6 GIT binary patch literal 1323 zcmd6n>sJ&<5XI}(%=El_VWts$ASj|M3Jbz6WHAaFHRk2pm>9D-!3YX!A`nah8sG1e zKRML{=jh3|d`o|*>iykYy)*apuB#TWFD))FFxYY6Q$!I$*8`}{Wru`<RZKo@uFBH< z!j0KhW32V@oJ`p{KZsTA$AO*p9XqYSuBXCS^baf87;O#9*^bzn<#fae>8k57c}Bq~ z(u4a+%d>DjGBHd{3G1r%c&uXDVA@CziJl7>H>LxogA(X!m&c8*W*f{l(lyb$1x)NT zJup3#4S;!YD1&XaOkSC5L%MGd>)rj759}?~7%{5}KCn+U_~5V_V`oH4HI$FlT!d=3 z8n<fQs-vQUgB0v1;H(qez6HEv7kGPbIC`l0;hq2UuKiy@w?-t(R99?Ze`7PT$cfy@ zlid7SM;C!)7n%qrB7N&8(y4%_(53=YvEgOblzaRHn}O-O;_4D|X)`iOu-lq;di)GM zrWdA{^ldTq37Ohs`eFJ>&k3l`dOWh<48RPKo)<7E;1z~U4W>r=>Ky6DJe5%&8qIHv zNw#zI`t~gAhDg^$vhj~k<O`paQhx9HhGkjKIxJ_M_|zPMIYN3xz_frbkD22z$0@rn z#^n{gD*Tl2xA@$ghB-}H-3qH&Q`+i^{_1PizkNgb{WBZ>HSwPl|M%z31(*w@*9BY_ z@E+fqD==3iG<m$MYozZ=zqU@<Lz!G#&mXj;|L|iR+Pe{Z>J|9sx_Sw|SywOQ3YYt$ zYI!lJo=8nCFZwK5Y+C1?rX_AuJrmw#`7e`}|2k>8cUHXtFP&Ad!INbz4qH-8tGD2j zXKXD_srTUDXVg3JA2arxKWFTBsYkG=IA`VJvU*+~_q3EBmohkM2ghyT&K=+_+rhi{ zfp^Jue}Mzwmly^gJPLk|&%kf+1^78Gg5Tl0@G<^^T(<w)+Ujazf)udb$v;QJjvKP$ zDL)Y6Sc7Ma>k<+;vhz7XZhM%ZLpn+COj0MH0I^H4{CJ#TDos<JRvd~md2J{V0H$L! HMbrNPPRfa# literal 0 HcmV?d00001 diff --git a/juno_samples/matmul/src/matmul.pdf b/juno_samples/matmul/src/matmul.pdf new file mode 100644 index 0000000000000000000000000000000000000000..529c95335bbad3971558e34beb66f9b091600b7f GIT binary patch literal 88675 zcmXV%Q<Nq=)2`dLt*33<w%yaVt!dk~ZQHhOYo={;&-cIkAS+2FN6D(Xuga>BDT<2I zG0}6tkhNUZe!#F2F%sDuS;6q~5;4e{+L^mp5V8F;DZ>yE5iy8c+PIiH{p)QET}(wy zjqOcLVfguBoL!tu4Q*jOvX6Ug=V+yz@Sl79MnF%e;P!haUoicN@)O{P0!gtBLB8ur z;11{~*c|IjRhr?yyvDeY6SQ`8zg%8sFXL$frg#+0)XYpZ#KcE0B3_Y?e|%9-@_5HS zU*Es({qzUZw5`&Qj$S3E`BvimUc2J_-tq>=_P)|&_Q!v21^kBNeiU$i#J_)T82Ub} zJJ`HGzfp$9`aUQ@f2u5fi!*x1_xs*nCq^fD1-_n9rqYmvsYlcsGAk51`c@dYg82`} z?ECb;PxijaKgq|Jd_UUK_P%bv#`cPzhy31|S_!}J#;~ftem=3r5clT4VOv=?{oaoB zf1dY#eV6U=ziC~(EGso!W=w5O<+Awo+SES%^b7nt+55!0K<pbM`FSC9{wg@AYs|dh zSpCAS`W%(H{E%1`xRw~r<n}d{b&|mA>3Q)qveRFjdl&o+Igogx5;)Q*PGIpH!MXy( zarCKQGkKr&9ljjuBER^3XFZLb<n|puGKGDX-|R*jdQ*}49%J;fzS&2=I(4XiTwF-V zKgiTB)s6Sr{(9zYbzQ96#aT>5=63?iO7!#AofCdIguP=56Z8Fgt-{pzme0?{GHF|W z!2Hn=D9yJ|Abms>7^Rf!uGkx9a7|_9_XSjCHr77hZSCm?C;NW9ct?4p_emkWr|bVd zC2WA~9a~A&55h#b_+*Bml%yX^zN)JuEb>XHXi*E<H1Dkz+HB?Ot1a!O=tjlWN}3z* z>#db2@Z5lHP+EiadDD%3Bs<i7e!tcIuJcIyV7q51S?SYip6vS$V9<mZV|!)zW-jf6 zI$*dOG+ZV00}Ou<`n@uh5`Kf9rhB~!e3$Q0LMuJ09<O<0_e`(8#T?$@9vppM1AJJy zn2-C%9ep$4MHl*xBN$S|zY-y!JjZR0041*_?itK7U56A?)(jChO=BKklh1(Fh~<wo z6bG*ll;aO5s^(`?(MQz{@RK9lJ)_L~a1tJ1d_PX?=Oz4@6O>vDo0gbgjkg(M#uaH| z##AqICQ`m{=qXxbh2bmgk#`HE4D!~&rE^`HUNxhkx^>T{!ZKVxECtQ4*^7(j$*DmL z=)O7wFDR3P3^Hd^(Sx~zpUREd)E(XxbP;Hb!K|i&RQjXib5L}UPLv(G{Au$i?vcm= z`qGq|<lp3cCfyBCfaD;oU^juwvMxXYJ8?1|W<gEs0~vpR1_lFVfm%WMa*dX#G?lHL za8s-pa+NLGQIV8<_9~QI-rwL>^`oI$^$+HP6|6pQd9$B`G1cVQZ-mGGhc|(vx}PP5 z@LiGeqfGKiKDlNw)J9b(Z%%S+to#V7Ps)j2g#($(v!Z1ZTHrj{*c|FBsw%j3=M$N! zFKi_=;s6v#DIi&yUfuq+5gmMe;ap)*F?oM2N4-Y|-Dj@yL*H$_4=N8eB8H;KmFGEF zyViBt2%iKof{^SuTdqx0st~y&o!wT%_RU!-Z%zRLiq)p83*S<`d-RR|3-msEj&6tb zFYEbG<e<+7ecx!)31qd8@4Mu;;Iiqz?)#R0F@5Y03HtLY-Q>;B*bm6q28UjDf0q8F zF)se$2lzq{x@Lcv5VJN$uBFcncaLgm1Bels@tN}$gDXW9Q`b0CmckoFuAMoqb7!s4 z%xH=kT*|k{w(eu3%<o;3E<2Yg1!v9tpbt!Vc`QGKuWBds?gOfHlWVG5$0Lea2y-J9 zqWdJxzh(8bX&M98Xnb>KMGujB1m#oaCZb87<XVDWBrB7O52v&jUl05^y#nX|P^exQ zZ4e4@erFFr|GdgiIQcItPCdz2wDEw$1u*Z7>uXPQ@)W_PFd&5pN}(o?1%n(XL(OYt zLWjMfveS|brcp*yml*tFHAhN-I!7+Emsf*Q>TTb~Omy1)U=p~Cp2ctM%v<1kW~iB% zO9btSRLm6PtZCFmK|s0GL&nXbtdNql;!ji>(CEn5?2DF$R(4V5a=fb&KY}?)$IdrF zRx`8e9PBcDK@kbKB*7&gD>M&kGwpMjgOS1aQ6@v>-)Z~;M?mmou===Hy0r>6tO?lt zGpz5<gJycij+YoKKi$`EewE=ip>Ygar~Y}LOE*ZU6FA7DACKF<(LVDH{OLUfPv&yF z3rP5*a2y($mPCQ5Snp~WNkWd9fJ;=}2I44neHt>z-JBNbZe-IiS~yHB-Q3+3H4YXM zXoT~w4`mb&O)U7VnFot43<cc=%O!zFFXloSj)G(;0k)xNcK{H@&KU}h-f|SsMKD)~ zFSIxS!$u0=v@TnZ<ij>6L#^#dJSCnCzomr)Ksn#P@aeRMt8xM-W&6ACUs73~-(VOG zp=-He=)!X=B!BG|(SO)&uQWz6`bFL}b|OoDHl{S>B;s|%8%wViu}5^H8cHQX7i~wl ztC${^U5tX>QuAmgOzY!-b0WEweQO?9O%apwHQLG*cHuuIRjWeLQ=)H3i-A|<{34qG z2|2hkE>R4jkW%3fc0j!q?|yl3g`v7=!=#?lHII3>2*&}36!<;ZpOzTZxB+rAy3>?G z>HT1keP61RVFY(#kehhjaj8<c|JOD@&=lrRNyC`-maOzHtJe~a+!h*zI?alyfGWdA zqW8Ar00RrUr}oDRt(tCZQ4zXfiyTfGRV#WyOMUSxdD$?#8uV0qtD5B)ySnF=i6fz$ zP|i}OQ`wyJiQRv(?1W?U@YCE#>^0+=x<DoYToM4Sqr)!L|4dRdk(GujJ8P1=LShYj zF-NG|TG_Xq8xkg{KWMo?f6pDYiA5c&;n{Pa+kaz_1=8g^82K9{LXJ8q2@kK;wg8Gc z34N2yjMdmgF>2^5-z&o}yxo;_a)tdIhXlY-i07z-_nRS$vI}okL(~#K>bLq<#gzDi zt)sS={W~wkWD8DPw_eS&Y2l^QhX7*XXGSR<u{3gxK}kF);8rqPh)m(O52`H{$v6-% zH-s{TKs*aATgZd6DHmz8KTFQ4#ovPh4LGK?YK4Ry3azN39yms&SYmPWtKCaGsic%f zWb~S`F{22d7Q4w#=3Ms9Vgf7P1R*tgg?&)d7Ld)BZ88$Elh}}3@%-6B%CV)-@NSx! z#lSe|2hkdhTC|a{DLM*2d3$~g88o?V$r_~3<bO(SgYBfL`YI8^OH9$vy_}0PL+--w z^N{hNkRWIn4Z0OP`XFZ(ZB>Q7K!ed@x(&}m%(l*7D{1T6t}j{sC}>SR2$v?6zXILy z>$A||0nl-U0Epaa7#}TFZ0tP!yr$HiQ_s|~ei&G*@)OBUkoz_lba6Kepebb0QU*7* z6_!gM!6|e&XkH1Ge7vRRdw>V;jCJI&vEjE}`NP3_alrj0+$JI)P8);k+sd6Sugu5X zO!+=b-pF}`o-pDzv*Z?rx^b@Q!u)UQ=u6S~WaF$Fi#zJp^S%~A8!G@i`<#)&9#~|W zn>o8d!tS7#4R0R-S>wq5wvsZo7ga5HZNDs^<9L8fk`c#>uuV6JkUC{#O4|LAK=F?K zi(Nut;o9Lmz}E0BGPuBIu1z*GwM|mACb6nBXjSpXVNeHOo`*ju5rg98!G%w=uEzCk z{vxGM-mT=1K*UzLpV4=Ofha1jtf(K?_A9z6xQoIN5ZD+KoK}HRlli1cLp4WG8}Yqn z1g5VG+oU-!KQ;Po^>m0CzoZ3YBySO|ceLt?F=^widpp8?T4;xus5R;+YKIhth);1+ ztdy2JK*2*hf&=pJxx=~t&|&{l`8kj2z%Q?3ORjCa?5ZItYgbO6SG3tB&c42xCmTSw zhqHrT-M8yu;L5|Cf7#5Or}i5&IXtadcWnHqkkP$*yMzibhYAHn5P?CmX}oO7!vpr& zi6R{-V6p#kr@YSuzzU&t9RHLwxzwe#r4X8nUp~lvH~`whGo3_Affp;-R6ofeKebLh z7#F{NFEmpDw=GfI+7YR~^-Ic{6JL(IiKk#y<!Mtp8L>z28TU{-wD#Hhwcz`evu%oa z{%XI4+OPPgTm<nt?|02RtX|c(1fskIJg;Ad=O4$Te}2_f9cDObUaE_?nkFy9T@sjW zNk^j?4KWMb0_K^W0fnKJOu6ocokASPnswP61V#*nx?3THx&B6fZ2K4u%@nvvG%|U> ze72Uky%uj`m=u>W46V~P+2}6f*9mut3bpd5=1pD&m~W{tBmm3OHo#imS2gYdR484B z5L5X^hyZZewRDdfd?lzAti`nt77URr@wYYt(Z{$u23KhniV(^-b-d%f>DO6O7xk6d z?f$?{p}>XRX+&|danF7H6~cqd#B&ZLVaE7~A=d;+JM<+sW}oI{N|ud#I*+w`dY`pi z!F4_C-?hHJYjf){y)49Dy|0UEvX)otj;VerM;DkCgOa@!+Wz@8Ez6Umv7kLts6Ley z_}%4kL`bPuWJ@?s{gV%)-s)l@4Gc@_u=7eCX=$?oJUt44Um~b-Fz4i+@jPzy|3>1# z6Bi7q6sFb9@8o!e;b@}k5Ee-Cn$=$yC*C#lN_VyUmKl$QQfmg&;N9KCO50&dk{amN zERH{R`Lt{kFKhj_I9KJoXu7=Wkwv&47`wLO@X8{{8IXI}^v!SMR21$f4BI71(hz2L z8TTNm5#zGes0q4ax}sEDJUzJ3_S+aK#CedT{LH+miAA?fm9^fJpjrsr=_o2e{y--% z`~~HPuMgmNppJ3dv?+_5?&yTblT2Z>%f*PCe@~BwSX5uzaXn6?(11Is=MvpvqPQ>^ ze&J-}o6Ha`CZb+d-#<|HrY$2nG3aSpry+W`V^F@4zf0fMAP1v5PR?OWEGv7Qv!;AN zav*znskr6syMx<fl5-}%GtulU2$w<?9L)6){{C_%KDp<#Fth~&qdnAdU4Y*gCt4f* zI|@A}8mP=ht#%Dp*vGdHHAXnUj0^jh{Rxd&<=bWQv*GleWsmdKAl&+k1K>AEk<IHs zA{Q)IL#K<VxDE;!+3Mec;}@C5ZG7J|{ECTlaoe#;fE%tBNoS%-14bR;K5;&$$M+WK zoZSkb7(=|ggOsRbOJi{%5%%LzHoq0<3v5460k}T8oJh8`@+jAwAt({n>?+H4<5hes zb`{IqJC1?(`R>ADe?Q<?Xh6;Y1S%D6OEAY39(N4GoGOL)i`J=fk@^oHe^=fUhwy1F zN7>|wjxFPKa$olE;JcB2!2LFIzDIvML~OOT*5!ZveNq=@1~4P%=bHK9?XJwHf3G8{ zvAn+nK~Qgp2(eEa#iENsPErW0NySRF#sKPw-*-Ww7KhOq#T?Q?yz7wNDMT-$?JfBw zI6sRjPlIE$OoSjK`HE1|V09XShXpnN8mfhhSH-EZJbEA2l<*>Lo7nZXAcHSnySv~5 zC?g-l8C2_3maBZGfRcVj2MOV(qE*WPWw3_V#b04YP?x|KuVoj3w!TM+{`EEBHnVmj zQm66F-LYD*n!HHh93+6~N0U)d|9p0!Pl9O{DcU$h5vTZ;(*$b@_r;S^PH4n&`}gGb z1v2>E6_MoUcgq4<d6_n@C8NzL_E+n0yZu|6;`i!+R@UBpbl3&y_7*WkYdXD4QSXaI zYX45sbSA}xnTx+XCJ(9Qm(FaFc+{JzuI&o@J5H}x;f8~o0%KI}Y~VKaOywcVdjD51 z&=qI(eAwP+<Qif#d!aU%&Sx0{j^td#D`}`BYQuJ!x%0{`KXL)<x3&u+PT)~^rMwvB zHfVK=ZI<;7)U~OD#7Y&T9uz3#(n{#UXMy8~d4=<i)0>4PND$J_@sj!mf3sfEf()Cq zX3UlVMSJ9q6H^dm7{$l3(CjnBlI2n~VwnV%K*6HWLcvMU4bgxDwn2e8i$b1H+w~89 z4dJyg7PoxF`tLRkHN#B*&UxBxw$%j7YLCoO3{~ee-~>oKHv)cqT`-3*jZRwRUa~;u zYik`Ber%uCwgl|8zwGyQ2yIL5o)Xh)N<6+7Jogg2U8gO=0d~h>^Pr$(@*R%|9RIiF z1I_OE?`PEh=Z4j>{(n$lECU((<!5&EL+F>>^PxX8x)+C=Il5H0K9%<dj5DUY>ju`I z;}-UK^TjV7(b9+9%EUcw9$n(Xrd2wBNgmv!@hTClTwpZN$Ccm`;m<bCMzH*#+H)bf z8Loy}3=u&o=ISUkQ$$a#+sNf*Og)L0>*&dIFKUa@+DGSR(>_q+dVXLQXYJ%fZ>&qx ztoT|&oAn!U%5m#--2pW#h8Y%*{MRa%SL(z|XDpjd%`H4kG2#OY)wdt;+K<l9qv3Qg z;@fcD4niLPKos3%bMD39L>T#saJ^tLf+`a1kg1Va^;Q3c^?AH|KERE9=iUUoDpW}6 z%jNzF>lVQcEBAp#segrtZz!$QvO`N`IsYNZIVgkd#;Un9UjAA<*P)Tio^#{e!HhKf zczmMVkcR@**L!3_y#2tWYKZF}_-O5%WF|TK`sG|~OirwBdfz^s@6ES9<Uvw&Ng+?l z^**$4y}6%DP{WcQsKRhmg@=GOoDmDzct|4QGK7N7DApc&T~5`+=QDjAa6{j_jYl+w zedYuBt`QX8L!~>;m`WN!SrT~<B;%lM9wg(shEzs1!v02opozzE_tDH<Y-B3;uC}7g z)~Ww=4u(B_am^ch2M{d5K0<xa3^T3&uu_{mSBOWI-)&P7!4Zbx=~3+=cU$YCMh+<E z;n(BkkAsa{E&S@Jh0&=#&v^~C?3zgP;rbt2#8Yph-ZF8P{PtSVuBUaBV35pG6uGki zgd#132mPfp0kvlM*LVtQ?xe%Bw?M<6gPscOtd(9Jcj}fb#;poI%OY;u6V9|Jd4Xyv z>&kCmxK7aJFcFuI`pUeGQT#z|h<M#lcJr-Ul_hsb+E1V`wf3n_z`G6Wp|fpOW(YRq zD=P_-?bFpsq3)*}e>GBwC?H%hVVQ7biFtZ->BK{#&<Fylz%NVcW`(K&K;%eHWW}nG zO@XL#ysF!>Xi!&cF{&Cy(;ENK6Z~x;u)(y|<$t!?smoi{#V>B+gMN5jQq=QKxj!sR zhMYi2(1_oNq%K+7R}*axP7@OqgS~@3cR|;NmRLYh|1RJn$su;qx`KksHW<2s>iXyU zGaso_5|BEEv;`x|P!TxTNQc6D-2Enc4s>p$aIC~p)t|VkQn)RN5(|{U^yh6hG-r|L z3tvMU<EVO3<2-O^t7iOBJ1Wx;{x15N49Qm|sb4pBqkpi?4Dp<)&G&}f71-xSuBHC? z&1)I3C^<Ls4}Hpc0bl<A_{DJy>W}+aiL5X;gzT~X^PRjaFeSYf^!)=u_w%<Uic<IT zwoHWglt<+Uf#+=RC%sa>f1Zjy<sHWq*N59(xrL4*?K}MV-7jwAlL<f0(nEGTPRc{} zr{HGnldblHTq55;+@vi~Znb2`?p~F}mA`U^lYIX>9x*1LKVQp*yW?<7y3Mr6&6&gF z$f>(uU@ZD0cVjsv6N#&u6)eUD<;23u3U5%{q)3;*W;Hpi(;a)3wSC3JVgH+*g$R*c zR`S9qJmVf)p)_fM0=Fe<*{?^*J?@6}cm<!`N@c6^0~0yXb-nzxXARf&<yCx#uXo8M zwMXd*yH{xq#HHvO{rxse|1R_|M8x>dm(B9`h0XG_gw3*BpVAH!=a>V70=iO1oF)8N zp<q!+5TP{gheA<#i^$pVq~S+FDX-aEfk{=W0-n+z*+X4O$Ye$Hj%UFd9t8UGp>mcB zgyAW6S=r@P7h*??U1LOQ(&s}zmH4dsGfPNO1dCxWN*zE4a(PvWf+;WVhPKwy`JR47 z>V?QvW$kzvf%-=%6(orlZk(9Nd-``iG(7oG2{W>k3(3&VMNQ!in%to^NnMu3QrKwF zsM(<sT4*^I61*VkB{H_Tvbi<fkN0ruiFiz83NFhN?6~~OqIu@6dHEz`d2;@{FzS_y z=I-gnDN9U1#@H)xb`ITwr0nvNJ+rFC4cs!_$cW!pwuq^>C+-(#a?SE~#`=?3dfI?^ zXhRwN5R@UGYTbFH_o-ovy#=&E7wF;D9BJ_HTJcdm9G2|N@*o4$y%nEGgF1*SyM9>z zTL!FP++V@0tWOObg2MyK*xmt`X(oZsip}C9_wgL2w<_(}s?Yv*`4JmvG?NX0RM3K7 z02gf;A{Wpjumt@aCM%iCUR)nx4%^rq+5S)pl>Rzi{K}B;vJ}aAAaJg=5@q@@L%1=m z@6g61An)WqP!5B~*AQN*e^5`kSL?ZUXK^hG{rLy#-rEG8--qn^I<1P+&HX}Wj^+7T zni?Yv1UjYfyn@x1rY9i=)d(}SPEre7aocLoVWsSJZC&!;UD;mt;J7}rGox*61a%s? zX04ZaJ-&eY-5dG}=-2}rEFgE9p8lPZfsKC+qb<Y7xIyo1LaWSMBY~AhM<Lr|{pMPq z?NvtUG8Un8;L#X*4X;V&;&shAg)ERNca?apk&4e+&Fp197b{u=Fh~RX=oQIs{Yj@~ z2Q!fVvPXM=0y>ZeM*s=#9dHSg(F@1fKN?6w)W?~|A2$<@ZwiIp7s3cp)bJrh3-O`= zSpyBM(j8C^I9*Kkw{9zj6Ac2GJ>tHzKB`{hey))Ad<?7sH|47via?eV=SJ2Km#|}@ z0b@*@Uf3b1Wt-p5v9Mou@C?dp?b=x)h0hwT^TPcOn6mfBnuQ@!<7NEtJ&h1)(GZKA z9CDjGPxr<8BirVv!~hA}H+X0ez!i(xweH|R6u$sj7kf~va&y{?YXcK1eFQc#ViwEk z*XnlJ^0t4m(l<cG!%`j3{(9Ch*$f$Z@7HJK)?15|n<Tr)Wp()#BdZL>2ek&dYUq}# zNojaubHEw%nsy&KX;R+qa!MaU5UaUcCjTbByhsen4fu!tRMRRLIH5V`S-BDh$68Rd zPj;#<9FNm<42|@?A~S#Nbn#6`T)=%Nw37j)Mt&%ST{P$$V!<K);;Z{B35WrJOUYb5 z8x^AC&;UTXz`N(=Tr3XI{(mGBM4?b9IzWA2R#w<`*+fHK`B>Diy2J`aKn#ww%~FU( zxqC;ZN`k5m<8H05WKw`AxZd{3FD^1L75j)7oQ!gY6qzCUx4%xTaLD<`msFv;P?J36 zFyB(}-Mo@R#pyQ3quO}&)DrteWDZMXfm0tbZwDU(lx7-0d?aFQW*lc3GM8Ac1XZ8| z6n4qE{<1c|qV)CU5jN6ai;YE6mE`q$`MX~C;lC$}f#y;}w(o_EYyUk3<g!0KX7TdV zpIQC!=iv%U<@XmKX6wuO!10X+h)H<h4qnNS3&6f19?Fx!3dUhZX6%Fz7P3(L)i?&} z2lV(kPJQ6PfFOSD#^s%UmcW6iB9(mY&-rUh40tfzaGDz&qtwiwFSVj8<-Y2w{Xv+p z{eJ?Z3t9Yv7KON|B}h60qLIb+3a2lqUZ_j+YpBj|1oyYeJoW_*MEA?Y-)R|_eT9AM z6udr5MXbZH_>0gVa_7TGEH*uq7&^Mz(UV(_pJY$OTw+5<r6IL}4XT-eX+e)8?acu= zunu_-tX=-W0;*Kx$v_6m@|$uYBavbYNKp}=$dOUtebeoI&UWm=RoePmvmyV#KF{Ui zaP%t9*W!6%7Qz8;xXmq*sP?QSLx%vvV#f~_IjLpgY8bI1wzlg1j?S3DRcV_18F4G> zM0soSzf->Dx+nh^<!<Ub1ljK^YTon*lHa{r8h%&KLvaZVh$$*X_Tz(t45@l@FSt#r z`jyHu4@M6-06m?mLC2I4v37D2as!S>KSx+cPJtNcC=mojZzWU%bYC;gM53l!7F84= z@}PCzuzc!j(7E#>aY*P|`9tIIJrN~r>ccIpx{k11>*(Uz*VHu+Q(~Jve#>7BJf}S= z_GQK;Jcq6Hq9$4!?!=K*U4)NQQ}oOA_EL4_b=4vJ^vHqWsOhfIq9S7oS>$mCnSq+A z!y8t~`jzc={?c9S)8OUQK|i>nt7_&BpHM~r%pqNb+TtYP9|OE|J7nc=kTH`0d0ln& zw#cs&5flFUDBTOVjT3%FaP(n*j-IZR9@(1j_1ZStZIoWMv$ZeSuGd1IWVmpup>Dl_ zc8kl}agW=Yv&CWI(7U9XdTw7e7=7uP?3H!U=sU(QiW=<aZyD;NI+%*H&lz@6sLPu& z3|G@I$!dmox@Ti6BkEG4r#uDFCYWUg&8uEsh0{L6_{hOy7Lq44qNbZk0@*8%G(g~~ zQkSr0lszGq6)A`{I*J%|jZuN2EF=v7hJi$}9#pE6tbX$NpH<4{g%)PPl0D*%YODLo znh|BI>Ji$`cfw2ft4_}7%^5|dgByehE8&L~T%8$_Aezx*;eI5K1_fUW2MnLh9z0dA zowtO6OY+I4JSW|{SxQwnaGchvu~4Z{laU(Ee5s`5V_~fxRFa!fu);L!qTfdGWsflh zJk>mV8p&AIT*?iMhWQ2-w*(YeKmmUtk*}g0Ah&+EsI#9F=CY--({V^qG>4blW8(*o zd+iZl<9@3lYx2_7XY|MAR%03s;Ty)%_Wzb{m8|Ih2HPDynP)U)UuX{f_T18EdRY36 zTbJ2<bEFX*%PMlP-<htY6om|tkiMj+IUS~n?O;E!F~hi1uD%T^yqjzbBo@SpB~pg* zW!UZ<mN5VaS(?L+z%Oq+aub@{!G<ogvy*Alp<-0*_-GC%i8z(F@VEQk+=cUIRnfLX zAgJaf`8MHC!+!Ht-T~rMRdS5a|K$I*jkxO5+fV%W{#-gEK6b02#(Wbta|RX?ry`O> zIr05=_TbLx2_8vtjlVJxW}JNf@8qay*}BtV1YlOip&CdLJ4|{|J)y=^98svouwd~S z605Tsp(jH6kUJy>lv*SB&|#kqP{gvQfHm?N@q9k=O59<YwH!EpF7GhKv&OZ?gFouW z0h3-#)|cb1Syqtg>y3w#aUFP}5rsXp%{J`;D-so7DX5*ct9j^ZbSM9Kpk>fq*}&n) zoP0wAJ?uj=t1A4W(BMYYF*1`f5QmPj0|JK<C=6bMPzej0v)}|G0F}7;_VJexM1nVc z@V$w+tc*h818&M<WZ`B$Y%7?yX_Vj3R)V?y9(AywcawqjMK}Tc(((%kc!GY-R^F@C z!Y98Q^4W5H`9G~e-u5uVUq)ub5-tOx1d`=R*Z=wM%&Wn-w)MJQIo~_re*6&R|85Dc z`L3vEGjKy;9t>?1fzfriUbb4~<$h((gnkHD2|zA-e$3Q$?N4I$@FRSV7w;ln7FMtc zeCRiKRr%Z*SHUc-)HpBy@jb<|%V0*Y(c<pK>#(Ia+|CwoYsmO+gQ$zWhRJhhZByJp zb1<rcZCKyaN1UrQ`)U49n~#K-mB+uJA14_krH`DIg{ygl4`f^)$wj9GyF=t_TE$e0 z(Gn1@?f&dN^S{&<*>aK}A82IeSrFh4J|<c^=XjLgmxla0a)yGzYQt6s`im+rs{P0l zfuY}!WdH}DcY*5`ONz7~%W13^Pns$0S~(~NJr>?z6xT9V+SfoYIu5siAPfMYei$M? zA(VBl90VV+Q0l@{Fv(#xKyI<=ui^Y|x#w8dF!Fp<ki7nzemj@A6`L*n5Z-^6qk{hW zm){^VTx5<Uy*=x1ETA1B)Y*FzQy4!K1KEG3uv`dMf?Qb&$(s|M8gU`e0m@K5-Qudh z3)>T-9bAV*WvbEGt|s?zT&p^xoDv@H9`BpOSfZ=b?~|Goh;HX_NJBv{Vt6+R&o`>6 zJW6<=E0oQnk3leQrAL_1_km`YD5tXa>AviafS-v94#sochO^DWIS+0Ck|Te5d;bSH zg381A%}@RRQXGzB$R>K^$LR$Xu}h1Qa&5liX-0K%RyFnfd*$sKs!Oj^YAtB@ImmD% z&QkwUOz7@(3vwV)$>OVTqnT{HYm+wl@7yN`Ol9-a#z06Fgrg{s%&Hv!CprvgdNFGx z3>p}7p)|?8hx6cQeX<tx*Xz73`uqi;tpqTRNyj;+LK;_8cxo~A=ae(zPWK=M=l8<_ z!;+fD^#@MwYH_--IWS*m7tNUEI1$}BW;5S$r**&zS;-Q8*vjw!N#g2!&n8E!x|Cn= z5cL5d&8_nHv{wj5cXBeOrq>343Qt0%8WKT=g`1;+Wl_frI54(D0blIutWt)XApT~> z1E_=8N4I*zqZ<5A=wO7P4j`m!!5or(H7>Gg#Y|%MliMD2O-ogRbWt%v#?xRy;mW}2 zFE5;_Oxf3Y&B0<$%rpjAprx~ZvjJjDRcCL*>`Ql<4_{dl558KlU+8y+-guX2wk_ir z`sIxo5!QNq1I;;wGdgm&Si0-24l_IsoKunMJvc$YVHv*>x3}cqggh@t|Fc*R=Jl8+ zDK2TOm!LT)(BYV%l4jF~``q43Jt^DUN#q#e(eLv=khfEmPH?-u<m-I4;5r@(d=M`~ zi2VjRQ39qq0^aauKblq5Qy6YfGP^|CrIx29KaIo@h>sDE!sBuFLT$NCax(H-HHPZW ztwb+9{`J5r#w%NNP}{ps&T7p%F*)0ASg$+fEZB7+xZIn3@(_&Ngl85g!|vDw`>_Ix z$h637F3n!qv2JrDpgCqs3H&2x7Ii}Z+9E~!8`%v|ZK{4wEI;PAc(wL;_P(8&kD`Hj z&6LpVt5u_Dfe3INw&PAdtLiJd(kZxj$mi@ZGK{b}H8I}~rK~_DfuZh#dP&{VS#9>Y z?_-OEWu}LgxYqkqGn(4C$*>Q`fjr;m;7fXYkRSqy9TAAId~%s@12BK4Ty@h|fWBeM z6Zu`|EsY1c$U98X0N6XtwM3qfH&FZbw$$cr<fYO=^@=$u$+6}&>7YHWyD|)mojZzl zS0oSaA585cWi8aqCL=x>AN2p;<6>;?8U29Wm4C-0#1wWV-{0;+Lqv7TDT*B}4{FF3 zl_5ekdYk)x<PW6!FT{b2qex4{<g>w{nc`V;Mb@UUs-pSRk4%vxp>F``UWev-UjZRp z?UGt&$WRtcUO$tU0QH4?B`J^JoytiQ&mPTtNE$Dsb_K7F8(=Y|yLl$9ihNPK*08zM zJ>w&#eam<x-FI+Mk(EcQ(SVi9@SglaOKqp3QJ{Q}IfWxiYdDBc>2l=K+s~`tsPWnl zJCuN261?6fMc_f<6y_;TL`3(_kg0V=My-ri@QIm_al(w@GS1n2jDK|?1m1!AWRDSL zePKsw&2kj&j*G|#CUW{0u$W)JRpp@sB)~#uW5iuiCP{;LlVAP+Ze%}$P?hTlU!|w9 z!O>;iR<Sjr#>Ge*%cbLGj_0PpWkCJof>oG3BCgQf=lAq|J@iKbS5~tAP6c6Zh0MQq z2w@mlp%kc*0`cDbK>qM5RJd%|x;3LG9szzN$2L1f24Q$P9`9_<*+#}J#Fb6)&mO=1 z9U+!GBhJ6blVA2L#HwnQQ3ab$8g`KW!UImt_Mz!Us$=tz;aR<0Nkw{Nm2@S7keBak zJHOu4I`DnqeVM9Mtl;(V9hG8S!G?ODsUU`8=(+uF0gHL~kFSsasg02Ju>c`}Z(!yd z+_Y0+M}1p1KXw--Y7+@snr4+q4eNtyKrtZt4gnL@VO^;ht6^0VKD77=&%V;q78Quh zbyY)}abp084QsW$8SF@cmJn!rATv4j-?cN9ZazB825~>cV&m1xw)66z!c8iQrsNKU zc%)+p%5I`k@F&JcBI@CDf31m(Ib=&@8M%O;41E@LVVTaY#`dfqXj*Di`2o(&klbJj z&+Eiw^SEpAz!0kS=G6C_h~$9yjls;~*9O0#dWdQA@@N}5wN0GfRB}mK3eVAR%7R0% zA6KlQv5-&-*$TaLF~&hs;_W1InaE^LwOo$zqhbrqz_76+1Y5?T7MWEcTN&J*L9A>N z@?Breck!c;Tp)`6)jJ3<&>K$$ePuE%hw0UQ!iS22f3cvvbIk@ff!+zyLlqL5oLgl` z(teX35=Jv{nxmMYS?Cs-@evV&5le&W-+?<3lXgNO=7XtewOE@h!m9GmPvNOc=iF$5 z-*+XLdqXlq1<z&6LT>SGuu%`+_5)Q&nIxG_9!7(eFz!d*SFOY&`@9xV;4_-HP}Iiv z2$rPGk6`+VI{;5(BB^rf&@>;k^&P@PEU$=X5?ojcg#EsDZKF9yu7G6HSIorCiFRIO z{$|3Cz<wh*P0AEpl9lyW;Gws1BZnzSEJo)V^8(Wt6s?Zp*cv!OqPRb8QadP>GfxHj zn?#QHs~KD5X-nhc_=X?Ry+N5MTOz6b7g2a}5s0a*!6li(O?D~e@G_k|b|-jK)*>UR z#Xiz~9IF!7C3sL2DiX>9&r~)Moaz+7-v-Az<G_`&705rQ2{WgM@Ke%NH(qB`yWg!= z4Ch{#>s(R>SnVdEqRiTY_n1s;im7o`5`1CNa>uKg&`RS2Ip?B&(_MA#e#$d$vE#*} z5AScfO`8F8U+Z$Jb<KAM9S8V~BlJ#8L+xSLfcoZ{M_6u2ThXisARN=9b%hj|Q&*Q8 z(X3ig@|o*@`EmzL3_8RlzZS7|G+ESJvwqLYcG)$6PBC*SbLFCu(8`UWw!DMAUgIMc zNc=mjvOqw@by~qk6dIgCvl)*&Col+H^}RR>%HTs6*#iLTfsEa+X(8b+L3YlcE77l# zYSfOTXdqEHtJY%h+eA{}gr>@QRvQGO&z<>McCZo7$B>TRs3!LhbE@eZ?wSl!$_(&H z{qrx<E<8A41BH68bCj0uGFq)FFC}|Z+IO@%lGe3+ve9R1pTI2r``Gc44#|pdY9}w> zb0U%g#s|ptLnazrpi;)m9a<ors7A}w8U<D&K>+3s;lMLhSlTT1>3Sw3EDME;%%nHQ zYqk>;fPY3w>w`JyVn))H;~iDkL1q2gl4En9gxEv<1-oi;yRg3Gx&|77?@65O7g(dq zpqSx%vUd?WcgU4mDsIoME{yZYtwu@xoUTOt@M)5v>ylBlSc-_cBPj`fB^l#N7?zT@ zz(D?}0j<JUN0N@7ld^37aN$&CC`3)MwD8KjXogl18i27%adRepedm;B4_Pa370b#v zXP_Kx(tfL@bJ1i*e>x_y18riz`bGNij7;9f5;49~xD``fD>}AB!jS#5RC8YyLQ$Hf z?%~b%F+vtXgj7g0483lfkEI<MVWU)+MmiID_WQf?o~=N!482V>;63%_c88@q{N!5e ziIQsya-gk4E`5M$8^s)UK6WV6_VudH!#HgpR#-19Z@Kx?9unqBW03aSIA5{uNUg5Y zc3<^g5Mb-sj8rL301~r5wh<r>P3D0RAl`WQ;Yg`XfNVk+g`o{#OO`Ct4G_O5@J+eU z5qYuF$ArvxQL%lcT|QN7A<n}+Q<g5pMM$2uVa+yOZ_fysuZg5b#aH?b!B1lgoEF6Z zP1o;+v=Wzg{nNT-l%dO)O@;#|A!N(CykO!`I-<P4A|r6LfXy$pF4{lOQb=SPQUxB4 zZlxwcf|C-4z-VB>_Vs6{KdG3(THKW>8aH18KnBi~pAXNejmG!x`>6Gl<~IJfDYZ)} zhjkiMW>AYjvyA$sKk?7=yu<k4rB$b*cgcr_KgJVXa&f*bBS-)6QXr5(z<J=ii_kAA z5hMBMIh*K47fCoys7M5%)5gewi8pCR+S**S=dQN4rTo}-n1R*;J=jTe{2ZBW*v7Tt zSgPjqB{C@gH*)`5@Qoy;;%pFhZM#`Vj(Lz@r^SgHLRnpWR=FS#$*D&boWImaw_bL` zIF%ff^J?x`*x6YUThw<Zt`spNO%N*vnjuK3qKS8WU`DbaS|Xk-H$OJ5wVrBuLda@s zxv;SvhLQzc47qo^sGZ9$cuBwc^6kLj2$Q}2PJwk7zH4VoCrGEq_*%gjNvkBWe{x$o zJsaNdaZGd?x<`w&gIbg3nM{M1X7uZQz7?Nw?inMMXEfM1?&^Z|)CaZIf93PO6S$d% zZR5#vM!nx{gwE;%J~GID&eS~1WhElNXwQ&j&LcDz{X#YGBfA9~ihqQSCSo2{;^MT7 zq=GdQV~wPuXbW8DTIHwh>oc}ImHR9X@FX9nKNuRQUCgblN)klq6vNC^A-RB*B?*lO z$9)P#W6Oz?D+w+pYKZM}*f39OOC8wzOIY@slM2jH1o|2AOt-W*Uer$^lWoa<OKh0w zR)S8tMb_YlT21ZO*O)49coln2bJi5u+=QL0=_VlZRX^uQo1PqD3T@E;bxR7KP7u{X zoy@^GdQ}8FN3PV@DGy6%YVj{LI*)Fn*GRj(NkD7^UIN)HYKae--fv#$B@LOn&N~}+ zI$|AX9$NXXt%}P$+6gCllT)KrhZqTF#Hs|n^nJ|G%^UgCsN=aoZFfRv<*>p^lbp0E zrTHiwVu}B#g^ya_d@`Z2DLtiX4NzHMDm6WM3fokp3;p}gS@t3j&-2Q3dc>3{2DxY` zpPb+%<(M+$ie*RAjjNFc;NuMyl(6qmgjD94`By}Q^H1z<*(^g$d@K742Q#6Mfw6Y1 z{3oWSMreF@;=ZAo_*J+LBxygTlMIjZ9Lux9>0c0{4(7Ed7+HNUH?xr}h~UWtMFRCn z4oT0s?C0r{XNfXdA>R2xGRH7L)+H8j%E9t8z~qg1@q4<nqgTY6AUt)rPPPhOZS`iL z(*5l07$O;1W@ure5I``INTkD|RF@4rG^k}SJdw}<FB!fel5@2!-~sBB{+m)$UmVWz ze+q>PYdXrV+$b?<c!y|7<are#a4@OrYS^BthH^8f=V|PDnY*!XDG?WxP$mf*0~Ki1 z%{hg#_LD=SjCY-Zzz1k~+de+sPC?;@)c~<EYR7T|P&NE`@i1vytBP<}%AoaQI1t@K zYFM%Gb(epNs)NDEf1#6JYUv4rTxgCQ|3gyXz{$MSK(8p>>|qAT(>lG@1{&8vh*11u zF`E9rShRWL)Z>xv?d+B`s&6&e7%UcQ;Xp+D7mRR#U{BlV@X}E2d?zPFIDb#rjd90d z16@+c5|LcrzaL<DoI#by>=46Z2Eq%wG+W+c&ZBzFA=udxA~XtUF=Aq|&4IiTc=drT zSgMrzuq#Fd7_Mjq$lx}U+SLFMMd@-9%+O{&BOIj;v|rbZ2)Ib}ml&wZ+HSc4s@?`v zmMN%4%e%3>;J+ucS~pH%PpR-Myi(nuq!iOy^g(8)2{1UQtcD_%B~lv6T{fT1(IJ0N zh8bFi+yL=C>BqJzP!ole$|fr+Xam7D!}~KHpW@Jg;#AQziGgl*PNMC$<1ySSFwl%c z04;d<gFq{%4)Y!f#W}7gN4PH$>oK7tjjdOM-m~{(L1~4~!^Pc$_G2)Gj4tm>dP70z z1D0?lrb75`&HOVRQ20PfsI}Q~+l1(WfYB>)QF}c`U<dPjm#Cc`1nz0gi=V_YHcxX| zl&4B%54LedrLd%Sk|S={rwlc>gNtL%CkWXmRbgb?Sz)|Zl_}p1*R>CDodm4szl4)L zjp_sKYS)#6K>FQXsS(91*gov@%-WPCgM!0HbKFq1HlIkE)3NNx8AO*rD{l=aB^qY& z*{L%J#u%5+z;1XJ$AxrX8SHv4*G14EQ_+Di=+0JzGknN378Lt}l9jN$zfXt9G=%0! z4zpHzhn6vq8Ix^#S=`W>`=_zM`0j2(zg?8j!I3rNe`S_Jp)_j!!BwBdNO*y9LU##W zZ+e**e8(!nR3ocK+i*D*+W!W=<`>4BiIyjFfoma<EcP^v*^B#@V#x>Qf>+iT<fmSg z%Wf~Jiot{ZWye>@u>gm)9P&ogo-6fs-22(EFrS}Uc@1?yDUvkUSV}VCLw$kGJGx_0 zjTO>T&s#870eynH5VbDGs4f&?Y$1CDvce(77MX@rqfnnNu_+?U&RWTmdgw6E5x|IM zDgXy$=z{ewu4&ljUL<A_(=Rij=2u2`NJKATT>y(YglTe*CI$szCLS6{3Tl!B&A;N4 zR5C~WF;4<t4-DF;p0_+BGzlc9P9a&`U)k&}fD@IQDJ5hLzUXNnVu*Z+d{*A3&de87 zQ?h71#rm_;{=Q`oT8RjArA<@8ofw}N+G(W<bL4F!5x8F$Drx~FqF;y3N)ze{DT3lK z;!y3}yM8schv-}U$^+EbqTQkX*#+KBIPjA8zRT9EmGS}JT}KDGv!+h97U?&)mAmip zp>y|A^=kpAQ{$ub@CSSPTgQCT#~-ZNsb+s5{H?G7Mpvsna-sP<ytT}Uv3+K|Qw1c+ z*3bzi4j+b-4NF5hGRCS(I0whBvityS9lSqzfjU6iO5Nn!j<p>ggd%*FnWe}Yo-r-~ zx;#P<EJ^Cp-wLauUY%HZU_r57hk)5Nn9D*pM7pR#Ow;16Gtt9{`JUK^xZF>vMie~1 zt3jaPRt{jCL91t<a!5_5i+Zg%kIbFVL<|=4jkmktL2XHX3y9ry@Bl5$;Ptd<08&+( zQse4$LSMyRz^O0L&LO}!>A?L;!Z`B>3Ydp#my_dAl0Zc<dnE{3afXfz8B6pTg5`f1 zD`)zYHu&-gQL5rJpaqjXFxfxgWI4cZVPiFprT{c-O0prDdr$a+8Ds8u7c-<$*d9;G z!b~ejW}|Dx%t>>{LT#E(t4JWx=@UOY>x<y2L4XfTFexvPAuH)P(imi!Yn$Ri{=|dG zbP-oktJZ~ls1_`WzwZBLRGuWD4TTM+HdVSqsxpgLCxW9g8C=JD57s{D#3iM(fYv5M zCMD~V{cgu1nW#05Y=RSiKnOmT6kIZBM2<vLYzTWX$T%WKgW516fBkEDn6Q_OZmm)R z4RL)wp>7G|g#|plS585+ST{b~Vuz+MDX)Y5b^!%b3eIj5(C;23gj5f!c)k?(rprOf z2{$-NMT*o2T4_aygAH->sT`c4CBB4kE~f0C0YfAa5Z*z&AEgyV4T<6EMG525!ju2W z`U;c;-(}@ZEGF`n@{vhW)xQe)Y*pC5W0Cl25Z}K&a!O<jl9HO<2$v%m|2_{dQ+}oQ zkno3xY$Pi18>t+6g}Y_Ypv^!S<L|Hhek2}jI^yNRKp@J%Qm<nJ6SPKB--p;BAyQ3N zjbxk{79;lqa%mGlMC^s&PsyY{V$3*BE6h(KS(^wjiECl6YwP-cX(r@CU8PsqHRO~` zuv1_(?L62acla>`n@{XuoAf;Rtq<5TQ-JYfDG8QIz`aEcsm^ADKOV+oG)N*LqMY}5 zDY&(l5|1V0((PbeJNER5nerCi%9PA>{v8l2_|i3e7manIs85GCh!PEv$=K&a0fh^c z@k%gP&*|eWfjU-fsplIY`q#e<t@>iR3pet_Ol)*?ztZ*a7M5Ft!l7^=8#%rQBul_k zzFk^Rt7dA2?8D2nE<Yt-2cqZC1XaQl7PRE`hI~2{6?#a628p(6=xSwa_<f0)$=Lx^ zJ#%28sgMoXgv^|nNOb(BMA8h50C(&4&$A}EE!BQwsfxr2lOUlez0x4nkT5e;3<$ux znL@FK?w$uSt!|(GOuA|*o6@D#AuABm`k0)3<jcalBllH=;#|SZbgUpgoBW)*tvVJ5 zi#Y!-0DfS**Ou)zbv#tWNX0PUEmSKl-gvw<7!jF~LKYtA#yi?@`a^XtDpK_epX@kk zwYc9)MIv$=Q!ZM{lei%}=35Fl4bojXKUhq8ZZuY;X}St6nW?gXPItkKd(n)0-mJ55 z#-w>71Xv<FKm?iu;q@W-k8}Qvp{b$bJxU||Q5_Wn5I%GtC{+dqH^e`IT#pXyUQ17z zinS!@Db_Lrs$ZyYLgUMX_E|}p`xsE<FZxq#YbHe~V||>OlZ`iyAk|@7RMMQGiD(y5 zbQPdL4X)xlagWCX(gH|XaVyelbYzBQoVGlXniF>($YFq=!Yw_3OvMN4EA~Z<<|0Vh zPgBycQJWlKxElkl$m-;M=m<GOcOC>xLSJf<!raRB>WeLwr0f5x>A;O#5iXhhjgjGr z7=l|C<`&5;$(>nVz=K^GzpMeH=m`ZCC(P~O;hv<E3Enw?l{2$HG?%;oH!8EnZC#Ke zsAIL1K5kX2fgNzgr^}aGVbBwtBtgkZy|rpVrhm&Puz=tJC|bMYLY<l+A0M-d&*E+D zJ7_g(BYWC(8ledCQx=5EbR*KzAq4@cYc6Co>9IP{-oN!9!54dD2GBJJjtzl4xy+(K z!f>cN<Mw0o)j0h7z<Hl$z9J>^&|UH|Ab?E|LbGn^lYB$X)u7F6!(-8}CMv;FE{T6Q zbPilPMkFLzEz}z@E$b?X$>Z9f#s{~@$`>pRjFC^xfa%5YI{gx?(Q_VYITaNoWWlwJ zj@yKH9;x)0(N&9uFvO>-<LC(<H!|Q|tcGyZ+*O+{uw46JY0*u7(aKy^WRGbUmrOI9 zI-@*6|0`7YL;|`^SD~+T#d)O=74GUDbHo@5v*hXtYJftyiCGOFEP*NKXe0u=l*^u~ z1+02?^jKJgw|kIb#=t$ffc^{0a1d~{$ABj^Zs|Ob30IfmV7OYcN9566X2BbiASQ=4 zytSkxl30sbPvCSU80Z`%tW?SAc;QQsrJOVZE^o|T;qS0-ZvP}2MUA#tt8>PIJD|A8 zcVPgZEMF3C8O;9UT?IXiMaw9P_PEZH3z23^voG$-Fa3uv3n)fbBRHQqktq>jV~U%= zsJH{7eht}3di(D!;|`k7(d=-}=zf0T*#XpAga<P+1;yZkh5c<yRS!*LQ5pnfh!sDR zdK;6&sLe2;uO!YVBP|clm!df<(EghdWvZpA$KP|4&jC_t=7lGqN(Z_d!i*pj#Pv%d z`;%dZPU-z#>Yek~cP_+PW+$*`Dx-~fuQ;Qc;B+oK)`1uL_YjTdt(Z^RgGiv3*d%in zJMbR<NQb@<9u5#IO#w!RjQM7K7A{Kq+N;buMVX3GltS<bLbjfp2|BKIkoS*-mHCjG zbL`c-i2)W(**eW_yEU2&U95h?!?G<%yNKctI|t_D>LhJK&1naOpp8+Jy6$e^;f$r1 zu4}Xg0E#Mblq(9Q$FaSF25C_|Ic=VDcQGTtuwAj_IANhqNDJ<l`!;-UjSRb*KF~<U z1#)Z3tl(tp)e26w8k%<Cmag1Yvjne87rt+(w+>FyU2cM+0a;@Pj7)8%|ND;MdyiSb zcl5^qhXClRg`qdp*McSoGgT@xbSepSyL-4|Ki-sx_0g5IFcd6`o;NW9_LF}on4u>k zI<{Pfx#9Mm5c)XEy%^Jls=dZX{*nJej4qmn#P*Vm&GvT(($NBu5wC0MIXPAMDjewH zP}@#=uiIy+frpSba%$4J*HTls6eU4fxJ3exJ<q?-#2~D&$V5BK>$q`l+)ymr72*7< z?XIt64tgaJzQR$=q*w~H#sQPlk43r7)8=j5ad1GUv1$V2s;rVxoPJ7O975mJ`LAcw z;Ts?#A4;7fZ*p|_T7}qy$Q2bMq_BrskG5xQ%kBP*_sP6L;y#0dMH!}l!Uffy54u6l zG)Uwz0~gfX$+u&>iD#;zSx0C|_Ev&HLm{*H#i9{1Aki!K(JRzr;i$}0z@yJ$$y|kt zSQUp{xNHO=Q75ZqUDobp12n}H?F}lJBrmrTII`)f{k09hEVs0i_9|tki*m8lZw&L{ zf*=vuI&p3(GwZ|8<sp61M58sm(S}p-%OyG(8f+l(Me<d;`ymu(LZL_to`%FlOGzrP zh=Kx+8RKvrTZj0wy&fj=(uVfD0JBHp!5#`l>_PqqKsI;nh{RpyH9Ps|+#0y(@p>cT z$pHTAA-h7g!2uCMd>39M-uXl*!Tb~cwlRvVu|5=-|HsrjMn@KP?b@+Zv5k(+j#06b zj%|0xwr$%sJL=fBZQK0vJn#9=dw%RcwMUJy*BEQBT63=Jx~qWdttb;(TW|0=ex#<2 zz?PTokNi%p%qbM(mE&_aS|lR-F0Zu+1kh4a_9f5qo*t8Iy>T3WgRPHNeO~?)g|M_f z%>-t)UXz*)(7;Kb8|HAB@#15-aGORr|9wD{%(9SxFlvqY7HTe?hJQV%I;VZ{lw?pW zBlr`_jT8o>91O0+)GH<k%HETrN+R&>gMKCokU5;IVcnL%a~e#Jf0noxZIGv7;oI^= z<jCBs3riB+%c71D>4-3X9)k}G%>bE<Kx8{vJ>fKWjV);wFF3t=nTo;K*UM)UqT4{N zI@V)Kr3v!hJ|i=%ikWVc;2W=)#&%Mk<eW>Y+ChDn?@lIv4s}OF2_>r}DV9cPnb9!K za@)veRW}688y^+M%Y9NmH?q`U;#6@zE(}TLQboD@23d^Rj_I3OcnE3PD|fk3_0?s} zQ>~H?hIygWVE<9YeSl$uCH;8)musSxilxlT4X8D?4X`Q47&J$#fW7`kacuZ=J@FFP z!HF^lh;J!n%lJ|w{fSL^A?v!7^Qr?<cT5uI5>!rx;gQ#&)b$e)XD!=8O0^gHfZdC1 zoV4Xeh!KQFWty8l`pL`2#=)14V(KVcGy|!Hibo;boG{15Jjmv}X}7GOVUdR8i<(VS zNL6oL4x_3F(ix!~5ynMH8KQyM^Z~Ry6!=zNf;S3xZYtvlZFA6tk?~oGk>)2U62_M? zhK5LuAip>arxbBEMlDE<a0ZG76_hoDo4aeaU~YfuO#LHp146|Ph^&~E3uY0DL63lo z@xNwU=GzKA)eNqgjn;q9=gw!zh9FcxuW_dpHoOc1y0fw@Wo6=V2OZbB(9sjn=8?4f zQnsAIR9w>W!8tjeWCRe8w}&XrlJX`MP|B-<vXtSZ7NF)~NSjyu2e5=X!3H^OaBgO9 zg8*Kh=s{LdsGF5EOq&lzvNn7crUVphy6Bnrx5L1h8IcKuEzrc)5fUvZGx;a9@xXM+ znh$~<Cr?7+)=p?^;%<w`i&{Q5!@>SQYi^P#w|OY)a|ONS2_*>bOe>x;AzbXO{SF~Y z5}7Uz*^Y;3%UVGkh6W{bST@ex0A<HAxh7m&SdlPN>b9TZwy@9BcedI-0`)zfa54~e zw8{0%2Gf?=)zTK$>rJfFYMq5NnX{kn@drIeLWMbMG^#=~5^y;?CfI@a$1Xn##Z+M_ zWq)j~|0Q|1zJCbW?j)t6;Puq;ff1foM<|5`da@m4FVN`o$dVA6uh+6NYGV0=U|w5I zPmlE?gI0qG>QTa_o#uN)8(5V&P(Az3`EACQbwLM_&}~EAl8eeN50>z(T|#dWqa#+i z@&QRAKK^(MuH+^Zjie906RTiIr1jYishNA{R-o)0OrPI4?I_K;HtW}gCJ2VW!<2IM zg340M*r$yy`PnhkF?}}$!quBTTJKq?3AaBDqH%)3nUZN#o5opGs*iEt$D7&sFn0f3 zUsy@)^HMQX$DK&hgViIZ4VCITGl2)U9#yuIz0n)+n0ZM?p=i2Xnd|-|^R+jD{F|GG z)ya+c7OXA24i>$vf`=?E2NQ!!KMY?P2%-V<kSoreN`kKu1!L$6=qc8QfjTo*Sa19b z-m0EpgW!u;g2IdH#a%wXF<n7FBqDfy3~SW{2}2Fqb5Qe#6(WmM1rBF>{GxelMr4sz z+KFt7geJ5=m7*6KJnAI2Vixl!dC(-wM4P6R5dc_yCc{K<;@bhW6Fh?*=358TAV!XQ zFF`&~KOVB!XvdTnXw1qLB&9l%)WU**W3Yi&?0FCLDubZQDcnZ)>rmSi)>5R!vSOIe z1fG3fVF*pC+5`i{t?TlWhO62jZpG;tTD;=*8^ip%y_7Zf8a@3X=rp5L!V(G}WQAbn zRnQm24PwDv3Tj|U#Y72|-a18vN>}*`RQXNAk(Ct!Ic^rj-a|E9EsKWS8lfZ2R6gDE zHA1Bt1?;dbkI{30Zj?)q8g2r&n>TSJgQWQ4{|GlD59^0Zxn;aU*fM#H(aDY?-CHzJ zq)3F!#LlDp)PHB-^=2HFs27|JZTI&Xsw@I8;u$QWc<7!@j>LkWGF~WRbntv2Q)vF- z_6xs3`&zsgxU;K|K;4Kymkc893~haW;FPQRt<=@*B5WZMsCMQlFOx+qcu*}U^xk+F zN5loydc{#pmI7BoHw=n!oI@~d)L9iq=SP4F7-oFS)h}(H#}A`=-tD53l*M<v^zXG> zbYHKZdp1fuEeX#{0-3_~g(c>UU&0Z)RGZ;yQ<AqkHCc)R{d+F$H^Omk=HY<`!G<cq z$s#^KWVm|gU!;1X^Ga&t!cFxq|9f8Ftoy{L+;Bj>s&5#(F{lJK1+SViDpl3eUaOu5 z$nLY(gX>j$bgB(#CwYp)a8rgP9R@NhI}a&y+Cb^TTCYNArC=eS%b(k{EM+v;g~gRp zIJO2?N7p@vINh6SX_*ZN{y5lwM@u~Wzs{h|bx?kodFb+cNzwZES>9YpqE<m$aBPSq zB3B>Hi26if^H^kk99>74B{RxZD%8&;aF!M@KLeEaAlphA@tk!qZDLb=mxiLFRjMdn zl3#MZrIiSG^hFQr8rp8dDb}Y^<6h>=6ciK1Fqx4ZB}=mk?4q4*YQI+rRQ<~u2jjam zv&*`7FOxC?S9%7^f1fXvD`Hz6pVs<IxA%=QSK*jb&W$9@oc%8gJ!wHp$8vJ#F4qP4 zSA2Azo@1veA|fx_1JaO&`5PTwO)9$i<foIeJzT>`q_*@zB}3|>MtO&_VFQ4M)!MqV z1NX0wyB#$f0}+gk--1flxq=byvJTJ?*|MuPzULQ#`sG!imA)r*Ok6KAlTt(h@`z3t zoQJ&S8A(Pbk-PA-U}Lvdg+{@)v-pg8GjE!k$t1^4&ssqiNks2m-m?u(oe=REyQ9;- zeq;|%Jlv5LlqE8^TcZ@iSx-mvk_pg8^^?0GcoD9*=0#FrJty<}&Y_nv2d$Ia*q41S z)i^oiM*&GFt_IK^cjiTMj}}jE^nVzX&;Y%s#KM*yoNy>Q)1rzW)Zinat7u>w1}EOC zRP^$JPkRZ$`tBmdMXBhy%#~YA^mkte)#IU<_BU^Nl>M0~q?m2OK5^+TF|bHK`ih_@ zl4ivWpwwAL3djS48^=1sx3$UXYIfR(_5*-1VaOctEE27Z_c${Ob4y^2d$s?C;B?JS z`b|l1R|HxSl(G|?KN1@oK-Yj|nY<SX>1PV|=wJcne)VZ-LPSl9mM$ODxG8&(pC{Vs zi2XF(t_eMo23L5togBkajjOFc1C;y_ahOn$Y|sgJuKY=mwqPdPD#cW>46K%(5tHHk zaahPo11YFQGUuQffwvsl#@JsN>R6MmGuH<sn(-nDv+<6FK-mL?8aEI^&>PmB{m~6y zdm|$ZtBzF9Q0tLe32Ofhh36pDbm4ZG7(rf`5e!w*5JEaGGlw0)xRKncSj1n!6qM6X z`mwMx?AacefqCYwB%3v^6)fUKHgQ64ryBGl;lD?{q6{fxn!e2=IG80U=1!dOy}Y4q zY}Q-VXJ(7?aT3U{w=?nlAt`mg2Xf_v{OFxNB);h577zS%MYEogc`~IOgDYo7UxE;f z)$29A8+R%G*_G0$Ml+ZgAA41WoCg0#YFfcImIzs9-#wnfH#tx$rHSpo%yib^Vv>#< z)u(lkw=q&KwVcu+J98LT&uOma>mOY7I&kzavAzvttEdU_1)dcU6+acAW95@7)P|Lf zaasXvc==o7rRi>B#3E-bTK-m4YU1A8tu!o#)pQ==A$_B3(?+T7VK~3vyOF8x9MC8_ z;C_Y<w~cQJ=~+<rK#C#x^!2-+##0p#beR>%a}dmqVbd(Dq^=Y6kqcu#lnjz2>>A{A z=yiCCs!TJiCYaJ;N?9#PsER+ioDuvvd!W8w({uDT+dEr1U$$!K;jHj6s==TiSSK1w zfVlgp(flXq+-FdNBA8kv^2rXCU)AfkB(N+d64{=LapO!D9D-M;WdQ!yJgQF&zo6f} z0p+bBY8f5>7(Nb;73>sS4NAX+Y9Dr}WU-(uvJctm>l)P1xEk5eg$<k8h^9~y1q~KG zW;A+zRR#E#Bbi7wX6{!|{SFzESgL!L-M}KNt(reA<MR{G-EbaY@5z9NiT5Yaa8udw zO(hm~-8<=6DQi(~yzjxloPI8zqm2lId-Pn%PkU<$)+>3X){)>(%_^;@4kTDx|50)| z)~MaU-RutZ772AQ7k{OWR-#MA8z~fqJ!)2PWlkctahhqV_~wL64!q)px&a|G2#gGb z7~h#UYgYTtz=0Ym6dLNBT{be4?uJcx8WS-_yl2<VoTs120u3L>D8!Xm_H{Ryz-1hC zHQ>l^Wlt;-+H_)OnlZ|f3!0E>*681^UwdEb{G~0&;jf=a`|q8QOASfT)&B!t$tjXf ze`;KJJMQ~|(3o3nCToVpz&#Uc?98`_D%_A8kU>Xar0TOLac1JgOiCav293qOSp<|P zU4EEVRzXNxmP^D8k+A^ThnH)Ab)IDI|GnjkD8ku7;+;wK+p#O0-7pAL1xaZyYpmrC z`$gS{4^Wo!ogz!B29(TxGrfnuu#-;T|HNudnSRG2Aa-he;1%aQ9tjoQK+<?AJm0SJ zC$bl!yijKyVhY<^+KG#mn;tsLLQ$3z!t+Yr8VT-A9{n)^<m&PuT5+LG29uN|p)}T> zFiCdaYKLr=ab&H<Q3i%1bG|e{4(w=Oq>j^Vu`>%Ay-5RI%Lhua)&sYxoVDjM;X!nX z&Mvl{^?$R%#S}Z?>51R8zLcaNT=Y24EH(}5R=qi)0U+$+s`T5uMDlVJ^%go^!loXY z@L=nT#znem+cM<QuIK9rEn;mnsyv6`4A84hWiU#`;mVh>D^zLTKRUm(u8FUzeer|1 zP!et;-aT;kUG^U3m)}1yU!E%`!#Fe9>aj+#6l%li;?5g2ok~hD-SjJwUaob{c6vy2 zFfQ{*e<m~?z$uv;Jc*;KNzc&Tx&WsjV2CSv_A+F=Yn@ffCraIfOMmSa9V+$u%T4?+ z53+O)Eex$!3|+ga+hkw>sPj<re0uQ~1Up_DqjwSgM_aWz|2Asx_EAG!6GsJX!hJ}Q z1W^oSm#kqAa7Y#+cg4GngJnx`hkee*pT-L<1eAm*1s}w^aev3#S`l+mYmCQ^qJ^%_ z3RB7tK)dlzH(q=BugZq2?1R9F85c5829@E|VgzUVu-pbpIS^=GELwVA09b`gu>sWY zRxp~A2;z;lY?{)6vkYe}+uy`gj2?k%ij|Y=3PVm`?+fE2Ow7d=JPd{T8r;(J+17P% zEv=!5yVCQSyhOu|O1c**L3g#7E6E_FQr40^%=0Ic1gK>=jC8_IR=z}I0E+;!xcs#s z<l-buj4?J3C|!o*ePj1M>-4k6y+!OJOT2_ebG7-(eZ=NvCLOY{Fa-MkB6>QUwThFi zB92q2blqB03oB|<s-%Yy5-J6ykomHpQuW5!e~dd9kf=#w^VHYlN$PNDF?Qy>T*jHv zZ~WLS1Er~!&8Wss=(c{Y5&4hunMx^kD?JVilL89@)2s3Ih_`jmtfJ0^HUUFIKwtNq zDT@dFPPdnOQ5l^dA#B;W&P`86ALQ%84%k2W9ZSk17ekFH)h36k3242V&xb6SsXbTH znJ-uuLsqpVMfI3pAKYfm@z78Sec6F)#Y|}lgczPfadaV?5zDqIED`-cn^f1~Sl1wb zcbd8-0~p>W&BX#w;)p)cbmPMr1-`dRs$*`8n{SVA9H7UL@Urth+cL0_Ap2OHFxCLr zHc5v!LWmx(e6YWu@e&RlQU4aZSkmmC`uG1VkGr>O><ypeT%PK{E688z<{@>xVA!-6 zBm@tkS>1M>xVJu-tA5Tcj1@ssUk-IBG5*a~`$1c}3-J!JXiNsM5^4$eWY90^vnDN< zc=0s@EjwK`r+ppz?;Zfhj=w4+wJnb%+o`QLtIRwly{d!rvJ+iGfBd|cIp99hAGJo4 zM2F)CUHT6X6<dhizC}0h&0()JRIR`W4koF5<9!?A7>>CvvL}dLw_)vzh%iqx@F~Nf z-j0=}_8|^avC!wuB~tjOe>JQuJAgNCR_U*n8YI4vtgDJ4j^c^>$0oi$zAG!xwwfn6 zFTL+f2ToGnz2_?ny|0rq{Uw2Xz$5s3PGqP_YQ?c*nc{DqbW_#J)PKlQ$Z-lNvn%SP zJC@qfSkFhR%Zs?-s7dW)a1G0X{&yg0^dp`e_Q?^x=k2N4tNhBlQOL*o?!&{`rl#g# zvqm1|;-=(JXneRPr%k!SP^D0JUbQCd%j}FWP`9uFxV(5?&gQI?D^jDH6&@f~C#tAI zQ=^`O@ZudUs>no`pSY`b=0DNnnp7W3)rdwyF*6qPwHV&=oRerZ|HHMwxXB*e#n(}) zyYdxKtpn{YJJmty1-AEKpjEpJL1}O$Ah2cQRHYSNL5!dVE2DM>X>9&G2x035Rorka zrPEqA6I(m%h5hNUbP2$R`Fx0P?D+2VeP#YNGA{se2a!?tP6A$tAKAnBL7NS6+xt4q zkmbm)y3(XSF72e{m}bg3w}N*JLuVw$YAEkTP;HTAcbSaO_(4ML`yriz_oiR?bq+Lq zB-`A#)edU>k$uhA`rCxuihHQo(iu=B*i7Hd%ha7Aa5vai7yipjkp6z*tBm`FU8kq~ zE^e?|y@F|U_ZI=eo&*~IQ1WcjjF^R0ka)(0xZ`JZKtDmOO-pUy36V`O`MUbUv48Gn zGjLZb6pB_I#sCS6M;phs0~EWb6I%r9EgNa(jTKH;{lP+C^7;Z1k((IoO`q5aYBo^S zof__`C_22o=X<+`GdOFRrnzcq1f2!e%!x7xhvNA8qz0dpvFP{Qz6k&}snFxfYYo&_ zE=v5$|5*jAmGYsWR@ZCrUs0L^&9C923a&;T=HEK2g1XxFxG{2de$jP`bMPsyVMj11 zsdz?ccR<9iMq^dz38Wn*IDl5+iB&Vjuk;lxi|SE8xVC<tUs2dghHmJ-^*!v;*)!cn z_>e|LdnAv>6SLEV6ogpjhpb)717{QXOh~BB8gRvu0k^?qVEa)b%5>g>KK-Vis9eoS zl>a|Ua%;9NGJ<<W{JN2$#F0Hg((|ZNwPVhl3SWj7)^-|%I=BKkAk>a2+k!U3+{Ij{ zn&D)tdv^7&(30<8SX0l`ebis!Y2m?MOaJgWonUrHSZ6Ag5-aY_DEIS0V^L0fKS3hL zAwy9;%%_b{34LwQKn(RuCqV?jAb*hGbsyjPrqv~z?JZ@lBNo<GB10a}Dc>XLv;PPH zuFs8ZHz@EA4^Jext(;~@N8}sBh@Ym9M!b4Fa>|s5xYPqy4esoy_(-ycE9j?&9^z!5 zAWMEXGcx=y{7AX2UeMXB7wvo1sv#x`PEKdss~o3iyeJ{^t++x+*DVA0nMSIWN_Duf zLW-zy*T=y%Lp4aaSp=`QnR<_eRXq>KEDx7PN~cg;0QerNnmg9_ILu6x{RHu=_9>*O zc{e)2ZDf6^pPZG?mGH&{Si$;)-}IYT+^W6JE^gT-NfrkerxwyeMH?xRyZQwliCyH_ zTR-dzKl6IK9t&LBj~~I6(~USFfXE;S7&c*Jmh<=FGo?9bf6^qtdzSxPp||2HWPGq= z*$V~u5hhr|t6}LO*g^~6C~LwS`kLmI5eZZD#OtNvauR4FP@46IV>zAP|Mt(d204Fz zVV3i<`j&v5bOsq-U0Ved6?yeG{~wVVy+x+R<P;6O$AcxRf;4QvC8bBQdxH}}#Ns!e z1@juTxFztUH8(F5F!6P^@+B)#qzA0{V)_&CM@_vEPGB{#08^v#T_sQ9X-Q1^zExaL z-Rey&O6K{pW&j++KKE{}=FH_E&h2)=q57lvM47hNybLl5KOIkYs2AbyGw}z5_h{}s zFXeK?p%|%%vg*yqlCy%zX8v*{p33X27Ig@Qk=mgmgs+R3f1lS{Uzy-VoHmA~M<+)R zKZy7#1K5_1&M&UHda`r8<>ukIJSR2&@E=jR<TxO5Q7duz4JDMf0jd^wPx(=rSJ(tl z!jUxfaT7dcco-x)LPPRa%bQD@ROpm_qS!3A8^x%<-^un-jF;&?OTDeyVc1Ljr)3q_ z_ZDP&)&ls;mA4P(oWprGq*CSZ@K|+Enwwt@*O0=5)NIj(K`Rt{X?&RY2ab3T0-~_} zA50Im%Jg{m@wb}`Egxt>-S3#n&P%<C2)4mz*ky3({Ikv|Tog<Qm|Sq_U&$2Gx*Ut< zV+hhDsO@`5MSdcTZPa8_c%E8+G0n4R5cE@}A!_{M?u62i=6em(AX_Gs!3>B{&``?a z8!2GJQ9$e^E{(?H#f-O*I)xWUG;i%<kDecaQHbh412Y}rp&<WEl;(H;#y(H1PWO45 z&y%`m1zgj#gl?6<0=6Nntc16ZDoJDCHaiI1;sbqS4JGc2+J0UVn3C%=fG38}CP_A% z`uIzN2`h<gMCgfJSQXM%_G~KhZFKXDtJfA!+eSo8>Lv3UPO0WhYW_FJ3~&4=>u51A zD`%}a2p^(P7wTeX^?Q?+&xp<T{#he;*7s~2GbPT)pPz_D0Oc<jN6-_Cq+!W}p?q}q zvxJ+eGRLv&KHN#-pD;6M5>Mwu;9xEoB=@y7xKWa@ZKa+4fkf22`{!%N`k0TpNMj-O z@9Am{fKh4cJ-nV~xd3;_5WTzd2la~RHszjL)%OOfV*UHJ#?>z>-gVI<J+72bYx&my zV>I@>L0?y7r~LbZlswsd@D!C2LUG=ys%J4292W^iXhzA8x;7H9RAOYu%lPp4ubmgN z!rBNLE0kL9Ya&@%J53ixVG)N{c)w@up4^(FZxJoBY5nfGm3CPAuqDmG)*~V)7!c%4 zHT_M<(g7@g3Rm5Y;x__@;`M2)ow4Dp+aQ4`a1npF5FmX?u+#npyNC2CR02Zsu5r+V z{Sz3m;e2cxD}}UDz9aNr@Xn)L>2v%PtkrI}%2wUPz4_qvTB~AF5jA(Ic^ZJdF`{hZ zbz~@r=ndO;p-U*elu9RlK6z{qSI#$$)XHr+e(SB?IxmY8u?=(`rB2)SPshz04e`K| z?COnpal727EY0xS#1t=#8KTCPs+|h6A1T$@Svuhw=qk}3F;k_|?RAjKbDo|-g!|6l zM&mSl#j3Rcto_?c<E;fLQ4rVS3KW=GUL;hbZ6aETxnv(j>iIPGSLFb!IFWc7ivByD zF|k0%;=)X5pr)In<(QQkp_Yq}6~p6z%@|cp#Pm-J%Q5XNLigVT7P+w@ix*o=4JF<! zh{cYh6TqFG1#>YH@1NuTRZ0f6EKCs|<ch>R9I*g6H)}B0N+^(RCQp)IJcwkhQGLZ? z#kH<7CyJq36v^P~3}`OpDHFeIpmQv)0_86#WG$^)*nTSCg%{fZ+oGj+REM&iwW&h! zL+Tf>)J*3SB4YVxBl$9$TkTx<=Bw>Rt&Vjs@Xnjc|LLecse8YxF-^bg1}Ix+zy#1- zuq`}nE=(B(gvr^-$lHqA4dzcb%mC~Y-75O`!y*wRew9JKB34}U(k=hg@Wh?`<TXco zuMQ{G$)%Pz#rK{A&!+ZKNHulTY$5WZKqU>HS7fPdB4!X)Drko@4luW3sLIPHKujqr zz1-S0C~clyrMmGAuM?ZiGn{L4FcDzGMiAr!vpiquB>S3K`M|*(tbm<*Xn59P$m+3F zcx^JZAw66u#q|af9TF=e2ok~&Gt)WtiCAwRneNHHp?+Z+T-s{MVWL)QBbUjKK249d z5@{H#X0Eq)MHn5{t0WW){!pd}<Y6m-6OXLztncOP1yR@sBWyLZw#L4PC8n*+rj3S4 z#2X&AnQJ3~U4S4KGfiBfQjI<{zD`r0R%uxGE?;QCm#b9JIkZ<L>DIrqQMZFtHrEMv zKmklvZ|q$b0b^QoJaJiP_0Fl+46m%1wia~%zSVpS_&lcOG$T_R>Rq{(bNm<>>b=Tb zeqhInD%0sn8t^K!r%B^mMZ_I!$&f+1xNtG|yfJ#x?Vyz-)|8FCD336TcW|Gv=6$pL zRo7On?=A%biYZt@7umAJ#f=Q8;X70$rJe26<LtY9J*OJz%^sP(+P|IQ=1LmvVpWjP zoz<U6XTGV2=F9gArDYBG6GiL^IEMTt-*Q$BY%tdF^PZ;k*k$eGpRA2gQl>u{i_E|t z4<S*3gfuTnwRfdWtux|Z{4gq6r|2;_2IZStYMxi(dcZT<=N*Mvm~fG}2V;m)wuS<V zyBwH(6^uu?pRZZ56{XiV!WzV<NQy=C`zl33NE&(e2|p|-(N7SS6{Je83kCMR8TJTl zg>M`oa?l>Rf_kgNlQ|L?pYjT_4*GNVU>m<*vemQCoX#aHie-zD!jUmYV64Mad&nNX z2BVWilob3@hcO2~>-X+7ciuT#j;$qJH4-_O=vt8S@Gai;%Z`U&uNZ>`m<$@(kHa0# zl;Q)axPHm-m>*Vw(EWwNU7GrQ@rlY&9_UgauwbyAZe>@)6=RNdVg;cHyzuBOgJAVX zkQ+#Rd2#aqIUvWnMe6B{Me3+CF6x#s7b0l86Bk0nnnfw%)8`761Fm830b8S4>s;)& zk+}LM2kc1a%nU|)mpT!er|cChl_66K6kT8$b@BcB@f-|iWDu^Zj-zQ0X#kr($ahlg zcHJr@n-E>ri;e7=_!Ij$6*%9v%97--XsS+EH8?vm%=!cEnDFvfqpCxMHUo4Hsp5^Z zF|{FY4F3Sv-3-A<@*8@S%6jiE*~H7Z#YRcXx5@P#!}Be4B0eJTG4A}5ISLO-$4uhU z)DITBB)&(fW(to|xZYOsZ)a!-FE;CiI2YBY#0V^uT`k0Q^YWu4D?2;e9yUl*ga|sj zk;1|R8?W<KxQ4QOyQLMavIj%Sf+o%P!UN&bS`1bj;AMOMP9XftL$~5<xz-YuknEXf z#<CX@<aw=DpIoLAdm`wvO%eoytv*v`a8%Ha+GM5vl9Xi0tytnO1mj$F(2J6y!yx*q z90k1!7#%G~`FH<2RYGPVYRr#zEe_*qxJawLbSQ-?Z~DC~gQW8(O_=4xoOrl^@QWK! z<&i9gPp)Q#n)1#f)w-PQx|BxW*ZpuOkyk#y-wIcCI0pjufJPx$S*!<laX71L7ld<1 z$Up~({IV?@I#H22b!M%5Y8T&_uS!ca-*+w^fiuGDD}vwwq>2C^OODM{J&{tlV7csk zE%k^uF`R~+jpGEjdpc#bx!UFid#|2e5=T#z#(G~3En_E+{cRpAYAyv$^y*4G_c!RD z#(!gBA`utTNfU3%#L4~fPFh|x5?1@4FoK+Uo^+xLjv!Uhf5-2tBu)Yu1RlTnpWPp> z1mo)zq|aie4)hdkB&V{xsY)Pj1;^_a9gcAQh(PXGU+`1;cy3S4*^)VZQ-iVM+ezeh z_rRyrlA<#ebl{K%nFn0?`>*?EfL`mPs#%e|4Xvq!ICkl$Bc1NyA--uGmN28ZU6{!^ z{p^Nkq%G8UI$r)jd1vPi_xO;YG_gwU&t3T&ru~0V4R4hy6iCSJpFeDtwUx=DNxs|@ zooqZi^XUrT(JhAL67F0EN+7$jsNipY#FYix!;k98ouWHj^^`h!y%KvnM3)}Xvp`## zp#CJJT+lwGgD^$NZ0b)*a3)oTwpNVA!p8U(FH&}W0Z4baNPqe&XLUr5Clq+xrQCz_ z1icPN23-Y|9mr5aFzonoJcOI6>fw9F&OFWmw6k$>f2b%MF&~EMde-sB@sLF~&OG*o z=VsP{Y9UG6%LJ;0Mia4v>FZ}6mH#(9!6lbj|KOvtZD{Hd$6GE@A(T55mH8MrMX;~N zMvn&HvJ>ekJkKCmzQnRnlgl7g{P0dahyuc;h-<=f7oe}<e!BY*CuqJC?1jT;iSphz z(#<NDg8o6jbej3+f$hjqLVOIcf#5q$7Rl;OpE>E><mg28b`6x*F)VpPKXS-~bXAaT zBH<_F$+&|wZus*C7va!)rgsCCVwE_fG5#ShDwva<VmV#~)sMt+uXC``Sq>DFc<rO% z>w`zgm;43)k8Vi-hz`4{SITib1DbLhS&TT}`XJx-QdS6>kS&HdKWh^F8<%b!*nbJ0 z3PbOv31pusPj2j|-G!{@T<>!*|3Tr5j|-w7_0}Be30w@wj655GOe`3c5HO6~s~gT4 zPl{EZmj#3DjWpC*MtP1)rx!G?yI=PUV8ud8pdN&KwFvi2)NrjUzpJBRO1n2Pr7r{6 z3%rh#2mWp5omsVaVS%&4=bfPr44~wWc0UM<uB_lt4OZfJ&2Kum+-Fy+isrt!6{!hX zU2s%*j;`<`n}9Ls?n<P5RZK>q#e(#JJaskZ$G9LMPN<Y*Yb0SNPtilrb<3<$ras#v zCkwQ$UB~jAoB$jvVUr8t!J+TjoSqnxh?j!QREkmtOfA!w`zN_FVU3u!abQUfv0$xi zt!Wn$m@;Srt1yrb^R;`MT0>jf_<v_}9_BkG(ZWGH%%nd$IiP4*hQ(Bg)|?ib@wWuv zt!@(qmbe)LdMY$@$w58s{pFkgH2I6M)@Wad7YDzR(nTa0+{&ItDKsaGKP~QB9ds}K z`NTvWIgDOBWSe5A;V){dBr_KZZMt)4`tzj<?CF$s6p2_)YpNe^9PaeHzJ%!lna|O( z;FitDp+2&o8h8(o_!}S9t)G)s!UA>n#BY+w)Zh$Ih_xT|p+p29O{FWPw5REJgMyN0 zb&YPAF$0aiB!Th`<JM4}a+HsFOMB2qR>{8rl7<|dHIMatEL-_+u&&?KyT9Icrq;Bl z{v0nsPf1$BZ9J-#%Go|l>DY@@)Z^5r^P&HD_Di0eorF8iz~FbebwCjdT}7^~oDFWa z7#L83cT(KFw!4s($x<WZ*)1zTYJ{sG8D;pS9aG--37Lx&?sGyIn%7ZUy4`tkWLuu? zP1c~@3?=j}Eh}(Wr7Yp#Y_Ynispea0HIf{;N$*WI?V^g5|K5-+>Z3QSwMyK-rgT)T ze3hd+L(zGRa~<Em-a2Z%7(t<Eu~-7nT6<C1AX^Wzil6iA33KUnZBfn~JO%4`r&B|( z^7{*6fK0Cqxn_PB6t1T|AIG%voZszwW0S<1k{xp#{u8`p0HhgheZG})^l+@EX`N~d z24Og*xEya|E+W`bdr+Xr;zaT<=?HMIhEAcK7JU2-mw+jz>C<4-#bB;VX$3p!td;%- z7XcU8)2%1zJZ(VV{*N{-O}rJqQEO!ytKP<3<<p+2s_!58(>3DpAK1n_j5|l($Z{Pt zFxdIf8OihI0+aEYO*a-wR*DH5lL}3L8>at)_qoMLR=EHPJ++?Xt@`jMK06m+Cqi0% zR=eW$djD?p32R&$|4uD%^U^A3ZF8TqUi_;vAhuK&T!qnj_j`_UJI6a#q4xOwT7~bg z3SRPBd6c|^)(2XZh>8c}!C?iVz1CH5X3}F6v_NCic$!S^%SA}?-1Bxwa^LMJQk9Cq zlEUw_zGclrnN6}JDo4#H;Q!+&YIb&o5^?8PVg693b2DsVwVLjwg>$es#Bl{-COedp zb9?@=lFJsM*6`ncbt~~Jyfh`1uN7z`6WCD(MgmP!B(Tn87@nyjFL0-p2&wL$i&Gjp zx(6Xe!tmR8Vknd#J1zY6=t6$Ww7E85*{a3Z)oRJ-kE8tWB)PwqLgO^ju;S!n-A556 zRx4P~o_X*B{<^?)VSINYHcY`9k|cZe%?cVsV#3Atd2}^tPd+>^#@R54RRJ;WP~}+( zfJ9N+9T_cQ5{x<-4}OxRfHO6Tv=3rJvh4E$ZckpRwK9yA@E61<-2?M6x976%&e?o+ z8Z67Kk<}=?<~{IjZ#KWjT17Z49MMF@UR@gB&a~91r+YSE6fGRJ0%i2Sy=_Wixg|b6 zRrhQ7G|yMTwpi0hs47^w+TkNnc2@F+RDswXkr*F8BaCKlK_mY&@44BmQtWX_Y3A#t z@6)(MWn$DYiKt2<r34pZa4pcRfL+{)T$A4iJq`=VL5Rt21<uT}6L1ebn|X6Se2xfv z5&Q$knv5`aNDYlO|H)3T{7-f|UZSc-6l70Pmp}LQdP#3p|38gmft_kOwove^K_!1y zNe5#6NOBr&8;}Tl{w8B|L~^Nq;YpPhF?jS0@tBBkOJkOK^9@Um|Fjigaq86hJgX-h z$AMG#;rz0DpoVI?v-;Af898cqglh-f?92bL;)AgsHTb-e9=}W4W#ysy5;g(j#s4fL zqwGBojBX{pNPVcr>oxgChYkTZ7>YaQ<F{Nmg$2qB=p!hpQnkJq$`(Ci1ojk2nb-Ey z)fbfGNJpOs;Demo9K%vn66}VM7GoEILyI3<>{BXwuoIWi?3q{+CYIXde#|1r#yf`C z+PdG8%<Ar&1?DuuDVY*gxYXbNUF*{IzSD|lVOt&zl;H2x(7N7YyQpO8D*CeJiAv~X z($Ou-;$;C|Ti~|(sMZ11KhV148#hsZe7mfDe`W_~ZFIcdeJFqbPr!Clux{16lf7!? z<P^)9PjkvO-NC8gzxB|p5JaP6Y18=%d;vu?@*r1Q@SC_$oZ@O6zn{bDzAL*p2i^Ke zIH)VPk_TI)Ebt{%QkYxUiIOi)rt~tB^{qtiV(fM53&l=t?(%w+<Pr5hq4My3y!KGy zpM=SE)fxtg@Pj?%wJ6U|SH$T8&{5si2eM~gNjY=`8(5vI+z9HIx^}aEC0pve?I(V$ zu-$V(Ym1MQSB5nGGOJ^-8krKY%xvzaNlLrG{OrWXQ;2@;q|eKR@<i{IZUk`@P9z0$ zlO~gkmEKlBhE{Y*uT^-q6cGv!Jl;85%vMf=ibvM-D>$ddC@DUuS(~ns<E|KAIV2AV zW|A8hLCdyYfc>`naY9|W$uYIYJnJDr;MTIFlWbZY;Pcr4m+Qi@jqJ)=hb~RYFUC(2 z?A0fYmy;X++wMiMdG`++Kzh!gYcIFKa5wp7N9gT{sU;Wq@#~*DEl%Nb+7>7)gh4DR zldb5yE5iE*HYaKvsXv|aI1<QNHP?t}jPUVl(9IZ>UP0T@#xkoU64kGT{HjV70yM%C zx?_c%@KAE6lc%32B~IF*d_MT2s&+r!n(K{v--g*98!dW9-hdDPe|+phP&Y=IH>%-7 zIq9nfpD2o6S`AcKdBz`TOi<#-)KF8oR+iarlsEzq9#G?T@K=sg!pRZagi70nS&W6o z*bDRo!fpp1CidA%joC7CL>}r=zJ8W!TfRns2XsiKAqe8i6|oZWU>~I-8wE;`0N$L? zjTP}=Yn<#a2?a3(gA(23gg`k-I@Hv1{J=XJn>--qxv#*eq<|9y`V|a04jWbkf8*t+ zYFFb{5{NMpIF?olIX3_iL`;e*3ZHPrfg`6XYY69$8u8B(k>5sIC|^b!c^Rk9^CQ{9 z=p>HO^XtV89_zvZjGz+(a%d=TgFD95T;-3A4A2V=NNAOWfA7u-|E?~x9V=?6<m_^< z>IRgVTP5-LOfb3yyqQ|=mw6d}yc9K+$n!Z?`uon|twSGybxNr=&Wiw;)3d!0_>9uQ z<K}a#NaNv6r?PX7c*bA`9_00*!vzNWko*i`kre`V)a~^YQiaU_O+2I|%$RB8b^6o7 zt?|hbKYvpIWKN850mBZW2DTeI-&q%S-!~!z9n)pMV=YV|)vcKDdNhmVZSSNvw@-^c zPSKrJOK$`Xy!q2(MAtwt{1Qa9hI#!`1~m6DYKsRPwiADaL;!O7_!uV32A3I9+7;ER zH<jpzX+eJjjY#1v!Ieho(uOY}y$#XR<fhy6(qRprp|6tK#vO{RnO=BEklL4>k=-;0 zG+>-G{sOHZO9|Bo+8bslXTq8YeqlEqP)9cD7BSVY??@R`o{TyrT8$6o-E4E$Cm)ei z$E6pR&Q+riJ}9)WMpH+ACr_eKWYclqux`y%>{_?FeCRGjF6?Sx>g3ZizC%W!d8esB z8Lix^aMD?+b~xuw(u&0LCDPYm71P&XG>Zk-mmmVjI{zme_q!^8Sc0v3)d;J_;<Y<i zUrVPB5F9jGkHvVHpm+jpfRx}4^}_D~F+-kBxw92QYMScpb46YZ2u}02A!#U#=?E$0 zhTKBPQo>35^Yag>Co`?zvGKhI2d=P?RdcM~@1&iFvWEcWF-RlD>m({uWTkTu3}R<H zBokVX3%%y}fs60KHI>d$AiaEit#~*TOD0=$$<*lC4dAeLSk*C_GQp@(LeiiUW((p? zGYNAFk@<a1**-at7Bt)R+_#g1Qf*+os2H1?4BBG~T)yQ$s%_3(x^G-?ih-)P+1h+5 zgqM4!=W7Z$usHRlF&D!!(}HxJKMTGHp5lv-aQ=Wgw+jfFMg8*^YyJ$TL`9_xbW1=r zG~`Bz%|X&{$xrPXrw|54I#78fPw5v^03z0j5|vLFu{}XJZWV^n9I_STYbVWV*?{3S z0IxIpr(?^ZahzvG4eZ4&Kg+&<l5DL2nRJPaU(PnSq50NcXo24a+nKZ3tb0s_a!u~~ zQ!d$%&MPdT_Xy|wTzg^Z8g%Neo_SRwW8w$G%~z!bZsZqbUqJ4x(r?uy5PA4_0hC1u zsmX4!aS(yq4ZWSOkx+GTPn;b_+_U;;x);)Ll{e#3Hxq)fjISe=D=e}|cQY5VbMAU* z32PTJI`E4+aqPpz4H&I0A8dmq6r%rJ#Lnqffj2ZBffpXND9#%Pg!xZh`#e?}uXEzd zAv3E@Bw=5aG&K8zfuY?fpk$#;E_*vh7wkDZ$Y1ghhWoVd0u>?Ho`yw)f1CSAK8XOz z-%oAz{gr&>wHF`V2XFTN_(n}<sr#S1wOZ)7#Jk0bC7b`uM3&&w*;1eM{OAJDKQ6rq z?9`u}`k5QJ$0kEoshr2FwtNBjF2@^0-UFP$SbUL44S8R-*I!1V7uhvXGAcY7>%p1> z_Z;)(L<VRvGkqNtU4N{(9~J&SiV<GC2a8P@)u;&$#4~pFYeq{0xz0f?3Qk{x8zH6q zbiGIL@2;ii?Yl(ihl(z;gB|uQI17L4&3|gjv8~(v-4dh^zK!^Ze_dpx-kWA?PUgW9 zt^akIf_@zRr>jt+&hqWYj~vZG%T~So?_2F9yhvHjZ$G}L+|tZc_0ooSpIqrp%FB-_ z7@=X)^szeYg1p;?m5h6vf5^6Ati#|cEw^q*tl+gQr|yx=aUJseF-n$DBKX!YCV<f@ zQT=+5M~}UsZI7f+NmNMgtutgi%V;_G+Ri1eF;c-MK&bn|+d}We5myyQD*bls{pm!V zvNL`VQA+s_cp#NLP54cd^I!w*$I=NrqgKH)#h=Z}M0{7xn8mF}(T|#ru8oJbHJeCW z`-sXvS9yJU4@Ugswxvvv<qY>mjevlV(uQK^+R;2^j2Q7z`}=BK2-t8AUmgLIx~W(< z(H{UFkgeNatl)p{+^yrm2WqLP68o1L1FV#JUA&oT^lfJ*SL6RKp)6HbZA^<6dua8f z|NF-*=4;OUniG(hQjK;_#^CEgVE03dC+cK<^POfg&1WM3=Aowj9*y@s{wgpr(Dqds z<q1nOlE~)f9TLrr##h8}&Ysw`J#=`_c*ILW__9~Fn9U&S7nPPwAkOL$CYclt>`w75 z3w4UnQ#X@Lf9+^xxJj7w(Ho6azJ2Jw;}6RnoCAYU9dXY<Tl$~D>Uo0oNX*YhE<n1N zPp1BAGXDQu!sl#D$a5;5p*l~wB;T{2zvqMTWv~~0Fo(aNA7R!CI0P50vY}SXZdvP3 zLOg^~Dbdc6pDIm@y~%nI?C`Y;7c4`MN#b~ofxx?MWKQ=27mTi~n*FHRlg+c0VzDEp zu@{}2uZRx&YY_v_aD$*3++1m(?U4?34Lm@!IvK7i=hiI`GoqZDD=3sA@+1e2-`|PC z9#=**M9eIgiZlRI%mMGE?1}c^2A`33MRRRFYA<qAKKGST+IXCY*6p74x+Ym`#L^AM z`$^eTw`61H%}epkivNCkvi;+g7n}8io3^zRjZl7)M~#PB!8KhS87J(IVl=96fODY~ zt3h-_I>q*jsHRK#FcnHSERB2KEt!xWci2B~9M4!^Z)n}TeC)EB5%j5fBpM3!FwkXB zN~*LHBR`(r&AvDpM)xkIJAm0jtmdU5bG3J?`pKos?oo~kuq=BsO;eY>F!@cgy(E(= z%eh_sE&sLRa=4w)<qeVj>Jk2zz=Nz8ww1pyI_`4)A}l<Np-tJZPmm{p2RT7e5M!Q! zF`RuN14*K*IGHr<<Y!$DEwA5O$RpkGx?+mx#(Z!zctT;^v(=ui=WmAO=;qlVDu>OV zPW@4jO<+iaUgN_d63-+?L^o)pf3-evUT32~-2{60cPU=&`ej@NqHvx$+egLD&ON5% z_BuZxkh=EI$bL984|*QMzSsCnXuQ$?UHGc}zyvq+V!XWiSo6U;>q6YwS|N%?dw<AT zM1Fnp>B0Gwk1TFte(HMPTCS`?dyU@sJ><>%@Vq_$70LQFg+28px#yN;f&+obFQe>M zQBBx{TrJ?R_x0p6(BpHrv=#yXITxbV(Y_q}AQWPuQ^~K502=Njs&lS)J@q;){fR~w zF^T+mm*@c)tl1GT7S)|ZFM&vGJ(fiwfMQI2!SA-|L8*_H@Zu$vo_k%>@${-o8w<oB zPtvg#$ti=+osz^FrjQYU6Q$ue$#(Bbb%JFOi1f3Fmmflzv!;n{G-cBV47BOD$&h)E zP7i4`FH9g+Stx|XuT%bvcHEkLLQ(r&KUYbn<W*mxoU`^uSZ?~+Id(Yq`u%pi9s-zA zx~Ei!YKRNII1vuMt}itF6tSNI@ltIM1_JTEx&te&4h0)3GM-8MAUJk**I<{rn08(} zm-o)2Lj+w~OrN^889jQdvYewKf@|7gm$v4Il%|_r!sVHMlk(vpUgsgw&s;<0*SbaE zYc_&ApE*BKdR;aV>H#4E2s>bw&aP#4@#mm<dcYQu+Tnp`jvmEa0r@U?I)mXhlJIqw zA3D^G-Hn5<>CCjVtJ13OulaYhZ?GPJ%_b_7I<QH^(pg4BE%4TS6?n(DSJ-%?38Skm zMgQh3!|~5lme}iaSD6j}`Ohw5BhH2I@)7S54YN*u&QLEY63GU!)H-(U_!j<bbZ!qK zq1Bsk#5T*`M3HU0T7>9#t>@K+LarMIoump5u7O#vXzFhMJA*eJtKe{!-^{P#dxs-6 zD0GQBf@_1S`kCSj5KlTh3LRf0^xRC}n1eAwbrdTTMn4N^%e~XIUMEdxPi?FPJty(a zhn6*)#9HFmdzAIPH{cG=a}Qs$8<oazTW0y_?DlNIBA`bR)qgJN$`xHDgNfICn}4$( z`+1zA4#$rM>?e=zS*@AczA`AFKMAq3GX;Y8=>r@iFAUo5ubK&F9cU@D&|$MLsIMSt zzm$FjdhoLap4xWL78vOBz+ftc1S(`BAd9yJ<z^o-<hMVgkPmh)c|ae(v-c7Br6p!l z2l>DTbA@SMLvw}3dcvJ0=Bu09l6x$g=9sjSv;0EEabtVBiJr300)gAo-k&d8&+eFV zjn9&<&tFnLFO1kp0{kvuISEfobIexC)xTqxOPliN#j29{<MANxKyFc=lR?!cG@LbN z;*Y^EY|NY7&mVS^hSZV%>d`-K7)iLKsi3l-UyDt-{P8gtZyC|Ne5`1iE7+W~FWL3E z{%xhlmT{yWQ$4idP1dlw#_0T}ht5<)lWQnKFFqqJ4^T!B()F><W{1cEQ|O+dRYHYe z+ycjtmVsjLZ$O^6<wFFgxqS)cBclMP@sFML3$Jmnd9Ykzi5!h*SBxA8Vrxi4$?Pp} z0W**kYeJb8ggxetpY%h9^=M|JP~i|Ps=$Up2QgwRAr%RtI&E)zkMvhjeIX_cjv&3o zj5<k}mOk;@bF^dGMS+nsNW2%l!cv@#J7V9hY651JWhpAzwreZS^1taMHpUp$tB(~F zTDp}wJSpQVmux7`{&$(zhdKHlnG|j>BjF9B(zZTx+^88|v=lCNELesg1B5uMx^(|F z7^bG}Eh734GFw<+P4g3eWk3;QWN4V?ZAr8Vyon+2$23Oi3s(fneRbY-{uQk{tl=M* zxN0xsk`l^cuxg1VZU-)<H3DmN@j}`HsV|0ECxR8O%d=3*iVHTL*`f;9!m~E$2uGc# zi@53#z1xSp`pT6rl#|yl)50^Zfs7tS^6nQ%JdhbQXMC}?)U<AtLmYiPv_VmpQe)W9 zCT<_5qu}GgxI4~B>S47u(i%CD=4erOaRkSt{e;W1yNU<Shd03`yZ$}p?N0$PoimyX z7{{uF4d^z(NH?P6wrJPfGwxkK2A&Ck<BCeT{Dl`1>O;%rto-%0bSF4==ocSk+oh5G z*Qj50Y1y3~%CWoHc_p8mh^>^pS#Nopg&qS^qNr{nSqZ>XLyiP)goN3xb<4^-;1JCN z0^UtAdV1n_<hBv0E0$0Q{7%~>h4r&w64~@FHdTIr-@HQczYRs?bNuk!4lvt-r+#J) z{}386b4+?*jACXMv+Q)6Z?}OL8aN-YHk*@$#5SJ#gm5Ijn`?Tjl`?VOc!f9hW&i_t zBeM%$e8ciFeS$3`hEACpzZ>!WuTO~M{fJU(v=^n${JmCInuHIFXBf$>9nIDSj?N}y z2FI3M{_7sN{h1{jJ564SU$j@|F8{vQDpf7GF7A#;OL$2ND!ZU17P$Tfe^j`dnNl!^ zhM}cJKjG9DvzsP_Q(n;!Wa*Q>DLw4WwVNg<y3cA&vg~Q7zKG2lC?GFw`TT8tDas{H z`Rh;Ga`	($kB|Ql+u~`F!IS`!DW`>eAK!U7eOqygpAs*5IjC`7gO+K=5Q;4Tby) z{jAdKe0@LplS$5WKN0$j{X~CUL4zN=`Z57CWo~8{T4PV+*cGp5YnkZOavQjVTghlO z+-BZloY1-Hy{DLT?B<5^S!r4Z>BU^K6&)u=#r4Am@}Zh20JjQTG5QKvc6Lzx7LI-v z3b~|IYR%)i!G*iTAP8el_LTRnPqdrZ?{Z9`m<gpk(W&G3IOh5fCA%o^r#WB6$Nq)H z7gGv_L27nc+uXB!L!B|H%b7V)$|aZ%hbp|JUK;_s3rqi}DziVI3)CHh_ILW~yL<NO zyKB*-zT`brL`ueM$}Fe2cdTCzrCsNeN#?N~vIiFLRao&Kn^ztvv$vPTn~4cs(sBMj z^=NNUuEuld-k}z)jc1B5<d^PXBgqT~3FNe*s4YVrKZr*4fqj@2ndEhgAPUw9%;(@k z=B631kUVX3uH-$Ao|?gl&#q~|O0!@ZiBq-$M~+hV)W5Ci5OI+j4j<RR0C&{TjDHr| z$7_%1I4ywnespg~6WSMjecD&NhwNRiIg36*1n-&JkkOg{6aomrRPhNgAx}h6g-tOl zU^Mur%oF7X@c2Gp-JA<}vSB=`?&S<0Fx*nSi59)F7&!kbXf?SnwmR6y*>1AQqnqZU zcT}VF=D$YY)^uvxKZ5t#2U$15t~YC6gjM7`61c?>F(Sn}=_~xxx(7`F%`)v|0iXd+ z%-|RauEIbMcZfE&`KLpKVQc-tc%l@?MppzSLR?lf+b7`B-qLkt+rl#MlUI;q4k)^; zZt8&QdXPpIt&XH>YSPEumW+kTDkyj{k<N;Jo?&ynG{9rZy2UM%Bc_5$8tb8JH@f;i zwO9!gJdh&EN2aF5OAZ6%lNv^(#-a`y{M><oignaQJR1AY9RzZRVl8+dtJ?-hUOVww zdVdAh!tcJ<!Usx&SXRq#_>h(~aL-;51L>?<$f`}OZ^FZs4c?MN`7aVM5S(V#p9w)? z86JnB)C=WN$4>}09amyj(6rDO2&JkmR#~m##w98i+ZpWfKs`;UjtdCPj*62uAt$gA zU{1aU8nB@<?R1Qg`7m;K(J^qM(s8=SkX2V9D9NG{jFgKyeVE+G^9UJBe;!#Y>%2iM zp{SX4{(nq;V~}J`w{_dLZF}0bZB*Mfrfu7{ZClf}ZFAZ)x1Z;|_j|t|RdpgxL`B9q znYq_qYwwk@HX+AmF8U)G>>lVZD0SZPh@RUed3*b5K4tB?F8YM)-F@}3ML*>Y{DEOV zNDJu7rDWRXm<B$HRe;Z~aC0$=vs2|BylpiV1V<A3^0gxAyB;hlyo`<{ED~c(Vzg8& zNiG;57=kJhR_TA<d>R3!Q3xPbT!+U-*z6Z2X422GG#zMeAu!ycJ_20v@BjHY8kTiH zq}CpQen$cL>Pv=y(KPNrk@}yffM;fFV736#A9QF`iux9wzhV3-=9Io|elcxl@#N7+ z1MxXqqp*7gqnYbDRBAyVxf`%B%Y|aI#Dng`W}=u*xtFwNjJcx9dSA$e0YB`vw1q@3 zr5;8>62PAIf~$ZEjRNKymnf}&%A}|-Z?I|}{O0Pz<OUiLzvRVv#5RIT&u$^x0jHWo zbCZAFG^6@?^Fd;xKFA9v+uE9vZq5Tg6@50t!IU=PVh%{OJ-vur;b2;JqMi57gq~Q` zhbaA~1S&rs|A7v1dc=NahQ7b_{S3tNJn$8aDvsaEOiCtF^QEQb*GZ+-m#07#6{PR4 zCR595mL8%PZz__L<#qVTdKj(zj`6SmnTubJ6vi*_R0_{(-e2Is{vQs9lCY{z9~Il$ zo}hxplGDvs1pB^GjTDz{(YxT|Aq8J6x(fOAv3eQcDF~$mSGrUn@u5UmrtD_<zWX$n zvpEJz-a@`yYGuETs;(`S$h1l?p4fB0Sfw^Ai)ZL|^Y#}F^wul+i`HoPi@M7mN5TJ2 zVIO0*^gYB*vSZgR`sVDx1Fl@{|01<$d~}l$px8+oSZkZ~)UtJ4Qkqo7_s?7=(S<!@ zu=7%xwA&i*)l;mAp{`|4KtO1sh_0ZDH<dg24_Enw=QQi&soO(ruC9)FEadsnFKH<V zo=#C^a`r2uAczGjt98F^y)7%UOr)}p*ot+;qEX7TtfwWb0lH5D{7w|y57++MWU{P6 z*OGeHG5V6d2^$WwQse3c)gFYWT@rwKbo_51|F(7eN#QvDCB-pCsmfI!(6C<0&<WT3 zBIO_g(#^ubn=p`&9{%0vG2$h_Enh$-`S%S-h6H&!(8~CVG~b^O9}%ZfT(dIOTvAKP zXO|UW_NT#{K07a5-DP+jg1RN)P*ifSw@1UwIi9|%cOS0)Hz}sSn!v~H)kCP?Ie7xJ z{sNFC`S{P&=4lmrTMSzrV}J7CMwWl)>PB)IT3a8gvZn6{#Z<h&3oszm7a_(a{2>q% z3GKVZ-0g1#X%G}gX8i1R^`R(2sOFd#EPzs=VW{<Ua%q>I#3=)B$~rEpBoY^>ur@;R z{Q-PJJ+g-b0{h}t%$IBr6;B{M2Mf8M4A%ZY1&1Kt<KlxRcRTpB!$j&g@+$fS(rhUv z@iDRq0izyDKYxhqgTM>X(!6Wn&tWx@Skj^ULhfmGig`8NjH6D-B#Mrk%@V19sK$jE zx$$Dg$fVD|_!&=Sse50mq)9uca7g}I9L3<#%PeojTVx{Xo!wO!L4M){f)63w4#rdO zXO-&C+zPF-oaYx4HnBOa9{nEQt7MYr;h$CVQ&PFI&@aF3>C0yemQ1v*{xh@u+P%O? zUBf6H6%Dq5?xa;6Ko1FM7pez|ANaatwRnU{ZFDV1acM!IkE)lz4}k28g_4d3VhWb~ z81)uXumjuHba@R#J%qnZfXMv|*^3$OATOwEtKNgzV;~OpzCT!b>}%1wf$MC@3M1bN zn^j{orneSdm|I1w^VJB9AmH*+ea?ODX80oZ{J*Ij53&aB>)UwPmz8L~0A=pYYp+CD zWz*x&CbgU-Wi)qcw#TVJKZ?H<CuoR!jtrR3XWyz0Xm-sNTj~yL%4M_V<IfMVq$Yx# zf8yQfAzIze#S7y7#q;VLu6EdbBWc$8QY7ms+PH)e!Jq15t$@8`LyFknKqwLKy`?~- z$%*urKY*bhRY(&jk^ta50?`Ow58lNmN#svkZz-gZ=nWQ?8!OVe*%n@eN+kpx5Pv^^ zcy1VO04MgIHqq+LJLA}Z&zft_GHvqPrpqUteh0dTShv+nyFO+p7-Nuo(2#4!VX*hE zqhX;J_!2g(L_=CTbN>xX&QOZnZe7|GcNCbOb{bNcKr9p$t8>^=^14HOAaQnWM^PtK zCQRc=L9=6sK8<!g3tiq9Uv`Eq`-maUsan!Ssqw83De*bP&EJu>Hi%Pyz6)13DBEMB zPn3WK7B*wSCPAar$A_y#8J#!&KQbH7=du6bQ$=&?{ZtA@6t{>XXYC#<V9KeHR5yYJ zZJvmUa*)hINeD=iqXY^)`vov)9A0pQxL>$7+&8Wm$Z4?JaJ;IabR^6VGY>`;TXuaY z@Zx)X)b{&d;!gMs8eaqsxWYQT#`M`YJb~iP?wjTtl}DTkBQ`7R2*s*v3|s4k#5MXq z5SLnt{pX4JcY!`-nn6rG0B3Y-fm`E_R}vsZ!)fDmSp!KiX9&Ij7F;)u#hL&jwYK~- z9qYa^wb*!SfSlO47n7vl=$FpKtJ*CjCgF2%08((bIkX8fb}CSm{KZDsfKeLC-cC#q zrL&NH{92a5_)$POq?J`wP}Pz{k3C!+c1W)!6<~*mFtKws;dF*kc4y_$u&ClVR2_A1 z!j=u^WmDXGP}&x^#_`wsH>AOptLfg(_1LiBzHXiWEp}b^#daI}4Y0<NZMELUB|n^E zy!})pfnw1H2a#j_5$GCgV~7jE9&D&q8eoDi$!utYO;fpYu~W7B5+cFv<=A~3?*^~} zb?Rj5IDzQaj{f@da@~7MY&bsn<$C{h?7A4AN>%j6dG64cWx8-~IgnU7z2s399d3)h z-_pM^<F)s3|G+)T;(PL9m$l}rmTcKVE4FvnN`Pmn*o?Gc-qlKQ$*Zfi=|HbY%?jLE zyb{g*-AM9P_~-vbm{3cb){mb63Ey9=-utUVn<p1S+@nT@C=oV`fyEN@jU&H=Zy%_% zOG$;3Zb-ulO$|R+FmZ>TPf9L7n-X&m!&ZRoEb8&dmjV**wm#4$K%wB`!5?kEI9c%7 zU^}`TZ7)$*H#Xlhq1EXCeguY<NC5lGRpi~3I?`U+P0L!_9F2%9=>A(FiP)}YRNg;{ zxBWVqk>LCrtABB6F~VP+`wl|>R*rxWWPhqR^m`!4Zycl?#|FiKpeG3MP2!tE26P30 zfoSTatrPFGnRZ3sr&Iiryk-4ht&ey3X@4pQMhw`M!owvJm>Qa|jdnp+D|lSO<^gT= zY7TIE>IZ^(Wz!z0Z8A=<?kQBUs@bR;c`}=RRCtemG_Ym>Yd{w;aagnr!Ihduo=nnv z^j(IQ>!l6{tqw<o=^*km#Y1|NBuXeF7jY0~BDP-kypiLG7v7=A@l3A~h`Fj8f)pdB zQ*B>WNjirig>8-mfCs1!u*{|x`YCiTC&OwRRMS<8jeuH`grb~ikwpT)23LAx$*q1y z{5t7)se+S-2}>^1t{ub<%%4g;Us^b($@O$A$oB+-#bq(xS=OHS#8_w9o~|kzws6_l z0~7x!OuSyg?=U(2mPv@DikFVewR3i1!UR4TFKP7rS*|cqB34s`c;FH|tPNR33Yc`b z<P;P=|0VArG~eEFG+Q{)m#j$iO=F`}3S!^(RntZMc`lqcRQtD`X$037O9z89#-)i_ z#@1Cm=5bYsO*Z^LpGWtjq><qJ;X2lbQ80M}pz14e{s6_UT;C|XceAH2(`d2G0W<$3 zdl$?M=l<nNrgFp(z$f4*2#;ZPYG0VmasqehSeaN4G^ha0e2B8HU_p0Y$Nd-h=0-dd zg8@=16rQHgQ=osP-lSWtFcW24q~$l2;N^+^pVuUQ16UHB%N|yGxCWx=@)jy(cv8|a z?D}f5a)Yox@?nfB5-l5r9A#VJ^YoPYvO;lCBGvF6u$<dhc8LP6zJ^H>du3+zbCKHf zQ)L$Yv?&NjP?{?Yq~W3K2LbHlnggz`rnGC)!z<}#j-ImxWN->RbkNS5|4m+gx<-`7 zW-7AX$88)P9@;>gpRu)aS!0{x-3G;}?1+n=S!XgP4kh1wp>66SNSa*LT4B>?w?wnt zTu?(Al)1KcT>c}2{)fPfLk+@Gcm!?iU^I~#&9Z@Jawz|`*Uxeaaf04`AM9n0!EcOO zW*SUlRtYnJFU!p$FsNTodw5GE6~cnd02Eu6>R`i?@Fp^5F)3G1ufW=@|Ky9@9DNCY zd^Aa5)j=hp`iF)xg>VGplhrIz*?`;OaFMgC@mTx&9^BZC)~M@7qj}1|Kw;zGz{x~Z zaNXJV;(P)7mU!>kOEx;m$x=3o<z+B$6b-D3U})t;?3sW}3iQeBd@dI|5E*X)Y$yfc zN)%jS#O#gkAaqe8{0r!V&Erx!BTxKyz3M=<K7MOnw6P$8X+KktixJ7B(rYyGu(RjV zH&BQs!-~FZaCWl_zYKTg1-zt|he<OjQ~km2>&{B=`ipIkP;h<uom^2yw#RZF2IW9V zpwTX=b!*;sLZ&?Ext~*dypr!?S>_nsT)O>5V}7~!3<bZz%GmQaZ(4I4?wy(!vCmzU zw!+{db<#wk#Occ&Fvi<i9R3bZ1%9P}iE5eHhs}%<P*6ESqGv9rgdI(YcE#BuQoct9 z*YGS#7`o;<ED?@M@s0(<g(m{nddxMvvV(LR_Wj>Ge;mJW<(%Lcbw(mKToSMi*LH1D zw*MtJAT%N_7Wf(dGutqNPFbN6HvcdaSvfoV7ODJv?JbCjSJc?JxF8%7EpV#6<-0G@ z0iG!s7J7d|7lW%`n4OoD#RX+aXBSnE@X>qO1W5n6ceW91lYxtgP7psaaeIn{c>SN> zKrui!iM_wck7ND5#H$n6?jZ6m<c3mF;s^88D+H+ty8}7*#*c5(veYRMy3dxUWlNBB z_1bi^WDa2y;l5|FHR@-s8L^JtW<?7;+b#8_T-kQ(c5nGtiu;jX+&6qu=Ar-fkl^@2 zuhP-Op5hq9#c5ulO~1t_AItc${rk-;n;Q`Kcu|?^oLYH2T=dPRz;R!n&~5*l#B^VN zZ$X~f#Sr8Bppy%AnUjp^TfeEAB1(3Rqo-C_#LcXKj*mzC!>le(*IX%#zoltaIQfW( ziN*EkYcYplbdpmoJB6qn@!YQ!O?tN@iDH_g_ci0UJKpkiRvsTzTXhUx+*sBpwhKJ! zeSF8??xKH{(>=k$sW-%Ctv)NnzRF!rtH_ndP$HDEzCO=*sR9cE2jUgTn%%*?=%wiD zt@M%&dADRnx{7aC*%KpIN&WFuL1smOZV(>bT<1uO(%}?p)UVfDM(;i>%zUH$P<*{W zS3;>7_NaP<wl9-q)F)#%Tkv=&Honi<xznMWc7^)5VfyjTI2%*79Pv?#0sKW=QP>Z@ zvvJ2=-@tF$75)Y)AA`w(f3NauxpZs{9gpf9#Pjb3oe7dY@1Q#|t4aTA1quuTxI>y; zl8~%Jb$yF3FrVsP=a5;5HFC|QQB6G_ha(vbj^{II8Oy`2@g))ev+p_oNEmQy+|@ns z-&;HY)I32fDO&t2Fht+Aum3fvxBs`Y>a%893p`qN1>AZ&0I}x!pTXe2RQ$;RwdtM7 z+lH^FqM)d~@CN@?9Z;-Oi%yq#Wy~idB51TPqJ$u^wW?lE)`yE9&w3b3;R-D#dc;d! zA5jPuvCdkD>^1+f(;7mkZoB4@Z<S6wK?FMKwGZZ$J}8um+P(t`t)kV*#a~xAI~SxF zC#>z&kyVIjB1&oP9_tU#N;h<V`R-#FY^@=>a7%%+?g-F6;Z$YGfAOj?sXn&OrNKrD z;O|-ynHki!%fHEO)OX#m-MopqsOu!X_1e1rfw7V6TOd8^Be$9?bP0vf91c#$nWt=0 ztS<~hwkVeol=Y4cKVPndFakAM&KlZ*us$k^wLnzTn}~HePByJ|^x>N)ie+8yzZ`lB zvIMM!7#(Q{gW(hdkw2D%_IJSxGi})U6mtkNqn$P?KeY^GP|_U3P>Wsy19&~3H3|uf zc1>^nfeu862Yru^UZH<cUkj|S{4ct@i|aoeFLB_8uO5bBFSKuvWF!uf?r$-+sEhe> zF3gG1maU_G61-N2<HWf6pqAuGL*xl+vbL4V%;JkNI7}S|UANX;fgr?enVQU|!btl$ zAg~KKN?9xNaJ`FQjmvx1Ic`@1gCY(l`S4w#UV_~C8g04?ujbg<AJ$K)adiQ$sUKuc zfA_HQ))ZZ4M(1pB@ReI`OJyw9D2sYiO;hu&7d=p_laUjOjZ}Q<G!V(Z6N&Frs5nF1 zGp*U1j<^@_R#rNh$Wzc0G{rtySCsEoAk)czn*-@1zjnzx;8);6UrR)8_He^rX&kO! z?f?j!cY}Z+{ujf$b1sscE?&Iuf_~eryvB&V|EObljyudP4P5PdT_jdekLB9X-hQka zIRj6E7WOT+89IP-@ehGu4#eseh6g~zW0K)nFJl)I3soM6!&9OuASe&FNb*RFT{8Jq zG0)W*rY~cP3O>?%g7=L*!%gM(%NhR+7Dlx5=M_rohq`$lhZ&^*wQ1^c^JQdbR@-L> zQ*rzo4rU5b#p3!fjCy$RZU}9%B=;M>t4PwfnY$@e6&+f(Q8)jW0*Y(6#>^6)<?4nj z;GD!9CFWB{EzE0<rQkAH9YuEms#c+I`uy@JMSp#PZWM-Nd<QKTri|f%BQ|^I9$hR> zc6+1h4Y@tG-It80K#YlStwwHq5a~Wk;jzLF-2*B6>wK$G1ev1BBU!zIfHLcXNj9qA z95hT+#1%ALk-G-m;`3DALfW&lIM~m7jZXGEYU6Al|9al}yGn{gujZ?h=-wyC*0>~- zLWRtKTuQq9317^5G`Y~`fX1-ycv47mlz1?Kh7mh-E<c@S$P1t<pp!<NjBt2*|7Zvm zK*OR))A7rktcJ<v2L<@yw`&Cq;$dpH6_vgC_;=Bf#H;H;yf6nz&I+ybg^#HaJ7G05 zso(xo<%&rUu0KB*Wi(>=u1cOT!3>G-QQjcxUP*}-x)7P+l9N%`0A=g@TGWo;R`NdG z)Hvp0zVHSzD|r~zJ>rtJnZ>M=auc{7rw&z*Yu85-zi~u+uX<<c9!rp^U5#83jy1K- zk*sqF;SPTgOQkquc|eIx$EQq!d{VCl4_JoMwfEQe4*TM(wh7dTkbnKM)ff9pMv>Wa zGo@%|`$r$hKOc2A^b33ahm*w>s;<Ou{>C_)Htn(oXurp*tH`{ci0X*VxBrovhQwG$ zn(pU2?{*Pp#aU?H1r#nie<qn5_W}<NdaUB+&cMzKQw~N>tivT$Q$pOLBdNf8vq8=; zcW!@4<-LHZM}gcskY^UQBWVy#ILJu3Qm1QP^P#YTV)J&&xkUM~kHS`IMJFCuiaHub zNJ7q>h~1`<yTmW9wMf}qgA|FwxK-LBU33UE1;?6Xb32fVOsuEW=w%JBFfcXd4pqe! z`l&<bFkJi}8(G8wJwNs)q_@zfxN@!s1Q!n^%5UxdmI4FqbmbF#1Nq)aS1)~3-@4MQ z_iD9$^QU;WXY9~=u91WQ>CSIbHGnm>Jpy8Xs`@?mNoW~Z6djoWzd9dv7-K?c$t$p; ziBia>0DZ}WKWw{k<@~#N#h!5gW&b$C$}`3^O>(-^o__3=*_d&K)H>M1lOpu~rgDX0 z*2H>M(@TN2|H#ta^Q5=@Pvw7DC1V?492S16>CZoFUdRMhBJ7tOt*ZPJT>yL;HIHho z_LSmRx>-@~ZhQD{pv;Vj6j-J!eV6d>TxZJA4qd{+BBq>oi6;%Di#|~wG$d5qqDlNM zpQ^mrnAr=-BS3(=kcs+%Dwn6PmCm8cPzT*XI}j1LXWT9^@cRTnmKO^+m%}DYu74Ia z|F_zcZ1yjqbkel`F9`XUdTC;DH2X`EQvWncSYyp8YLJ2CGPb~$D7{!<FF1@GNfDk` zUQKjb88WF=CSqOh0vf7Nl_I89EoWxSNp{CwRUp<srEEw?bc>NkBjEFs@!>+iH5yn6 z>Q_to2xRAl?$-*W;NRxg?|l^f?o>w#LF{*oM8vNJHX%_P^bQ4{dzGUND@dT8z1*HI zD@gZXRGs<z;T;tpNA4QfOx^EMR?4n+>AQd6`}?UgU)169Gg=MpNAw>bxS8pLCGN>? zs@E4IA!qiiallaoLSc|k(^iRSdn3x;rheF!7T3I#G&LU+V;2$2kCzMN;^->;dgOcl zbJ!|wdTIx^2qnKbyDJSE@7^&1Z+F&t+$er+ubJ=fZC<3a^<}j+BXDO7cKG^7Su_$B z>jcC)sxYP=R4}l2>)`nD$D%AqjeCBxW1a`}<!ruqU^DY*r*@Op#QeP1rw=HqU~g|~ zxX1TC!2$hn8K{Y7f?V?CFcF!otQDLjd-0%71tS6SvY`2>5+52*;Gb~QO|1lS=$)3R z`GYG~Ij}IESDcF^UMiExzxhP7;HmJI@JG=*0DApa?Gs25b1X0zM_9e0U?E@v%Hb98 z0YolLLX@f$=<XR)NVlI^tqVY!)(dK3o)_nMQ-lNHHr`8;7ThmgBgUJPIgXP|T7MGC z0fjA<8fSx6O^-3j8-wwj8Ho-G>_X5L8wpQ>%mj#f#<n6dYa4<I{t}FmH+^10{lQ)G z5FjjXvRea!(O?!lt%%=7*O~vqc;c4KLvxIyivLn8ISVJ`Z%?cm_s~pq*Yf3}dM3vl zNW!nIi}cWO8s+NE;&d?p*20S*h;vU1$2UOSCQ^#_$u&3!YU9#XrA@%E_>1rFwn}Tc zd>3fMzmnsBazF69NU*=seB+o2qQ&h%ffC0~$s&PD^OEEnXp*7>6reUxhCtc|sGyZ6 zRzOH0K9$CK8JI(puqsrJp88$R&inAe&;D6B%TC5m2rF^;0JF?`e2Rs(fflGzP%w~D zvK}c5P6>vMlL(MAd?mHm<NfV;Q8hGuWUJ+(lz0VsgI<mz<H0OZ6xP6WN(_-JK6hWW z=;-?PIB9EmM)8?_tdw~6E^$*}?@2D2Q_e>ktG9C1Do?S77n^<Vnz+e-bJ+is@*d4| zUiqDu`ydAeK%v8GWjzjp-Usp_UGgR<B8>;FQP2P#DbZG&2ceu0KU!J0h|d|9N}XDm zSy?zp%l|k*+9us06%~6~AyM^Pt%gt5C^XsLLyE`;6EU3Vn%u%Fser4-=vNhRg<~ZN zw2y!r0hbLmL}X?3@|%By&o|j#2d0X+2UbKm1gC14#mpClgi~~KF}(y!9=-#jir5E9 zUONR<Z2TL^14Ef_lK1=4A7$$V5|DTs*Muphn=jG46v8*BxGeOpzEmP%DUvYjs##p^ z|JSXIeN|9v7a*L(zRaW?UzW56k6ZNa-gE!j@O(+zw!3#(0N@I0g8ewaNegg*#%K`? z1n`}A+db-`6U>jA+!6^D?75$TfYKh22}QPg7$$=v-1&qJvE_RADL2cFQ8lN+mna`{ z!N_EcQPG)h#SEGT5ERX6bLcKT6f}EVWuG^DquoSp7<fu_eJ4}~2G^6X$3=HyCBhj4 zWc)>oe+HpXzq`BEkhxv805g$?@SmLFl&=_E1ChPC<dy9Emj3+?o^)prwjNYK2qI(E zm^Ao|bA=|K4G|0g95-LSrxw_c3r$|^U{ToYG?13gNeKfo!Z=J~7d)ijL}(sO94kc0 z<b9Ms&3}t=5(OexEEpb=*WErFW=hQV7@(5~+50laTD`%9u>>1C$`a#6k)qt69$*C! zZ0Rj;uzy%wqtAG6Jd7>sM-_dm(2+bF|D=`j$FiRxLoB@!YEc7u)<s;9<Z52?(7MD> zU`EtaTEd!Eui9<6ZI6hb?U+1l<s0moctp{S<0NeU{92A+iga#iXaeI0ftCp6NR!)@ z3s{IbU9L>MafH$wr0+eWv(0Oyctxi>x5@Th`}5VNijDNbn=#n}x3ggnwwMilOSa!U zsG7!v>92lcjoGc*MeT!6<=+2C(;IFOC+rIJXZ`e3?a{uv3USuN7PsUY8Kx=`oh&e& zgh$C(LK&xzA4p$G8p9GYORg6dN{$)+`6~DU&y+H3Grn<}$Qb9!MK)e7x)e$?4jt<< zs8(MU<Zgu@3CRg^&g5T5;(#t!PsJ=P#*_-H3_fLkuM9=>3~&@*(gz6E{7eC!4c~)# zB#J2@PSXIk#6uDIwS<a}qU=)DqQ2@nkj4Jv&w&M>pIxq+Y883_eRQhzwkjz&lkp1{ z+dv8xmVzGS-cCONxDzWb+c=_toAX>Vm9+N*m~lsS+{*Q;nPKYilu@*qL#77Q0CbPd zPVvlnOr!hRRDW`D<-vkhL{-NY2V<**7arzK#gqH7=C;<V<X3vvQBhqk%*aJEk7Cz< zu`=jm#PTia^7-v-pGLq2T{7QEUJ4`eibPLFOEoD0;u^Lu|Do7tx5l$i^~Ia^@~VUh z%8NURRVXNOa|)steMds+l~?o|T|AH;%y1Wp@KB{hsMJPhya`=SQFeds6!u+lbl;Ew z0^OaRsF0YTd-Nzs=w|LzG$zW&5?oE9f3mhbk)C8r{$!Q??Y~rG^w$0}dnyf75t(ZT zdt{KB;&gLcK$g{vqYr#K#nyt?ezOtQRmvq3f}=AF*brz$zUeTGWcA_u*N^CC!AzRO zhXoBBYjCW4TN;D{6(SuEFVZs|>)4kXx+&n<8-E~X1oXrC_m69}Qw^15_r4K|`Hjeg zel9$tq>@w-8x(SzvNe+cITu$k>nOR(7Ie~@vR_gwPLXo&|1Im5oqvd!J9=@ZW*kRT z@|Pm1HIBsSiM5VgF^*hu{`dR)dhxE3sXNdLgBIsfiuoi`9FFTF^`vHVJMRhm`ysAs zW{gIvlKo)`DpFI#2!{PRiRy%&{t!WR8H5=mMuOCVrAnyI*@G|g-ECWtU!*ETk}t_i z<5&kpVnwWQild~qGQw`8xX6^W!UV!6DG=G>*($u1EHPJ59wCc?B6?&TDu9&aet%#c z%=5;o_ydUx6V<MwdR;nOPX4sgsX15R_F9L&-WUTVw0PVlen?MqqVb1){&>gDGn$r; ztQuUqLBo*MxhjHn-*)SIdVzM-r$}%&!Zk2jDMu!d`bYpvmNELEElHnXxcRTQQYQdR zHX;-xitn0aJ+_hyc+o!ol7p*gZ@_L)$ynyfgrJ7#Ji7WJAzg?u2suKM6Y|ns$IGv1 z1~Vy+@<i<s8Kszc52_!8LIFSU-jHJ8G9c6EW6U*L8@nQH$m{f<bG5fzHqI1Fe$JnL z3Y-6<!tMVFu*REzFldA6i5w6ocOoW)8K?abEj&h;g^wb`mE;ZUj%ouFk1{pTgWs`j z`jaIK(g)oI6v@6WUq=+!xACgVb`c!GoOY74^2!~4I`pskN6hk9UyIx}SotgqJk@kC z99H30|6u@6*vO}Khqg>=A0W70u}I>@J`kXsjJ_5K>UoEHPS3E*7uKQys*R*{CAWtb zJBKQ=Pk(*b5AUhN){Yl?ue}|54BOyM@}BHpW0>L`J@yW9f}_c=yK7PNh$O@REE(y3 z|4c8dv?6{cKiFt^4*h_F?8)zlYqHYb0e{qe;@AULi=6R?01O)Vt|iqND-MHycV;w2 zM;$hG6G;!um9?a<03%Y`X4K<f=HJeaw{bbZ!=z@a(>!tBO|@aH8d600I^YWM6+Xs5 z+ZTzVjII>a6xbNT<j&L>fFbAsneE>^nT`9<@mR%)8s+Df9Z`dCjuJUk#f?gH?WSgq z0%W4$TIzriTzTwB(duSd`qJDfm;1Z&;wkFgz)?cupLe13EB;{&J!jXzLLXuubl-sL zXSS9r(?PIUYxcZ?W@U0B?vmN07CUY${Gggd3Fq}Xp4xzlAFXG6d-{TuYgDE<Gq@_} z_7Qg^NqQ9$Nfq1HDup|}bbDCKComF=mYpJ43uJKOr?TZb#cClKMb9$<&z>rB7x@+z zfV2T%trfN3c3$b@xXt>q0B~X>3*Xy9SQ;_I$*<<~VU+&$W_V;0vu$Sj1Y|p2*BEGN zgS_h0a8mSp5o{k~6k||TGbpSXXv%>BQ?O*P9oQ>UD|>XA<Q%$+Jo<5W=H&6zxW(cq zCAAerX#Qw7RDZkEG7*u6vbB0k9x%J<zYE|FHZ3MNA;(<B`6M|IX-ACE&6PDB!npQ6 zi;RphyCK#%prQy~JChfiJ1=(tzauzdu9UoO==rSmu~tFe?P{Sa7XgEzp12~$mj!UU zJqQc5zfwnsbSt<@PoK`Tn_|hJW5+ae{f1fxblPKgv2~*!Ji;-h$)7OrQ--?7hnt+Q zZA$P;#h?)jkO3iR25R=;YAv%4BQ)q|<f3Y3Kln~6^Z@S;Q2d&6RRlRGi^RkJ4W2;m z`B91-^k@zcPW7Q^f%SQ9(cN;vT?<U>OE!%>G(3PzYwWGLqInWYy}m;z0>d~^Tvn-G z1>cvHhmQxK%~K6YmpnmZN!C}vmOO1`2{g^;#3h&tJcO=J#z-3>Fb`7Pw-pjfNf5-& z{z-`T7v`r-)NC{nqce0CtCyWWt1UrZz-69mi0JW5QA@6SC&1Xq1F?1C)|%fnp$GXb z+UQKi+G*f&q5!vbltOT>-P{0n5RdG3k{jq#dnPkQw_op_|H`>(ay4d?x=t1%wg9oh zO42mq8EA@@^;<{zj_3!-O87jd|3%gt`V=UlfZP0XBaSjNVq&1c^GRJbhZdEGiH=I@ zh~%pj8Spq;xnSzSoi^Ff8%+{&h67#V7g`t`exYcU>$*~d)>T2>ybzP|IwHk_({HK? z-*hTUHMnXvIoI;#Aw*Nv=n)UFd0ipq6{;XHQVG*NmR40bX0L9+n5wZ-+$<4ic7<g# zLGjo>3>Z&Y>}^B12$V1PkH>e7YE?=)5Vc6r8g=BM`cFB(bh^&gN!_5Z{&ql;i<3Tp zCWt`bW_+%0?$R;De3A@W_^biB1LDlk;2{QwDR|LDWNzDXxXN1Bk&Ek|Rd7K2@mT<& zZv#ke9>5;5uuwx=3O!o%ntm@w?H;GB*%c(8YfH^Qm<Sm7b}L3V)9!VNNSBh*u(l}0 z(>;DEZ#QZuVH#2xNtbGs9i)XCmv4{gNc0#;CGG(U86cH1$jHatl9DH$MsW8kHN^Fe z#P_0$(d9q-?~V;pJ$j{Wi0JGHNTkY^Xwxo$DZVTdZ?~|*i0OAH0}*bvqkY6MZ^*6! z6T%s8JfQqp&LB(E+QAp{xF6+&O$F_psG{QDisA8@fxuEwl!dn{I%^Irv+o@|R(|dG zS`r*auy{;c>L%p+cs*CJcqT@%sjLPpR>_3j{{Hc|$fl`mb~Q@Yoss21?B>OzNvB=k z#M>tUYe}<;^o!%1dPl64c2$44;g?J1CP`nW8#ahahP2)dn<>TB%9>!pZ8DSv^5d_m zBzD=$@gr6}b%!ti!C+;lX!#)~H2`R~7Ss%*chKk;qSz|deO5`5)(Y7v0ngzwscAyi z&QIiKMPlSljP{|SB%(i|1q-9v_Z~KAYY8w(uEkCwdO#CRcO3q6HE{S-tO!UE;lr){ zcsS6#W!0*~z+O^t6i+-u`U43qa+KHNLLKqNx>oJKLV{T_oj$2$=4^F-lS=U>c_?kX z)qiRt5moK7JC9#padVC3J#n3Sw8Kz|u0lJ$gUAfu&T-lOapN%X>0PsjShuIsL4`9M z$;T@I2N>}N1{e<JX+jDnB~9#;7TLYQr{qFptR6>WYGdC<`$2iS_%gCY87DwV=MY(j zS_KjIVfU3LoUswwF?#7lbes)x?%1rx!<XE1KvUmp$43J8%CiN+bc+7ECdoye4vAz# z?3tn}@cfMK;Wlk(l|X)QB!rnfG3wLG!WqJLuceIetvtBUo!z5yU=``iGm*l+OmqH9 zaAt!$ndcf3Ee+nQV{+V&P2RKO^9#jFPC$t_hpts-3oHX0>YS(>+JhnR@6?XVvkwb~ z{u_+uN#VB?PoEBv;CjS;Rg6b&DCnoK9vGjmX@Zm&pB8e!9W0XH>W@3N2Lu)sa&@%E zZni#FEBI755F{&jl}RUH$}PNxlWaZ6Xym$tK~3L1c-Ol~%l06dA2=NLee`S$%5@_h z=yQP&-#|LBIwlBn2P`#=m;W$}vA;-^Wupl(|BgC<4c*;;(N-HABGRFrj)`NGBzkbr z?ET}0MBt^FQYbD4LG*<(U?Q4Qs1b}8I=CQU_u_e9sJ^jSFqA^ow3N9#<1lA}47t3r zYlIr7hW~2ZB)NET{t2+sW>4l)jA4C2XeuR0BHa=5+}%jc9j_}ChEs#Y@Dy=rZgw1z z8dX+%DMeI2A_unrTG=>9jT;hj?l2YTs9&8Vb!kZh(rLzyw}lwV8-6^2a>O<cNh92# zs#;?M7Eff|9~~mo$mcqeBiw<rUo=qZF(hOZ5WG2rG7r%Ukt{PHYLVFdx(yIoY0vSS z8=J4YRrIAHU(Ol|I}r{Oe*`?+{lhGg3etS=;IpgJd5P%8FbJfiNWQu<WFwhKQ`r>_ z$=x9-rPOm2v<@F!DW7IM2m8@v;+tR8Xk(>6O;J5Gik@R=$q(Wx5rp)j(ljS9i!^9p zGS(<gBd0@q)ZE$>vx{+hE1rNJewEo6Zg~ag<QDFMSj%crpgY1fCFJrP9&<>+OQ3LE zzbiAyEuI-v?4Vb~QZvvbdtYHgq-%sZW@r2r!zQk#QV=dwYUS9q<Mp*6zOCEx4bqDP zPQ?zv>zlSL2;aEX2cUP7P{6V$h#Agd!D>ck%N%(GU_=yOQ;>PCg5U(Wk`c2b!EhU0 z&Dm%49NZrCXNS<PD3F_VXl)9Q3{%H4x6F`RaW}21*muVfGjCYbJE*VHfaSL1<riXH zNQDIRp=h*=;FGRs;|Rq7(E&2qseC6t0Ye63%ilh$luK4yzC*cT<emP#xX7YQwoHX- z4F1=<>ad^S&2E8#uHca@(Qr&~thn`a^xA?hHMIVeeaqBzTlt!i0a0JcZiY%=tV~Xj zIfL9-SjJ!;nNZJFJ6pbFRAKKpWPii==M+G^$uTy+mv;eLeA}MS^)b%%)ABDTNECH3 z8sVdS5aBB|i>sDfE8d?^CHCxg)*YA1%M*N`p_RIW2=jN9D@FIOj-=h{-^X5Ug_o2& z2?hvwn2~3bnM&fYN@n(3u#`29imD@F>m7IIpyjmtg>vr~BGs}7Sv3MhD3w=k19t-v z47tQ&WgjFwumvOOUo+&+$6}2hns*MgeX+j|IGnSTGMghK>vz+t#_HKI)r<Zdhd<Fz zK|L2l1_4fv;!NhT0S`o{gT+aPJd=(`WrQ4rmi__UZ)hPNDk!-&AVd~TcgmYbarY(E zV2WNO-i}glxdR>wja@WR#B`_tLDEtg&HUDdjE#U1G}3TWQpf1!yrshriz|nop{lGN zBHTM}ZW=GG)WGerR;PbRfSU0TSCQ#2Bl`2!s@N=s@1}0_aO@)Y$ZBgve4Dq37fT1% zpUYjkgzLbdqGhz4_aL5(KYY4~8f7i0%Y}>Z+JTcm;6*#mp8il!TZD-Mm@OO?`fA?p zp}?UbJQ9{S&AnH+wK7}lh+i42xH2Daz~#8kw9YtOmeuZvf%k=jHY2G?fqWryeS>WP zD^x3r;O8<R8G!A%)*~6fgr~$$r>uk`W1XN)ngEQzRx3uMPrEoY)+0I6C03amrOJm{ z;Z-C1LRcqOX_Sz;?137z45*JzB8xNE)8#lCe@MZ<R4Xl;A<_yZr)gXC77s7AP$J!E zx&UA7k8<gYfZwD=+xgIru$w9ucExleo-9Y8cFKN@Dti8W(<~Uot{4I%)do+-)Wucq zhFwarxj|5DixI<Aq#Dc@pT=(c#M|f)7HdcqBT6L!HpP}llI0**W(g#Av@$ls&W@p) zGk44go<xVM3r>_k_7Y#_2`P_0<{@py02lDrYapXMA^Czm!<3qG*2}Lt640CvtDZaE z;n>3-yQtJ=Zi_b@D)0d77KT_3WlrDf3Rwf7$i7h<8B6;B<^}P>QNXTeXwgJC$FU~* zQ1{E-U}tMbuQjJHT7tLzy46|P%$RJUZOW}V7OZA?7bD_?BmL;}h_;BsL8#3sPPH_Z zi!>Z>(lHORX)+=T0aN8+Jwp;D6T(;fG^TtzheaO2cCdrIL7J)Sw|7cPfp^Hil7-Un zrqBu@vE+Tw_OwAn4EcgEUGbcJk~?jDRDTXij6nJ%Z>K5DxgH|Embc^)Cb(FyfAZ?Q z`taalMzm>5lF~#=N#EM>8@CThgrE|-eoEy@)Q+tp1uo$aTGulj5RvL#Va}bU21)&( z1RL(~R({h6C?>uynb}hkrl0oFS*nJV&yWIBZL;VKBeA%g2Zy*v&N~may%_AK=6~E; z2W6W#GeVUa)vi4^JkWv&%}qREQauh^X|lGeG34%N2ip`pAgs?53S4-8%hdYHl>eSJ z8Oy6+gG&d#0Ty(WkiX$MXVxU*Dgs5(;8@iY#UdrB4SRZn3b;5u54hCsoOM=J2L>ld zr!ovHO{5)1JZ6$b=THM7Bp>2Yim)lTGVFMBuesC|(=}`>nd$-yC(l{M?(|&ORZM%5 z$}1L{2S_hUb)5;-U46`u#;tK^)djJfM#dh1$-6fvV^<bWgNZsr?x6`>ZnQx$%(7@K zY*WsQo?qYG<Wo#k#E|_2cgOt^(*P#Ke25j`HXxo>B+WD3(0JY#7Zjv};p})O1^#)G zDyxC2VEI<)cmY-L9$_4&)R0=HJ`BdRcr^Izz1(yy!b%H)qblaoueQk!8w7DNiX-)K zsMdzFbQOh9rfM$GODQnpO79O9tfXwy(yH7qlHwMD*S{yP)X>Tb5QxWlJfP-)#;3p# z=fvc;w5p5Etr}BUbv0^{wui>>Ye08tp=>8!N7nS1rNG&09rW6TUA~{Og`Q?v;DlVS z{IFd(g75_8_5_F<Zs&)l%+BL&z8i63P?j#KeA*xp?s|D~tS-}dVmV3TY6suT2rJ`& zm4hUlA&!BE?iOLh<HV5aMKmJzGujCJFJb=N78s+v(<c!*biH=PZaXL?Zl}_{aUDod zN1hXoA}$?)<0{mdXamU0{jk^&02%so<Qw19*P*&%4MLxrtGrt?0N>CBDTD+n>=6n> zC;DBOOY{9G;&|-Vh@%=O?=_Q(-qQ@eIXn`X1r9wPYQK{gH<18lo!5PInuG>2tykTs z$zjxtc$K{}BZ0o$?L@G`x-2Noz)88j5MdZ{ueh(<5DXOXdrwMIwCTtqA{|B$MynKt zh967HOme;OnDM(3ursI}ifm1}G~Zg22SrAja_9nF$XI<fND~8=Xu#=RE}zbe5nNyM zI!2SZTpjKvkTQxVCfdRs{gRU2uR*+kIXK4qrYDp~x*=RN<{L{Pi6E%2%X(OkNd7^Q z8#X&?rn*XkwkSRCdg~wn-zBbpf{Og3`dpVjpP^_X4?;;M8YYMpXTj_KM+8UU0X}R& z0zjP;Q=ta9r-Fd2Tx1YbPi=`AQC$$!ZQ4QI1TI$&b>y2i0ufyd2C9sJzvloVjg6<r zpyvV$&*iDp5$C2<gB{EW#=;OVoYVzLLWcUIn>H|vTs8F#g?nW_!Gx+JV4BC^-nZp* z0%VX;;ZV0{!a$Hpe7`|WZqWh=9=O}+k!C3S1)y3UmPK?p94jFs|7@cbuDk_TaFy<H zFj8ah;Blniex!&h`jr!@a<1UVA-In40IAHN1X@Z4CY*?R^Kq_#<OvIv9G3Y>{C=<@ zL<}@0oS~UB>=f6Bej6MkM#<@p2i>{>PvCXsrpZ~#-b#gy3bbT10EW2`=1*LHad#4u zY9WR_PM&0Yn&8y-60(gBIF5YuBC`B%H$=KSVyJTC@}N}j032z@y;2X8m6u3UNhAzb zi@T~FT}%|u^}_^w5FxB-<Y=gF-~Pw;fI(T45e4&%KApZEvw}^o)AAuGjFpoU#<av- z1&IbdCXXTku>>+H@o2F))za(mqkJ2`L|UK#yyAfQ5aPIKT~5ZVcHsLGP>uIkLU+V( zKV?)ISQ@%>J>`weDs&QFFbPA_I&sUlLe{c`$?~_kClB_}*CdiQ)w`-Mej7`p+7mmR z1Qj}W5a+P)*(eFCbe-TF-N1w58C~FVwv__i^e=r#lpgL>-K30Ah4FP1m`4te$ZE;q zVOqFg#N+ZB)9O`;e~2{SopNI9m?`?cBkD7;LG@q>ZKe__3@PI2KW@0x`PRWrt-a~5 zK=nEr;H@ZBEU+U#V3|~xLa6o_nAq;CcKrM|Xf4R|nf2iKxWUoG1j3M~NuSsosD_AF z-8B*O&OUp3EQdRiPlhh&>3A3-KN{<gTuzu7sE1>4uMetr+AV3QG*g|!hmsP01TFyO z21*Usw~$Dr*b9htf#D^*s{0t}xmGrA%~J?0YZd;;iNdktlJX60n=fS~pUrXnC3Y;T zD@_)wyob3NY1v&Bzow+*Xr$PvxT*%iF$LAvW9s9`l$SE15v)dVSw$CiWDL3jkpova zTU7Tj+v7P4kX~ZgX{sqtg&xdF{9+eRcr&cd4{sdysl8Vkfogpt&d|kc>pihlPGzes z9sAM8o(Szdt2QbGOqZe#J9@`L%49FZ#nC~8URz?H3%ke@6s2!+u}2vzccRyxPM+BH zFu{5rvW++(!k>oO^B0LdocfO|Wl3&&m>0r8F@Z!IIZuY@1XU<EY3|?aWMWG1LjvW% zB$&AF=@CIBkZ8s$O1MP-WBRW5LudP58Bk3UBD~I?DTXF&MAV4`5Z;2uLaGq_W|_%1 z4rdlg_eMH-V5o3wy5B6C0xKhFwnpF)h}bk2L|>$szRBY$nF$qfhyDQyP=8%A1;jPO zE^z%_IDx`_z$7SO`|tk<IK)t{z~>h|q&(z8#gJ|VyT`bGh0RDf)bBLohRpWZ>X#o_ zaN0T(`?omy%A6$#bhCb+wvMH2)RBdKreyKuQUgzZgU>!bt?j(;!4&e|8LcLU(^7(@ z0Xt_D9OlYu#~DVCp&BqZ2lNx#@+wq%KcfEI8>;6kg?J<%3Ji+5;ZhM39f=@FJJ*s| zM<Fln+P=juLh4xXp;NMmC!v@0P-4kNna?~$ruI1kFYWH8^vNk6WCDr?t)EZ}LL1=S z2QQH_h#nd|9TT8<m9EmF$GbAE1-AhXeahZDppo;pT}fCOm*QD?*f1Q3D&Tu76~x;Q zjgW<P)PqTM6XW4uW2y+4pl58{kd7A-L}(o0X*OiBV8HF3iYj{*v)^LFkYd>E*a)34 z-eO9Sk%jxaud~J!AdAf9+A7n0Hy<kX%JITmT-kEk&}1c9X`2prDbWYN%Vgj>N>6Gv zo(zv-+HGuuCn_y2zDpOg+%=htqtw;?mD7Bye;uG4M)cea@HbgVHW>@?cx@&d@mftN z23SrY7}_n}g7nw8Fp}YI)$IZo=ZlTua<C06G)QV5o4EAK)W<~_pi^6&cKOVE)AK`J z@kRH!UKxJgk8i&&y7w~u8FzYR3iOx{yASjH;{BulEUrk!8zVr_h3oge!u$sY3cHu( z&-lPAQ}EiG?<cxwR_^F-)r`jsU)F&5H49&;F1%_=^F1b-J5<&Tj)^FYaYLrGjoP{G z7yY3P%#AkRn*umQ2kU;iTY<1<L8uVPeQfpH>5hYDTGr@*s~CqR7r^dn+{|qHNB$z* z86sV~hazDp2kV*OSr{odHHS!cM%P2E4a4D(ayD-UWl!Emtv9%A%JX!4ot-&c=?lD& zu6jn&CiFcw(>&&)Ucocaa~#1=6&SJqDs>ERp^O%SqOVKTr*tTB(N-)ci9h$9o9nOH z9>X{E$5faD3gJ<O)P_UqXkwH+CGsgGU{90`&fn(xm<%2s3OG_<*!=!Ww#z}E(L0vP z(I78yyl96#{uUVx4_hyBSxoPUo9M1Qx2?45J-fCV3%@@H-9`XALvAC{vqNrTt0$IS zUh}vqWCeV-8k}x@diOe~ZYf(KpEg~At{?4s1&>2+;7}uvZqJ3RcAh=^!#=Q9l{Iq+ z*xYu8m-H3MBS%oAS^_9>&0X-Ysnl22Am<T25g1pH-ANme;n+0u0YszT4WMM;U}T^N zs`eBr#}rnlH18R~7!GRIHweV84_hDLz!hvPfhLXB=qcY`<mmz?8|XHsX&GrTY{8Ir zx^FYJHsGu>>w)Xy7)rMp66<Bl=1B4Xh<fcsRhO=929!k8&8$O=i4&1^rj0PPM$ep7 zQtFCf!D5-uW+B0#PzGSyqr;#N{?&k`>Asj~GuVlH>xE#3@^O3^2=u7HhXFSp+|klw zk6dn2>JZNgoq3o*6|;_O=(){sNey*+p+I8LH1ylZ$4m@Yue!TYMl*rFix1dPYIZ?D z;d*GSP~_mjGhZ_N2ED+y$CHzc5p*0`XSG^j!q2fvAyv^c{NLu$;#SLRz|mX!-xeyS zgvT}|fT3CMh26%*%v;wkhJV^$|FQk(V|LQIPxMO=)$pO8R|CopZ|{;eK(n`^G8a@u zes}=Cc{CA4iPbTjbIz_3f))NGC*jL*6uWjw>qo4QBgeg`fSmPOx(!A6jm>||Z4<>n zM;WG2k;H;12#HnA(A^6C*?ks0AN#=WFW1l>oR4RtQ5(id1EzyLdK?Un!D!K;lwmeF zTPAze<y{*?z1m_0EfE8Yw(J#1tu)9F4kae9ExhcG@w<cd20H;ZIAb4FYlBl`gA$tk zrrg3@Bb$f#8LqC3DGiK*)`n-l=WhAE5cnL<y6S}>!bw4vxNXW?JUOV3^rEoy^mM1b zR@RnE2y==}BtgySTJEtM0;WnTeOXt~OI!Cx0a_SKfPIh<#G=j9CR%%xmX%065n2`D z5PvzVb#r?U$5jWbaX`=#UPTB1<}f6`V1qAU1OspBti~ug%?g>yg$LI?@5{QJV{b{R zO41@248Z{r<zUCL_lFnIO92>XuZ3mK#3TjKdZzOvm?|kK;m~78`BT&kTrhg7E;$hL z9JBy`R6LcCel_(K19`S;&3iEr@-J`@G5@z7uI}D7v1<fF0aaSAwh*knBDE1#IV6`s zk9}1AC$oosA|OE`?@>}I3S<1N)O*2WKg!FY`e?}BMa|B2ff<i#cpY+V|69#b&?Tgf zNV6f<Z>NiL&i#~okDHh*j%CIb8;?=Y?1{>J+0=W*0q9zklzUBY3P#``7emn!sl!j) ze(TtXXbO)RM>+R<WX+rHECl}`hh>nV{JUhasV4?dj+_k+lkmJLW<lS?LihlM;@e<d z6g$VSalvb@wN+6FH=}Wf;Lpy1!)ZL_YPAWMEtq_8qhEqs7+{FMxTZ1585>y81`vc= z@WDHa!!L`kq@r-BC0ko)?l6cfAOoZYXJEE6<<Z}aO1PkmowE<dq7gX*Lnk-m>su@Y zUcY+@a9|9fTLcsJYMqiov__GimNqFUcn27I+4wy(l1!L^7dnVa*TRPl6rzzY{pWiK z$?jGnEKIH#e3Xj2g`e=K_(BKHIg8v5NzV%oUe^=1HR0|IX{y@vNE`gMW@gGpl!Goi zen4t0Y`aVbP-k&OJDH=wmbKB%tw@GSN;2c$=1ef49Sma+E9%5kt3b1K!%bBTH(~{? zVO*7Z{HBh#_OF#zmg8$brx}r_v7}sZMK)S8MhEtb;9wZn;K&d@kK|ACJ*G#A-5r`P zu^`VHHyz-C;!U-jB}z*{Uz0~$W~q7=Gmivuqiv(>4OF3uPeOZQH$fpvW9m>wqR1)a zMq5fxseI$3fOK^`+5<P8W&@rn^<QG@<V5j7bmWZJSzToO=Z&$Ek%W<d?d6e-NMr^{ zFH~zI6qaF+BaD>i2^!#~BS@R~M6E;6Hed^veVXxF1}l*UVUlkOIqNA@bbP&5vff}p zF~*93O>y+qkLN}<#G~KQu9KEpxDp#Cv$6=sm7**ly4}r<+@%al)zYPU*<3lCgA6tL zM@2uEAegdJKJFG@(j4M!!}+Q}vVzfT(H#aYry>~}X6PpL>;<C99^@}ndOXXA0T4_V zqQ`$~)enWUZiaPmf4T_2=tNP7@aMCEbsE5k(tg`!(r1^St@6G<FTEkc7RYE5JFOCd zcVDPbwe?4?{tsF20G&zjybH&+oi|Q4cw=vDn;RP&+qP}n+}O5lY;1dPet&%De&^1a z(=$`kGhN+hR8v*Yqvge@j;=+yXu|KgXv%U!Eck1^2$k4*Km9^dk)zPHI)r#}qS=I4 z5zG{e180Q^bxaJ!LME9bKsU{Rzsl3wDwW^)sCJQP+=L;`s{;m<<{9MSbe#jfl3a6u z3tWwz?rUW{WaZlp0>SR=m#{2?Z|FrOE*bmhaKHdCG~<@(N=7og3wL;AxMyLYF}!S1 zpYc<%6$6U2(0a)CtkcIhQZ>Yd>fSF0|A%S2Bu-f{$#4Tpa~5tey~Eq)^zhd4XS4a_ z6k@)Bg1C_mB|N!qgj<k;ulkNB-ig#UfsZ1d4S5vhcdYRXIh1Vk0^9Me#~e1$z2`0M zS2Mw$Qvui>D3caUj8+JV8G|7L$Of38TiG9)%TM}5B}#E7>1e}Z;8@G)Lx(85tYPNC z<^pOh_r}iwmB4eAO1o1@8&ummtMZ(HZCNnvgxj+Q;*W*9?P9@2Xsfp-=g7YoikPZJ z3?<iM_NbP*b;kYD^0_<*G5$D_%nrSveUCy#N$B7T@U|^rujHRNN$kY%K>vP{1}q@V zHZ-LLl=5C@f^cMcao0^iLk_uZZ$d(t5i4v#6<HN#KD9L}15|8r&&c(jr<182GswTX z7&(do)60<4Y3ex2Th1#qUO_FfT%P-SmBK+lZ9b87abr)8bbORcI+&(d>x|6yCVy&1 zV${(i#-Y&>9M&@vYnsgXPikTC2+}IBYrqoOx-_*Zh$Zl@;DG!|0e~L*^Ykcu>9|`% zk8w?b1ri}Q&0WiI6^h#Iub+Z>!6AIA*B_7AZcIae6ucYrP48_zmR5`HVx73MV@f2d z^rFXPhUND@z2cfsU<KYJoKczRgp=k$P_?PeZ4jIfs_NGwKC%H!Rib#d?#6n^hZ~aQ zJZLYC0U;tn>yP2YzFr(vM<ZWkTHCUAL5$EW6_0TNT6QAKukAcK!F5VINbCBOZ-@Th zWz~Ug@n=@jD999Z=NpZHSah&6uU&nW)a4Eld=~Q+>6hIY*^I1kaY)qVdIOx=B323; zch0NGS6I(Upa&TZ$Ah$^cwd$qks^r%R}Ik+W?R*6)h`0_nzUEYPmVUayv3*ZCxY(L zAIIBSpMgUl7!;?`Y+s$Hqo-N<9VA@bSN!tX_A`>!WyXd0(>lN^pMcvTHh|PFlOh7R z3x_q#@IOYi6wEii*tA5ys$GR8t0*yWnK$dGPU7|=CKKsXZHvLu>DGg)&5w3mn%sXF zk(j|pr%L+9Ah@LUVFnF0bL{L&%g0}!nZ9a+KD{%G;A5M0H*pQvx%*D6zwGcm^?4tM z5Wy+sZ)0~~+ehtCeQ?HG+o?gwYW<v{m17Lc4?zYS*td5W4sl7BJVIW?HR$o3C3pKp z_8^^?NaO^SLNlJul=U|4Fj?p!B@<|Rz#ou9omh>2x!_H(qPj<Zo0ww}gX?_14n}K6 z4aHRKEqQ7p2<rBk5)7l<2<RlpOBV!Q-!DbYjx4|xp)Js?O3OpjD0{*@aM23H>QN|G z*9=`?rSuZHuLBoVp)v@!k!4;1nW5$9Ybx8nxWL>HRQV6bHV$6e?uki5k8%8@oECJU zz3hyW9TmS~V9hD<K?=<tkuO%Iesm9?RU>OGiGC)UC(0Jy#IdtZ$Y?T83E$W)uVj2X zid8<#?}?4il9eua({>{h5MYhVHzfpcOYW8vWKh7UxwcsrL-<xDdToPXhsA@N0I9QL z&k4c<8iq=kMljHQPRrl9?XAZZ`PnG%Oifrz!~!wmoiw?Y%71vL$dkO~Iu9u?HL~H& zk(Xzj7$Qvbv47QM3YC*C;MBcLcia+-%8iB_24~Ohf4Ws}eTR-;U?(?to=!(jZ!!!i z#sMHnik{!fZxYbl@>SQ@R4N5bkTkNaXwla6g<4}kc3s6_4;deoSy(D3pt7898J|QI zXQHFie@#*d?&`<$y8K1UF;Qm)FZ;JxA+0DeDv^*Ur2(6^_VTKsKkBfm32|R9VfAnW zZw=3BvPm|WFh)-ZKZ;C(7``ijdtJilIfu^??Zi=Ckg-J!%^NkJD$B~jr6l5$Q4MDF zV;ww!;#Lp~J^^_UmyHJfHU*wb2Ct2OXj3a4RZlO1pw(p{R;R5InT&yMNKlrk93f(v zSlxb9DovG9gb6u_GMG<)BaZ0pa@c6Hf8flyj5S`x5os|YKcj9e4sMG?{?8-te}e9n zUsu?ghD}?^C<U&?mF!X91k8PN5T?Nr&T}PMg`l!F9*@c0rtAZ}$vynP_|Rl{XgTAN zL~U$blIu^=KZu+TTPM5&-oG;i?cWlN%Vu75gU)?lt$z5v?)<|$*ZaEHGkTfu6;%57 z@6Afa_m!U=44`5-n~Mtamkn$;*H^ly*;h07-vDCJ@ST}HDio@N-)^JXJ#m70%vd3) z5q&&;e;V8{0P5VfCH3GWcZJPoMYTOjU|B;XT0`M}u9QHzdjOdb|1GDL3(@vt@jZ6q zmP{f_fRM2$tzmvpLX%mej8vP?23E2TB+geTA_&vJtSiSBvHrl7wK%Iq+M-98NEi=F zo@z0ZcOAp_QU5-2`-SlGjZpZb6PIgJfYn^6J<Y;tl1X9zP+)$&g0Wi6E?EsLN8X$G z2fvTFB9=dpk=6kol_i4cz}u-{oRcH0oM;i+dGyWkx?`sgSaqu2hgL|99KsD&(j0Hb zN2VIS>)2&HoL799$g095B-B6q5-@DLiP!2}8{+6j7(Eg@y%#)u$+6Cuj6rl*BzhBJ z0}+9xBW`gy67Ii%6JVRyHg^#OH_8mDPYI|;lSu=D#s~;vs*Hu@K*{OlV<|f>u<2F- zn}o|wMYdsZz4Y(IieM_~!kILF954zOB8~O@_6{99T(48gMfm!yk)qxV*XE(IstR=- z;zFAr#h6*PP>D2^ZsJEw{+i#E^&(wVHU%cv|L15-EG2F<tprbie2uqt)Mk6rSX}-5 zlm!$1sU^j`Xmq6E5LVJ?UpR=?D)W@|p}E$~XdE(~OI(~a6HMGANeIsYjPhTcshWq^ z>lCSDp0{)k->$~wEEB#Mb~)OV=Ca>~Aai3;->MUb1mq#N#HOfj^9Rzklbld-o0Tdd z$8jDbhstSuWspU<BC?KM`lzq((xU;G+~**ix>`{c9#^o=H+=wDZkmtqp=5wbo|TS^ zkf;RAYut>Qvx;K>dw=KUQ5wzRxOi^R7-I4S91=zCmhYK+OHOSt8t*8Ec3@Y(sLTAD z-!42olC{!q5vgB}4|_rZAMKpd4QaqY$TgM?b0i28J#MyQs`Ls|gfc0seai`6u($%y z8y$$rMG3}5#OFK2NHoeh@#A8rDgt`#Pne<7Dj~m9)M=3)_xDGg$JNJ65*XpAWl$Zv zPo1EHCgCqNXw0LRpc{AeVD|%;FMnc=XA(Td>Jvr%XOX(xE8uhpn-efopC|9jg%NZR z!jPXcccGB5vBYtWE+W}}O+=H{o4&eGHX&-+2$;DlJNhb^6sKlN<AsSiU5i<vGpN4F z^^S3?no3RN2P9q1NqE<kzo7dRXTpkKTAU>}z>ZNdnxKsBa9$f!*F4}!U=I<>7DR<l z`J%O)zYeRtiIf0^3!*bJa=Z^rAgkG;&%D(5@y*vtm(-+P#Z~Hlu@9%BqFRmR9o+RD zz*0Xw?B-3M`on#oIRuF&G;e;^2{IupD(ZP!wbDUpiv=3ehwG=5b;aShmfq@MfaV3< zE+;BF$^7P+%D6-iKF!HWmN*)@?E^qNq2Nb(-ASoxV|AXb_@h~?8%|N~l;SSqOT`@M z7%^Q+bw?u=$`mqTp=Q*oyDQa;=rwdE61DR1r6j7Z`2xvr`hM4;(8PPkIFp|F<XF<> zQ9t&n=b@4JP)0b?D_H%J(V_{$3`sP)EM~bj!H6oNj*B?^w~FThhro9Gqjl})6FR6K z8+>Vs%T|WJ_=;Z6QaKRGzD*Kr{xj7JI8FAo+V*uc%~a*@a|M>uKBKZGlkNVGRz2~@ z+s~Os#e9aJh#HlsQ60)dhty_<4f%OD*3q75ABN^NdY>B)25Dj5;Fzm%kaFD5A&`m& z?#XPrN|`j8qNF4%cGJa!Q~Qocx8lXJz_(B#+r=kJN2D>r`SWHTuAJs9rSnW<53P`h zHN(L_<;`OcHOmWTW4D_hR<Lf=LzZKb;S{;3z4kY9In^UG(CIl}PPcNTA+lV{iuCR5 z$opf>aY!sG9o2_kZIdh}na^RKqsFXCF*8u==|UfXABMMND9VL&JsUKms?|E}_h!`G z&x^MP8vdI;Q+2w5;@#`UpZ8}4Z}(`8=0i-=jI7cP`e;J#d+<3;+*QOeq?~PW!NP{P z)tu0?$ox)PZjS?`<NHx=XCC~0^*X)O14fLh&Wn2eIp7OG6UQ>px=D+rStdp;Af{ID z0_;+R&8TVrI{p^%0i_w}QVt%?1f7m}N$M|}l}R#0k1_<~#CSx!)zjZ&2L|Z@pHLd0 zmrP3YQe*O@2d|-=jhPsrZ$C#KzuA7MApge6X$e%$o~z+U3XkGpHN;eX)~Q!_lsRg} z&iO?JK@PE9pIU)8uYy1hDEkkUrpzoiK7ORP^M1!}*hKXzxqQ&=o-@@i25b__yEPH< z@5o$__N2s3`q5~HF!@qBcz}#Zi#MB;GW0so5=m=Z|D}weGSu<Jp^RmsaQ=EX)lxDK ziMk1PSDLhF<6U0A>s7xWIzAIQweE`t4~T7#*^IS=$9#ykDkXwDdnN_zpAkbBs(^8- zMM~iLDCcB@9lFJr$uIFojL}QX%AiQf<=#Q0OcCi5myS-(EqDie=HDk8<#5CnwY@`h zJuBGnufuO|E2~^m2^$q&xSy|r5S9trn<Z-AD}B~aI#(~`@le#@qM?Z1$k$K-E;xQO zx=Ct`kP}g9l^<^d19~-}_kK{-#Pxjg#yT0Xipn61Ss(uhG;Z}Bo-rl5clk^vRldLI zj%6iwewSz_PnkB9zdu@8ja|0&gO1^OxI*sPKL^!^S0hRP=l=4{UvWVj((^&()!yF& zBKFv_xAQ{!u~E|ptprJKR<BB<rbk^EX{?Q`Bu;XV4KCz1G}#2e8EesooqgLZ5oYjn z|3E^QAZ>NCnRI<@mix}0C`<;s+w<6c5IyebYGEAd-0o#PLExfGjFSR%3o+ZaG0TEb z2T`H&P_q%MX$e{@fihxTRfhG7tVp5-uIFjVS7eR*F`VcD?q%Kc^tr{T;RK;t6hO@H zJ9_2*ICxU*!u#hQu1`uamW$VGn3o$<=nF!+ATz$GT;y%hV24Rt{V^u4E}cm=D5Xp< zcI!64)At)Tf(0h4=IQc+Ui}J?pf!(dhcQAi)8`VUl2xR+>CoTKuQ~hKv1c;o_YZ+A z=5CzM9V8nspr7%1@Dtj^dQck&M_vy0ZS~gYXR(|_M3Yod;t#h~ui>&?13`4qePbmd zS!7F~B=m7Tc(^Oo9VH1E>@q%#4=#EsK<+yee_96`PB+0)_yPog|FPBTiME1yzI!O( zmLbw?QVFq(y3E)I3k`bCEd=X|6!To*x*`mjk1V6%Qui9=i1Ic$E<=gGQg7#tY?T_u zmgC9KtBs;a#{xaW&U~Y%V2U7ZHx2M%3PI?lhMad4@rfWcwddCg+QAcc%+6l`JJOVj zBP$BN@Br07UC6DkZFw^s(HEkZ%!<ehM@d<fG*$_*54Cs`t^`{e;QdpI`ndVN=PGVN zU}y|Cu05&}b(ZYsCmFp@OnFIDjjI4nsifGx@0BDQ#k4e7q?mU<OwBipFt!SYGLX3w zJ|3LiIgss@VyqD5kuLGFO)f0UW+X$rIcg@&)M%22A)uIKU@A$9>;spSDvvu72k94% z?Z3QcZ36V*kBKM82ECUrt~Wl!U6jQwAlSgfk|;3h(;FbozDEX`mb@A*6V#6I>{Z0b zDxMT>>jaFlmisRs4q(?=D!d{|1Lp5Fre|Q`X?^fo$X~k^FvtGlNyq`NXrWAQh3+b5 z=Vh4-GIDha0OY%*L&fbEOZYiJQ&s@}jS2jP^QI3(QPN?2a79E%6ntlOQdGY_*%-bl zVT6WK9y_@v+MDV^-}w0>tBp{<QFfBzn5ALl2IgYPygqic<)@RBiQ0*>B%KCx0s(X< z`UN4SSx9vZJiiVXkjZQAJ@}J~5d%t$9i`WK@Da|8biyAp;Qlabo6|aGW8;f9JgP0h z?fG1S-Sdh&fiAGNaTU+`-pY-^U<U;Gf9Nh&F8PnF<}PVZ|6OnR;{I!jz2x8heUMQV zb|zNHKQn1$_*LLkqEK9L6Q1U`)Wq$61{k`9ypL6>UnM}j;;<p+WoH&LRGp1b9^&hY zfjO(eJZ`)8J~U}6d*kSKIyH?Yrw~DkVlZ=94~Q0EP{R3{9S<WCR}Nl4PJ7yFkgM$9 zD0O$DIE|{BDY4-oOUlP*W?{CkDA;%@Ii<L2frpxq)StqfrZ4e;C)?{-X%Ch(JW9mr z7>-$*$JV?F{a4f>yi^S11VJJotNck_hA}9OMSvrp$Cq9a7Xz+ITL40SkJRlixwR$1 z%@Bq7500Q24jchVi7Dh40Y+_N1v{K>0WjkNmDlYwUCO+?$4Z8a{Ace6$Th8mTj)b* z$eR#<nA)^(t}2D#1VDPHy;MV$stXZMlnW%EdTWKI3Y?NC34(Qk+F{+3G&-)Ej994+ ztIc=F#URg03Hmg>DSdm@Z>Z5aLt()@&IwWx((&#d*?tACrR5CJ)Y9H8trnn%&)K3% z{P9AFzs^_iv!D@n+0K$+Qwu+4wSlu{F#B|pJp@K-cdE7XzxRo4<m&--V=_N|^pM4q z<$(C=U@{y*Oj7UM8r(4D2=npYnKRubN}v~CPyW(={mSQ5Ou2wn(w~KgdP!$bHHAc! z$tWZwqE0A~tJv^Z9NY0hDG^mErjXtTas(`_1)TUEaYDK?DFG;#VT9)lUSk~Fv^~v> z#!Qt?Z4->)%$sOc9!0~S2+O5BV)juYb#Wp%a#e2R=zJ^SNz`m^KpX7>(I*=OD$)`j zY4YDi`48-_ia&&^7}4V2EGEND0KcUFbfZhJn=MELGQA52!!3#hE(az)cb|Q7-e*co z`MYIAbTao0i#Q|;xCpNMb@&JSNl}k*%d;D4+DnitZv`}27Qm`3=Fbjpr+8|)O7?;1 zQ`SvZkp+MzX16$x56-XdOu&za7Rkk>36g3nlN*kJnQGV}fv13AzbvAnn7>dyL>B4$ zx4^Y~lS0@0g*+AhMqx@Qmi?8DnpC72=k3iZf#@*Bv2Ic8Qt*=$Fi->w5Z13Ist<$V z-M1(4;FPG3$vl}I5uFUIjx_wV{OK)6U8D{YAY1e|zBSKt2oB145|xa8wF)ONAO`+u z<=E;MGsz7AQo6=I+WrHg_;fU0UEvXdm962CsL*@qRkaR{1;>;~A{Dc0Q(U2A)V#20 z5uq*l{qfaS2{o_&%y&vP<YdX47`A)<1vO6wv+7XOT43h_ZzJLsf7bsT$@Mu@?>1WB zWXVw4^;vIB<B}M+QpGi|HYj@Mg{u0)&%*-Zmnns_k=+;rN-H%HQ`nk&VU=SVP=-zW zPq~<~rJ_<6$WP@lF%vwCPmqI&KCF7|uIp-5njq}U2xb1j{5KgB3dP~-rL{4w`d$F| zW`DV<2JS+c*u~U3tCF$OF~riGrU%D}kB|;wNMV4a5N72_Y+N_I2E9VQmo-gVh?avA zbNW%)crc8Ka=)mw8Y16w`iC0P`)DsIk*Fqo5W5L0Kq$}ie2RQT|L>31^>CR#c)O+D zvGOfU`dBPh(&<6J?;t>y;%=5yj7cfcSH_wN{Xt=j%1MnsC6Y(!Xm!2F7LW2SWGO1k zY)kTBMU&w1P+_uAp}>)0kKUu4AR16p5_M{OXi2!(7nBg#;@VSDA1zV)(+=Z56f7mc zsKhYxroK_TKTYAv)ut({#Kip8Ivm_kA91SIdQtBx<7&u%;tCBB2>kdnYjIYdE-Xwg z<Ylr9V<cMlRH*9sK3XM68~%s5uo@l3!FEZ7&q>6^u7nasVi1trj~2*ga{dR$5$@zC zC%Z=uWr^w_f#2hk%VlU8_{uH`b-1K=(Gm44;n0J8uqy<+JGmFhasX0LRMRZNtmaLx zc}NrB2rn@vRB_IDKtuOm+kM~^z6wh0@8NM#g?gM~9bOBmhJNj;V$z7whkq>L%^fJi zxp)_3?W^7{W*`*XX&ingYwh8!SY&&g%MIu$Kb?IDYSK}Rk<<sFIDcVU9@_oAgP_V) zb74*Pe|(dR?J`T<D#kL;PtO@#=+)xOn|a%k+7M&o%mp_6o-k6YAZ0G~;uwq^{X<H< ztFnwyDrwj!Fwvc*Cyg{$k1<drMkBqF84#aDPa4pc3;wfSARPYt7oojkw;)|s)s>*? z6VqykOfK~t$h36v&gq%~rAue>&v_b;rjQBx0K8ZVq%L@P5lt?&g93eR<4I+Qk=z=~ z#>_l?k&~thdp|X}1zF%vz?iE3qJcGcGIlnxn50RH-nkLh{-0x;^8+9MvhvZSpop)l zr*o*c)O9^k)=fI4L_X-z*+D3~=Rb+|FnhnzvvAtW{{pzMM1gVBEkC1lc`_%_UaTJO zm^Bx|%z(!mVwy~<nsFJ5!Csbd6noyt$${Cm_<fZp;z53OoOs(e?W@tk`(Ar_TLMu~ z*Gd#`LwD#PBxopO&};K4ZbD^YW0GV(rYV64<n$3jO>vr@p)Kmf^*n$gkt4-C0`wwx zushr^-`x63wN|SV2#Y$g6vJ@z1ZqsOm_D0=r$&Ede#GB}H*jE>YqOcdYSJV6MTBe9 zOJUreZ(EJAz_)E}Kqr6erG6ZvB99vr9jt$T=w+PB!l<Bl5E>4Ccv0-S#0nlgZnPa+ zI2jpkd5HyX!(aEE#>a<OkcWEMKBJ{C<(xno(B#F!)i=`VDU2}Cc|2n#0s{$Uk|2q3 z{Th;89g}Jn7X?np+$rCy!(M5lM?pdoB6-YNzmc?u8AjlkR$!H@4a4x)=Bv+FqNnEm z(^6md=(!OIXg5@mzOhab5Ar3811Orz8sQj(om*H^BXMG4H8}*i;L2bSmp0F_Pg4#b zF_$Y_d2JZKT%ka4C%2g-J{2q>+|?M2x~xUSG&7V)M3mG4ir<?3`IF3?@$IKtB7mUC zBtvQ{mGdk|E6z9|IQ}#YJLCn9W<t<9^aaY^f=T|boy!IJgd)&Is?3>O1wnCzHNkns zLLqkf-b`raxdni!42loDDbY9?II+x4I3s~)bMVse<ImhFD;1br)c95W3{DGWt)_Bm zVqqY7N~9JSglM;+{QbR%)KF5!fZ#u`!iLz_KOAo%f=514$mq^q#f?^h(EJcf^ERVW zO*&VfdIsi@=V7!+KpU#5qRdnhM!&HHsr%rKx(Hx~+yimL%7}WGXvDw^D|n}Tg_!#g zeQ8Sx%Fhe=hbc`6hNb@OUF0!tKunV&w4iB?wEcmnOH=n>M%3okweQeMAl{&5-bRlh zl#LJ0Vx9tKQzNGx1`<d(jpG^!eFO#q1k9?qCk8U@ii{YahauGGeV+)XVM5eAZibe7 z;kdR3))Z38w$>q|>o_ES-nHoUyLT)t-l-WX8&(6JJL=pFdyNCcW`q!Sl_a$l=D@M) zZhZ6}x<wv!Jj{YJNC$k^BPTKuZ$um|&o)HRXp)a;ys<t7ZH1On1vD%vicK$gd{R0= z7)-jpg&yr1`AzNvABH;w@{@Xl?#pM{^{l>B_vMBP98#*xLer=~P)4M1%(vYN<Z83X zV%N+gAR8#*0D&a`B-lFfEf?)5QQ$wapx6W<f&w#)9^BN)FK#v{+k`qSg5I*cF!T;z zP<S3Jo`2U31+}GQvE`J6b>zqpHQJ&MgJ+yzBC%c1d~WT(O{S|V!Vvitwe+r^+b6_9 zGYlq>ckLJ@5w}K}Zc_IYMtuTq)mnXu;4gmu+Ia6#sahGn8Edg9KI}?Jfi1y+>2~nf z(M?;d4ew^bZk!&PTk%10)Bz_tmLicJX;$neJo*HXSvrk80P2hURHH*Ph?60S6D7-Q zrZixnAX^wPsytbUN~Auax9XqgGcM2v5UjkNs^H{T3O6&KOH-rbsM;TW(a-hhSq+?T zTFP9y`^E7TZykTT*bIr_Z-?G|HIXy(2l`bn@e=rbX^^kwI1q+o;jU;}Gqu06)npKN zfD&<D*}3-o!_?aeL|gkc>h@L}0)|9<El7i>`O1`4;1^FIN7vSeY>Ytg@D~9IEuQhk zqa5Y!)lP|oi`dB7#&6L?R5WPXpuixRfU=cWXmxZ(y^HuM&s2Ab@da7lR?=fJ!4a#7 z0Ta>zJ8?63w@H>ba)^`Vj3d2b0o5n8`SXPFxE|7EqC75^;7VL9Oh+3<mo*}f`PC<2 zVi}YTl9)XIPg*6;1GeTD>?j$w&<}sVX*Pi=51$;S&{-6BPsMYDg>*0>!J+z91jvR_ z5VQC|?lPaLhrk?ee!fjh<*YZo_ciuHQz{yy64<FA*hY$Yoq1wqFr_&hE687xc0Ria zU{p*~9!1um-ib5PEFez@3@}P&_ms*LJv`4W{Z9N%>FD8++TzB`8pS!^7h7Yw)Ku^Q z4L*kk7LOD;58R~3n-uvMqz_mx={jqwf5DlJhfPJeE<56l!%c-*GZw7oYf{bk)8p=7 z`)MUZvZ0;<{#pjc`<6_B2M^@)DR}ZHcO5md1z)0$r~(`)!v1|RVk!-qKOEZZ5NGmz z<TNs}L&dHt#xx#}42^+5uFRT#XCo_-JbUE>W^p}qp$qv7@x({iK4abs8APIFaVWq7 zbQFO9`^7WnoeV!%1TGb!mp<B_n^p*I1Vs;HK~WIL0fSa5^Yt6Ik^itzA7Nl|umG1X z9R>ovoDOBeO4a@k``~?_gY2!E+alhO7$Gc8N?$`(dA?9Pz9SaR%@1%+B1Q<f%&bya z8)_V(e8<sWSbTAu+$;sr6Mevc0Y=#!@d{OXg%P|T8724{*iygD!><)wVtU%!#or2^ zjiU5qZYz!yhfq14pjU|k8Npz%0<n$DHnXa6C!;#FS~~b|?^Q{Mv2I(&Dtu8C$jd3y zg>j1EC4Utj(3T_HX*JTNVx~HlX-Fl+krQ@V7H3e0HG~WO`!qQYJx(xDuFh_l4KP;W z`C!oE9J4yisR|1xv-j{%n<3J-Tn31;T{)be2nBS+F8$KS8S#<(vVlR$Uq-$im#1y6 z#!w(@!*SF?-{6r`>T=Q5@;hJ8nRxrQk$_bLHYe+>1Y}->_Hnk@BB?5+$)i8Mq0LeI z8^k8R))D@BI`_nIioN_Rzuc_H5Avs8Z^UEme{0$nT#loZecH)*MunK<YLz^G(TMS% zGcR&?H1jnJ^LnKByE+XL`e$Sw)Ito1zWIV5x%sB?BO=7I3@ZFg6mx5{v30dGu%n2i zpP<560?7a=9}ia!UtAvB(mA(qmj@V|Hr8!uLePLtLEuM{^B|WqxLYk<vZ%8kOFF=5 z?fWu_=Xx>iGL?B*)<l%v70@V<*+*F<C>Xj&tLf8uzkZ|1sy;(F2%fOou@9Nk=UX8% zbFAto_@*LYbt4+~zKD$j+dK)@G7tA_cIIT3wfWV^nTKhtaU;Sa`;IHs4rmgr=)1(G z(KncSlYR|HWU9BD7hR+&NY<>^bdsm;2I7P!ghW}w*3gC1AQ&!HmrAH+etEW(ucw>a zu=Y`*ks<(eWA`{Rqw_9CBP!>pdSG)~Sov>AJq<f%P`1=Rfo?3l6o5gocDsfkwq`-U zK?8vt;aOKYsoq}RU#lB`ln$>4^sZ~ih;skL3Ra11!$$q01rfnbU*Jo!8bt4}`P(hA zcG1dL`JITinp)gwPP$p+Unk|?EQuGcSp32M=0>|V4ek{3XL3038|G;sz!(H7woQPS zZlCfAE^i#KkeeIFS6~%htp_8pt0^wvo68pNSfPzyOaxgOUTC!PUKqPDcVXoLtV@p$ z63)cN*wM+s#NhXT)@%(e;n)C-fd4E3000{&7YFPA%31zL&dd9sT|iMcCvjyb11A%} ze+&L2GhqDQ004yr0Zag(k->M#$M>Jg|6KybY;BzWuiB>cjP#5Ej_=bd{io9R1T>{* z`WA8iuju=@@A-f9;`;yE2LM&v?Mwhbc|%KOr{Djjjf{znnUgtyiSd6A1<C=K{%enz zg@dCLfc1aYt({C906;P8Z@ooKjBJff{?ow!^xrLuMVWCo;wa)T7zh+6sLua8H2-z_ z{}~%5_WxCkiTVH1?0*&c&uINuw}4_6|8JxJ>Fa;jWCF0Tb1*Xg|I|q8gw4Vj>f+Q@ z^`72ZaNODAd7i$SyqNwDYp;?a<Cti-;>0G<i~)fgzp9|7Huwo^X>8Q_2rIeh_7|FT zH-M=IHX}{qkYd>UE^3hf)?%Ns^COJQ*zBJw8%vn`_5Huje?qRR>gvxb>fYCdAP^uY zeT@2k4M!2G;MZHN$F?B9ZdE~M@7T0!)PAh(<Ur})(|`o&)#7XT4k72l>{8Vbg8SKO z_E?;^v;5O-$LrbC1Y!Ri2?5q>y}wV7a)7M*K(#dg?nS)Ib@m4FDu&e)Xa8mY=*+K< zh`e{M>*Z~@aKu*Wmv4Xy!h_-Uxr|v^gTJfyw<of8?0&p=HWfO8_-hrSw~i2Gg3d3G z7-V_|lvfMXi9d8V^5oqQ8qDJrWOr5Z;WH3n7QV;<1}t{e5qR(p!^jBhWJlJC(W*RO znvI@~EosKxLB%ZyyHecvvY4P+UBMP1KRXl92*={0cF=1kU~roX22=Io**M8pm-IU1 z2s5S8B;)vK(GZ38Y(a^xb<q$}RL7tjN0`m)IBO^vY5x})nj&0n+8iqraYd;~y`tFL zrEBHF12|t$UbJ#?>R7Viv!FR|#BIyEe-Fm<+6?)4Z1;B|uxKIR4c_i=Zbs;XP99aQ zW?VsR>B&L))7;oXhCgeQ^dXbiXNoCcruls=^7fIq3FUM>=EPfMd(Ni#x^CW8BYAEt zw&Gf$p$NJAOYo6Uip(9l`>WX$po6K>;j6|H_e?D(U*f#Hsdxk~o8D+%FZO1PHIOru ziG8gxNvGRCV_`3zi6CT#PUSc6X2F{{mb6WqsytF0&U6ROQhv5`t8&((Zcpzl!yPg+ z&qO0Sv0}HDAY1G;bL-U1eS3bpfe1!Lr`0K9*D_g8D!G3vUSFi3)H&1-l$-mb^6uop zmaQwR=ht(93`~=0O>ID!S=d2NFx4XIvil2su<?9n?u7C!h;DVra2ZJMS?GYrdZ2F{ z7iBg)Z)*$^mqY%5_4B#ERg%O@9oBBJ>nXRIYe?<87>n(t^D@4P&;B@xTe-CF*ix?W z_2!$(#g=>cTpL4p7PQjqDc?+C?u+Lu3zK3#Ardu1W^bYjq;D<2#ls>GbUfnFNEBS5 zHsZsPqX=A69Cl*l{rR1aO1s{4v+wr&Alf&?GGy%8q9#Kr0uVW-)0D!ZTsL0ChsgOG z#MP+XV3XQC^Y>0_IV?kU*RMfOaGaXXJmtxl8#6uOcPRMf`-UQqg`*2Fn@F<`u(-7m z9nvA*j(U>PuK0X%xN6YBva{7wWs5=YtLtqT>WNYF4YA#RB2&T3xwuAeu*n-pi(XnG zdMw)5&3i8#)p%Gd^3V04?T53Li8|{37^E*DKm0$f;P8}-sOpR^62qpnf<2AFvbX%T zdHt9-130Pd)`4h_ei8+_zCH&%Vrwf~tBGcAou$SFgXWH`=`3OTL#>V5t^hXT*kF7& zK|9UMCH(4K4*1R+cdO?3rr|=a)ebeA?Ko|RK%-9`oqH(NQpp_`!QusWRp(PpJugXv z1F#?h!bVzp_y{Pu-)9!Ibk<KzPO2p-T3y^ECyyiavpGV!Rn3EVD>a=pl~(p6h{^QQ z7-rOH_L2rxz)o*KHRchXQgV&-LMszmy$@_lv$=oCQ-qOwCJ^mbRtEvt)g$?+K?<Ih zLc(b4khK<Up-J27m#$uDDiuCopCC57w1+;gIHxZVStvG-2RTeoObjvMC^rwO_b(+O zn;>5sNeIKC$+m`iR>@rM0CE|F(1$!5T#$V&X$qf>(5GAx9%6JY!WQ@2q)w0%BkLGI z+-h2q^=vIRM-e81XHFy4uVPT2qjKhD@yG6XQ9I+LgdeG7)wjO*DFnXFEf=r5VSTfc zBYfA-PEDJ?{~a{BW$=@%^zc&O^NQUrbT0;sI4+w|$)a`e>;t|P@`KJqZ6E-tx``HW zHWdJdchtx5$ve;~r2ZZi<Bm<oR>iSd(IM&^+1D`Y-iRJ7MJrvVH|g(X#=Vy1i~8n= zio+yw9x)u0Z%)2*bxAETtV=e3Q@%4w-Lz_G9G!hNH|E+@l)rkQd%AsL%6vs=bND4C z5pWpUn5{+CpZjx`cQYyj-fdefeJvxIt!N~}z|j}uP)5v%FP7?yj)JKTd(Q&}RPiqI zh%_vuU|=qux>8hLe9jI-jAEYVjWokwv&6{ZLZIvfC-}TM52$R7%Ic_y>Je<<9}nTC z<<|v<VJeepU(6G%Nj}`4uk!qoew)N9sKC6$6@Qjg@sE$*=*r!6Uqus-m@XPwHVK+Z zl+2q^wPt=3JPI|yVTV(fw&52B$uUOB+dx*ex6}s4;JHXwm|L@;g@9L|f{GuT{yq4b zDQx;j9SLtT?OPuPQ*E!%HsGBisu%HPvCs7l^b3<W3`1rQz3*svco-j6Xh4t=2AWBC zM#CkKe87R2uC#YHN;-wmx=$~H<n9nO94mBc$OY1u$J7Eks)(~37eTbekh=Y5k81Xc z4dc;`W&^!w@jxk7nvlU+H12I*kN9hv8hEo8qCpZXDd|Zfe;P1gQv1iCVZUM?Q-%sg z(mw8~6_bB}=b{yM>Q$@gG9Y;cI+6d=__SY%HH5j|-1J}0Z*1Ae36E4!W^Qii<z}be z_e>8{hV+Kr6xynsOorS!=KJ<<<Lw*|FN1r&`t^u{Y-~2TT${Fbhj)n~jk|P{;~u|m z^VH%DIIRO)Mqrft0bCsG2J^Bj2zj0JqcjRaGKa`S6yxZ#FC2j4!mnPRgxLD6Y9mf- zR<>zCrff_;vx;DtIAmcao_`JE$4gC^%W2Qc;!`!;xPzalJ7Dg?$MVz~1wonpaKTx7 zH83G)u&!O}7B*=>)eH*Gs^K+jpH4X4DE#J_L45bpg2*?js5TWI^z0zkZ_9<hH&m1y zmBeiF>y_&75W9A--oa3sWsw{K74yhoxP!Y4Hv=K9(UG)xjQW}@MA{8V34SE)o>Y*O zKXf&+hptQK1;L}&A?g#vE(7p_=_ySOPd(%7?8vQUfY6miE_CC-1i-cC?MZSlx-2$f zj2h5cF93DvwQ$>gFoQIFYtnIzsE-qzYc3msinoy7`vn3A0;8D{x$61bZWYR3w&AiR zdQEOsZrx40=U=*xNF2R#jl^8NS4os6iI`?oGJ2_0wn$iSi=1F(CCm+p(Fb^-Nff>b zV^TlCOon4nHN@Tex+5>>utpyqgZ}JqtI+!$AD&c&Qq6<Cz4TL)Xg%AWinU~8z48yk zr@?=xnJF?=x{2CtP`?$jQ>UW6+u+pA?=V>UHo*(1wF7C*vc^Y9iyG)yslzj(-2PAs zLs>EJ(@cq(NB}QSlFNC2tbih^e@T89(*&f1e@DOH*t`7=MQenA5hK>7pfe_+1P584 z<B(iY9s&2dXPmmqGA?zTkCK>ZvFbyjlhk3x8lgkQ0hCXX;G|?{h!K6iV?=abs+ZL9 z@+3fD&R`z|;M5du8xF^%hs0`gn%qqkqi8UmFuH$G4d0mm6UvvT4B&1P{ekgj%tgeb zhc-w>3a%u`$R;LGAJlilnV+r!H5_)Z0LLgN!b7o5OdR$Npz1E{Stgv0pHXS2JGYGP zG%{c*&xnT{Y0s$x6TyNh%nFF<*UXSbE>MgO^@-KAoyA9}5Gw1zy$GEJgWe^2p)}s3 z_K=JmC$beu@%&B3&Kf2E{u=?5rx?l&=faSGRq+^J^%Ozb8^I6f3&g)w2@?ah54cRu zB=ta%yJPsM-RT3+^i~2<wk7r7awDTQI+GEom%`Bb)v6FW=tW0~v54ojU+LzrX{4B) z-`aLpdPL7Lc}8S?*D@2l@GezO+a58I?n&v>W)`!9NA%Hm6YTAZGyq-N@}}>!zqhyj z2U4Q7j8K#*O4l!#7~U1zuD4d*-~q-?)~5mz#%V2@7sE17%9%*QnaG^{T!K0;{zS6~ zR-G>oB%Ib(85Ks|$1{xA=uo#mj%HsNgdHIlDpO}6K9$D~9P3KC8i#ho%BjAjNuo9z zI?6s%Mt@iiHV2VEr1=VOQxOb$?FadCGrW;1VNvn|Mr)K<Y9rLIL+G1Y!|VVw)*X1V z8m1Ks?1O%{J1+~DoFpxAdrP8rq_pKfkL`I!XU>luP`}7(x>^hMWuSsz5gBn04-7mv zS+}s7W(I}P+<y9@Ce}Y?RFo;FWe_v<>ylk?AmNBdc}AB;XWjbU_qJF$!QER`*ramL z2to}m!oVMtV-Qq;_i@3tQwj{6|9s_2R-(CN3*Fof!qjT(;LGebmI9#Nx-ghFg&w;V z-+<)v+ux9O1av{)?EdSXofg2hyDwuehr$X28-$M!B~WWn5>cjpL#jhy86RW=pILK< zx+SR4JqKw54;hNOpmrnynGA-;fs|e8OBVn~>4L}cqV@}BX>J<sLu*Oj=tFd*3uWwr z${Nl`MS61hBsd$Hk~o0kgy0Mo-c)QtVZ7;KrK-E3a%)Q1nJS$6Kq9(Lg_&R&T5u|f zs<-&CPX9nuDjH9M9_h*nhpolO#&K|ouyBpoVj$#EKOIo<hDrH73wrA-gEL)YO7wYw z_yGQLUz|)!n+goIRX@G&AB#<+*o2ye!zKXrpo7AV<ZiIb9^QA_qWiQKRtJACL844l zPSEP+|7ZGT{AKF_dbsDb6?60FVIiI@zVLKq;W~6D_KWf}G$gig*xk?pWBb*Ml*C`r z>FgC<Gwko;2T4d`)+`s?GXb1=xeOm8oEa}LJs})>4@w`J`{AE?h^~o-AlbNX0W)*k z@v~702b{Pa52~AZ#<O8Lb_=l<%>_yvV}wubX*2I;Ch^2KK27j~?yNO%BW@~@2QKi2 zOrcm}tF-8DxAgDr-7c;}KOR}gpWDS31&dDJm|KG^rGh_BlsF688r_13m8fhL4ZPTF zBZeL(Y?F0Ul?Jl`Ruv@OcxTQQd>wcpGX6^%NO=vEuVjXlB)<axk}#%8t*w_CAJ@Ul zIrR5i>#PpU6P4&dR>7nFX8e^FQo+0z5_4?x`ZNz8YeP)b+Hp6cW5rxCV%0T``TLiv z?<&63mTUZcquEvTC$g>n@pb$2UC!TU-gSqUZ@gAME*2tlv^F$bHz#HJl_UW}4+Y;- z`k&{nyox$I)FJ=)!2ZQ8J=pB-D@M>TA1i>=74)KF?C$<U#sGnUbc+D9Mucp2r1oQk zD`JI_wqw}@4P`>#(=hh`i4uRll~!Mu2^%@KTax@@L->WxOLUFH2^$CX)(5i+uL0Y` z0Pg_`-wV<#qD!h2_0kjphe5?C_5nl%GlYfi2Ya1)@_vG9JihqB9^0)Z3G{azcvn-w z3rE@vmj^q7gqrm`T51!7AcOb|)e!CgmUu?_pyQ0bU$ed$74z+q!`~g39EWfUf{oi~ zy=xpOYMDNyR7rnuSkVm5bfWsUm45#iY{fK0aC|JwTq68Y$ZkE{xHTiJOV;HUl@5%< zzCr9nOuT8Szw+#)>r$_|jj5Q96kS>gJ3YZ6xpqb}RHBAfJWcOB)>l`rX7=`IE^uGp zcpbt+2JJ}|KNJU5ScK{m<GWU#lFwN@B{e`d;i43Ii412Zxie86KB$pqyW&2u?;zW| z$3d<pf?h0e?ySI4Yl}O=4e$LL6)+}m;BesXr-|Q&nIEMQ!jZDF%1Ke?t}t=@qZLxv zv^Y;>yPb`&e#DWMAn5L+5$#z*E9hJ@XVetED^CM9tB+yb;3ER<EDFW%pfBc+kx{Io z&yE*O8B4RjtR%`|qWzoj6_w+{dBEq{&vUSA#e03K4LNAWNc%vfh%0b{WW1}cB9dSB zEh}8H0%Ss?iSkz!rm}y=u-SX=Zz0AtPG?IaqvOR|H?0zN(@BZ<vTQ1Y5SJ?khr-5s zdrQ%5g%pg>q=`P&XLg!nYYzCjtgg7VSV;@NHQmG}^aAuqI5BuATK$sX^bGyU7`h`( zh-MTJG(J@7x;1;?-KAP3=89h*b~Btn98CF8%h*DZL)Zhl2G~|n2ra+6|L9-q$UXrW z+WLkR=>w-eCKsh%FCbeA7hlNVB|JJPf&rY4XYddN!2xqwXvI&__>V9wk~(tOh&nxe zzgZvxyeFhuYvw0bqy_pb4X7_36XOs}*xF@2;7d+vNH5Yq){CBDQf}UuJv2ZSWRQDH z1M{oQRGm+ap;YTs<5r@a_cC6+eImza*48(3k2ivt_VcSi`5s%?Smh1({@&0W3afRD z;>gZ@y|a!f`}vk=i+Q7XI=SZW6{apIGq>(+xY#sP>`w62p(~zkstuwS{?7wGV^8=l ztJh{Hs879N%4c|up<cZf>F0<WqX`Q^<9z*gNT&9}L@H2zccsjolFaC%6Az)tS<)>_ z)*zQP7k8qjpCz6${6C21FIWD-d{S{+@D{d2QbE83p@ZR(Q>1sG@>;w@>aI?&3of4m zgqz1i-HVMvjgZxtG(!iT;j#8HdF^|3jl=r}#3Edq+8p91Zz=dB)I{8bohZFTBhhVK zuI#Q{n5S7>-s@l*_rt;(_HAyx<KqV_8k+QYY3}wHOrhajP+JW%Sh=j0C%a6c(kL1Y zP31E4)1yQ}kLP*GI13{iP8WA)zkO%`RRTuQRK`*{$pg3O5EzN#kh&itbR3z4b(`W4 zVn5IkF==FTnS)PiSyBiO>j^@j-l=8tExGHj2|&TV*o0fn7c(L$y-<GN2ozAOzzaPi zy;P`opLMM#cY74q>Ykn?n=WG#CX}8iR*9nX)q#L0>(#wJJU*_ujqh#fh?w<F>P#CP z##SV4aoqHTSgl<B^09Ipy-kEB`gh<<=yTE>o_!-JcUNVh?Z%SvHoT->+cw>O?uyd= zE>8XNa2jJ0PqQBF_QOmr$B=6>RVWYFdFh5HCI3%4`8B1+RAiE<!U8;UlAp4?RyI`d zZg}$9s@R#M{mR36kCj-3t+H;(OZXiDAVkUw6J{(3QIcKOOiW|1x4l5l+|saY2=kow zKqI#1oL0m(WtiHVU*707Au1Ov!rx%D@XH@$5$qEb0|xiT3PCQSO0I(T09C13ET0?U z76nz5D^YHq@8N*acc10-$61Nxq%nfy_pR0ws)&71S1Iv`yMr3nSH@)~$0xje-mk*F zliTLs<=>G_IDIh@WdcFZk|hW@u>J-@bc`GlEq{kFr~yfq>F|m13I76p;)(WT>DM@N zf}?>u@`XT0#6dT6{_%6Xc%1u_(j>D1`a&@WD+Z>P4bQNE-X@Cy`Zo)qf2(6VKTaBM ze=P1QC3JSP0=l&&98(AiW0_cm;Y5tjah+psA}p79*68bqF;IR?l3V`uJqwJV>}O$x zLaEZ$0lKhBfiyZ(>7pb`fwA)v4q_RWO3>r5QWZ|@@L&;`5*;5!P#GA08zrOyqpadk z24`qp6dcsvtbV$mRZ?vO1wwI+kWSZlcB#AeLfAK}6VshIOUUt#>)kc~=DLyM=5`XS zR%ZRvr4_r1T<PmHTe?Z3+Qm+j;pYWBpXZ}0XX$&nZ>ID^-OTno<x-zs==CH4^^^6> zt-Oz@@xmCtx@n`sUy^+TJ}?w4M%mteKlUo-YO~|Xp{%FcY*l;0gElP%&DX}~RXdX> z(*EldgRZ|^RV}T8U|CCewR6Ac^sKripKTtDK12~Vr>%`zxNM#4vke1|)9R@9_Ha## zVMME33DgW3u^M|7Uz(4%G+^<AKkl*6Q!MkL`ezHqpn&F!c=5}tJFP9Z#$+2OTyUkJ z#BHk0UdF#Oeh>axp<LyaEch)@@jHv&RX(dd>;!-7z-iO13&s(bxZ*AU*U%3K0@fQo zB7X;BomNU;6)NZtLH_+}MP<4Qc%w_9yK1Nk+!fNdRm+pq<RuNSJ}M#WX&M5REk7g{ zGL&l!MV5ZV+cY+y__{+}m4@sRz$vutI_28S)5!Sn+_kg*D}>`p<MFd2|L-+E)pp$R z<n)ww`+4Ptk#rlPqVi0OPm|Tu*r)u?>jTBiK(7I3OM|uH(RA=<PY2`uD)`KYa()98 zAB*_S=D9!Yhm<s@>@1jRIxBpkrhXwRH$<4*Bq&#vBMu^EFP<!2Y`i<h8=EG|WdI-o zf%ZlWHArUmfz%7a>KLk3oW(wFC7c)NP&GH&4RdTTin7N&mA9aY(;B)q0x-q+l64$F zONr>}1$(FWq^F{f3SNPrcAFI6(`hc|3`~cad*}Z4FZtZ%i$oz58<tpxB~XTNSBug< zB3{WTfBLD=W5FwCPH0%-2gKB|S*yY`hQf~$aDBWCBG0m^^XZ27vbd@e1)PVZXHIX2 zcXJ6|9>YwI+`NG0pu>vv&Vg5ofT=)pP7A^%j`2w8iyuQW(T6l?_*&au7)M2GN;bMv ztRlGhZE2A2`%%4daT@$}?u3$Ee3s2O0gVbGLqr`U8{M_QD)7<k#eQJwq&E%6Gqj1F z2y{zsu7M)i?!a50YCU4z*McxaT&CJjq~lTpkKL}%aA1Iwjean<?wm_W-6+qnY$k(~ zy;MrhY}PCvmXZL?@I79dcqeY#+pGODd_=bC++VHQ*WXy`b7EJLahar>oGKLO5BRB< z(S5u$-Yx$l^!oFY*0<8@VY90;Bqw+%mZ<wjrEXo_9n6xfyb<UAZ5CJGXNHdBvk}vK zQ6{9#RP-Oe4G?DemcEHyr@TN<LGWc%Nu9w*MxU2!w?`nj+$Pwr9xKaWmP{xh)P!MB zU*rq_icdaq$~4lH-k6?;jtoahBXYzULv*)%{h@@rqxe=?Y3U<qg;;CGJ%Wh<<*-3j zG9O<0TtmJV{=9V0YiZeC67o^r{w%nR!-M3#(AMXVqq!7{Xd(4%1LEh%&uA$|k#2jp zJJ&Q@TL|S@B|4q$jUY@tcxJ}leoBD^>BnB8qDGgf{<Zb+KQzB}hev9hY9Fc|Y94O0 z!dU``>ye%;2bM}Ir?lz7`}}7GOaUFzol|uSO$%O;CT3@nPh(l+nGPkL<tU?&-=sKD zrV1>T5XlP6Ygw64rTJ)?kH&#(C^>H*kil&wZPAek(lXM;6k`<S0x%(QK<pnc>sW2r z?pW^cTEH=#gq|Wh$&EV~79&RT-%xtY)(YGul#PVKcX7Yf4?*|*Cj-~0Sd7Be@${sl zowKg$^vLCI{ijA-kh@_~)$U1nBZ_+%rc~U@Acaqk;($23cmR6$8DSR$n{2VhEpyh= zsAyC6l0r!05Ok26a4gEKy7Spa1$g#D5(ERopKt>iN%h^<qSxB_hFfOdJL4MSbJN&D zGf>KC%O?_{vpK9@T%X4e#u<`Uj&NFUvR-|DO$pN_BlFg8o_6fi{cE#2+F#5m+#MsK z(@{qGj^>6i(wM5eg(+uvecKjtrITsAt^N;V>Gssoxwxgx%t=u8U6#BN#*o^irsng1 z1?JHv`oa)%gXp>!Iq)5O^{u6biiZiB(h{Q;xw60wLuLdme-_w`JBk(I+mf@x(Vbeh zvxPX9JkDoeXkU>{r(@tin9xmIiUn>c88@aFA`wIQzvr>ZXt0(%GnCSXmE~wAvjx|& zVT-6@p~Q+BI-_IsnH{B<{qlKAK-80!eSdj(b^3huRKCwA_V@`(g7?V4(`50!vca>T zJknjid+5sXJA41G!o$XhqS^7fn1>-J<$S&SD#DY&=fs;(9nN&OXXo?r>;C{rK(@bI zmbb2J3jXlwI~U#ki-ISVsoI`Qm@vM;>;nmSsIHMV(l`P><g&)8H9hVQ!CHva`4L5- zdN@<49?pa-f6}Z(QVr37rx0~NhR%77LQrDdPS1;2h;@S)@d5S3H^+CxnRr5Q8yNwV zj+4kCO!|omSggkl5jWo$v6o24r1QN$Ve+~des;5NhYsoR3*0;&DHTbrRBR<<Bn~K^ z4B`*e=0Peikb9v}WAbx9B7Q%}l#U-SV+pnD@b+n&6{J{NQL9Bmn&l=|&>G>&Mi@#4 z-EkU+b6YqbSEk2;@tHtxXtdfD39{~CV!Duw(AdXdt=R3-ujHY|Lp~j9gpI%&357xm z99BjY%%UY_zcQwrP&lPO`NWl9^v(>O8mt;Er17Egvb7o`Qv~N3FNQF*_4PqVRjwcs zVlA4b7X4BWeyQ6o7F@NpeQrJyUh0q?%~_l2!fBP%>;{jXGljjOxE?x~eLwrzjHY<o zEPL|O%De?}UnCN7(m0hhdoS=)BL>1W9<V8H0oHMv;B`kp8Vm5>=qO~;*SN3tUG2Xi zF_6C3FKl(c5ILIojqf-9Uq^UPTuLWm#b_};HIYp(k6#uaNDrqCuL0=wC;eUiUwMAx z<DN{wpG4ku{W|h){GG%fBfMV=2NI$=<P+5f81e~f*b?$N)G!DrO^ty>DcloA;jrLn zOgNnmBnqMoyplKT)w}~<&O48WuS_cyKpJZ4{b{rZ6PFX1<TOE^fFz_!6{Ln1*kCbJ zeauW7&0cDKd9*n_3a@=$UA4wdW128A?4F=!iCC6CEqBkrYHN&X{@6#0@xIz(S*b9I zmr5?y1DX7hq|5J)CSpk!Av}J3ihD4riTW^^sWEpfD9&57Rg;1ZYA8IFZ3rq;fvSW6 zq?8AwZlnsq5Q%z%<rE}~oU4a8RU?_g1+~>#1BAG<CRCE8hmZMV-T8?Zb4#LjU%Wd9 z|LgG2M}G6;mZ9l|F9kN-KlheJxn6XA<(lD8Q#3jybTzYy)VdCAd*Xz7j$Z%G;br%C z*(j}8RGE)%!1}EXbZFuGtlzPs+l;%cNdJJ&3Lbzq2!LMqq`9FNgZr?3QPl`MUUM;0 z*@z4tIxw9>1qrk}cpg(iOb*cv_kVfh5!m|a^^abpdgpJiY>qBi_jP#tTd)mIUfDSF zlgfRszP<1E?>vOzFFJXO`3>gPJ`jSJYL|$dAiFu2%*tjLZ?y=L*%dUod8<*7+@_G; z$6F18<kNFHz*|Kf5G0SujT|<{?lL=FX5<JkcDKok96r6@r)TUC=#YbD>_DgY`Kso0 z9afQbW|xax;dk2IZj;05^mx2p2oY&?F?O3R5b*n1mU%(IN_;RF3UM4y?llXN#bPuX z45BFTW|K*;*8!K??ehV>WVPDuj*@w&>$|3%#Da<3qsXMW^gDgJoxbma5)!bIWI+&J zqAYui%q8_JNoTM$bn4UcsfjaZsLK{qJ-6qd>A#83t7rbF)}35BMU3xhHz_Rt@<*&T zE?c;evtgyqmSf0~bA<7sGGUv_*i@T(Rr#40XH5cVTd{Bjym-~Zl}{g;Jy|(jzIa9D zyDP7R3+Gmz@xvc|2`=r0KdKbT(OY@yOVt|kz@-$X(io<eP?#!ektEwF&gB>kOjZn3 zHUObcj{!>Iv%JPL{IH`s5nlu<Shf|g79~3*AQal#<U&54M(GE>y!(;Ln}550Pq*sn zx`A8U*tPEN%C*0&{G<Y}jL!Na-1zD*_uu}+Lv##HSjK$@%eX<fLAyzC8;UNs|AM?* z!zT|pS%OYyQs@-s2~P>UrmSF>iz{5q-5bTLtyjyB82-)tp!He9v*y>i*Ihq$zw3I} zeNy=c`wy4H0Ryau^Eo_Dk1OC7bS{J25XgJxcy_sVD}vjDkjv{adUzA#K^*U<t|%Le zb+8SZPN&(8CBjTbnVe>nIPY!`+~e8jL7t;b4hzm-dmbWV5NpR+A?H8nv8}Xiw(YR7 zwoxc(HjS!Yilz)JOh5J2d0vG7z?zx~YMOl|+KhIf-RNcXCVC(J8HvafI!`Mrcc1F4 z>NMqU8kj;%8YEbtG%;A^yf5t5!IyP!>acn|*q1s*WH2h3W2%Rwn)<xm^ED4qpUs^+ zC2spMGjVi~_?0rG!8wpo3jkErULo98*Np`vsI6^8@>zNIBwPiRN3UGFCl>X*`S9cK zW#>Qfk7;oArlqsJkgI$ag%`pHp1SdgYX*<J_SVSy_1`*FIXy*cX~r}H>mJ+&%%c(@ zkC1jCLR~;nO5S%iK>60#tKnChnR$`#G$VJ&m0lNN@H|Ij(K%ol+>ACyu7}s7t3p>P zTN<v5-VS#v4>UgmpNYN@`(E>8#K9}Kz&j(i#2<=04j)5LMD{hm-26`V&&`v~CK>BG zUL+?lA8P4HcVyQ^F3ac}MdbHGN62SU8$dMS17gT$R>RJa&##6x)D(?G8W6N21S8KP z1qqGG$EkVUMTRR#La)#-FvLuZ1mJttpC5%^*DS3GOmL9JY{rWw$||vbE-R2io7GbQ z>OK|qsC!kU9+KLi);7>~qK#?Gi<I|?bRI;?`@{yPgYrHHZFErH=XkWR>In3JnsKYn z_@wd>mQGSB!uK*&zL%--y&B~i$FL$7%R`wIp)ww?G>#>6N?o8_^kP0LVc~o%Md6_( z>n5DBxg{75MVrH!9Bc{VQ@Wuk2f~r8(n@6Sn#@gN12Ol*2elPPsRJC2PM$tsFD6Jh zpdEvqhg44=l8RZ0$lS0R14cp!Q>qH90ek-+OjCeZgM_W;Y%v12p>kiPkW)-S$sb!# zpzPD(C+zc2?;PL#?Ps97|MtyiFR=M_KX`S|%^hpdHUwd1%NLnw>8WeBkH#w3-?q$% z?txF<v}2ErWItR`na%tL%iL4JJkZD7uiYd&d+(1u*v5cnX$9KSxMd*%jeMH_(j5w0 zYVTRGx&4~hz>3}MZtiB+E$-cg+o#<;Yj@XeJ@>lqbwAiM${yhkx(>Qu&%fR^w&KK! zlPgZI@Oc$SPAb^jLMyoMi1XV@KHy~9)cHQ(nIT&xi`ir}=yf`q&8`!Nqfm~Lh+x@{ zv4mJTG36npW{?e~J<)y9m!nK{6h3-rS!!5?7|MRDnMhxG4+gZCRYt8nLreUKiu-Er zk@;{wp<(ki+%&(5=mYb6?a)37Ma{NZggZoBvEtW=1^&SqaK<Rpq8UB&^%)QB^$cS= z_Y(R!;DL_m2A%kpo)<h9!;71mEZyH{vLFkB_*?|tOjZj?S-3g7JG&>#WZlFEmZkBC z*+MbR3@?I<$hVm=c;m`X4oY@f`7QN?E+TP|dJ|R)7DYn|m>?hRbb0f;6R;;SkQhsx zNU#Yr>6xf2I|-uW$OLB+*C;EpE3_4R@tkuj$fJIPF~7pR`@Y$5HjUez-J&?5#W~=7 z)5$nTC;w8_jhsf}ysO4cI!Wx7?ZFZ(wPcxI2K6!!FcLAL;i>cl^0dTnW$IpOQh9-V zBXimE6-VK9K!y6}c46H5lmI*yO~;2OXk|P#bV^DMeo8M=L&QUq8kA09gf0(}V8g28 zX5vHQ#wlSP1EYGmj0@5qcgFhd!8g_SRaEXvVeK1ZQHr!g-;d&!A%gHVs(+61*Q12D zbuI0f6)E@wE;r<2(bkq+OTLBSr^k9?>1bnYNpz7P`lkl{psUcWfD2(s0T*y3Kj>}l z_Jc2_7AbJ1d$u1gjxY7YB})SxKHS|m6)b3(ufVSPg*FY%P%wL&#&-JQMVX8JU}5rN z1<Z2I@KeKUs;;fqPBpE)_ScxYk^Vu<iAX>tbw6qPw1i=_Aju@U`1EsfO=qv`;?=AI z%$@jfxGG@B`KslX{5xmUCBeh^AAJf|RsC8J3FObe!q<gG%a8B9x&H?#GsAO?C3WqT zV~@|A(-cy({(+xeP~Lp`BcHu;YnQ=V5LV_>Md+Bnc4oeJ!RlGL%HK2Dj<qixd?uHF z=y&j<<Tv`hdQ9VZoy)7|__+hahwZVV-Kq#I!|6-|Um9F<_tMrjw>x^FZcV5q6uuPg z+_LS_r56ru+q3+_vp43JMYEA<JLcw{PL{`fa~77Zk6^lF0Rd#JSp)obBTwVxD2Fgo zwqcY=dV$f)5~w6MNky}fMwWJCT~W~>ML{(mi0bjRu!qkb%sO(P^z)L3eKZ!6wNvHY zZj@+ToJ51-Sem^+sw^8c8mo?Cnt(&16{O>-njGlUX34`2_#NkA=d18_-Lb&CI$r)y zJ)EnX<y`8x72c`aWqH>pgtXQIOC7{}Lhv=m>t3XV;5@ND8kr@SoWk(X!$OUPC&;;% z?Pmwr5q3Yzv!56VLDP(TjL3M-H%cPyiMKP=wLh`2Yk%*>%bzm_=RFr<=Y47UvX@A1 z0bubFnj9mEKQoqn4|thYzydqd`myw}@4QQlnSGyIlG|WFj+$dA>W}H8e9UUGD<A;9 z3Uuml%`M=X%_J$%$KaF0;8K8x!>4mL<LV!ctslbp4~Y*}vtEO);kW6xnYYQ;Ij?bF z;}^?)<*MaM=a;NSA3hue5e!vCAfXWi0r(o2x3#$%2t>;@fJWfNS2k{WbH|(8)^C4f zVPWHidv03wmCNQZ`ySo7@A|XDkKgg#ul(cM>7_?+_;Ka8dw+Q5&VC|u8!;pN8-}$U zI+v4IpfSf;Dv-2PRdUhd43p%xG`R`~B1w_jp+stj5~&?ZbVkTyPAa=NsrS@bC)Un- zqKlJI?HK9q1OfVRfW9a|UlB;sX!#^HS0$^V=d^N~z9m_=ZzV~Vo}N6B1b!6B68EdF zMI+AGmR8gEnGb-3j|O~1$(@KL(kghH5XFK_2FpjEd?LM<8vk#$t@ASBh>5Fi=lVS6 z{?d$-ax}^_T4<dPpr!hLOGKo2BhsKBk><aNPU@s}(k3U3Z+1Ehc@UsI1GFJPZwBax zQa^4zLMTbn;z*BVvXDP7*naE_(Wx2Y*{xa22(<aMj)K-$5DO&SIa}y0^cMySBL%J* zqY$kR<E#Azet+Ra0qrlqe%vrtU;?5u8MKTt7R}m_Oa>$K8^mPLJU<*r2E(I_Sxbjo z;*Ha@!IqhR5N^%UFOG!67K>T$az=y^5$+eEMH~?Kh;NFlNI0r5nF~Z3L&@G`e{vwn z4kt&F`;!bvN=cNYfem(NKG~mtVh7PCQ?*F^iE3GwXGv<>xlrhuB}aC97@m!K7?&Sn zg^lyp6VixL7-QEUR&Ry~#=!rcidHkmVbSvZ#?S3@IoS2hySp|iPP3uq!pc;emeaG- zyRY41Fq4!r`|K7=sGd<a{)4V1oi|jrE)9A7kw`pd>4Dd7AH1m&C_4kbNMz1hxaje@ zUShC=VE*LqI19HCL?F^G)CGy-A;V-s_gNkce%tbG`LN{$*&qfX7Jj!g*E_CrevP@^ z`ESg9-e;K?8J&?avnVi^>0`KzC|M&uFbcUt$XD|m99ACU5`G4bqW2G3Q~M<-jWW{@ z?KbT(A=4<6(K5tCfR%XIDm}Z;3PaYC6<NJn490YwiW^$oAvbbUj^dsdUAv~LaFmCr zNA1(0!S3-hgIG+N7(7!x^I>VcZrhh$r)!}&e7sSJdSix|Gs^pP&A{luC(*+-L%qw? zT=%F~6^N?-0BvDvbVrzY)#5+cutI2>9H|=JF#~7+JrtVu;Wu~wdi$2~2X6Vv*3i1j z=@%>ej@*72mR|bC?nc>X_Zqm3mE4<$cU9hcf3)(Kk-;bJho1cB(X&5=i(Z`TwE41C z+xptcQ|OnN#<j5d8jXwRsBJx`5nhQPjp~3@J%tvZSBx7;dt9}UkVA6MLU>~<xo>F^ z#JELe<rD|Ea&Qv|xo8GYZKL41HVD@QVKAzAVZV34i@dS{l#XF(u3XOG5*K9>dO=7I z=EBF{IxfAH`eIH`t7?ghY@;(Mr#aNvB2-`Fk-Ip!k-MHlT(nV`8H8(tR|iorDjOgf z;a@Z_AsQA-uGK4=DgBAdq=?6Jt+hCiW7YB)iR3DiLppY>T#}B_tTH?(!nKpSCQlQR z<+NrfHYE&2x4qA}Jod13PlVG8`h-5&pBu;x=XgtQ6e`+IOj~|x`l<O?<XH4q;kP62 zHhsu`82&Kwag#wVHI<vLY~J3q8}3HCnPJDUci1=V-_^W3Z6Y2$JqBpruW$Ns!|P$u z&p7R}-x=^EeN7MQ9@Iaqd?Wmgh(S)75>4})dU7js*CnrOy3PD#cwg=#=0m?RDYgW` zODG6KFar@Bg{cGJrSvHDYSu<~(DRZn=nZ+H<W=y@k(-{EoaAPMEJwm718a%Xf(ybQ zfpn&^1pqRs-miE(ZjvErcV@`MqBmp+Wg>Y0ndnuFT{8?25Bh**#KKrcVVmZOdp+q8 z)}5O6#39Kj8IChbJR3*xqfh~@P<gJa?noehGfLJbNL<#W3d?=P4Ce9&CLyj81>n>f zlKDZSBu|~^P{T5U9#hze$zV5`40ZQfU)8@>9{PfZ4OeRxpR}Sg<*~|LwRtC!Aw{zC zLddH6A)gfeK#{P}FC;iWtYRjW!>J$bGvQO|Q|o65w%iAYs1t3O=Gg=HpgqhU!$YPK z$B1{tH{yS=;r{TRW+N6>Qbckm8i{7egfo#lnjVfk+{BgphzMg%D4wD&;VD8*Uqtxu zRYSzQG<sHFOyi~|dgzKqDJYlB3ONyTg0DymPcc#rv<p`YBQ6fxicM}?^<8rHT^6kP zXfpl@x!9!0<eAf&#e%z8ii~8!zhfdVIIYPh{5vMx2On-LJ^nQX@r!@3nv+PJhLl=S zr=hE+c@Sn1w&v;v{z&|MhannKW7l3Xdx;WSdG}9Wx@OTP)!{O!s{hf|vzD%^{I<FI z;p^MFb5=<<GW#k&{>J6=o2Mj_={ak@wf(`M-V5j4dDq3oS(lD<6qgP@;If$An1(%B zxg7Ol{vm)HHMgdNK+rKP7h{kl@G$^ooy+B=k-8d<)IHEaieW9>UXU1t8DNGPc9<Dq z&|U^IJ2`$oguSRAA><Ktqp;<9_2<i}gJqC<z0OpFjq9o6-4mT;j|W&2G0IT2st$ah z{>of9TKO$(s9Y|<o_{=o$Jtf63^8<^w`dVfw^xVybr6q~7h(`d92-N7T+%@@TnN#A zo~sU)6j~4umQ%O^k_$4-jc{l$rmj1)U?0|r@HoE*TYw4{K>+AbiB+_g6LXP7Mp`~^ zsMV1UmTh>L1$=bjan8l#{4KZf@0A{YE#(VWR-S<mfFA=FSg6JOP@n6VlhL{QJtsU2 znP8T+h%z`VYeoaxVR3{U!w$wV3L7;;$g<LcEFSkGkJT)-6J?6Vausi@X*sq+8x_lm zH?W?+N`U9e^@BP=Fhpg0OGj7Rh3j`$o@r{>-D@-H?7EIz%j~OGu79q|w^mPn#BAjj z02Kzc8IQ40?h9TP+`;YOcLeTazZMV*sGu%p7As5Djs7j%R{u_PyZ3hgx0xq(d&4Kg z780K&S!J8U=@RWGl9vGenpIKl3d1U@*XL&hH_PGHJ<ltOYC8)50o;sD!-4?(0H6<4 z6>CD(SgtvThN<Cm6#iAyYhkE``@<-V>FPfYOK7hORq_%|r)bh%2}z!Yqwrq%F*RJC zDtAlcQkldN(bSStpqf@$8m9>*&|0LC17bSGVbTUZkNeV0LvRQUDL28J&`k<Y1Hg#% zK-D9|Ms~BjHaNfy1UQnO3<XtStE$m?b6t<<!m2)Gwq8`(&<Ax7-@5dci?7<cZF4&8 zjc2;LuX*m#JFfU1WVr=TAC5oz)zOWIhvV%FTm30X%|Ex}`d@Z53&=v&g2aqJj@QBf zUO9qw$dk|8eA76R@tndHj|e%YQFkHGO#P<4rk}vq(L3-Rbkbz-0%(B1q?s7RvFs@P zhUR6Eona8mm^f`tf%^dRxZpp4c(O;~gNOGTpuuD0j-rnMgZ@)90#;&);=7mS*zcha zfw30yM7%GjDD0gfeu0!Uo+?pa(oXaCV|CU3YVK<O7VZ|FtxX@vTg6n(raE94s>Dkj z{~7vKrE>t@Te)K}yC@gn7R3JfefCvfy5CTZ)w&pe_mkWPj3aSq)N<Fy-{pSQ@NS&l zz;5NXi`#V98m}{LwOy;+A>M4$>%`qjG*#r{ZZ+=am|&Cz0(TUy0dA-rjFYtZQ8+`> zWulv-C>jMp67Xf_Fb}`;pvwhJZUSvyXn6t1lB~##JPN<8$snmEhm(wkb%?#mlSwuS zN$jDDyJ`BD^+@kYd?ANvyqd%`QH8WpEvQgBQ%#|-`R^%jZ1hD$t1%{Fr9K=Bnbg%F zU?IU@BCaWh;A-fz;!}ezdVVNjU5YKch>xqio~>R@U_u!BV9noKH#Kd#>EypB|82+a zH`ZPM+IOzK`?s%s^ZO_#U%0iq@3y|_E7M=`N6|GfvhU09y>Q@;r*41dvkxj;Z@e5G zx#^-+zq@YlqrbRz3EB0B0XJY;a64WH4~T&r+@>AvgHV?X<r?FgbK4q*4Z}tvW!)4V zj@_Pn%Kf<aJJIKj2fZ)EUW~u0f7S3SlT!eC$eWN?7k8Rm-l!>R?t*v1n@zWxp91Eo zpaXV+E;ui-60V3}lDiyS4lhINW0%J_<gSM|#J4ovkRx{QVPROj$$FE#+rHcR0DGT! zkM%zJVdoRE@5aBI8)XlR9~=H){Mh_){NvW7VA91qKoL%9<z|Y&=#8`VB)Mp`AI~+D z!e$Cg*Fg)=;T0uE7S|=XPLn`ED<Gn+?k$`wu!ZpV@C^n-M<cXo`m9THjkp+>Cw~<F zsYYnX%AcV`21`@b*d)5zuIgI1rZT|>tCJNSQI!j?1|cD^8aCM*SA(>i#f!`;t6`7? z{H2`Dt3k%vTwU{Xu}0NJC)!Kh97(+u&U4}bZi}|n;zx;1(pKA(plW#B_094dPd)bI zP0#EvF8K9xuWVYf6}DWbZCST&xX{wJu=i_MY`Q5n2R(Dk-X*uZd|+t7qZ_|^(YnFi zKi#_Os^!nUy=i;TW!G-$$#2M1{&V)@{Wm_mZRy<N<u(5r;SWxX4UE=j@EHQPN%u;> zl(;R@7W+==0o#L)*L|-C-jYPORkjC$jNpJfy<ZKYgvf_{08@aF&!mQ3swb2%n@z}* za5{nL@9dFb6$P`hCUf%W<ZlntjIMd%T095l0ZJGSgbANv!m5jUv|QBY=%R|G3lo=W zbUkh4sa1%7v|-hne{pBk;y9?fSx<l-#|nQvX2Tn_IHagO7PR=6K(7Oz0@lzH=(Bm2 zoENi!mHwgfVD5_^C54q8SUOVUco~3I0z_iVEr~dZ_Xs6m7GChmv#(UH`OS_cAHmkj z&rUDDD%!4I#cbN4G(~T(eE%1f-+%wD)qXe|x}XQn3{-itu`-)E!`uT}QA16awCi-K zMh09Lhw*^S+o>*TA7wr`Y^7BzsUliMq}ocWt+@Ie;Awhg>J$90lsQhcLzwy8G8@#T zK)`NA+gf2Oka@s|w~^nl*zGw0^0}J%6_nm9A1jrl_sS<uj8$pdeyMBGvX_8w^6$Vi zc^Y^pPd}%ZYFiyWqa!wUbL!qCvgOmxwQV<ZxAI7*<7CkzdUYwgH>Qio5pOIt1-8ir z-yC^^ZiD_Z&pPiK--f2^#I5?Ro@>2V`>t!+rQhXw06d_(-+N!`MQ|ejdp@kwiK$dm zW1}95SV6OS>_Ho7Y7GKeZw<;ZQSo@a*+#t`cW6qbB0ABIXAVEo=+&`$u?d$R%tFMl zO~%?6;Nvul1dleJ2^RyFJQf)}#B1f-t%vXHPwP>?en9_cJ)_@V()H+8>KNU2aTJ=h zK<aIa0xilO1u45%Ho;6&sR=cCa`~so_DEE89O^z*9y~R1rc82UCTenS_r$4G)#!9C zqEW1>T<83!cz&_=Mck+V{VYggr`TDQ{Zhog-q(j2ZIuz&Y?KbfFjP8-j3Pk(ygf~b z%FqspoDz#0;WLirX7&B!RzYk?!NzF9t@BjwXxn%3)CKKXwV2Qc=R~GgUa+Vh$(3X7 ziN*u*S(R4!&tyW@8BEbA>sHOBvsd2w)tOC=Ij3b>-yZaQC>=IRMhd^PF+ZQr?86k} zuLndH)^{#K-Y=jl#bMqVwdjO?bwEWb&1S<mt_DUY-=f)Uxbl<3#9SL_A*n7TnXOzp zR&z2OBTy_;YH?L#a~_0=o!@0z$|1jP5xbCE$S)F>`Ih;G_1qS27!0cieXlAfl#}51 zoUR?tflJ(r{VT)$?tcFk_mKZ~`7Yatb;SJyd<^XiKM!AlKN5cA`B*&V|ATS{x_LBT zUMk-ax<eTbpAHLF1%7YxBv9}Xnml;`1OQR&vyuw?)nOF@RZ<nTS0&#yqV84qt7Ga3 z^`v@QHL2?Y?^~edN6x5Dz@PWd0eg`Y+7!8nKhmJS5i-Ia<8C7|W+a-8*AE832-puu zCa#V&An^26-kZIs*9-S}p?4IEO!8@xBr5UM;2Dme(J<o(`g)aT5!+gMXmDb%d}?rL zoTda~!FZ5b4Nl25GN4}=SQEG^z<eVBNt_49Z<3cnGgHb~O(**ICCd&1$xXE4(^x^~ zB#F4?#|}tjPoA;oq+*SgVO4ez)-tU?Jz5b%WmQL^?5e%3otYnf=jKN~g7DzZ|I^Ym zHE1=2!_(GYaPc>Ht-h!|4=*|NL&(4X4m9uXj%8wwEurB2)!%&Vvl;2F6uvNNLf6r_ z#8<S<D$Ti4>8@C+mMD1CRo1m_V#tTo26xCOs|}uz52|5Z$Y)i<vK1jHx;=E)JP*y# z^04GlPlIki92QTC%p^&r>lOP&W~DeLo)DRu2ZFK}k!*KDLn}rnD_Vg3w75zcz>9uT zWwLb18M4-B)qmjG$5TxCP^vsgVm&D{qOpTjD>FGq&yP{4@)QT1DPlRsXp63!crm*u z<~Hd=P1!7()v_?=G3k|5HX4n#DBGA#>s62JruDP;(0Y}J^k9K*8+Rv2A+f%HP&1vW zW}3O&Oy|p&f|8ksqB&XHQA0Kp4~$Gu&Med5$ZDV@Pm<Ae%m<oYj6{Pj04zxh*+xj_ zMUW|tk4a;t<L9=_7<-R|*h;UFe_!xz903;k3?Q5A1d@CNze!Bi_RXN4jN0xQWEk&M z{eWiLj<0`BBYDl{=B9eW8TC@oNZz_raM~TpgN_Gd%uHscajxf9=2j#35DPQSJJb<= zMA#$l(LE|XYTe(glXyu$D;rm)kY6+(42pL*z=J_ylo7R1IJhVHau5Zrk*Eu%dL<}j z8<Vn?7X`iKgT7JtrRR4y!{$-+=>gc78ikT(N+h9du}XJaEHFZnDWC7}&(pG_qgs|q z)iTmTOU>z5^CM<RQZQGV2h3yU6K3A*X*$aAj8Ky_t4WjH<MoKXPF#FgK2?nq?(Ccx z>MTuIi{(t+TM>=how2AR7Ih~4Aa0NN;lISw(5*Yp1;vn5=WroM9DG%^g6`)+_s-)T zj+_HN;g3#RIPqRGaiQnHfn|pVFI(1;54v*mL!nq&^MArDn0R8ip(zqc%v_C@&+Xjx z{cC16PYD*(D{Quw^>1G|7o+IpKPvN?qnHoGLGhd<lblMe;;s5dQs{XCUV6990~97{ zqd;g==*D$QeLo?x6A|^z(Zop;O^##|DVa6%-uebZ2DitZj+)#0u(nA(b%1*602Bbp zjx3F5#;H?|itCxGGj41QgQs*X^MyRgBcOHi>>&bpt$Je6qsfkX{nQRTItr&Y3ZXWo z>L=vMp#G%!XuhO^dnI^t96*m555_=7Lk2*g6U$E2dj`*CKA)(iKBoxaenmf(aIc~? zPg)`Eva+`|!KqE9sa;Jgnl86q-gK3?)w;Fm7V&Z6WAPt4Q+DdITwi`up4FzpjL0OC zvQ0?_J-0R3NW6L&#MPd75X?k!D#5U6sST2mAc2g@<2JXpg!Cgsuo>3x!-CBp6=Wk> zkUmA}C2V+Dg@k=oSr(`Ecl@BMzJ*nFCn3T_h|7#**t^bU*fVB{q(oNTHJO575~KN; zF_w)MgjNM+O!$=3wJEU0kXHW1T|?tDFymmNImfweIw;qO*HiOz&i2o@o8hWXH9|CN zrY{7&u{pbYZog#ks{>EZZ%eegid_}O(;l}uq;Sw3g?XL%iiK;ZU3`hQESrfi#i6&i zuG)0VTjLM!a9ElvA77daMx)SaXj#jw?#sH(J1S3a4tFfOXx)*Y4_@S!Np9rtC;z}b zjd@NSzM^&AE<<@Ygh=Zt>_$)yAQ(r@wkfvjY!9IK(IgUV4Gpps^6AwEHRSWD4NS<V zQ^WR<&x=`*Y=sCl$PIQ`ZfF>V-_k7cXQ5uFgUII<WgUY?(d(K~UTC!{QdZI=Mj|dE z5=bP~o!z8D)g|ds>g1HLx|D=U5)$hLO4{t?6%JL+Ho&n4*g(au28le`Pz{5^k2ZJ` ztG-p&l*(13Y+X~r4Y>XxP136d8jp|ftl7`xA}qSe+6hu83G*JBu}n+oWRIMLB~X-m zz<hZnST1h{m&@Dahv8H3MR-X5Df}mt|BN7s3hx7hDL6>u`_SZ*&j;laBE->YD#^t9 zjxkj87fJO%t?<#}u&0PQH>tiu?D4WwE+WZ6!vSqXkTMhzNu8&~-w)Y~Nc-IQUbm|N zYHnv#Sl8cbyf*x=QTCJ;`rrU_fj`=tf$wCoP8T_QlP}hjtt2u_BhIN8Obtxs7Mv9r z^Y&0<G@4h2KHJ64Jo{4pirF`-$)>C4-7x=`y9>0T-8Cv0m_p+pB)wHnX`bFq0~_4l z+9o7_uWv$fvbuGjn}#d6V|4Rh(hyVgg?KYmSrVb3(s!uMG_s+Y&`r|XTn}Yvu0sv3 ze4<&X5J2D83>DTgU5{y~T#aJ@34Fvs9#c?Df$oS^XajL;pot}M4P@Kf)Hn@o_+nH8 zN!V$Xj-~4RWqmPqXIY-^aZ;dX{EP8CekpmSHLj<`R8LvFo|0F+n>M*=liOI`uw+U* zRB5wHn^oGZHrFE<n#tB*<OZ&srF)z-H&1E*--a^yw9FK(v7nzqA{i`EOKf<`$P{+} zl(8u%rZ6cU_D<=aGC-QNDNqsJ$)L3wzR;Wu#^*QalR;^ISWO0F)$oNvJUu-hEX;&T zybaK=#Imo|D(OA$h;Bp=_v@hrYm|HRZ|Ye+-OQ~SsF8H2xwpB$d7zmcZXRhy`<o$X zmYT<!Pc*a5{q5Dzh4TMCbiwZRb0Q!0#r&Md4+YUH1kT@0caSEQ4AR&I>;JHk?&qmQ z=k1r<{)O$nx;Ni6{h|S%&8*LAm1z#GRnLTGW?L?w?<mf$bX*X&yDcHFBV&d#ch|(~ zZL^kKqCH*t-crR)V;H21;LQ6j&E$J3e!3T~O+RG`b3rvXo#0Cn0QLn;YZb5`vsNHM z1t?H!Sc=x8Ytik>gUVCN5yc2G?Yv9Nnb)>0MwbK;mhTz0!P)M!UeKTq`6M;0gzEd5 z{HNb4A%7S#B6t>V!u0i+mT~@11|^+NPw`DpTlEwj^^dBn%AZ@5Br3(9IYUF`@gkfm z7pvJna43b<au@S2F~^P=&(~&`h+XV`>eZipSPw8p>ozS_BxCE%YaaQ^26(Md8Hr9& zu4Xn8^H3BvYFp1fyD;Rir>~)OWE!5A{Z#MyFRf!aT#l9pmIpV&jc8+FV^GYfCACL= zfV<!KB=>|*Krj$=l6h#*k$Cj5;0}Wjk}RS+ipDgX4yJ(SGM8iveqArvM_dCap?O6e zofaLP03DqcU4zRRN(HH<gggO)QgCH(Z;%ZhMG4@<G==UhUW>}YZ+~7{Tc*)AsWW8~ zLLS6=qM<-uaKK>6W4KG5k~*u?NPWUUGZgSqzx5%NaVIdeLg{r90Ro8@Y7194==v8z zP3Si-gl+6MEir>Fw0_acm_}zNULgeg+ba|K`9e(M7F2$)DALjX*_k?VW{qasrc2;7 z>VKGAQ<)8SbGv~7B;m0dg*GH?R9&)#$Zkt#4ieI;tM93*t8WvfBh1KF-A$^x6P2Dc zn?hJSq>tdrC-r>E|EOnBzJrBRk~j(YB$ZZ;2`a57l1|^xB#;?DPImK<-acO64l`BX zMk@6h78Vab;)T2irpS7f+6$)C0Zh2GRBvh|^`!a9z}^(E;EUlDBjJ`4DaI=%;>z@R zFfr3ZJ_!r2UR|TdrzDMnlbEth5&%X4|FUI|4N_6Cv$47<lQyToq*E@J*J!NmF-5lo zAzzBmkx*!an7|;(IytQ{ibAE}(aFDJ-I7Sg2O3lVu96*^Xi{A*+mWn#)s@<PCN(}J zjhE?0npIx0pFi$9NOy-CAL=7!+S;A~a;laHUfrhBA2eG6QNJbRhe5NC!~(<rx4dtU zkFq%Te`e<0dp5hdkZg80*;_UjHeoj*2nj3MT)BiwxbLDi2^YDDCCa60LE^Cp1}#>p zidH~-5ELz&jbNg-rXICkYQa;rYOUZATdLAhtDagQ`}@wjOMrgt`TTyL&+otU&OY<Z zJkK-tXXc%G=Y40-t#mNJW2y5`$2(o{7%Xq>Kb`Va76^v}qr>5zmv>(tt@79%SzR96 zn7s36J#ACjqXXe#_nE6cJ>KZ^RhxD5{qujwZ@)Ki{;Y0Pq9(4nnw;olwzLjqGZz+; zh2$)9QlX0#IeG5SQwtW-0gUosF+Fz{35(vDS4ggI*26esdJC7UYPqTO`jOXzLAhwb zUH8wQv0&=5ZrJ?73s!(!Iym&17K_<xEHb#-J=#&09IHLc?r=D5g<ADn5x@GaNPj3= zqvF6A6(`D5WOqr5Y%fZY^Hp+URO$nhd#yP&W{U}t*IK4oTCG!yW_V(j`PR9)(V`WW zrPgId>#RM>=FDEpX6xqc+lsch9<V%M-I4V`(V%6}`ZN1r(QB4htuGh8TJ$^1JJwGv zC#)xmzP9|``gP&gMHL21n}fTGn5-l!EGlvuGK~&Ho-^N(r{_9{9)qT1)b&Lct9+Tn z>CLj{W_4#V`9VXboZV*eqFm04T!njxQWZDA4n{KcR*M*wm#5bo^v(hHkBGs7FnmvD zBx`_&`ezieq5=L{B-1U=KsuEvGWWW#?p1HE1@^O7p0wHJCi2Pi-?@BorCjyM*J9+{ zck`9<A?;?(h8F_=@)7N3>xLJ?x-fp~X7c$rJ+6aZdelc?^7sPv{NAxFb(Vp;iErZK z(X)TM!aH%{@Z7ltV<)lS``A0RSI$2Br<t{7-}%F5?4`G7l)8dCzu#gD-K|`4`o1kQ zHGaP`##7P3%)I#Q`|=*)CDMM5dqjx65;<Y9u}sqEXBL>tEajHbN=TP|DZ4Znv#nyw zY}W*Du-(h<558u5&vt@+ij}U78S}NF){wZwc1dWmP2`12Z6zV0wP`~6`658&@J*z; z{94<%f^nhx>KWC`=tf#+yP;rBsF!ZDZ3^w6dqa=XBca{Z@#<gYzhWDzen0<R+mY&% z`JdQ6DL7XB75yXszeD~hY-)aMZ~=?u&kL@|zrNrl+l!&MZEuG@wtXDRw8*uR*Dcpd z-Z3smna9iJr@=m~r}8{LtfxvnzI>a_D-S=k6=1P#vq^oJ6bj~sZ26&}Ey#k%d%nG( zAfFrbdZJLMv`inm9Nn}aIL7OCU+_d$>)9EpX4OnxbD6Ccw*|{A%g!qG@g45?@?{S! zisZp1XTz9j1&8I|8dxv*08>83A9=YzJ>F0{Y^g?<_pU3i#YO4|O%X@XnwwG2QkJ#W zX3MIzS+i>i>zaHlUJmBh=7(}?Q=iSHRx9$}=lO<jfaPO;#x8hPcmbP<*0WzY{Owq( z$lG9UX4`CZkk8n0)+3iLaz%4?C^X-fclJwV?U{8WE0%Q&$_h*Ur+=rY-ZQ-;jNa2* zrEWQrAL=$hw`xUEYJ@vvU=zwqa^?1uT$1rp?lgoPiYc3$^m3vtUq!69o;`wVNZoac z6d1H-U54Ih&>4*(ZLKal)0R`4k$&5%UvH=Z?vdCj-z+)-$0dew!D)t=5;g2KXiKyK zeTAtkqb#S)UhWuOR$6t5w$@$~nyhWswV9?k=4hk3s6J+lW<>4L(44B}+Qqt+re*eJ zj;qJ6Q`TwM>DC#qH+?VTd-m%c8=Tj>*9JE!x9fYITY_6cw^iMx+hMvp=Wg2$`+bh> zWj_jT5AD-GW_Zl>n0=q)QRicYdxQPDe*II%0sE7oUxvQce{DKb__cf5vfz@?vZ~t* z$^^&CqE)W%R4Pk!OZ3YOqRlYfHKnXAsKgxegEK>-UDvK(U=mnN8HLH@%nOcomb<ES zwI;*4gZL<WVx1#oa4M#(RLwfF^*R$X>1#`~xzxORIUc<zuaWl*9FYox)2YYQ)#-F# z9MzMSIVi`T>&Ph!mOIL_GZ0H@QHi6pwrYZ-b|Bf^?=Ts0pI#No4e53643o+0fV9Kz zbQT$mM)lEw!wG^jSg6-~WxomqtF$_u<TyiB@K)tymzI^))@Bno8I5|q&M<MOc25=B zED;%3m3rMuqDV<)s3ug^Q*~#Rm{HYH)mhc8dX80{s?t}T(Erge%j7s{H$BbWWXEW2 zl7GQ)BtvBEt(!Q&SM+=2;2e;zp%hq;+pK55P(Moyoc+Vd2z|j9&ho7kVA}-_ecB;6 z*aH80%)WSwEi)`P*>u*hdPZsbqAS&hR!rRqiR6}*<(cb?q~#95a@n#?^{J;?G5Kc( zqrB<{oR8sGlHQX`?b5NF($dr}+=WDzdWXn2PPbuPV^MBk_|~%F*M?s&9=@hBBe!KD z`^q+MLIpE@SmqvOH|G=-<dk!3@r0U6rZ8Snm{)SCX1c$m#`pcx&xl259#pQnDZj+; z_lLZ`o6hR^=Ibu6F3B-xV@c=_m5<$c*2O<vAIdM&XZlln1zE}C;-uJ4ZvKmN{n|kG zY^^$5$pW|m{M=?TmjCs^?2C4ht87Mt0~+Y{Ae<>z{b(UKN1HWTJ(wd~%T)?zTDR(O z%XnT7od2ahD+~Vqb*p-2T*Qz)+ZeT7ULXq82Rq}uQkA+!jms^_wfhWSqbJLqU2Jm~ zxb1a@T4P<dydl-ur|G8}nvE^CmV&AF<@z7%cNiYD-|yJveU$d;_ZS{7co^OI=lX+& zL&ih4rwX37KjRqkzGeH$_?7Lny>gd<dDU}GJ8M)Ns7m3Y@)VvtIfYA0Q`qNA;jAnb zM<NBz8jE*5$v4WnHS66sYc^$V^BU^(HO3lSt>Y!_kmnt{Zj13Y+vWl>K6{Fd=h)=d zwZom>x*ka0ijHhAaN7zBLIz_l`m@7sS2wT?dYx9IDC&N9YTMdgV6qLc!bo<9ky(w! z#$Cok#y5=`;|7C6PT;K(ZE&}KQ2%Sa&~GplthLL(J#mu(`L$%%7}6gcN}Q-3C(%<G z<H#^%;D!PA+##!%^>|aY0V#<OS#oMT@~;dEtbxF_*L|VRYV2oie<;8N$@Yc)q{P?R zPE!53QwdJW|1;Q}-q{}Mf#1*pQ(f%ZbF(D%$P3Js(8&%O-RN2Rf#iv&fDOeac@*ri zL{6<yJ{B|P)au>1Z2;2W|FYC+kKec*p6ZP9p<ZbgJK!uWANAJT`FfMLh6QSJea_)$ z%7+K@%3N91V!OY@?F$WSxp_iirom#u1S_kk^~|4z=8~Y*pr=%i5hVYOErS$(8E_G1 z|AI1UBPH2v*2S)8H?h0;OZ<Izi96)3b3g9!CX@1i(r7oE#X7;?kml!rUpvD8*B3(? z@3Z^akJ(O`-D&d@%**WM5vfrlu3XAfu{-UzAcy&+LmoY3#vHL&zSo0G3rLkoe#YP= z|40mFs$EG{6abcIg4igF#;DZlv`i7erPhO70TGpA1#P5qDxeB7kUVrqBWDd{r_pd{ zQz2?ek-N9JO8&?4{x7{WKLAW87r=?}*g=&15O5rDVt88e4Az&v;T6eaLbk^fX&|)P zJ+upLdWu!i^E5=$=n-n5cG^yp=`bBYtKYz0L(TiBnf8&NxzJjzl+QG@gWjbp=sNnC zj-kcd=mVCGcv`3%`cs?y6YMtHk{pDTk(%hI^fX(^W}{`BXbP`j0i?VwIRworOTM0b z2fhdCV^*B(qbYFx0s2ykKJi1e{0e$Cc?LNxriHYZt!ICN9(K}JrAFyZUQH9}Aic%f z;QAKbpn1n|kX9npPgp)1N*+!Ak$$c)T7sDVjkX}i2^!*KM3ZJWw7Y~ZrOT)b-0#u5 zEQeKzNU}89nA`#CUizE|_)9{Ed<1AJb<iF35NhLXI!<3Q6C1}KWcy*>V1L%UgS@rT zTC8Szkn=|n_Hi0yRji8V^L*4+KD2r+c-v?X((k7utc}Ik5c`GLqX`YyCv%gdl7B=W z9!*i?a2NdoX?(##kU<K<E7mAQ${J1e*_%<q#q?u3LT?}+AE4g9q`$M#Fdy=p_{QY? z<i6y`B>SX`CeTb;K&xmSeU~0at3FRJ&|leUZh+)r<webnnp4TUP`4%2h`i5$^lZev z6|Iz@0hqT@$}HwaDJHPXFzzg6+t@u=B)`kv<y!9H*YZz9T)ZaUS1!?Ll68nF4?Wri zEtyZt&;mE1w(mlD_tA^=3LC{rSS3pIHo||!C-P>PpYX%{1F=bLQ_g5^9X>Yv>G0`f zFGiVW=v@@G`50>9FDwsvC}%6!YW5LwekXrYWD2YBiE*Mq%n>oMMQj%@i(f0(Df^Z8 zG*dNQn*F-2;qMH;k!(wTpXAu5MZQWgrqs}Q=+9NqrmK;^ZkX$6J>5*b^aJS2U9_9_ zqukHYEA$rqo<2hh5c43<%aQsu(7H|R2QWL><LnpgMfM8&kbNZ$_rjF%OL#qR;;nos z-vqOrAK`EF6T&GLiH)KMW~VqL-c?9Zl%%E_W{PI3X0P@&U72o*ZlV5HXHK3SeKvOX zgJBxB4_`if&+spX|CpSYyg@zlzY-(odfJR!?tl*Nfq4v?c!*x2U(s*XynfC&(?H{G zxKE8{6==13HknO@`4*cAGZ$t)TL9C=7P4h9@+|*<W4EyHvpd+2RC6E7wue2+4#7Od zo`!jg9c6!DpRmt4l$Z->vmcAJAg@J9oA_itgU^Cl%2&a3^XvFJw9sDO&j<P2B1iZ| zrRWmZiXGyo;(76=_*&sgg%VW4$~<MMa*J|Uc|&<eIjwPNS~SZvJ2lU19oic0T<r?& zecA)s6WTMl?Y8R{>elPt)Ft(Pj8?xyxes16<OQ{d*=kL$ay>taeqj^cn$2u3YD~-L zh?U|8;y*Q4u~Wj$-ebLDxwty{6Vb~5E>^L5{5j?oE=`@dif+e!cR&A-f5HE#jAC>6 zpIDjlL-rJ3C7QTa9an#&j8bmVoWO1Bw^YYBup$1UxJBHO{2A40cCw?Io%{`QE5~>a z9YwF+%<n~@U-RXBD@BzW&1qVWdVf@NJ?eWB-@-<VH<g|AvGDP~u~Y0Gj2f@AX-YBw zHm_y-G2)(OMRbx~OWo{86k*S>-?IT?Y@gW6rt=K6R-BvJcuXK(7asPeFj7qBk(KaK ztev0YbH%gTBVrt;c}M6!=|(142)c9b8m8}{2W;o17}HuXn*4@U<Ia39#``aZpOquF z<{iye=-ES}f@a~aekFg6>d-enhKbUxxOYDd9o#}8ejlw*_OQhm_rHbllhXiOK|yB1 zXqk_^ZN&W|k9#o|$^%vZj`9ChjQVZt&-7j9M$a0eGDUK3S6VPKbz=P83bPo?r03C& zcWDo5enT@@K9S-c-U%&#pT3QI&PPb!PGRJ40X?KtAlGh;%-15skB6sF1m;$Hjd9w5 zJWoPDZ&#*Z<h>`k0wr9Id(?E?6JMd_$$O~@Z8s}<OL8l9Bp*s%K}%_Nav#Rub;$%> zLYp-)K2H-+YB0vW!d}4r=yz-@#_cKe9>ya-v(YCoKSlm0X`Z28<+m6+>yx)9-@+WK z4D+KM7-yzo^ty)rjCz|QhG^{YWxOxhD!OrpI!ZH>dy_6^q-DvK7<r$iJvt4>o*pXF z?1AoXRj%S8<hPviSP;}JG`ol*jdSNj>L*<qo>*5qLB6^$wz?`598*~l7+qdgTH-JE zc|C4dQK8deFR<n3<>s6}yG5P;3(lyb#n;;Dj+b=Cl@i~SDV5Ui>jI(c0zzlp4N~jH zGI4jOD(1deG6Kn~{z)>Dl8lT<GOIgG;mQhki_aZ@z1im;U<+nO;l88U7jwr?s_t*8 z?mJbt8E%gU0o*ONWzFt5>vXroTh}e?ZRu=AbbTgcldoxsv9f~tj3ziuaK-a|-F+;7 z5>p*Kzoo8^lirL>#qGZ4mUw}$S>_-v{4HIJ<Lxt}EzJ&(CstVzXHAQI3*+Q#j9UV# z2sNq6#I;Rvotlh$xhxHBb@x>a_1->UrG=e=jK#jiT~|coqAMoT$O<6c=6L>%$8G1m zh&;O~y7@w`L-e-Tmb<05w|BEUzI$f$Las;Ju^8frbAM}RZz~eM9aY#i+l{pNrdTx2 zHX%W`EQc&ps&q?yEt1l?!W}pG8hy)pS9GFb?7eZCb%Q5iw?_t($H?B|?wu3$dE)gB zU#zRy*_TVbvu@}wh`0+b=2TYnS+i2r+LvibJ2K1{I+l#^RF^6)U2U^Q>XpeH`KCf0 z;_gLm<Rt1tnI=eU$pq?MGy!6`Vho|;i_sp-<A$crUTYobRvA97@mqcF-Y*I5?>qU~ z#gwizMeDbINzy5`WJJ-xKj)4I0`bwKrGn_1(4xrWB-KBzvSQr;kNdi<ZXi@2wWDsk zVs$}OzQ-e*ck4ie7Qz$nnHf#_-L%k=phz$fi}Oy&89K)qHCJ+a&T&S9clw~mPpYfE zQE`3A$j@TU%V}9w7iW3@1A9q|-!|LVHgiGL-O}5cuG_Xb7x_~B2_w9;i=`N-@VMef zO-=Pd^=2)IN&<cwf2*%$dFK>#HRLg#(-ajB9!t5nL#R<ff3FycS$d)wGA_lhRrP=I zfKCquRw>Ng8n<>%N!c-@$Mb&~Xdrn?hEmb_VCgc&>jD@1Ctl>gD0>;b0vT3Hc-x!> zy}ibZc&!+$dV5=a?$+MU-mZaU&qANu>gyd8Q4#I!Zs|PN`~%6Sw>sjjx5rS3WvmXW z&8e}^$F|Jui?A)T7eoiGSQKxW6HRc&n>rg~eZ}BK2i+K-RVtTMNt7P9^x&R{UXkE> zRo*ceA?i`Nib_!Ziv}2}H2pamqeTNeMYEoxanO_$EuzxoB}a;;InfJsK<!wuN_A2o zCog>4<;q<x9hUHy`hwJA{o#)apO^UXlY@_*UUT+#E9o=gH9($GI-Jfk+(PrMbb8G{ zj#?>AKY!I|YnfA9OIhL$n#yY_SHa{6_)_H-^20q_V)(=0Plvx*q2ynHqX0Kx8sL<& zntJ$SWCr#E&jHO3JO;QzGmm`0mjYS<8SqaNg|ry_eSkJj^c~O_0%ZI%0qZrtRKsK- z%~r^a1<XLYUu%Au{1|kz@)4B-Ch^+jBFI{Gh2+)UK@RY;(&f4y71oTZK(m<cBgzaD zS#4lXAj$#GokCd55!sIs6+Q_#MdVtEBJ~h?;P)boe>+j>3q<8ZM57T`MJ@pJG0TXm z&;a9h6HUk<nz)82jPMNz*LVd{6L?MgP}~}#)(WD@pig}n3rpnTTUh<JLuMw@k<~Qs z2vH1iTmjydbBMnEI8i6!T7>*Bd6Q^4{43vATdjsZ$BgMgtgGZMbCBktg#V>{N*dyW zG)H{aFGjoS8%Bxa)G7W%yTr$I6rhk5jh%Ts)ZN?14Oz2PN?DuRlAT%1m`TQ(P<CU< zQiGwf%*aUD31yASl7zW$`@SzFr4U&nOGM~agh&#}GJZ4TR?~C$JkRfC`<`>IbA8Xb zuIqfh-#^UjeNfhRB?Mo8o58?KH(+?8&s3P7fjidZ;$OuN{fr?8d|Xy~TCQ(iEUvW$ z3K_gdn!jL;diJdP-7R+MQ`<8gb6H$EOMorrrdHNTDQ831d~zjtFi)Th2)f8?4Z3;h zC4R|2nh9~T#eb_+Vz42D_c7x(dWqv$9AsFQooE3Q-ORbdaoXfCpABpOy|ArR>%oC? zfwNC8u&wTI0yjyAXco2DkFX81AQ+6>8AqAf7@c*ahZR@FGX3|BGpevzF%PpC8%M;4 z2GvxzqAD1x)Kcp0+}qCdO4LR-pf?wreBNHyXcMqLvFCA1Lm!znL}Ht)6v(>i+w?7G zsga}-vS^YySKkufN514xB7ip#SmZ0>J*nWPwSNt-&IWut!ad|;KIZfN!<lk^3q{$% z#PyJ(&@w%Y`OvlSw!GSx*T|b!=f?8;xboMz^6yi>zj(@2kneS^C!Wc%;z?aeu%`UV zPfD<g`)bko!LPi8L0O$NZkaVMp~~&g=Bb?%d~`KEWUNo}QH-D5sZu<z>;bHT)ayR- zi0-8=s8umo+aDdQfmx|tMunNkh<Z8;*ksQiNNUKQNcf}R^!h@CG-pAPAf#1vT_nUz zVctQVi^NZgJQ+1f`iQ&8Z<m#)o9iYvAqAHagz4v}V$~!I3+xm#ZJcBDIkZS`2KPI^ zQ{fuBCK6UI@a9^=9}X9kFIZs49viTwutocUwY=F@jsWb&tiIO=SJvX^({smi>v8Sg zE!k0S*Q-xew^ip=-yKj!EQ}a>)YYEy&-1B!1yn=5P0z)f#p_ua;LX|(f@{*06of~| z%Jt+D8-;2Q^>Wqol1MjH)Gx;nn2z@f2<Jx=I@QL*3_e=}PQ6936o>XOBKN+RBb`;R z3DGZ?$P?~mpD`rXh}|P%vNxrl0}`*MvAZ6dTRk#w*2jE@C6gZo4RF>K{@W!%OV(YP zBn&hk0W(yytRn5Dx)=9cK$E4>(?W`(&0nB5OZ)!x;!9BkSE+Pbi`z~Z3okwGc=6)X z=SyoU9miLtiUNu@i#jY8Oq|!8*IW|2hqDGY=C)R_EB=Wj#TlMt=5JC8nVpL=#08UD z12xdr{n_hzvcmnA1``Oo*c5+(DE0iat@e*wSb3RO=a!PMMQt)f_XGS7zm-x?*z2>; zn8%9QcnvVDK;D!8WnfzJ0t<<Wq!B6k{xjof(zM>XLXxE2tsce6y$3g!w8w_|x%xrx zeHeTIdnSW~Hrvp%i62As<C{^reJ<Hkmy5g)&l53wSn8SQKy?gb1OUr6^8jFm<-$z) zo%poo0Dh?ZVUnOH11XXjwA>m%-qZDN^P<Uquw|T=NyZ!9_7{CNg2gA8j-PklFyBB8 zJ~F8Jh;BgKLUwNsDlPZqr4rJe6(Yh?FrftJQS>M;7<MPq9n^CHl%jlRzs;jJF>kZX zZDc)G3tZ3TjvTsa8~@!c{yRZr`JC!BtLn6y>hx(<$UWY`p)MkNRNTWS-qpx#;a22x zY|wLT#v&yCa;jjLyQJ3KK>o<d!i5C*#-`rsstC@W@}2^IaD{gLn+WN&Xe-EmJcEE? zxFl1$LHK3HEwTNNMpxuE?iSQ3t)l;0t1rSF+_-9h&fX^AHZ9>vtCWA&wCC7li%31X zHo^OMTF$1?K6237(PL%y9ctPOt`uD;)DH#;e`<@0j?i5R)o$g|T?r~QHfvNixhOQy zz<a0;FqV{W#a=D{cM8AlO}2O+m{UcLc~{Q7XxB3*!`wWxdspvRG8!2&<Mqtjjk_VC zfLDp1(SW25TU2MjK<Tj$`fCH4e7B6^p6a1s{oG%Q5Vx%dy3(E(IRrkv40U*knpOL> zzW5nh`Sz<smAKCXlpo(z8tY<kMM2b7sWJ=pl7#JbS(ZE72_J_Zx!5Z5@s#_r4;i<H zh{S%7VL_<O%xOuq{n7dQkD<`$oKndEnalA!wn{6do;e=@6O2RLYMbms-G_QQ+<ReC zODD=558bxyZ{DygAp=-^#A91Tbu7~tk6X&O^Rf&Nkhe+;CjqHR`zD8j+hGFI$!i0} zUBcT+M^7EEg`56qlVy{imX}tLmM>UTG;se?#PK`@%gmCxIp_+%UzUB@(A*)_q1z}+ z)k-bH`^1{LE}J4<8<R1MQu1DZ*BdWVGqM(2h}wdLw6XTd*B%4#JTMB?3n!g^5pBd) z-MX@na%$tqh6d*&#KhI4XZNHIn%{yW-!DgLILOt{a$j%xs64hmrlNMOSJc&J=0-3~ za0=v*p0flSks{bNmZ1DiEj-+om95fo8&{j7+z$(*on&Jt`r#@9!NnlHvg8|1m|}U9 zHR|<|_d{RM^4(z%ZT_<IP8e%1^GKDLoqskx9C^foeQaM=xwdI>=0nrihsEa(rIki& z7&OXv2ig<#XMEvk4voMJ%ZTLMNg163kwtc*kxQa77Vl)HPnvSBCq#F7VVw9|<XoMS zIZefg`hTJx9~C_mB;BVx&J&8-zLM0qePZ;35mr(s2qvr+Q!oqivCoy0V3ODE^i*|X z+*T`-h}-6QTKxFqR%^`r<s8Fvod?ui0zN*#)?Vq+tZN`FZOpP7Nu=*REPuP+O>Z-B z;@0$<!?~)4GPjq?<%jirQp)bf^VM`kt;$Es;S9Y`J~$g-*IhwA1&7zzCBE_y!@)c@ zueHNF+V47@jjs-5YA~ogdJ|Xkyn2Mv))q18cTF2?9~sFC^m9#=3@xb7OpBf2N#`~y z2KS7p>MaQIs@A3n8Oer^CQI*&aaj|1=N#gMhMU86`TX*nupN?-xa5W0D3P{)_j7l# z%*f6mT|i}x96{uJiG8DATzHKeQ~6MFO+htmraI%b$CAbOhhjoJbuD7)-QX38h??0V zX@LsVVU*Bx(Nn?Z(WebhS)bmmadQN^sC`hKpAsHNkH?LBjN6Sf*_Sxk?pHm?zW9aQ zD*H+^ZCJoozT5S#Tj<j5ahRKBXG}tqtGdknrV(|s>L;gTK%>|@sg|bPO{_=u-r?%) z4Q?WHy%lSkVBi>I+`u;(F>Kuul{txx@$juW1J2FZOW+;c`t#x-FH1;6ss7)u_nOvE zRUYJ4HXjY)U2%-#;yd$j>L}W>zbNDzLnZ?m&>QJ$Qlj$+o>Nmnp$Io61b#p8UI16Z zU2L*FeEa#Febw=Oq`?A9asIRFTW;$yD;dLcHQ|NoVGJ6&Ii<-@Ug-yXy2Z)ay3HzC zCS3g+O3f~*heDZwz`-!&4^WNLZ$LFv+|U0Gs-dUj*xl~NkAn7vFZywa+|~l72D>se ztKYN_nBkdQ1&F&&aYRalCi4Vsc=^O0oOv=W>FRU2{_HD4n_`>q$eArM#aZz*{yg7y zFFQOkYIf>G_uz}W82!<AdfWXuHboEiWPTblP-yeDE!-L(Q0wgcn7lYWyIlVDi!Hz7 z+@f#7iEfwWuNw<3CH)?D6H}tg&UfcLL1D2HV?+3@S9UD}0-Ih!9UHiY$BE^^(dWLQ zV+TJ7%jaMdK8{E{9}7r!ols(}I@+csvHW?W;WOK8+t<9#K@|7N5hbkjL$O7UI{W)_ z5ll$++xdDC!mp)ODw+0LlY(=|hq_LfEQ5lz$sn1OoEDvDIv^Ogz)*r9DDe0iXR)Ah zUCmcqg;8M0c#x^WY0RAb*W_`zhsbBg<0M!;-Dj(q`(46&+Mzo4UPALMdEWC0`AyG0 znJ21e%5ON=5ay$_jD*~f172UJ2L!iN*1pY04J~RK6g6VUA|GUI*nfN76B$%HuNRYX zW&PWF+#NO7Aiw3$d{(j~i~0On5@uU<ebHATp2)n;^Le5YZw(5M$2B7+#{;^2A57Wm z*ea&Hfam(%oq9(u>5P`z+F~|*GXwZd!O$U}lwis~$^}kWE<3~}D#1w`OLX+Yd3qDP zc9=?(E<=n51;=E3`mC9*g{(Qw14}f-UN$9oVDQFf2IfD^bTA$`cVD0s%>qa<0n%)w zcMRMyE<_*{^21tFlW+xS1C)os5kPq;2o8jRKp+a?iE?E}Gt%?MxZ@l(@GkCHAc)qZ zL3G4Y5K{=`4*H3@QfgZsLV-0=kxN<_PhBj|<$^cO4Xw+}8|z^KMC{C`HgtCz)V)&J zO;`w!26!^1Ls3zgRa6WV6?XMAFgyM(Ie63HROVinu+%YHuO^0wrO~hcpCD0LRazG? zUb}HPgZZZohWtab`XvECFf~Oxr}reDg0sTmU4V*~IJ^d)h}-R;Ex;1z<b8oi2`3eh z1p>k-k=*@)LZJ#EkTQa@OS((yC<L}^L$&{JeyH{^I8*@%S3)RJ)#jH51@;AjD1hP0 zNH`Vd1&6~GkVps^f%vt5=cfN-EpRYM0Rcr)y&@1uiW3kR2LE*)3<g&K!yr)Fjp>}z z_0xHw-UX=yRX`{y!zrGAbJJf|uoE#Q`U!Lm>L^{4Zsu;(Aj-;=^+P~VFp%z&662r! z%0Dl6XYJgT&K>mU-sxuUn$yBhSqklZP)d`4#^b1vGa#6n3;GHM)0glrv+M_V?YGP_ z`bPn@7YadxLYfb1DKH3!F&+Aircj;jqS6pQB<0@_w_nOs8|Und^}^yEv6NZ`0Tn@Z zbf*qN-QLcz<Dd)yX?vjG6a@UkU>C0jgizsZ|K;;7q2fj}kn`H*jHiw&<_TRU*@bmZ z^=^Z2R2cxbwOl#9+7s&3G_x`W0OA|lYRitkv#8@=*x$#6&4NRho%lbLmunpH{3>23 zP^^qA>+m=x9MR?Z_yv334cmPNOkXlukxLxctGJj%m>Jq-ZJdn6gn|^dss<IoC{Im& zlJ@A;lPPV1b89)0d#vA@xm11X_v;vb{LVj)?NULJi#V^`3ll*n1F>kG0zQEcXG5JP zs?(3$`%d{Jeo`TN>C;%;$fIx3j(pSivgb+K!y@0p%V4ehlnd{vwS2=5{5w6gu>6xA z2n4j7pWUSWpTb5FH7HF~sEW3u(mSVzk}ayrQo9W>-d;GWd{a{b0?P0F*%{x>I1o&& zhMn9{MbFra;7FvX{Eto=x{oG@JM#QP4R@M9HBA8m+i`7*B@!-qQL2X`&D6@FEQkIW z&<cdUV!-qjv!moj|AGoae!e0Z{s0M5p@NWq!tfiwAN^rL-+bcQ4BDr?NRGarc<lPf zE%zZq5K=tUSpSK5W0FX?sE}4gta!@wVngM-L{Y2Zwj0jLQ|E8$-ZW$G@fC+;w-U|K zy5nVyI;NdE4Q%RPMlys^z$VX@50O63GbgT8zmAO_xw}lNZq<DLO5pRIl!~Uw$-u=| zdXT<n9ppZl!-Au7DgDY8rzh^>6&s&9Jmx#&WSf1j?2G9^qv8N3cVyaT>r*Cc)<F3Z zZlq@A{)e+f*}CcLlMp{^lhTCJ{H_WWBgX6Vr3fqN*ZjiX@E|Y#*;qsEl6l*sZw4+s zQ~d1HORUQ(Xe8s4P$IPe)48Vjk<=$L3kl<@M&jl{9E&sD+k9{I6d#`<j>h9XZQ`#5 z0S-(NoPSf4bnhV$1bDY7A^(04u`<Ov(`s#3x<JUC()&#+s3l-V`NShdYH2o>KpP5l z5eWIGVh99EB6sAGYN>~JCQuSft3moCI-Wm&er-E5Dl{}L6mSLthMZ_<X=-Y3YT^;i zTwshLGQN%ty{zFh3{O~F27-JKi63~x*%id)uz?PHE)9g7=el!(Q&ej@tlgLeB+YUx z?1>ZmaMyR^QT`|5P%+1BcHI+w6vqr)D||2i9T=iF#>@MP7iG@~$}<qgAt7<nNQdJ; Dlcg*@ literal 0 HcmV?d00001 -- GitLab From 353ed4c857279541bf3e4a961382c93706183933 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 3 Jan 2025 22:52:50 -0800 Subject: [PATCH 032/109] done blowing up --- hercules_cg/src/gpu.rs | 1019 ++++++++++++++++------------------------ 1 file changed, 407 insertions(+), 612 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 768324ca..f2cfd9cf 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -2,12 +2,11 @@ extern crate bitvec; extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::env::var; use std::fmt::{Error, Write}; use std::iter::FromIterator; use std::os::unix::thread; -use bitvec::field; - use self::hercules_ir::*; /* @@ -30,37 +29,19 @@ pub fn gpu_codegen<W: Write>( ) -> Result<(), Error> { /* * We assert the following: - * - Array element type can't be another array - * - Any array field in a struct must have known size * - Fork node must have >= 1 reduce nodes * - If the returned data type is a collection, it must have - * originated from a parameter. Technically could extend to - * multiple parameters but we aren't going to. + * originated from a single known parameter. Can relax to allow + * one of multiple parameters. * * We don't assert but assume the following: - * - Global memory can't be used in a phi or select node * - max_num_blocks is within constraint of 1D grid size. This can be * relaxed if we want to support larger grids. + * - product types are packed with padding inserted for each element to + * be aligned for its type and for full product to be aligned to its + * largest element */ - for ty in types.iter() { - match ty { - Type::Array(type_id, _) => { - if let Type::Array(..) = types[type_id.idx()] { - panic!("Array element type can't be another array"); - } - } - Type::Product(type_ids) | Type::Summation(type_ids) => { - for type_id in type_ids.iter() { - if let Type::Array(_, extents) = &types[type_id.idx()] && multiply_dynamic_constants(dynamic_constants, &extents).is_none() { - panic!("Array field in product msut have known size") - } - } - } - _ => {} - } - } - let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) .filter(|idx| function.nodes[*idx].is_reduce()) .map(NodeID::new) @@ -246,8 +227,9 @@ impl GPUContext<'_> { self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; - self.codegen_struct_def(&mut top)?; - self.codegen_reused_locals(&mut top)?; + // self.codegen_struct_def(&mut top)?; + self.codegen_phi_declarations(&mut top)?; + self.codegen_helpers(&mut top)?; let (fork_tree, fork_control_map) = self.make_fork_structures(); let (root_forks, num_blocks) = @@ -260,7 +242,6 @@ impl GPUContext<'_> { .next() .unwrap(); let (begin_control, end_control) = self.get_begin_end_control(start, ret); - let global_refs = self.get_global_refs(); // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) @@ -277,7 +258,6 @@ impl GPUContext<'_> { &fork_control_map, &begin_control, &end_control, - &global_refs, &cumul_factors, num_threads, num_blocks, @@ -320,9 +300,9 @@ impl GPUContext<'_> { write!(w, ", ")?; } let param_type = if self.types[ty.idx()].is_primitive() { - self.get_type(*ty, false, false) + self.get_type(*ty, false) } else { - format!("{} __restrict__", self.get_type(*ty, true, true)) + format!("{} __restrict__", self.get_type(*ty, false)) }; write!(w, "{} p{}", param_type, idx)?; } @@ -332,13 +312,13 @@ impl GPUContext<'_> { write!( w, "{} __restrict__ ret", - self.get_type(*self.return_type_id, true, true) + self.get_type(*self.return_type_id, true) )?; } // Type is char since it's simplest to use single bytes for indexing, // casting will be needed for use with different types. - write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n\tsize_t alignment;\n")?; + write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n")?; Ok(()) } @@ -376,83 +356,21 @@ impl GPUContext<'_> { Ok(()) } - // Emit struct definitions for each typeid of product or summation type. If - // multiple typeids have the same type, they're separately emitted. Lastly emit - // dummy alignment for later use in dynamic shared memory slices. - fn codegen_struct_def(&self, w: &mut String) -> Result<(), Error> { - for type_id in self.typing.iter() { - let type_id_idx = type_id.idx(); - match &self.types[type_id_idx] { - Type::Product(ref product_ty_ids) => { - let product_size = self.get_size(*type_id); - write!(w, "\ttypedef struct alignas({}) Product_{} {{\n", product_size, type_id_idx)?; - let mut cumul_size = 0; - for (i, product_ty_id) in product_ty_ids.iter().enumerate() { - let field_alignment = self.get_alignment(*product_ty_id); - if (cumul_size % field_alignment) != 0 { - let padding = field_alignment - cumul_size % field_alignment; - cumul_size += padding; - write!( - w, - "\t\tchar[{}] pad{};\n", - padding, - i, - )?; - } - write!( - w, - "\t\t{} field_{};\n", - self.get_type(*product_ty_id, false, false), - i - )?; - cumul_size += self.get_size(*product_ty_id); - } - write!(w, "\t}} __attribute__((packed)) Product_{};\n", type_id_idx)?; - } - Type::Summation(ref summation_ty_ids) => { - let summation_size = self.get_size(*type_id); - write!( - w, - "\ttypedef struct alignas({}) Summation_{} {{\n\t\t union {{\n", - summation_size, - type_id_idx - )?; - for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() { - write!( - w, - "\t\t\t{} field_{};\n", - self.get_type(*summation_ty_id, false, false), - i - )?; - } - write!( - w, - "\t\t}};\n\t}} __attribute__((packed)) Summation_{};\n", - type_id_idx - )?; - } - _ => {} + // We declare all phi values upfront + fn codegen_phi_declarations(&self, w: &mut String) -> Result<(), Error> { + for id in (0..self.function.nodes.len()).map(NodeID::new) { + if let Node::Phi {..} = &self.function.nodes[id.idx()] { + write!(w, "\t{};\n", self.get_value(id, true, false))?; } } - Ok(()) } - // We generate all phi values and all flags for phi and select upfront that - // indicate if collection, whether their current value is global - fn codegen_reused_locals(&self, w: &mut String) -> Result<(), Error> { - for id in (0..self.function.nodes.len()).map(NodeID::new) { - match &self.function.nodes[id.idx()] { - Node::Phi {..} => { - write!(w, "\t{};\n", self.get_value(id, true, true, false))?; - } - _ => {} - } - let global_flag = self.get_global_flag(id, true); - if global_flag.is_some() { - write!(w, "\t{};\n", global_flag.unwrap())?; - } - } + // Emit helper registers that are used throughout the kernel- alignment + // is for proper dynamic shared memory allocation + fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { + write!(w, "\tsize_t alignment;\n")?; + write!(w, "\tsize_t max_variant_size;\n")?; Ok(()) } @@ -539,36 +457,6 @@ impl GPUContext<'_> { (begin_visited, end_visited) } - // Get all globals and global references, where for GPU purposes global = - // collection parameter - fn get_global_refs(&self) -> HashSet<NodeID> { - // We start with collection parameters, and follow any reduce or write users. - let mut queued_nodes: VecDeque<NodeID> = (0..self.function.nodes.len()) - .filter(|idx| { - self.function.nodes[*idx].is_parameter() - && !self.types[self.typing[*idx].idx()].is_primitive() - }) - .map(NodeID::new) - .collect(); - - let def_use = def_use(&self.function); - let mut global_nodes = HashSet::new(); - - while !queued_nodes.is_empty() { - let node_id = queued_nodes.pop_front().unwrap(); - global_nodes.insert(node_id); - let node_users = def_use.get_users(node_id); - for user in node_users { - match self.function.nodes[user.idx()] { - Node::Write { .. } | Node::Reduce { .. } => queued_nodes.push_back(*user), - _ => {} - } - } - } - - global_nodes - } - /* * If tree has a single root fork of known size s <= max_num_blocks * with parallel-fork schedule, then set num_blocks to s, else set num_blocks to 1. @@ -593,7 +481,7 @@ impl GPUContext<'_> { let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { panic!("Expected fork node"); }; - let fork_size = multiply_dynamic_constants(self.dynamic_constants, factors); + let fork_size = self.multiply_fork_factors(factors); if let Some(fork_size) = fork_size && fork_size <= max_num_blocks && self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) @@ -641,7 +529,7 @@ impl GPUContext<'_> { self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) }) && let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] - && let Some(fork_size) = multiply_dynamic_constants(self.dynamic_constants, factors) + && let Some(fork_size) = self.multiply_fork_factors(factors) && fork_size <= self.kernel_params.max_num_threads / cumul_factor { if fork_size % self.kernel_params.greedy_associative_thresh == 0 @@ -713,7 +601,6 @@ impl GPUContext<'_> { fork_control_map: &HashMap<NodeID, Vec<NodeID>>, begin_control: &HashSet<NodeID>, end_control: &HashSet<NodeID>, - global_refs: &HashSet<NodeID>, cumul_factors: &HashMap<NodeID, usize>, num_threads: usize, num_blocks: usize, @@ -737,7 +624,7 @@ impl GPUContext<'_> { for control in begin_control { let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; for data in control_to_data.get(control).unwrap() { - self.codegen_data_node(*data, state, num_threads, body, &mut 1, global_refs)?; + self.codegen_data_node(*data, state, num_threads, body, &mut 1)?; } let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; self.codegen_control_node(*control, term, 1)?; @@ -749,7 +636,7 @@ impl GPUContext<'_> { for control in fork_control_map.get(&root_forks[0]).unwrap() { for data in control_to_data.get(control).unwrap() { let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; - self.codegen_data_node(*data, state, num_threads, body, &mut 1, global_refs)?; + self.codegen_data_node(*data, state, num_threads, body, &mut 1)?; } let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; self.codegen_control_node(*control, term, 1)?; @@ -769,9 +656,8 @@ impl GPUContext<'_> { thread_quota: usize, w: &mut String, num_tabs: &mut usize, - global_refs: &HashSet<NodeID>, ) -> Result<(), Error> { - let declare_variable = self.get_value(id, true, false, false).to_string(); + let declare_variable = self.get_value(id, true, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { // Phi registers were already emitted. @@ -799,17 +685,18 @@ impl GPUContext<'_> { } } Node::Reduce { control: _, init, reduct: _ } => { - let init_val = self.get_value(*init, false, false, false); + let init_val = self.get_value(*init, false, false); write!(w, "{}{} = {};\n", tabs, declare_variable, init_val)?; } // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { write!(w, "{}{};\n", tabs, declare_variable)?; - let define_variable = self.get_value(id, false, false, false); + let define_variable = self.get_value(id, false, false); self.codegen_constant( if self.types[self.typing[id.idx()].idx()].is_primitive() { define_variable } else { format!("*{}", define_variable)}, *cons_id, + true, w, *num_tabs, )?; @@ -824,7 +711,7 @@ impl GPUContext<'_> { "{}{} = !{};\n", tabs, declare_variable, - self.get_value(*input, false, false, false), + self.get_value(*input, false, false), )?; } ty if ty.is_fixed() => { @@ -833,7 +720,7 @@ impl GPUContext<'_> { "{}{} = ~{};\n", tabs, declare_variable, - self.get_value(*input, false, false, false), + self.get_value(*input, false, false), )?; } _ => panic!("Unsupported type for not operator"), @@ -845,7 +732,7 @@ impl GPUContext<'_> { "{}{} = -{};\n", tabs, declare_variable, - self.get_value(*input, false, false, false), + self.get_value(*input, false, false), )?; } _ => { @@ -858,14 +745,14 @@ impl GPUContext<'_> { "{}{} = static_cast<{}>({});\n", tabs, declare_variable, - self.get_type(*dst_ty_id, false, false), - self.get_value(*input, false, false, false), + self.get_type(*dst_ty_id, false), + self.get_value(*input, false, false), )?; } }, Node::Binary { op, left, right } => { - let left_val = self.get_value(*left, false, false, false); - let right_val = self.get_value(*right, false, false, false); + let left_val = self.get_value(*left, false, false); + let right_val = self.get_value(*right, false, false); match (op, &self.types[self.typing[left.idx()].idx()]) { (BinaryOperator::Rem, Type::Float32) => write!( w, @@ -925,17 +812,12 @@ impl GPUContext<'_> { TernaryOperator::Select => { write!( w, - "{}{} = {} ? {} : {};\n{}{} = {} ? {} : {};\n", + "{}{} = {} ? {} : {};", tabs, declare_variable, - self.get_value(*first, false, false, false), - self.get_value(*second, false, false, false), - self.get_value(*third, false, false, false), - tabs, - self.get_value(id, false, false, false), - self.get_value(*first, false, false, false), - global_refs.contains(second), - global_refs.contains(third) + self.get_value(*first, false, false), + self.get_value(*second, false, false), + self.get_value(*third, false, false), )?; } }, @@ -948,105 +830,59 @@ impl GPUContext<'_> { tabs, declare_variable, func_name, - self.get_value(args[0], false, false, false), + self.get_value(args[0], false, false), )?; } + // Main difference between read and write is codegen_copy takes the + // returned node's type for read and data node's type for write Node::Read { collect, indices } => { - // Copy from global memory or from shared memory or registers. - // Generate if-else for phi and select where we don't statically know - // the case. write!(w, "{}{};\n", tabs, declare_variable); - let define_variable = self.get_value(id, false, false, false); - let global_flag = self.get_global_flag(*collect, false); - let has_global_flag = global_flag.is_some(); - if has_global_flag { - write!(w, "{}if ({}) {{\n{}\t", tabs, global_flag.unwrap(), tabs); - *num_tabs += 1; - } - if global_refs.contains(collect) || has_global_flag { - let is_char = self.is_parameter_char(self.typing[collect.idx()]); - let global_collect = self.codegen_collect(*collect, indices, true, has_global_flag, is_char); - let type_id = self.typing[id.idx()]; - let is_array = self.types[type_id.idx()].is_array(); - self.codegen_copy_from_to_global( - false, - type_id, - &define_variable, - &global_collect, - indices, - if is_array { - Some(thread_quota) - } else { - None - }, - !is_array, - false, - is_char, - w, - *num_tabs, - )?; - } - if has_global_flag { - write!(w, "{}}} else {{\n", tabs); - } - if !global_refs.contains(collect) || has_global_flag { - let local_collect = self.codegen_collect(*collect, indices, false, has_global_flag, false); - write!(w, "{}{} = {};\n", tabs, define_variable, local_collect)?; - } - if has_global_flag { - write!(w, "{}}}\n", tabs); - *num_tabs -= 1; - } + let define_variable = self.get_value(id, false, false); + let is_char = self.is_char(self.typing[collect.idx()]); + let collect_with_indices = self.codegen_collect(*collect, indices, is_char); + let type_id = self.typing[id.idx()]; + self.codegen_copy( + false, + type_id, + &define_variable, + &collect_with_indices, + if !self.types[type_id.idx()].is_primitive() { + Some(thread_quota) + } else { + None + }, + false, + w, + *num_tabs, + )?; } Node::Write { collect, data, indices, } => { - // Only difference vs read is the LHS vs RHS, and creating write- - // labeled reference after write!(w, "{}{};\n", tabs, declare_variable); - let global_flag = self.get_global_flag(*collect, false); - let has_global_flag = global_flag.is_some(); - if has_global_flag { - write!(w, "{}if ({}) {{\n", tabs, global_flag.unwrap()); - *num_tabs += 1; - } - let data_variable = self.get_value(*data, false, false, global_refs.contains(collect)); - if global_refs.contains(collect) || has_global_flag { - let is_char = self.is_parameter_char(self.typing[collect.idx()]); - let global_collect = self.codegen_collect(*collect, indices, true, has_global_flag, is_char); - let type_id = self.typing[id.idx()]; - let is_array = self.types[type_id.idx()].is_array(); - self.codegen_copy_from_to_global( - true, - type_id, - &data_variable, - &global_collect, - indices, - if is_array { - Some(thread_quota) - } else { - None - }, - !is_array, - state == 0, - is_char, - w, - *num_tabs, - )?; - } - if has_global_flag { - write!(w, "{}}} else {{\n", tabs); - } - if !global_refs.contains(collect) || has_global_flag { - let local_collect = self.codegen_collect(*collect, indices, false, has_global_flag, false); - write!(w, "{}{} = {};\n", tabs, local_collect, data_variable)?; - } - if has_global_flag { - write!(w, "{}}}\n", tabs); - *num_tabs -= 1; - } + let data_variable = self.get_value(*data, false, false); + let is_char = self.is_char(self.typing[collect.idx()]); + let collect_with_indices = self.codegen_collect(*collect, indices, is_char); + let type_id = self.typing[data.idx()]; + self.codegen_copy( + true, + type_id, + &data_variable, + &collect_with_indices, + if !self.types[type_id.idx()].is_primitive() { + Some(thread_quota) + } else { + None + }, + state == 0, + w, + *num_tabs, + )?; + let define_variable = self.get_value(id, false, false); + let collect_variable = self.get_value(*collect, false, false); + write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } _ => { panic!("Unsupported data node type") @@ -1056,13 +892,10 @@ impl GPUContext<'_> { for phi in phis { write!( w, - "{}{} = {};\n{}{} = {};\n", - tabs, - self.get_value(*phi, false, false, false), - self.get_value(id, false, false, false), + "{}{} = {};\n", tabs, - self.get_global_flag(*phi, false).unwrap(), - global_refs.contains(&id) + self.get_value(*phi, false, false), + self.get_value(id, false, false), )?; } } @@ -1094,7 +927,7 @@ impl GPUContext<'_> { w, "{}if ({}) {{\n", tabs, - self.get_value(*cond, false, false, false) + self.get_value(*cond, false, false) )?; write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; write!(w, "{}}} else {{\n", tabs)?; @@ -1108,7 +941,7 @@ impl GPUContext<'_> { Node::Join { control: _ } => {} Node::Return { control: _, data } => { if self.types[self.typing[data.idx()].idx()].is_primitive() { - let return_val = self.get_value(*data, false, false, false); + let return_val = self.get_value(*data, false, false); write!( w, "{}if (threadIdx.x == 0) {{\n{}\t*ret = {};\n{}}}\n", @@ -1124,40 +957,27 @@ impl GPUContext<'_> { Ok(()) } - // Handles reads/writes from global memory aka parameter node. We tack local - // (shmem + reg) array indexing and struct field access onto data, and tack - // global pointer offset onto global. Thread parallelization is used only for - // shared memory arrays. is_char indicates the global is a char type and we - // need to multiply the global index by the element size. - fn codegen_copy_from_to_global( + // Handles copying data to/from global and shared memory. Thread parallelization + // is used only for arrays (possibly inside another collection). is_char indicates + // a char type and we need to including element size in indexing. + fn codegen_copy( &self, is_write: bool, type_id: TypeID, data: &String, - global: &String, - indices: &[Index], + collect: &String, thread_quota: Option<usize>, - thread_restrict: bool, block_restrict: bool, - is_char: bool, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { let tabs = "\t".repeat(num_tabs); match &self.types[type_id.idx()] { Type::Array(element_type_id, extents) => { - let Index::Position(array_indices) = &indices[0] else { - panic!("Expected position index for array access") - }; - if matches!(self.types[element_type_id.idx()], Type::Array(..)) { - panic!("Nested arrays are not supported"); - } let rem_array_size = { let s = extents .iter() - .enumerate() - .filter(|(i, _)| *i >= array_indices.len()) - .map(|(_, id)| format!("dc{}", id.idx())) + .map(|id| format!("dc{}", id.idx())) .collect::<Vec<_>>() .join(" * "); if s.is_empty() { @@ -1166,124 +986,140 @@ impl GPUContext<'_> { s } }; - // If we parallelize over threads, then we index by threadIdx.x, - // else we gate the loop by threadIdx.x == 0 + // Either we parallelize over threads or gate the loop by threadIdx.x + // == 0 + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; + } let has_thread_quota = thread_quota.is_some(); - let begin_copy = if has_thread_quota { - format!( - "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", - tabs, rem_array_size, thread_quota.unwrap() - ) - } else { - format!( - "{}if (threadIdx.x == 0) {{\n{}\tfor (int i = 0; i < {}; i++) {{\n", - tabs, tabs, rem_array_size - ) - }; - write!(w, "{}", begin_copy)?; - let new_global = if is_char { + write!(w, "{}", if has_thread_quota { + format!( + "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", + tabs, rem_array_size, thread_quota.unwrap() + ) + } else { + format!( + "{}if (threadIdx.x == 0) {{\n{}\tfor (int i = 0; i < {}; i++) {{\n", + tabs, tabs, rem_array_size + ) + } + ); + let element_type_name = self.get_type(*element_type_id, true); + let (new_collect, new_data) = if self.is_char(type_id) { + (format!( + "{} + i * {}", + collect, + self.get_size(*element_type_id, None) + ), format!( - "{} + i * sizeof({})", - global, - self.get_type(*element_type_id, false, false) - ) + "{} + i * {}", + data, + self.get_size(*element_type_id, None) + )) } else { - format!("{} + i", global) + (format!("{} + i", collect), format!("{} + i", data)) }; - self.codegen_copy_from_to_global( + let new_collect = format!("{}reinterpret_cast<{}>({})", if self.types[element_type_id.idx()].is_primitive() { "*" } else { "" }, element_type_name, new_collect); + let new_data = format!("{}reinterpret_cast<{}>({})", if self.types[element_type_id.idx()].is_primitive() { "*" } else { "" }, element_type_name, new_data); + self.codegen_copy( is_write, *element_type_id, - &format!("{} + i", data), - &new_global, - &indices[1..], + &new_data, + &new_collect, None, false, - false, - is_char, w, - num_tabs + if has_thread_quota { 1 } else { 2 }, + num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 1 } else { 2 }, )?; if !has_thread_quota { write!(w, "{}\t}}\n", tabs)?; } + if block_restrict { + write!(w, "{}}}\n", tabs)?; + } write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; } - Type::Product(fields) | Type::Summation(fields) => { - if !is_char { - panic!("Global product or summation must be char addressed") + Type::Product(fields) => { + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; } - let is_product = matches!(self.types[type_id.idx()], Type::Product(..)); - if indices.is_empty() { - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - if thread_restrict { - write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; - extra_tab2 = "\t"; - } - let reinterpret = format!("*reinterpret_cast<{}_{}*>", if is_product { "Product" } else { "Summation" }, type_id.idx()); - let reinterpret_global = format!("{}({})", reinterpret, global); - let reinterpret_data = format!("{}({})", reinterpret, data); - write!( - w, - "{}{}{}{} = {};\n", - tabs, - extra_tab, - extra_tab2, - if is_write { &reinterpret_global } else { &reinterpret_data }, - if is_write { &reinterpret_data } else { &reinterpret_global } - )?; - if thread_restrict { - write!(w, "{}{}}}\n", tabs, extra_tab)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - } else if is_product { - // Iterate over fields in product to find offset - let Index::Field(field_index) = &indices[0] else { - panic!("Expected field index for product access") - }; - let offset = (0..*field_index) - .map(|i| self.get_size(fields[i])) - .sum::<usize>(); - let new_global = format!("{} + {}", global, offset); - let new_data = format!("{} + {}", data, offset); - self.codegen_copy_from_to_global( + let has_thread_quota = thread_quota.is_some(); + if !has_thread_quota { + write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; + extra_tab2 = "\t"; + } + for (i, field) in fields.iter().enumerate() { + let offset = self.get_size(type_id, Some(i)); + let field_type_name = self.get_type(*field, true); + let new_collect = format!("{}reinterpret_cast<{}>({} + {})", if self.types[field.idx()].is_primitive() { "*" } else { "" }, field_type_name, collect, offset); + let new_data = format!("{}reinterpret_cast<{}>({} + {})", if self.types[field.idx()].is_primitive() { "*" } else { "" }, field_type_name, data, offset); + self.codegen_copy( is_write, - fields[*field_index], + *field, &new_data, - &new_global, - &indices[1..], - None, - thread_restrict, - block_restrict, - is_char, + &new_collect, + thread_quota, + false, w, - num_tabs + 1, + num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 0 } else { 1 }, )?; - } else { - // All variants of summations have zero offset - let Index::Variant(variant_index) = &indices[0] else { - panic!("Expected variant index for summation access") - }; - self.codegen_copy_from_to_global( + } + if !has_thread_quota { + write!(w, "{}\t}}\n", tabs)?; + } + if block_restrict { + write!(w, "{}}}\n", tabs)?; + } + write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; + } + Type::Summation(variants) => { + let mut extra_tab = ""; + let mut extra_tab2 = ""; + if block_restrict { + write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; + extra_tab = "\t"; + } + let has_thread_quota = thread_quota.is_some(); + if !has_thread_quota { + write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; + extra_tab2 = "\t"; + } + // We can guarantee correctness for summation by just copying the + // largest variant. + let max_variant_size = self.get_size(type_id, None); + write!(w, "{}{}{}max_variant_size = {};\n", tabs, extra_tab, extra_tab2, max_variant_size)?; + for (i, variant) in variants.iter().enumerate() { + let prefix = if i == 0 { "if" } else { "else if" }; + let variant_size = self.get_size(*variant, None); + write!(w, "{}{}{}{} (max_variant_size == {}) {{\n", tabs, extra_tab, extra_tab2, prefix, variant_size)?; + let field_type_name = self.get_type(*variant, true); + let new_collect = format!("{}reinterpret_cast<{}>({})", if self.types[variant.idx()].is_primitive() { "*" } else { "" }, field_type_name, collect); + let new_data = format!("{}reinterpret_cast<{}>({})", if self.types[variant.idx()].is_primitive() { "*" } else { "" }, field_type_name, data); + self.codegen_copy( is_write, - fields[*variant_index], - &data, - &global, - &indices[1..], - None, - thread_restrict, - block_restrict, - is_char, + *variant, + &new_data, + &new_collect, + thread_quota, + false, w, - num_tabs + 1, + num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 0 } else { 1 }, )?; + write!(w, "{}{}{}}}\n", tabs, extra_tab, extra_tab2)?; + } + if !has_thread_quota { + write!(w, "{}\t}}\n", tabs)?; } + if block_restrict { + write!(w, "{}}}\n", tabs)?; + } + write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; } // Primitive types _ => { @@ -1293,34 +1129,21 @@ impl GPUContext<'_> { write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; extra_tab = "\t"; } - if thread_restrict { + let has_thread_quota = thread_quota.is_some(); + if has_thread_quota { write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; extra_tab2 = "\t"; } - if is_char { - let type_name = self.get_type(type_id, true, false); - let reinterpret = format!("*reinterpret_cast<{}>", type_name); - let reinterpret_global = format!("{}({})", reinterpret, global); - let reinterpret_data = format!("{}({})", reinterpret, data); - write!( - w, - "{}{}{}{} = {};\n", - tabs, - extra_tab, - extra_tab2, - if is_write { &reinterpret_global } else { &reinterpret_data }, - if is_write { &reinterpret_data } else { &reinterpret_global } - )?; - } else { - write!( - w, - "{}*{} = *{};\n", - tabs, - if is_write { &global } else { data }, - if is_write { data } else { &global } - )?; - } - if thread_restrict { + write!( + w, + "{}{}{}{} = {};\n", + tabs, + extra_tab, + extra_tab2, + if is_write { collect } else { data }, + if is_write { data } else { collect } + )?; + if has_thread_quota { write!(w, "{}{}}}\n", tabs, extra_tab)?; } if block_restrict { @@ -1331,59 +1154,14 @@ impl GPUContext<'_> { Ok(()) } - // // Read/writes to local collections consist of local name + array indexing - // // and struct field access. - // fn codegen_local_collect(&self, collect: NodeID, indices: &[Index], has_global_flag: bool) -> String { - // let mut index_ptr_name = "".to_string(); - // for index in indices { - // match index { - // Index::Field(field) => { - // index_ptr_name.push_str(&format!(".field_{}", field)); - // } - // Index::Variant(variant) => { - // index_ptr_name.push_str(&format!(".field_{}", variant)); - // } - // Index::Position(indices) => { - // index_ptr_name.push_str( - // &indices - // .iter() - // .map(|index| format!("[{}]", self.get_value(*index, false, false, false))) - // .collect::<Vec<_>>() - // .join(""), - // ); - // } - // } - // } - // let name = self.get_value(collect, false, false, false); - // let full_name = if has_global_flag { - // format!("reinterpret_cast<{}>({})", self.get_type(self.typing[collect.idx()], false, false), name) - // } else { - // name - // }; - // format!("{} + {}", full_name, index_ptr_name) - // } - // Read/writes to global collections consist of global name + pointer offset. - fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_global: bool, has_global_flag: bool, is_char: bool) -> String { - let mut index_ptr_name = "0".to_string(); + fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { + let mut index_ptr = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { match index { - // Sum the offset of prior fields in bytes Index::Field(field) => { - let offset = (0..*field) - .map(|i| { - format!( - "offsetof({}, field_{})", - self.get_type(type_id, false, false), - i - ) - }) - .collect::<Vec<_>>() - .join(" + "); - if *field > 0 { - index_ptr_name.push_str(&format!(" + {}", offset)); - } + self.get_size(type_id, Some(*field)); } // Variants of summations have zero offset Index::Variant(_) => {} @@ -1408,112 +1186,213 @@ impl GPUContext<'_> { cumulative_offset = format!( "{} * ({} + ", cumulative_offset, - self.get_value(*index, false, false, false) + self.get_value(*index, false, false) ); } - index_ptr_name.push_str(&format!( + index_ptr.push_str(&format!( " + {}{}", cumulative_offset, ")".repeat(array_indices.len()) )); if is_char { - let element_size = - format!("sizeof({})", self.get_type(*element_type, false, false)); - index_ptr_name.push_str(&format!(" * {}", element_size)); + let element_size = self.get_size(*element_type, None); + index_ptr.push_str(&format!(" * {}", element_size)); } } } } - let name = self.get_value(collect, false, false, false); - let full_name = if is_global && has_global_flag { - format!("reinterpret_cast<{}>({})", self.get_type(type_id, true, true), name) - } else if has_global_flag { - format!("reinterpret_cast<{}>({})", self.get_type(type_id, false, false), name) - } else { - name - }; - format!("{} + {}", full_name, index_ptr_name) + let name = self.get_value(collect, false, false); + format!("{} + {}", name, index_ptr) } // Standalone function allows us to handle recursive initialization for - // product and summation collections + // product and summation collections. `allow_allocate` prevents unnecessary + // shared memory allocations for nested product and summation collections. + // Since not initialized, array collections don't need to be recursed into. fn codegen_constant( &self, name: String, cons_id: ConstantID, + allow_allocate: bool, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); match &self.constants[cons_id.idx()] { - Constant::Boolean(val) => write!(w, " = {};\n", val)?, - Constant::Integer8(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger8(val) => write!(w, " = {};\n", val)?, - Constant::Integer16(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger16(val) => write!(w, " = {};\n", val)?, - Constant::Integer32(val) => write!(w, " = {};\n", val)?, - Constant::UnsignedInteger32(val) => write!(w, " = {}ul;\n", val)?, - Constant::Integer64(val) => write!(w, " = {}ll;\n", val)?, - Constant::UnsignedInteger64(val) => write!(w, " = {}ull;\n", val)?, - Constant::Float32(val) => write!(w, " = {}f;\n", val)?, - Constant::Float64(val) => write!(w, " = {};\n", val)?, - Constant::Product(_, fields) => { - write!(w, ";\n")?; - for (i, field) in fields.iter().enumerate() { - // Array size was set by struct definition and we don't emit array content - if !self.constants[field.idx()].is_array() { - // // Don't need type declaration for the fields - // self.codegen_constant( - // format!("{}.field_{}", name, i), - // format!("{}.field_{}", name, i), - // *field, - // w, - // )?; - - } + Constant::Boolean(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer8(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger8(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer16(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger16(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Integer32(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::UnsignedInteger32(val) => write!(w, "{}{} = {}ul;\n", tabs, name, val)?, + Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?, + Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, + Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, val)?, + Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + // All three followign collections involve align then allocate from the + // single dynamic shared memory buffer by using and updating the offset. + Constant::Product(type_id, constant_fields) => { + if allow_allocate { + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None); + write!( + w, + "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; + {}{} = dynamic_shared + dynamic_shared_offset;\n + {}dynamic_shared_offset += {};\n", + tabs, + alignment, + alignment, + alignment, + tabs, + name, + tabs, + size, + )?; + } + let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") }; + for i in 0..constant_fields.len() { + // For each field update offset and issue recursive call + let field_constant = &self.constants[constant_fields[i].idx()]; + let field_type = self.get_type(type_fields[i], true); + let offset = self.get_size(type_fields[i], Some(i)); + self.codegen_constant(format!("*reinterpret_cast<{}>({}{})", field_type, name, offset), constant_fields[i], false, w, num_tabs); } } - Constant::Summation(_, variant, field) => { - write!(w, ";\n\t{}.tag = {};\n", name, variant)?; - // See two comments in Constant::Product - if !self.constants[field.idx()].is_array() { - self.codegen_constant( - format!("\t{}.field_{}", name, variant), - format!("\t{}.field_{}", name, variant), - *field, + Constant::Summation(type_id, variant, field) => { + if allow_allocate { + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None); + write!( w, + "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; + {}{} = dynamic_shared + dynamic_shared_offset;\n + {}dynamic_shared_offset += {};\n", + tabs, + alignment, + alignment, + alignment, + tabs, + name, + tabs, + size, )?; } + // No offset updating needed since all variants start at 0 + let Type::Summation(variants) = &self.types[type_id.idx()] else { panic!("Summation constant should have summation type") }; + let variant_type = self.get_type(self.typing[variants[*variant as usize].idx()], true); + let variant_constant = &self.constants[field.idx()]; + if variant_constant.is_scalar() { + self.codegen_constant(format!("*reinterpret_cast<{}>{}", variant_type, name) , cons_id, false, w, num_tabs); + } else if !variant_constant.is_array() { + self.codegen_constant(name, cons_id, false, w, num_tabs); + }; } Constant::Array(type_id) => { - let Type::Array(element_type, extents) = &self.types[type_id.idx()] else { + let Type::Array(element_type, _) = &self.types[type_id.idx()] else { panic!("Expected array type") }; - // For now we do element-wise alignment, later could consider (n-1)d array - // alignment. Then we "allocate" from the single dynamic shared memory buffer - // by using and updating the offset. - let element_size = - format!("sizeof({})", self.get_type(*element_type, false, false)); - let array_size = extents - .iter() - .map(|id| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join("*"); + let alignment = self.get_alignment(*type_id); + let size = self.get_size(*type_id, None); + let element_type = self.get_type(*element_type, false); write!( w, - ";\n\talignment = {};\n\tdynamic_shared_offset = - ((dynamic_shared_offset + alignment - 1) / alignment) * alignment; - \n\t{} = reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t - dynamic_shared_offset += {}", - element_size, + ";\n{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; + {}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n + {}dynamic_shared_offset += {};\n", + tabs, + alignment, + alignment, + alignment, + tabs, name, - self.get_type(*element_type, false, false), - array_size + element_type, + tabs, + size )?; } } Ok(()) } + // Emit code to calculate data size. For Product types, setting `field_number` + // gives data size up to but not including that field, so = 2 gives 1st field + // and offset to 2nd field. This is useful for generating constant initialization + // and read/write index math. + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { + match &self.types[type_id.idx()] { + Type::Array(element_type, extents) => { + let array_size = extents + .iter() + .map(|id| format!("dc{}", id.idx())) + .collect::<Vec<_>>() + .join(" * "); + format!("{} * {}", self.get_alignment(*element_type), array_size) + } + Type::Product(fields) => { + let num_fields = &num_fields.unwrap_or(fields.len()); + let with_field = fields + .iter() + .enumerate() + .filter(|(i, _)| i < num_fields) + .map(|(_, id)| (self.get_size(*id, None), self.get_alignment(*id))) + .fold(String::from("0"), |acc, (size, align)| { + if acc == "0" { + size + } else { + format!("({} + {} - 1) / {}) * {} + {}", acc, align, align, align, size) + } + }); + if num_fields < &fields.len() { + format!("{} - {}", with_field, self.get_size(fields[*num_fields], None)) + } else { + with_field + } + } + Type::Summation(variants) => { + // The argmax variant by size is not guaranteed to be same as + // argmax variant by alignment, eg product of 3 4-byte primitives + // vs 1 8-byte primitive, so we need to calculate both. + let max_size = variants + .iter() + .map(|id| self.get_size(*id, None)) + .fold(String::from("0"), |acc, x| { + if acc == "0" { + x + } else { + format!("umax({}, {})", acc, x) + } + }); + let max_alignment = variants + .iter() + .map(|id| self.get_alignment(*id)) + .max() + .unwrap_or(0); + format!("({} + {} - 1) / {} * {}", max_size, max_alignment, max_alignment, max_alignment) + } + _ => format!("{}", self.get_alignment(type_id)) + } + } + + fn get_alignment(&self, type_id: TypeID) -> usize { + match &self.types[type_id.idx()] { + Type::Array(element_type, _) => self.get_alignment(*element_type), + Type::Product(fields) | Type::Summation(fields) => { + fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(0) + } + Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, + Type::Integer16 | Type::UnsignedInteger16 => 2, + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, + _ => panic!("Unsupported type for alignment"), + } + } + fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { let func_name = match intrinsic { Intrinsic::Abs => match ty { @@ -1629,13 +1508,12 @@ impl GPUContext<'_> { func_name.to_string() } - // Check if a parameter should be represented as char*. Must be a product, - // summation, or array of product/summation types. This should only be - // called on parameters. - fn is_parameter_char(&self, type_id: TypeID) -> bool { + // Check if a type should be represented as char*. Must be a product, + // summation, or array of product/summation types. + fn is_char(&self, type_id: TypeID) -> bool { match &self.types[type_id.idx()] { Type::Product(_) | Type::Summation(_) => true, - Type::Array(element_type, _) => self.is_parameter_char(*element_type), + Type::Array(element_type, _) => self.is_char(*element_type), _ => false, } } @@ -1650,57 +1528,13 @@ impl GPUContext<'_> { Ok(()) } - fn get_size(&self, type_id: TypeID) -> usize { - match &self.types[type_id.idx()] { - Type::Array(element_type, extents) => { - let element_alignment = self.get_alignment(*element_type); - extents - .iter() - .try_fold(element_alignment, |acc, &extent| { - evaluate_dynamic_constant(extent, self.dynamic_constants) - .map(|val| acc.saturating_mul(val)) - }) - .unwrap_or_else(|| panic!("Queried size for array with unknown size")) - } - _ => self.get_alignment(type_id), - } - } - - fn get_alignment(&self, type_id: TypeID) -> usize { - match &self.types[type_id.idx()] { - Type::Array(element_type, _) => self.get_alignment(*element_type), - Type::Product(fields) => { - let product_size = fields - .iter() - .map(|field| self.get_alignment(*field)) - .sum::<usize>(); - let field_alignment = fields - .iter() - .map(|field| self.get_alignment(*field)) - .max() - .unwrap_or(1); - field_alignment * ((product_size + (field_alignment - 1)) / field_alignment) - } , - Type::Summation(fields) => { - fields - .iter() - .map(|field| self.get_alignment(*field)) - .max() - .unwrap_or(0) - } - Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, - Type::Integer16 | Type::UnsignedInteger16 => 2, - Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, - Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, - _ => panic!("Unsupported type for alignment"), - } - } - fn get_block_name(&self, id: NodeID) -> String { format!("bb_{}", id.idx()) } - fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool, global_pointer: bool) -> String { + // Setting ty = true will return with type in declaration format. make_pointer + // is only considered if ty = true and only relevant for primitive types. + fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { if ty { panic!("Dynamic constants shouldn't be re-initialized") @@ -1711,25 +1545,11 @@ impl GPUContext<'_> { panic!("Parameters shouldn't be re-initialized") } format!("p{}", index) - } else if ty - && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] { - // Dynamic shared memory arrays have special formatting - let mut declare_array = format!( - "{} (*{}{})", - self.get_type(*element_type, false, false), - self.function.nodes[id.idx()].lower_case_name(), - id.idx() - ); - for extent in extents.iter().skip(1) { - declare_array.push_str(&format!("[dc{}]", extent.idx())); - } - declare_array } else if ty { format!( - "{} {}{}", - self.get_type(self.typing[id.idx()], make_pointer, global_pointer), - self.function.nodes[id.idx()].lower_case_name(), - id.idx() + "{} {}", + self.get_type(self.typing[id.idx()], make_pointer), + self.get_value(id, false, false) ) } else { format!( @@ -1740,53 +1560,28 @@ impl GPUContext<'_> { } } - fn get_global_flag(&self, id: NodeID, ty: bool) -> Option<String> { - let node = &self.function.nodes[id.idx()]; - if (!node.is_phi() && !matches!(node, Node::Ternary { op: TernaryOperator::Select, ..})) || self.types[self.typing[id.idx()].idx()].is_primitive() { - None - } else if ty { - Some(format!( - "bool {}{}_is_global", - self.function.nodes[id.idx()].lower_case_name(), - id.idx() - )) - } else { - Some(format!( - "{}{}_is_global", - self.function.nodes[id.idx()].lower_case_name(), - id.idx() - )) - } - } - + // Setting make_pointer = true will only affect primitive types- the + // collections are already pointers fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { // Product and summation collections are char* for byte-addressability - // since we can have variable type fields. is_global can only be true - // if make_pointer is true, with the exception of recursive call - // from array match arm + // since we can have variable type fields Type::Product(_) | Type::Summation(_) => { - if make_pointer { - "char*".to_string() - } else { - "char".to_string() - } + "char*".to_string() } Type::Array(element_type, _) => { - // This suffix lets us work with references of dynamic shared memory - // and use n-d array indexing. self.get_type(*element_type, true) } _ => convert_type(&self.types[id.idx()], make_pointer), } } -} -fn multiply_dynamic_constants(dcs: &Vec<DynamicConstant>, factors: &[DynamicConstantID]) -> Option<usize> { - factors.iter().try_fold(1usize, |acc, &factor_id| { - evaluate_dynamic_constant(factor_id, dcs) - .map(|val| acc.saturating_mul(val)) - }) + fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Option<usize> { + factors.iter().try_fold(1usize, |acc, &factor_id| { + evaluate_dynamic_constant(factor_id, self.dynamic_constants) + .map(|val| acc.saturating_mul(val)) + }) + } } // TODO: Add float8, float16, bfloat16 dtypes if they come -- GitLab From 90a1c4af468263be6b70083f5806c200c8bc577c Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 3 Jan 2025 23:18:20 -0800 Subject: [PATCH 033/109] theoreticlaly just speicla case left --- hercules_cg/src/gpu.rs | 91 ++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index f2cfd9cf..122af64b 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -47,12 +47,17 @@ pub fn gpu_codegen<W: Write>( .map(NodeID::new) .collect(); + // Fork reduce map should have all reduces contained in some key let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); + // Reduct reduce map should have all non-parallel and non-associative reduces + // contained in some key. Unlike fork, reduct is not involved in any assertions, + // put it here for convenience but can move. + let reduct_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); for reduce_node in &reduce_nodes { if let Node::Reduce { control, init: _, - reduct: _, + reduct, } = &function.nodes[reduce_node.idx()] { match function.nodes[control.idx()] { @@ -71,6 +76,13 @@ pub fn gpu_codegen<W: Write>( panic!("Reduce's control must be a join or region node"); } } + if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce) + && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) { + reduct_reduce_map + .entry(*reduct) + .or_default() + .push(*reduce_node); + } } } for idx in 0..function.nodes.len() { @@ -160,6 +172,7 @@ pub fn gpu_codegen<W: Write>( bbs, kernel_params, fork_reduce_map, + reduct_reduce_map, label_data_for_phi, return_type_id, }; @@ -187,6 +200,7 @@ struct GPUContext<'a> { bbs: &'a Vec<NodeID>, kernel_params: &'a GPUKernelParams, fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, + reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, return_type_id: &'a TypeID, } @@ -367,7 +381,9 @@ impl GPUContext<'_> { } // Emit helper registers that are used throughout the kernel- alignment - // is for proper dynamic shared memory allocation + // is for proper dynamic shared memory allocation, max_variant_size is + // for variant selection during read/write copies since we don't keep + // tag (don't need and it can double summation memory usage due to alignment) fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!(w, "\tsize_t alignment;\n")?; write!(w, "\tsize_t max_variant_size;\n")?; @@ -555,45 +571,6 @@ impl GPUContext<'_> { } } - // /* - // * For each parallel reduce with a reduct write, meaning it's at the end of - // * a potential parallel reduction chain, we walk back to beginning of chain - // * and update the write's collect to be the beginning's init. - // */ - // fn update_write_collects(&self) -> HashMap<NodeID, NodeID> { - // let mut write_collect_map = HashMap::new(); - // let mut parallel_reduces: HashSet<NodeID> = (0..self.function.nodes.len()) - // .map(NodeID::new) - // .filter(|&node_id| { - // self.function.schedules[node_id.idx()].contains(&Schedule::ParallelReduce) - // }) - // .collect(); - // for reduce in parallel_reduces.clone() { - // if let Node::Reduce { - // control: _, - // init, - // reduct, - // } = &self.function.nodes[reduce.idx()] - // && let Node::Write { .. } = &self.function.nodes[reduct.idx()] - // { - // parallel_reduces.remove(&reduce); - // while parallel_reduces.contains(&init) { - // let Node::Reduce { - // control: _, - // init, - // reduct: _, - // } = &self.function.nodes[init.idx()] - // else { - // panic!("Expected reduce node"); - // }; - // parallel_reduces.remove(&init); - // } - // write_collect_map.insert(*reduct, *init); - // } - // } - // write_collect_map - // } - fn codegen_data_control( &self, root_forks: &Vec<NodeID>, @@ -684,10 +661,8 @@ impl GPUContext<'_> { _ => { panic!("Unsupported state for ThreadID") } } } - Node::Reduce { control: _, init, reduct: _ } => { - let init_val = self.get_value(*init, false, false); - write!(w, "{}{} = {};\n", tabs, declare_variable, init_val)?; - } + // Fork initializes the reduce and reduct updates the reduce + Node::Reduce { control: _, init: _, reduct: _ } => {} // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { @@ -889,16 +864,25 @@ impl GPUContext<'_> { } } if let Some(phis) = self.label_data_for_phi.get(&id) { + let val = self.get_value(id, false, false); for phi in phis { + let phi_val = self.get_value(*phi, false, false); write!( w, "{}{} = {};\n", tabs, - self.get_value(*phi, false, false), - self.get_value(id, false, false), + phi_val, + val, )?; } } + if let Some(reduces) = self.reduct_reduce_map.get(&id) { + let val = self.get_value(id, true, false); + for reduce in reduces { + let reduce_val = self.get_value(*reduce, false, false); + write!(w, "{}{} = {};\n", tabs, reduce_val, val)?; + } + } Ok(()) } @@ -937,7 +921,18 @@ impl GPUContext<'_> { Node::Fork { control: _, factors: _, - } => {} + } => { + // Emitting reduces before the fork allows the reduce to be + // used outside of the fork. + for &reduce in self.fork_reduce_map.get(&id).unwrap() { + let reduce_val = self.get_value(reduce, true, false); + let Node::Reduce { control: _, init, reduct: _ } = &self.function.nodes[reduce.idx()] else { + panic!("Expected reduce node"); + }; + let init_val = self.get_value(*init, true, false); + write!(w, "{}{} = {};\n", tabs, reduce_val, init_val)?; + } + } Node::Join { control: _ } => {} Node::Return { control: _, data } => { if self.types[self.typing[data.idx()].idx()].is_primitive() { -- GitLab From cb1dc56a117c25fd8cdf523006ef08fcc95723e1 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sat, 4 Jan 2025 18:16:57 -0600 Subject: [PATCH 034/109] beating around the bush --- hercules_cg/src/gpu.rs | 662 ++++++++++++++++++++++++++++------------- 1 file changed, 447 insertions(+), 215 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 122af64b..b129fcde 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -2,10 +2,7 @@ extern crate bitvec; extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; -use std::env::var; use std::fmt::{Error, Write}; -use std::iter::FromIterator; -use std::os::unix::thread; use self::hercules_ir::*; @@ -22,8 +19,7 @@ pub fn gpu_codegen<W: Write>( reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, - antideps: &Vec<(NodeID, NodeID)>, - bbs: &Vec<NodeID>, + bbs: &BasicBlocks, collection_objects: &FunctionCollectionObjects, w: &mut W, ) -> Result<(), Error> { @@ -38,8 +34,9 @@ pub fn gpu_codegen<W: Write>( * - max_num_blocks is within constraint of 1D grid size. This can be * relaxed if we want to support larger grids. * - product types are packed with padding inserted for each element to - * be aligned for its type and for full product to be aligned to its + * be aligned for its type and for full product to be aligned to its * largest element + * - similarly, summation types must be aligned to their largest element */ let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) @@ -77,7 +74,8 @@ pub fn gpu_codegen<W: Write>( } } if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce) - && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) { + && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) + { reduct_reduce_map .entry(*reduct) .or_default() @@ -141,7 +139,6 @@ pub fn gpu_codegen<W: Write>( max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, - greedy_associative_thresh: 32, }; let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { @@ -168,7 +165,6 @@ pub fn gpu_codegen<W: Write>( reverse_postorder, typing, control_subgraph, - antideps, bbs, kernel_params, fork_reduce_map, @@ -179,13 +175,11 @@ pub fn gpu_codegen<W: Write>( ctx.codegen_function(w) } -// Kernel parameters that are fixed prior to codegen. See description of -// greedy_associative_thresh in codegen_function. +// Kernel parameters that are fixed prior to codegen. struct GPUKernelParams { max_num_blocks: usize, max_num_threads: usize, threads_per_warp: usize, - greedy_associative_thresh: usize, } struct GPUContext<'a> { @@ -196,8 +190,7 @@ struct GPUContext<'a> { reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, - antideps: &'a Vec<(NodeID, NodeID)>, - bbs: &'a Vec<NodeID>, + bbs: &'a BasicBlocks, kernel_params: &'a GPUKernelParams, fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, @@ -212,6 +205,13 @@ struct CudaGoto { term: String, } +#[derive(Clone, Copy, PartialEq, Debug)] +enum KernelState { + OutBlockFork, + InBlockFork, + InThreadFork, +} + impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // All possible includes followed by macros for intrinsic calls on @@ -241,14 +241,14 @@ impl GPUContext<'_> { self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; - // self.codegen_struct_def(&mut top)?; - self.codegen_phi_declarations(&mut top)?; + self.codegen_declare_all(&mut top)?; self.codegen_helpers(&mut top)?; let (fork_tree, fork_control_map) = self.make_fork_structures(); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - let (cumul_factors, num_threads) = self.get_cumulative_factors(&fork_tree, &root_forks); + let thread_root_forks = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); + let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, &thread_root_forks); let start = NodeID::new(0); let ret = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_return()) @@ -256,6 +256,7 @@ impl GPUContext<'_> { .next() .unwrap(); let (begin_control, end_control) = self.get_begin_end_control(start, ret); + // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) @@ -267,12 +268,17 @@ impl GPUContext<'_> { .collect(); self.codegen_data_control( - &root_forks, + if num_blocks > 1 { + Some(root_forks[0]) + } else { + None + }, + &thread_root_forks, &fork_tree, &fork_control_map, &begin_control, &end_control, - &cumul_factors, + &fork_thread_quota_map, num_threads, num_blocks, &mut gotos, @@ -370,12 +376,10 @@ impl GPUContext<'_> { Ok(()) } - // We declare all phi values upfront - fn codegen_phi_declarations(&self, w: &mut String) -> Result<(), Error> { + // To abide by c++ reassignment restrictions, we declare all values upfront. + fn codegen_declare_all(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if let Node::Phi {..} = &self.function.nodes[id.idx()] { - write!(w, "\t{};\n", self.get_value(id, true, false))?; - } + write!(w, "\t{};\n", self.get_value(id, true, false))?; } Ok(()) } @@ -509,132 +513,270 @@ impl GPUContext<'_> { } /* - * Once inside the block-level forks, we initiate a cumul_factor at 1. If - * encountering a child fork with known size s < max_num_threads / cumul_factor, - * with all reduces being parallel or associative, then we parallelize along - * s, else we serialize. Then step into child and update cumul_factor if needed. - * One exception is if fork factor is a multiple of greedy_associative_thresh - * and at least one reduce is associative, in which case we use warp reduction - * and disable cumul_factor change for its subtree. At end, we've mapped - * each fork to its cumulative factor, and if not present fork uses it's parent's - * factor. + * This analysis determines the parallelization strategy within threadblocks. + * We run post-order traversal on the fork tree to get the thread quota per + * subtree. In particular, each fork starts with a base factor as the + * maximum over its descendants (leafs have base 1). We traverse up (details + * in helper) and pass the factor and a map from fork node to + * (max quota of its siblings (including itself), its quota, its fork factor) + * - all three are needed for codegen. A node is in the map IFF it will be parallelized. + * If not, the fork will use the parent's quota. Nodes may be removed from the + * map when traversing up the tree due to either of the max scenarios. */ - fn get_cumulative_factors( + fn get_thread_quotas( &self, fork_tree: &HashMap<NodeID, Vec<NodeID>>, root_forks: &Vec<NodeID>, - ) -> (HashMap<NodeID, usize>, usize) { - let mut cumul_factors = HashMap::new(); - for root_fork in root_forks { - cumul_factors.insert(*root_fork, 1); - self.recurse_cumul_factors(*root_fork, fork_tree, 1, &mut cumul_factors); - } - let num_threads = *cumul_factors.values().max().unwrap(); - (cumul_factors, num_threads) + ) -> (HashMap<NodeID, (usize, usize, usize)>, usize) { + // We clone to add dummy root-of-roots fork + let mut fork_tree = fork_tree.clone(); + fork_tree.insert(root_forks[0], root_forks.clone()); + let (tree_map, tree_quota, _) = self.recurse_thread_quotas(root_forks[0], &fork_tree, true); + (tree_map, tree_quota) } - fn recurse_cumul_factors( + // Helper function for post-order traversal of fork tree + fn recurse_thread_quotas( &self, curr_fork: NodeID, fork_tree: &HashMap<NodeID, Vec<NodeID>>, - cumul_factor: usize, - cumul_factors: &mut HashMap<NodeID, usize>, - ) { + is_root: bool, + ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { + // Subsubtree map is the union of all keys for grandchildren and lower + // nodes, and subtree_quota is constructed map from children to their + // quota + let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree + .get(&curr_fork) + .unwrap() + .iter() + .map(|child| (child, self.recurse_thread_quotas(*child, fork_tree, false))) + .fold( + (HashMap::new(), HashMap::new(), 0), + |(mut subsubtree_map, mut children_quota_map, subtree_quota), (child, (curr_map, curr_quota, use_curr))| { + subsubtree_map.extend(curr_map); + if use_curr { + children_quota_map.insert(child, curr_quota); + } + (subsubtree_map, children_quota_map, subtree_quota.max(curr_quota)) + }, + ); + // First update children_quota_map items with full information and add + // to subsubtree_map + for (&child, quota) in children_quota_map.iter() { + let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else { + panic!("Expected fork node"); + }; + let fork_size = self.multiply_fork_factors(factors).unwrap_or(0); + subsubtree_map.insert(*child, (subtree_quota, *quota, fork_size)); + } + let subtree_map = subsubtree_map; + if is_root { + return (subtree_map, subtree_quota, true) + } + /* + * A node can only be considered for parallelization if: + * a) it has statically known size + * b) the known size is less than or equal to the max_num_threads + * c) the known size is a power of 2 + * d) all reduces are parallel-reduce or associative + * + * Note: in what follows, there are a few cases where we choose between + * parallelizing the fork vs its subtree, by taking max factor over subtree. + * However, parts of the subtree may have had smaller quotas and didn't + * need to be discarded. For now we avoid this complexity and discard full. + */ let reduces = &self.fork_reduce_map[&curr_fork]; - if reduces.iter().all(|&reduce| { - self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) - || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) - }) && let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] + if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] && let Some(fork_size) = self.multiply_fork_factors(factors) - && fork_size <= self.kernel_params.max_num_threads / cumul_factor + && fork_size <= self.kernel_params.max_num_threads + && fork_size.is_power_of_two() + && reduces.iter().all(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) + || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) + }) { - if fork_size % self.kernel_params.greedy_associative_thresh == 0 - && reduces.iter().any(|&reduce| { + /* + * If there's an associative reduce, + * if fork and subtree fit in warp, parallelize both + * else if fork is a multiple of warp size, parallelize the max between them + * else parallelize subtree + * Else, parallelize both + */ + if fork_size <= self.kernel_params.max_num_threads / subtree_quota { + if reduces.iter().any(|&reduce| { self.function.schedules[reduce.idx()].contains(&Schedule::Associative) }) { - cumul_factors.insert(curr_fork, cumul_factor * fork_size); - } else { - let mut max_factor = cumul_factor * fork_size; - for child in fork_tree[&curr_fork].iter() { - self.recurse_cumul_factors(*child, fork_tree, cumul_factor * fork_size, cumul_factors); - max_factor = max_factor.max(cumul_factors[child]); + if self.kernel_params.threads_per_warp % (fork_size * subtree_quota) == 0 { + (subtree_map, fork_size * subtree_quota, true) + } else if fork_size % self.kernel_params.threads_per_warp == 0 { + if fork_size >= subtree_quota { + (HashMap::new(), fork_size, true) + } else { + (subtree_map, subtree_quota, false) + } + } else { + (subtree_map, subtree_quota, false) + } + } else { + (subtree_map, fork_size * subtree_quota, true) } - cumul_factors.insert(curr_fork, max_factor); } - } else { - let mut max_factor = cumul_factor; - for child in fork_tree[&curr_fork].iter() { - self.recurse_cumul_factors(*child, fork_tree, cumul_factor, cumul_factors); - max_factor = max_factor.max(cumul_factors[child]); + // We have to choose either the fork or its subtree + else if fork_size >= subtree_quota { + (HashMap::new(), fork_size, true) + } else { + (subtree_map, subtree_quota, false) } - cumul_factors.insert(curr_fork, max_factor); + } else { + (subtree_map, subtree_quota, false) } } - fn codegen_data_control( + fn get_thread_root_forks( &self, root_forks: &Vec<NodeID>, fork_tree: &HashMap<NodeID, Vec<NodeID>>, + num_blocks: usize, + ) -> Vec<NodeID> { + if num_blocks > 1 { + root_forks.clone() + } else { + fork_tree.get(&root_forks[0]).unwrap().to_vec() + } + } + + fn codegen_data_control( + &self, + block_fork: Option<NodeID>, + thread_root_forks: &Vec<NodeID>, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, fork_control_map: &HashMap<NodeID, Vec<NodeID>>, begin_control: &HashSet<NodeID>, end_control: &HashSet<NodeID>, - cumul_factors: &HashMap<NodeID, usize>, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, num_threads: usize, num_blocks: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { - let control_to_data = - (0..self.bbs.len()).fold(HashMap::<NodeID, Vec<NodeID>>::new(), |mut map, id| { - if let Some(control) = self.bbs.get(id) { - map.entry(*control).or_default().push(NodeID::new(id)); - }; - map - }); - // Define the following states: // 0 is above block fork, 1 is in block fork above any thread fork, 2 is // in any thread fork, 3 is below block fork // If num_blocks > 1, initialize state to 0, else 1 - let mut state = if num_blocks > 1 { 0 } else { 1 }; + let has_block_fork = block_fork.is_some(); + let mut state = if has_block_fork { + KernelState::OutBlockFork + } else { + KernelState::InBlockFork + }; // Then generate data and control for each control in begin_control for control in begin_control { - let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; - for data in control_to_data.get(control).unwrap() { - self.codegen_data_node(*data, state, num_threads, body, &mut 1)?; + let body = &mut gotos.get_mut(control).unwrap().body; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; } - let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; + let term = &mut gotos.get_mut(control).unwrap().term; self.codegen_control_node(*control, term, 1)?; } // Then if num_blocks > 1, set state to 1 and generate data and control // for the single root fork - if num_blocks > 1 { - state = 1; - for control in fork_control_map.get(&root_forks[0]).unwrap() { - for data in control_to_data.get(control).unwrap() { - let body = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().body; - self.codegen_data_node(*data, state, num_threads, body, &mut 1)?; + if has_block_fork { + state = KernelState::InBlockFork; + for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { + for data in self.bbs.1[control.idx()].iter() { + let body = &mut gotos.get_mut(control).unwrap().body; + self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; } - let term = &mut gotos.get_mut(&self.bbs[control.idx()]).unwrap().term; + let term = &mut gotos.get_mut(control).unwrap().term; self.codegen_control_node(*control, term, 1)?; } } // Set state to 2 and begin DFS through fork_tree (after root_fork if // visited in previous step), updating thread_quota + for &root_fork in thread_root_forks { + self.codegen_data_control_traverse( + root_fork, + fork_tree, + fork_control_map, + fork_thread_quota_map, + 1, + num_threads, + gotos, + )?; + } // If num_blocks > 1, set state to 3, else 1 + state = if num_blocks > 1 { + KernelState::OutBlockFork + } else { + KernelState::InBlockFork + }; + for control in end_control { + let body = &mut gotos.get_mut(control).unwrap().body; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; + } + let term = &mut gotos.get_mut(control).unwrap().term; + self.codegen_control_node(*control, term, 1)?; + } // Then generate data and control for each control in end_control Ok(()) } + fn codegen_data_control_traverse( + &self, + curr_fork: NodeID, + fork_tree: &HashMap<NodeID, Vec<NodeID>>, + fork_control_map: &HashMap<NodeID, Vec<NodeID>>, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + parent_quota: usize, + num_threads: usize, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + let (available_thread_quota, use_thread_quota, fork_factor) = fork_thread_quota_map + .get(&curr_fork) + .map(|(a, u, f)| (*a, *u, Some(*f))) + .unwrap_or((parent_quota, parent_quota, None)); + for control in fork_control_map.get(&curr_fork).unwrap() { + let body = &mut gotos.get_mut(control).unwrap().body; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node( + *data, + KernelState::InThreadFork, + available_thread_quota, + use_thread_quota, + fork_factor, + body, + &mut 1, + )?; + } + let term = &mut gotos.get_mut(control).unwrap().term; + self.codegen_control_node(*control, term, 1)?; + } + for child in fork_tree.get(&curr_fork).unwrap() { + self.codegen_data_control_traverse( + *child, + fork_tree, + fork_control_map, + fork_thread_quota_map, + use_thread_quota, + num_threads, + gotos, + )?; + } + Ok(()) + } + fn codegen_data_node( &self, id: NodeID, - state: usize, - thread_quota: usize, + state: KernelState, + available_thread_quota: usize, + use_thread_quota: usize, + fork_factor: Option<usize>, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { - let declare_variable = self.get_value(id, true, false).to_string(); + let define_variable = self.get_value(id, false, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { // Phi registers were already emitted. @@ -642,34 +784,57 @@ impl GPUContext<'_> { control: _, data: _, } => {} - Node::ThreadID { - control, - dimension, - } => { + Node::ThreadID { control, dimension } => { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { panic!("Expected ThreadID's control to be a fork node"); }; match state { - 1 => { - // Violating DRY with the naming but unsure how to map + KernelState::InBlockFork => { + // Violating DRY with the naming but unsure how to map // DynamicConstantID to NodeID to use `get_value` - let divide = factors.iter().skip(dimension + 1).map(|f| format!("dc{}", f.idx())).collect::<Vec<_>>().join(" * "); + let divide = { + let divide = factors + .iter() + .skip(dimension + 1) + .map(|f| format!("dc{}", f.idx())) + .collect::<Vec<_>>() + .join(" * "); + if divide.is_empty() { + "1".to_string() + } else { + divide + } + }; let modulo = format!("dc{}", factors[*dimension].idx()); - write!(w, "{}{} = (blockIdx.x / ({})) % {};\n", tabs, declare_variable, divide, modulo)?; + write!( + w, + "{}{} = (blockIdx.x / ({})) % {};\n", + tabs, define_variable, divide, modulo + )?; + } + KernelState::InThreadFork => { + todo!() + } + _ => { + panic!("Unsupported state for ThreadID") } - 2 => {} - _ => { panic!("Unsupported state for ThreadID") } } } // Fork initializes the reduce and reduct updates the reduce - Node::Reduce { control: _, init: _, reduct: _ } => {} + Node::Reduce { + control: _, + init: _, + reduct: _, + } => {} // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { - write!(w, "{}{};\n", tabs, declare_variable)?; - let define_variable = self.get_value(id, false, false); self.codegen_constant( - if self.types[self.typing[id.idx()].idx()].is_primitive() { define_variable } else { format!("*{}", define_variable)}, + if self.types[self.typing[id.idx()].idx()].is_primitive() { + define_variable + } else { + format!("*{}", define_variable) + }, *cons_id, true, w, @@ -685,7 +850,7 @@ impl GPUContext<'_> { w, "{}{} = !{};\n", tabs, - declare_variable, + define_variable, self.get_value(*input, false, false), )?; } @@ -694,7 +859,7 @@ impl GPUContext<'_> { w, "{}{} = ~{};\n", tabs, - declare_variable, + define_variable, self.get_value(*input, false, false), )?; } @@ -706,7 +871,7 @@ impl GPUContext<'_> { w, "{}{} = -{};\n", tabs, - declare_variable, + define_variable, self.get_value(*input, false, false), )?; } @@ -719,7 +884,7 @@ impl GPUContext<'_> { w, "{}{} = static_cast<{}>({});\n", tabs, - declare_variable, + define_variable, self.get_type(*dst_ty_id, false), self.get_value(*input, false, false), )?; @@ -732,29 +897,29 @@ impl GPUContext<'_> { (BinaryOperator::Rem, Type::Float32) => write!( w, "{}{} = fmodf({}, {});\n", - tabs, declare_variable, left_val, right_val, + tabs, define_variable, left_val, right_val, )?, (BinaryOperator::Rem, Type::Float64) => write!( w, "{}{} = fmod({}, {});\n", - tabs, declare_variable, left_val, right_val, + tabs, define_variable, left_val, right_val, )?, // Doesn't need special syntax but bool type (BinaryOperator::Or, Type::Boolean) => write!( w, "{}{} = {} || {};\n", - tabs, declare_variable, left_val, right_val, + tabs, define_variable, left_val, right_val, )?, (BinaryOperator::And, Type::Boolean) => write!( w, "{}{} = {} && {};\n", - tabs, declare_variable, left_val, right_val, + tabs, define_variable, left_val, right_val, )?, (op, _) => write!( w, "{}{} = {} {} {};\n", tabs, - declare_variable, + define_variable, left_val, match op { BinaryOperator::Add => "+", @@ -789,7 +954,7 @@ impl GPUContext<'_> { w, "{}{} = {} ? {} : {};", tabs, - declare_variable, + define_variable, self.get_value(*first, false, false), self.get_value(*second, false, false), self.get_value(*third, false, false), @@ -803,16 +968,14 @@ impl GPUContext<'_> { w, "{}{} = {}({});\n", tabs, - declare_variable, + define_variable, func_name, self.get_value(args[0], false, false), )?; } - // Main difference between read and write is codegen_copy takes the + // Main difference between read and write is codegen_copy takes the // returned node's type for read and data node's type for write Node::Read { collect, indices } => { - write!(w, "{}{};\n", tabs, declare_variable); - let define_variable = self.get_value(id, false, false); let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); let type_id = self.typing[id.idx()]; @@ -822,7 +985,7 @@ impl GPUContext<'_> { &define_variable, &collect_with_indices, if !self.types[type_id.idx()].is_primitive() { - Some(thread_quota) + Some(use_thread_quota) } else { None }, @@ -836,7 +999,6 @@ impl GPUContext<'_> { data, indices, } => { - write!(w, "{}{};\n", tabs, declare_variable); let data_variable = self.get_value(*data, false, false); let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); @@ -847,15 +1009,14 @@ impl GPUContext<'_> { &data_variable, &collect_with_indices, if !self.types[type_id.idx()].is_primitive() { - Some(thread_quota) + Some(use_thread_quota) } else { None }, - state == 0, + state == KernelState::OutBlockFork, w, *num_tabs, )?; - let define_variable = self.get_value(id, false, false); let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -867,13 +1028,7 @@ impl GPUContext<'_> { let val = self.get_value(id, false, false); for phi in phis { let phi_val = self.get_value(*phi, false, false); - write!( - w, - "{}{} = {};\n", - tabs, - phi_val, - val, - )?; + write!(w, "{}{} = {};\n", tabs, phi_val, val,)?; } } if let Some(reduces) = self.reduct_reduce_map.get(&id) { @@ -922,11 +1077,16 @@ impl GPUContext<'_> { control: _, factors: _, } => { - // Emitting reduces before the fork allows the reduce to be + // Emitting reduces before the fork allows the reduce to be // used outside of the fork. for &reduce in self.fork_reduce_map.get(&id).unwrap() { let reduce_val = self.get_value(reduce, true, false); - let Node::Reduce { control: _, init, reduct: _ } = &self.function.nodes[reduce.idx()] else { + let Node::Reduce { + control: _, + init, + reduct: _, + } = &self.function.nodes[reduce.idx()] + else { panic!("Expected reduce node"); }; let init_val = self.get_value(*init, true, false); @@ -952,8 +1112,8 @@ impl GPUContext<'_> { Ok(()) } - // Handles copying data to/from global and shared memory. Thread parallelization - // is used only for arrays (possibly inside another collection). is_char indicates + // Handles copying data to/from global and shared memory. Thread parallelization + // is used only for arrays (possibly inside another collection). is_char indicates // a char type and we need to including element size in indexing. fn codegen_copy( &self, @@ -981,7 +1141,7 @@ impl GPUContext<'_> { s } }; - // Either we parallelize over threads or gate the loop by threadIdx.x + // Either we parallelize over threads or gate the loop by threadIdx.x // == 0 let mut extra_tab = ""; let mut extra_tab2 = ""; @@ -990,10 +1150,15 @@ impl GPUContext<'_> { extra_tab = "\t"; } let has_thread_quota = thread_quota.is_some(); - write!(w, "{}", if has_thread_quota { + write!( + w, + "{}", + if has_thread_quota { format!( "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", - tabs, rem_array_size, thread_quota.unwrap() + tabs, + rem_array_size, + thread_quota.unwrap() ) } else { format!( @@ -1004,21 +1169,37 @@ impl GPUContext<'_> { ); let element_type_name = self.get_type(*element_type_id, true); let (new_collect, new_data) = if self.is_char(type_id) { - (format!( - "{} + i * {}", - collect, - self.get_size(*element_type_id, None) - ), - format!( - "{} + i * {}", - data, - self.get_size(*element_type_id, None) - )) + ( + format!( + "{} + i * {}", + collect, + self.get_size(*element_type_id, None) + ), + format!("{} + i * {}", data, self.get_size(*element_type_id, None)), + ) } else { (format!("{} + i", collect), format!("{} + i", data)) }; - let new_collect = format!("{}reinterpret_cast<{}>({})", if self.types[element_type_id.idx()].is_primitive() { "*" } else { "" }, element_type_name, new_collect); - let new_data = format!("{}reinterpret_cast<{}>({})", if self.types[element_type_id.idx()].is_primitive() { "*" } else { "" }, element_type_name, new_data); + let new_collect = format!( + "{}reinterpret_cast<{}>({})", + if self.types[element_type_id.idx()].is_primitive() { + "*" + } else { + "" + }, + element_type_name, + new_collect + ); + let new_data = format!( + "{}reinterpret_cast<{}>({})", + if self.types[element_type_id.idx()].is_primitive() { + "*" + } else { + "" + }, + element_type_name, + new_data + ); self.codegen_copy( is_write, *element_type_id, @@ -1027,7 +1208,9 @@ impl GPUContext<'_> { None, false, w, - num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 1 } else { 2 }, + num_tabs + + if block_restrict { 1 } else { 0 } + + if has_thread_quota { 1 } else { 2 }, )?; if !has_thread_quota { write!(w, "{}\t}}\n", tabs)?; @@ -1052,8 +1235,28 @@ impl GPUContext<'_> { for (i, field) in fields.iter().enumerate() { let offset = self.get_size(type_id, Some(i)); let field_type_name = self.get_type(*field, true); - let new_collect = format!("{}reinterpret_cast<{}>({} + {})", if self.types[field.idx()].is_primitive() { "*" } else { "" }, field_type_name, collect, offset); - let new_data = format!("{}reinterpret_cast<{}>({} + {})", if self.types[field.idx()].is_primitive() { "*" } else { "" }, field_type_name, data, offset); + let new_collect = format!( + "{}reinterpret_cast<{}>({} + {})", + if self.types[field.idx()].is_primitive() { + "*" + } else { + "" + }, + field_type_name, + collect, + offset + ); + let new_data = format!( + "{}reinterpret_cast<{}>({} + {})", + if self.types[field.idx()].is_primitive() { + "*" + } else { + "" + }, + field_type_name, + data, + offset + ); self.codegen_copy( is_write, *field, @@ -1062,7 +1265,9 @@ impl GPUContext<'_> { thread_quota, false, w, - num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 0 } else { 1 }, + num_tabs + + if block_restrict { 1 } else { 0 } + + if has_thread_quota { 0 } else { 1 }, )?; } if !has_thread_quota { @@ -1088,14 +1293,40 @@ impl GPUContext<'_> { // We can guarantee correctness for summation by just copying the // largest variant. let max_variant_size = self.get_size(type_id, None); - write!(w, "{}{}{}max_variant_size = {};\n", tabs, extra_tab, extra_tab2, max_variant_size)?; + write!( + w, + "{}{}{}max_variant_size = {};\n", + tabs, extra_tab, extra_tab2, max_variant_size + )?; for (i, variant) in variants.iter().enumerate() { let prefix = if i == 0 { "if" } else { "else if" }; let variant_size = self.get_size(*variant, None); - write!(w, "{}{}{}{} (max_variant_size == {}) {{\n", tabs, extra_tab, extra_tab2, prefix, variant_size)?; + write!( + w, + "{}{}{}{} (max_variant_size == {}) {{\n", + tabs, extra_tab, extra_tab2, prefix, variant_size + )?; let field_type_name = self.get_type(*variant, true); - let new_collect = format!("{}reinterpret_cast<{}>({})", if self.types[variant.idx()].is_primitive() { "*" } else { "" }, field_type_name, collect); - let new_data = format!("{}reinterpret_cast<{}>({})", if self.types[variant.idx()].is_primitive() { "*" } else { "" }, field_type_name, data); + let new_collect = format!( + "{}reinterpret_cast<{}>({})", + if self.types[variant.idx()].is_primitive() { + "*" + } else { + "" + }, + field_type_name, + collect + ); + let new_data = format!( + "{}reinterpret_cast<{}>({})", + if self.types[variant.idx()].is_primitive() { + "*" + } else { + "" + }, + field_type_name, + data + ); self.codegen_copy( is_write, *variant, @@ -1104,7 +1335,9 @@ impl GPUContext<'_> { thread_quota, false, w, - num_tabs + if block_restrict { 1 } else { 0 } + if has_thread_quota { 0 } else { 1 }, + num_tabs + + if block_restrict { 1 } else { 0 } + + if has_thread_quota { 0 } else { 1 }, )?; write!(w, "{}{}{}}}\n", tabs, extra_tab, extra_tab2)?; } @@ -1225,7 +1458,7 @@ impl GPUContext<'_> { Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, val)?, Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, - // All three followign collections involve align then allocate from the + // All three followign collections involve align then allocate from the // single dynamic shared memory buffer by using and updating the offset. Constant::Product(type_id, constant_fields) => { if allow_allocate { @@ -1236,23 +1469,24 @@ impl GPUContext<'_> { "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; {}{} = dynamic_shared + dynamic_shared_offset;\n {}dynamic_shared_offset += {};\n", - tabs, - alignment, - alignment, - alignment, - tabs, - name, - tabs, - size, + tabs, alignment, alignment, alignment, tabs, name, tabs, size, )?; } - let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") }; + let Type::Product(type_fields) = &self.types[type_id.idx()] else { + panic!("Product constant should have product type") + }; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call let field_constant = &self.constants[constant_fields[i].idx()]; let field_type = self.get_type(type_fields[i], true); let offset = self.get_size(type_fields[i], Some(i)); - self.codegen_constant(format!("*reinterpret_cast<{}>({}{})", field_type, name, offset), constant_fields[i], false, w, num_tabs); + self.codegen_constant( + format!("*reinterpret_cast<{}>({}{})", field_type, name, offset), + constant_fields[i], + false, + w, + num_tabs, + ); } } Constant::Summation(type_id, variant, field) => { @@ -1264,22 +1498,24 @@ impl GPUContext<'_> { "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; {}{} = dynamic_shared + dynamic_shared_offset;\n {}dynamic_shared_offset += {};\n", - tabs, - alignment, - alignment, - alignment, - tabs, - name, - tabs, - size, + tabs, alignment, alignment, alignment, tabs, name, tabs, size, )?; } // No offset updating needed since all variants start at 0 - let Type::Summation(variants) = &self.types[type_id.idx()] else { panic!("Summation constant should have summation type") }; - let variant_type = self.get_type(self.typing[variants[*variant as usize].idx()], true); + let Type::Summation(variants) = &self.types[type_id.idx()] else { + panic!("Summation constant should have summation type") + }; + let variant_type = + self.get_type(self.typing[variants[*variant as usize].idx()], true); let variant_constant = &self.constants[field.idx()]; if variant_constant.is_scalar() { - self.codegen_constant(format!("*reinterpret_cast<{}>{}", variant_type, name) , cons_id, false, w, num_tabs); + self.codegen_constant( + format!("*reinterpret_cast<{}>{}", variant_type, name), + cons_id, + false, + w, + num_tabs, + ); } else if !variant_constant.is_array() { self.codegen_constant(name, cons_id, false, w, num_tabs); }; @@ -1296,15 +1532,7 @@ impl GPUContext<'_> { ";\n{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; {}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n {}dynamic_shared_offset += {};\n", - tabs, - alignment, - alignment, - alignment, - tabs, - name, - element_type, - tabs, - size + tabs, alignment, alignment, alignment, tabs, name, element_type, tabs, size )?; } } @@ -1312,7 +1540,7 @@ impl GPUContext<'_> { } // Emit code to calculate data size. For Product types, setting `field_number` - // gives data size up to but not including that field, so = 2 gives 1st field + // gives data size up to but not including that field, so = 2 gives 1st field // and offset to 2nd field. This is useful for generating constant initialization // and read/write index math. fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { @@ -1336,11 +1564,18 @@ impl GPUContext<'_> { if acc == "0" { size } else { - format!("({} + {} - 1) / {}) * {} + {}", acc, align, align, align, size) + format!( + "({} + {} - 1) / {}) * {} + {}", + acc, align, align, align, size + ) } }); if num_fields < &fields.len() { - format!("{} - {}", with_field, self.get_size(fields[*num_fields], None)) + format!( + "{} - {}", + with_field, + self.get_size(fields[*num_fields], None) + ) } else { with_field } @@ -1349,37 +1584,38 @@ impl GPUContext<'_> { // The argmax variant by size is not guaranteed to be same as // argmax variant by alignment, eg product of 3 4-byte primitives // vs 1 8-byte primitive, so we need to calculate both. - let max_size = variants - .iter() - .map(|id| self.get_size(*id, None)) - .fold(String::from("0"), |acc, x| { + let max_size = variants.iter().map(|id| self.get_size(*id, None)).fold( + String::from("0"), + |acc, x| { if acc == "0" { x } else { format!("umax({}, {})", acc, x) } - }); + }, + ); let max_alignment = variants .iter() .map(|id| self.get_alignment(*id)) .max() .unwrap_or(0); - format!("({} + {} - 1) / {} * {}", max_size, max_alignment, max_alignment, max_alignment) + format!( + "({} + {} - 1) / {} * {}", + max_size, max_alignment, max_alignment, max_alignment + ) } - _ => format!("{}", self.get_alignment(type_id)) + _ => format!("{}", self.get_alignment(type_id)), } } fn get_alignment(&self, type_id: TypeID) -> usize { match &self.types[type_id.idx()] { Type::Array(element_type, _) => self.get_alignment(*element_type), - Type::Product(fields) | Type::Summation(fields) => { - fields - .iter() - .map(|field| self.get_alignment(*field)) - .max() - .unwrap_or(0) - } + Type::Product(fields) | Type::Summation(fields) => fields + .iter() + .map(|field| self.get_alignment(*field)) + .max() + .unwrap_or(0), Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, Type::Integer16 | Type::UnsignedInteger16 => 2, Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, @@ -1504,7 +1740,7 @@ impl GPUContext<'_> { } // Check if a type should be represented as char*. Must be a product, - // summation, or array of product/summation types. + // summation, or array of product/summation types. fn is_char(&self, type_id: TypeID) -> bool { match &self.types[type_id.idx()] { Type::Product(_) | Type::Summation(_) => true, @@ -1555,18 +1791,14 @@ impl GPUContext<'_> { } } - // Setting make_pointer = true will only affect primitive types- the + // Setting make_pointer = true will only affect primitive types- the // collections are already pointers fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { // Product and summation collections are char* for byte-addressability // since we can have variable type fields - Type::Product(_) | Type::Summation(_) => { - "char*".to_string() - } - Type::Array(element_type, _) => { - self.get_type(*element_type, true) - } + Type::Product(_) | Type::Summation(_) => "char*".to_string(), + Type::Array(element_type, _) => self.get_type(*element_type, true), _ => convert_type(&self.types[id.idx()], make_pointer), } } -- GitLab From 4c844cbcb8e14c9674650bfccb4cdeebea373bd8 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 13:28:33 -0600 Subject: [PATCH 035/109] cg will solve it promise --- hercules_cg/src/gpu.rs | 487 +++++++++++++++++++---------------------- 1 file changed, 229 insertions(+), 258 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index b129fcde..00b0051d 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -89,7 +89,7 @@ pub fn gpu_codegen<W: Write>( .get(&NodeID::new(idx)) .map_or(true, |reduces| reduces.is_empty()) { - panic!("Join node {} has no reduce nodes", idx); + panic!("Fork node {} has no reduce nodes", idx); } } @@ -157,6 +157,8 @@ pub fn gpu_codegen<W: Write>( }; let label_data_for_phi = &label_data_for_phi(); + let fork_join_map = &fork_join_map(function, control_subgraph); + let ctx = GPUContext { function, types, @@ -170,6 +172,7 @@ pub fn gpu_codegen<W: Write>( fork_reduce_map, reduct_reduce_map, label_data_for_phi, + fork_join_map, return_type_id, }; ctx.codegen_function(w) @@ -195,6 +198,7 @@ struct GPUContext<'a> { fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, + fork_join_map: &'a HashMap<NodeID, NodeID>, return_type_id: &'a TypeID, } @@ -244,18 +248,17 @@ impl GPUContext<'_> { self.codegen_declare_all(&mut top)?; self.codegen_helpers(&mut top)?; - let (fork_tree, fork_control_map) = self.make_fork_structures(); + let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - let thread_root_forks = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); - let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, &thread_root_forks); + let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); + let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); let start = NodeID::new(0); let ret = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_return()) .map(NodeID::new) .next() .unwrap(); - let (begin_control, end_control) = self.get_begin_end_control(start, ret); // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -269,18 +272,15 @@ impl GPUContext<'_> { self.codegen_data_control( if num_blocks > 1 { - Some(root_forks[0]) + Some(thread_root_root_fork) } else { None }, &thread_root_forks, &fork_tree, &fork_control_map, - &begin_control, - &end_control, &fork_thread_quota_map, num_threads, - num_blocks, &mut gotos, )?; @@ -376,10 +376,19 @@ impl GPUContext<'_> { Ok(()) } - // To abide by c++ reassignment restrictions, we declare all values upfront. + // To abide by c++ reassignment restrictions, we declare all data values + // upfront. We also declare an iteration variable for each fork, which will + // be used for non-parallelized forks. Thus, some may go unused, but we don't + // know which points at time of this call- could move this function after that + // analysis but for now not. fn codegen_declare_all(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - write!(w, "\t{};\n", self.get_value(id, true, false))?; + if !self.function.nodes[id.idx()].is_control() { + write!(w, "\t{};\n", self.get_value(id, true, false))?; + } + if self.function.nodes[id.idx()].is_fork() { + write!(w, "\tunsigned int {} = 0;\n", self.get_fork_iter(id))?; + } } Ok(()) } @@ -394,87 +403,36 @@ impl GPUContext<'_> { Ok(()) } - /* Create two fork structures: - * First, fork_forward_adjacency is a map from each fork node F to all forks satisfying: + /* Create fork_tree, a map from each fork node F to all forks satisfying: * a) domination by F * b) no domination by F's join * c) no domination by any other fork that's also dominated by F, where we don't count self-domination + * Note that the fork_tree also includes the start node, to include all controls + * outside any fork. + * * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying: * a) domination by F * b) no domination by F's join * c) no domination by any other fork that's also dominated by F, where we do count self-domination */ - fn make_fork_structures(&self) -> (HashMap<NodeID, Vec<NodeID>>, HashMap<NodeID, Vec<NodeID>>) { - let mut fork_tree: HashMap<NodeID, Vec<NodeID>> = (0..self.function.nodes.len()) - .filter(|idx| self.function.nodes[*idx].is_fork()) - .map(|idx| (NodeID::new(idx), vec![])) - .collect(); - let mut fork_control_map = HashMap::new(); - let mut queued_nodes = VecDeque::new(); - - for (fork_node, fork_children) in fork_tree.iter_mut() { - let mut control_vec = vec![]; - queued_nodes.push_back(*fork_node); - while !queued_nodes.is_empty() { - let node = queued_nodes.pop_front().unwrap(); - control_vec.push(node); - for child in self.control_subgraph.succs(node) { - if self.function.nodes[child.idx()].is_fork() { - fork_children.push(child); - } else if self.function.nodes[child.idx()].is_join() { - control_vec.push(child); - } else { - queued_nodes.push_back(child); - } + fn make_fork_structures(&self, fork_join_map: &HashMap<NodeID, NodeID>) -> (HashMap<NodeID, HashSet<NodeID>>, HashMap<NodeID, HashSet<NodeID>>) { + let dom = dominator(self.control_subgraph, NodeID::new(0)); + let fork_nesting = compute_fork_join_nesting(self.function, &dom, fork_join_map); + fork_nesting.into_iter().fold( + (HashMap::new(), HashMap::new()), + |(mut fork_tree, mut fork_control_map), (control, forks)| { + let nested_fork = forks.first().copied().unwrap_or(NodeID::new(0)); + if self.function.nodes[control.idx()].is_fork() { + fork_tree.entry(nested_fork).or_insert_with(HashSet::new).insert(control); + } else { + fork_control_map.entry(nested_fork).or_insert_with(HashSet::new).insert(control); } - } - fork_control_map.insert(*fork_node, control_vec); - } - (fork_tree, fork_control_map) - } - - // Get control nodes succeeding the start and preceding all forks, and - // control nodes preceding the return and succeeding all joins - fn get_begin_end_control( - &self, - start: NodeID, - ret: NodeID, - ) -> (HashSet<NodeID>, HashSet<NodeID>) { - let mut begin_visited = HashSet::new(); - let mut begin_worklist = VecDeque::new(); - begin_worklist.push_back(start); - - while let Some(node) = begin_worklist.pop_front() { - if begin_visited.contains(&node) { - continue; - } - if self.function.nodes[node.idx()].is_fork() { - continue; - } - begin_visited.insert(node); - for pred in self.control_subgraph.preds(node) { - begin_worklist.push_back(pred); - } - } - - let mut end_visited = HashSet::new(); - let mut end_worklist = VecDeque::new(); - end_worklist.push_back(ret); - - while let Some(node) = end_worklist.pop_front() { - if end_visited.contains(&node) { - continue; - } - if self.function.nodes[node.idx()].is_join() { - continue; - } - end_visited.insert(node); - for succ in self.control_subgraph.preds(node) { - end_worklist.push_back(succ); - } - } - - (begin_visited, end_visited) + for i in 0..forks.len()-1 { + fork_tree.entry(forks[i+1]).or_insert_with(HashSet::new).insert(forks[i]); + } + (fork_tree, fork_control_map) + }, + ) } /* @@ -483,21 +441,15 @@ impl GPUContext<'_> { */ fn get_root_forks_and_num_blocks( &self, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, max_num_blocks: usize, - ) -> (Vec<NodeID>, usize) { - let mut root_forks: HashSet<NodeID> = fork_tree.keys().copied().collect(); - for (_, children) in fork_tree.iter() { - for child in children { - root_forks.remove(child); - } - } - let root_forks: Vec<NodeID> = root_forks.into_iter().collect(); + ) -> (HashSet<NodeID>, usize) { + let root_forks: HashSet<NodeID> = fork_tree.get(&NodeID::new(0)).unwrap().clone(); if root_forks.len() != 1 { return (root_forks, 1); } - let root_fork = root_forks[0]; + let root_fork = root_forks.iter().next().unwrap(); let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { panic!("Expected fork node"); }; @@ -517,7 +469,7 @@ impl GPUContext<'_> { * We run post-order traversal on the fork tree to get the thread quota per * subtree. In particular, each fork starts with a base factor as the * maximum over its descendants (leafs have base 1). We traverse up (details - * in helper) and pass the factor and a map from fork node to + * in helper) and pass the factor and a map from fork node to a tuple of * (max quota of its siblings (including itself), its quota, its fork factor) * - all three are needed for codegen. A node is in the map IFF it will be parallelized. * If not, the fork will use the parent's quota. Nodes may be removed from the @@ -525,13 +477,10 @@ impl GPUContext<'_> { */ fn get_thread_quotas( &self, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, - root_forks: &Vec<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + root_fork: NodeID, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize) { - // We clone to add dummy root-of-roots fork - let mut fork_tree = fork_tree.clone(); - fork_tree.insert(root_forks[0], root_forks.clone()); - let (tree_map, tree_quota, _) = self.recurse_thread_quotas(root_forks[0], &fork_tree, true); + let (tree_map, tree_quota, _) = self.recurse_thread_quotas(root_fork, fork_tree, true); (tree_map, tree_quota) } @@ -539,7 +488,7 @@ impl GPUContext<'_> { fn recurse_thread_quotas( &self, curr_fork: NodeID, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, is_root: bool, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { // Subsubtree map is the union of all keys for grandchildren and lower @@ -566,7 +515,7 @@ impl GPUContext<'_> { let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else { panic!("Expected fork node"); }; - let fork_size = self.multiply_fork_factors(factors).unwrap_or(0); + let fork_size = self.multiply_fork_factors(factors).unwrap(); subsubtree_map.insert(*child, (subtree_quota, *quota, fork_size)); } let subtree_map = subsubtree_map; @@ -580,7 +529,7 @@ impl GPUContext<'_> { * c) the known size is a power of 2 * d) all reduces are parallel-reduce or associative * - * Note: in what follows, there are a few cases where we choose between + * Note: there are a few cases where we choose between * parallelizing the fork vs its subtree, by taking max factor over subtree. * However, parts of the subtree may have had smaller quotas and didn't * need to be discarded. For now we avoid this complexity and discard full. @@ -634,68 +583,74 @@ impl GPUContext<'_> { fn get_thread_root_forks( &self, - root_forks: &Vec<NodeID>, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, + root_forks: &HashSet<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, num_blocks: usize, - ) -> Vec<NodeID> { + ) -> (NodeID, HashSet<NodeID>) { if num_blocks > 1 { - root_forks.clone() + (NodeID::new(0), root_forks.clone()) } else { - fork_tree.get(&root_forks[0]).unwrap().to_vec() + let root_fork = root_forks.iter().next().unwrap(); + (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect()) } } fn codegen_data_control( &self, block_fork: Option<NodeID>, - thread_root_forks: &Vec<NodeID>, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, - fork_control_map: &HashMap<NodeID, Vec<NodeID>>, - begin_control: &HashSet<NodeID>, - end_control: &HashSet<NodeID>, + thread_root_forks: &HashSet<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, num_threads: usize, - num_blocks: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { // Define the following states: // 0 is above block fork, 1 is in block fork above any thread fork, 2 is // in any thread fork, 3 is below block fork - // If num_blocks > 1, initialize state to 0, else 1 + // First emit data and control gen for each control node outside any fork. + // Recall that this was tracked through a fake fork node with NodeID 0. + // If num_blocks > 1, initialize state to 0, else 1. This is because + // if there is no block fork, then everything is in a single block, which + // is semantically the same as being directly nested in the block fork. let has_block_fork = block_fork.is_some(); let mut state = if has_block_fork { KernelState::OutBlockFork } else { KernelState::InBlockFork }; - // Then generate data and control for each control in begin_control - for control in begin_control { - let body = &mut gotos.get_mut(control).unwrap().body; + for control in fork_control_map.get(&NodeID::new(0)).unwrap() { + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, None, None, false, init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; + self.codegen_data_node(*data, state, num_threads, None, None, body, &mut tabs)?; } - let term = &mut gotos.get_mut(control).unwrap().term; - self.codegen_control_node(*control, term, 1)?; } - // Then if num_blocks > 1, set state to 1 and generate data and control - // for the single root fork + // Then generate data and control for the single block fork if it exists if has_block_fork { state = KernelState::InBlockFork; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, None, None, false, init, term)?; for data in self.bbs.1[control.idx()].iter() { - let body = &mut gotos.get_mut(control).unwrap().body; - self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; + self.codegen_data_node(*data, state, num_threads, None, Some(block_fork.unwrap()), body, &mut tabs)?; } - let term = &mut gotos.get_mut(control).unwrap().term; - self.codegen_control_node(*control, term, 1)?; } } - // Set state to 2 and begin DFS through fork_tree (after root_fork if - // visited in previous step), updating thread_quota + // Then generate for the thread fork tree by setting state to 2, traverse, + // and update the thread quota. Any traversal is fine, we choose pre-order. + state = KernelState::InThreadFork; for &root_fork in thread_root_forks { self.codegen_data_control_traverse( root_fork, + state, fork_tree, fork_control_map, fork_thread_quota_map, @@ -704,57 +659,46 @@ impl GPUContext<'_> { gotos, )?; } - // If num_blocks > 1, set state to 3, else 1 - state = if num_blocks > 1 { - KernelState::OutBlockFork - } else { - KernelState::InBlockFork - }; - for control in end_control { - let body = &mut gotos.get_mut(control).unwrap().body; - for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, num_threads, num_threads, None, body, &mut 1)?; - } - let term = &mut gotos.get_mut(control).unwrap().term; - self.codegen_control_node(*control, term, 1)?; - } - // Then generate data and control for each control in end_control Ok(()) } fn codegen_data_control_traverse( &self, curr_fork: NodeID, - fork_tree: &HashMap<NodeID, Vec<NodeID>>, - fork_control_map: &HashMap<NodeID, Vec<NodeID>>, + state: KernelState, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, parent_quota: usize, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { - let (available_thread_quota, use_thread_quota, fork_factor) = fork_thread_quota_map + let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map .get(&curr_fork) .map(|(a, u, f)| (*a, *u, Some(*f))) .unwrap_or((parent_quota, parent_quota, None)); for control in fork_control_map.get(&curr_fork).unwrap() { - let body = &mut gotos.get_mut(control).unwrap().body; + let goto = gotos.get_mut(control).unwrap(); + let init = &mut goto.init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor.is_some(), init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, - KernelState::InThreadFork, - available_thread_quota, + state, use_thread_quota, - fork_factor, + parallel_factor, + Some(curr_fork), body, - &mut 1, + &mut tabs, )?; } - let term = &mut gotos.get_mut(control).unwrap().term; - self.codegen_control_node(*control, term, 1)?; } for child in fork_tree.get(&curr_fork).unwrap() { self.codegen_data_control_traverse( *child, + state, fork_tree, fork_control_map, fork_thread_quota_map, @@ -766,20 +710,28 @@ impl GPUContext<'_> { Ok(()) } + // state dictates where we are in the kernel, and affects ThreadID and Write + // use_thread_quota is the number of threads used by the node, and affects + // ThreadID, Read, Write, and associative Binops + // parallel_factor is parallelization degree, and affects ThreadID and associative + // Binops + // nesting_fork is the fork node that the node is nested in, and affects ThreadID + // and Reduce fn codegen_data_node( &self, id: NodeID, state: KernelState, - available_thread_quota: usize, use_thread_quota: usize, - fork_factor: Option<usize>, + parallel_factor: Option<usize>, + nesting_fork: Option<NodeID>, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { let define_variable = self.get_value(id, false, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { - // Phi registers were already emitted. + // Phi registers were already emitted and the data nodes it uses will + // update the phi Node::Phi { control: _, data: _, @@ -788,24 +740,10 @@ impl GPUContext<'_> { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { panic!("Expected ThreadID's control to be a fork node"); }; + let divide = multiply_dcs(&factors[dimension + 1..]); + let modulo = format!("dc{}", factors[*dimension].idx()); match state { KernelState::InBlockFork => { - // Violating DRY with the naming but unsure how to map - // DynamicConstantID to NodeID to use `get_value` - let divide = { - let divide = factors - .iter() - .skip(dimension + 1) - .map(|f| format!("dc{}", f.idx())) - .collect::<Vec<_>>() - .join(" * "); - if divide.is_empty() { - "1".to_string() - } else { - divide - } - }; - let modulo = format!("dc{}", factors[*dimension].idx()); write!( w, "{}{} = (blockIdx.x / ({})) % {};\n", @@ -813,19 +751,40 @@ impl GPUContext<'_> { )?; } KernelState::InThreadFork => { - todo!() + if parallel_factor.is_none() { + let fork_iter = self.get_fork_iter(*control); + write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; + } else { + + + } } _ => { panic!("Unsupported state for ThreadID") } } } - // Fork initializes the reduce and reduct updates the reduce + // Only initialize the reduce, as reduct will update the reduce. If + // serialized, add gate to prevent re-assignment when we hit this reduce + // again Node::Reduce { control: _, - init: _, + init, reduct: _, - } => {} + } => { + let init_val = self.get_value(*init, false, false); + if parallel_factor.is_none() { + let Some(nesting_fork) = nesting_fork else { + panic!("Expected reduce to be nested in a fork node"); + }; + let fork_iter = self.get_fork_iter(nesting_fork); + write!(w, "{}if ({} == 0) {{\n", tabs, fork_iter)?; + write!(w, "{}\t{} = {};\n", tabs, define_variable, init_val)?; + write!(w, "{}}}\n", tabs)?; + } else { + write!(w, "{}{} = {};\n", tabs, define_variable, init_val)?; + } + } // Parameters emitted at top Node::Parameter { index: _ } => {} Node::Constant { id: cons_id } => { @@ -1044,72 +1003,96 @@ impl GPUContext<'_> { fn codegen_control_node( &self, id: NodeID, - w: &mut String, - num_tabs: usize, - ) -> Result<(), Error> { - let tabs = "\t".repeat(num_tabs); - match &self.function.nodes[id.idx()] { + available_thread_quota: Option<usize>, + use_thread_quota: Option<usize>, + is_parallel: bool, + w_init: &mut String, + w_term: &mut String, + ) -> Result<usize, Error> { + let tabs = match &self.function.nodes[id.idx()] { Node::Start | Node::Region { preds: _ } - | Node::Projection { - control: _, - selection: _, - } => { + | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w, "{}goto {}\n", tabs, self.get_block_name(succ))?; + write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + 1 } Node::If { control: _, cond } => { let mut succs = self.control_subgraph.succs(id); let succ1 = succs.next().unwrap(); let succ2 = succs.next().unwrap(); write!( - w, - "{}if ({}) {{\n", - tabs, + w_term, + "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ1))?; - write!(w, "{}}} else {{\n", tabs)?; - write!(w, "{}\tgoto {};\n", tabs, self.get_block_name(succ2))?; - write!(w, "{}}}\n", tabs)?; + write!(w_term, "\t\tgoto {};\n", get_block_name(succ1))?; + write!(w_term, "\t}} else {{\n")?; + write!(w_term, "\t\tgoto {};\n", get_block_name(succ2))?; + write!(w_term, "\t}}\n")?; + 1 } - Node::Fork { - control: _, - factors: _, - } => { - // Emitting reduces before the fork allows the reduce to be - // used outside of the fork. - for &reduce in self.fork_reduce_map.get(&id).unwrap() { - let reduce_val = self.get_value(reduce, true, false); - let Node::Reduce { - control: _, - init, - reduct: _, - } = &self.function.nodes[reduce.idx()] - else { - panic!("Expected reduce node"); + Node::Fork { control: _, factors: _ } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { + write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + write!(w_term, "\t\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\t}}\n")?; + write!(w_term, "\telse {{\n")?; + let join = self.fork_join_map.get(&id).unwrap(); + write!(w_term, "\t\tgoto {};\n", get_block_name(*join))?; + write!(w_term, "\t}}\n")?; + 2 + } else { + write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + 1 + } + } + Node::Join { control } => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + let has_thread_quota = available_thread_quota.is_some(); + if has_thread_quota { + let available_thread_quota = available_thread_quota.unwrap(); + let use_thread_quota = use_thread_quota.unwrap(); + if use_thread_quota < available_thread_quota { + write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + write!(w_term, "\t}}\n")?; + } + write!(w_term, "\t__syncthreads();\n")?; + } + if is_parallel { + write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + } else { + let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { + panic!("Expected join node to use a fork node"); }; - let init_val = self.get_value(*init, true, false); - write!(w, "{}{} = {};\n", tabs, reduce_val, init_val)?; + let fork_size = multiply_dcs(factors); + let fork_iter = self.get_fork_iter(*control); + write!(w_term, "\t{} += 1;\n", fork_iter)?; + write!(w_term, "\tif ({} == {}) {{\n", fork_iter, fork_size)?; + write!(w_term, "\t\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\t}}\n")?; + write!(w_term, "\telse {{\n")?; + write!(w_term, "\t\tgoto {};\n", get_block_name(*control))?; + write!(w_term, "\t}}\n")?; } + if has_thread_quota { 2 } else { 1 } } - Node::Join { control: _ } => {} Node::Return { control: _, data } => { if self.types[self.typing[data.idx()].idx()].is_primitive() { let return_val = self.get_value(*data, false, false); - write!( - w, - "{}if (threadIdx.x == 0) {{\n{}\t*ret = {};\n{}}}\n", - tabs, tabs, return_val, tabs - )?; + write!(w_term, "\tif (threadIdx.x == 0) {{\n")?; + write!(w_term, "\t\t*ret = {};\n", return_val)?; + write!(w_term, "\t}}\n")?; } - write!(w, "{}return;\n", tabs)?; + write!(w_term, "\treturn;\n")?; + 1 } _ => { panic!("Unsupported control node type") } - } - Ok(()) + }; + Ok(tabs) } // Handles copying data to/from global and shared memory. Thread parallelization @@ -1129,18 +1112,7 @@ impl GPUContext<'_> { let tabs = "\t".repeat(num_tabs); match &self.types[type_id.idx()] { Type::Array(element_type_id, extents) => { - let rem_array_size = { - let s = extents - .iter() - .map(|id| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join(" * "); - if s.is_empty() { - "1".to_string() - } else { - s - } - }; + let rem_array_size = multiply_dcs(extents); // Either we parallelize over threads or gate the loop by threadIdx.x // == 0 let mut extra_tab = ""; @@ -1464,20 +1436,15 @@ impl GPUContext<'_> { if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); - write!( - w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; - {}{} = dynamic_shared + dynamic_shared_offset;\n - {}dynamic_shared_offset += {};\n", - tabs, alignment, alignment, alignment, tabs, name, tabs, size, - )?; + write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; + write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; } let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") }; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call - let field_constant = &self.constants[constant_fields[i].idx()]; let field_type = self.get_type(type_fields[i], true); let offset = self.get_size(type_fields[i], Some(i)); self.codegen_constant( @@ -1493,13 +1460,9 @@ impl GPUContext<'_> { if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); - write!( - w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; - {}{} = dynamic_shared + dynamic_shared_offset;\n - {}dynamic_shared_offset += {};\n", - tabs, alignment, alignment, alignment, tabs, name, tabs, size, - )?; + write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; + write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; } // No offset updating needed since all variants start at 0 let Type::Summation(variants) = &self.types[type_id.idx()] else { @@ -1529,11 +1492,11 @@ impl GPUContext<'_> { let element_type = self.get_type(*element_type, false); write!( w, - ";\n{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n; - {}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n - {}dynamic_shared_offset += {};\n", - tabs, alignment, alignment, alignment, tabs, name, element_type, tabs, size + "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", + tabs, alignment, alignment, alignment )?; + write!(w, "{}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n", tabs, name, element_type)?; + write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; } } Ok(()) @@ -1546,11 +1509,7 @@ impl GPUContext<'_> { fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { - let array_size = extents - .iter() - .map(|id| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join(" * "); + let array_size = multiply_dcs(extents); format!("{} * {}", self.get_alignment(*element_type), array_size) } Type::Product(fields) => { @@ -1759,8 +1718,8 @@ impl GPUContext<'_> { Ok(()) } - fn get_block_name(&self, id: NodeID) -> String { - format!("bb_{}", id.idx()) + fn get_fork_iter(&self, fork: NodeID) -> String { + format!("{}_iter", self.get_value(fork, false, false)) } // Setting ty = true will return with type in declaration format. make_pointer @@ -1811,6 +1770,18 @@ impl GPUContext<'_> { } } +fn get_block_name(id: NodeID) -> String { + format!("bb_{}", id.idx()) +} + +fn multiply_dcs(dcs: &[DynamicConstantID]) -> String { + if dcs.is_empty() { + "1".to_string() + } else { + dcs.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * ") + } +} + // TODO: Add float8, float16, bfloat16 dtypes if they come fn convert_type(ty: &Type, make_pointer: bool) -> String { let mut result = match ty { -- GitLab From 1f3db8958dbfe63351e6d7454053fd157911f28d Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 16:31:59 -0600 Subject: [PATCH 036/109] tmp --- hercules_cg/src/gpu.rs | 72 ++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 00b0051d..d7629ede 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -202,8 +202,13 @@ struct GPUContext<'a> { return_type_id: &'a TypeID, } +// Pre is its own basic block, to separate one-time vs repeated code. It is +// non-trivial only for Fork nodes to create cooperative groups. +// Init, Body, and Term compose the main basic block, with Init and Term populated +// by control flow (Init used only by Fork and Join) and Body populated by data flow. #[derive(Default, Debug)] struct CudaGoto { + pre: String, init: String, body: String, term: String, @@ -230,6 +235,10 @@ impl GPUContext<'_> { #include <cuda_runtime.h> #include <mma.h> #include <helper_cuda.h> +#include <cooperative_groups.h> +#include <cooperative_groups/memcpy_async.h> +#include <cooperative_groups/reduce.h> +namespace cg = cooperative_groups; #define uabs(a) (a) #define umin(a, b) ((a) < (b) ? (a) : (b)) @@ -377,18 +386,12 @@ impl GPUContext<'_> { } // To abide by c++ reassignment restrictions, we declare all data values - // upfront. We also declare an iteration variable for each fork, which will - // be used for non-parallelized forks. Thus, some may go unused, but we don't - // know which points at time of this call- could move this function after that - // analysis but for now not. + // upfront. fn codegen_declare_all(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { if !self.function.nodes[id.idx()].is_control() { write!(w, "\t{};\n", self.get_value(id, true, false))?; } - if self.function.nodes[id.idx()].is_fork() { - write!(w, "\tunsigned int {} = 0;\n", self.get_fork_iter(id))?; - } } Ok(()) } @@ -622,10 +625,11 @@ impl GPUContext<'_> { }; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); + let pre = &mut goto.pre; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, false, init, term)?; + let mut tabs = self.codegen_control_node(*control, None, None, false, pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, state, num_threads, None, None, body, &mut tabs)?; } @@ -635,10 +639,11 @@ impl GPUContext<'_> { state = KernelState::InBlockFork; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); + let pre = &mut goto.pre; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, false, init, term)?; + let mut tabs = self.codegen_control_node(*control, None, None, false, pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, state, num_threads, None, Some(block_fork.unwrap()), body, &mut tabs)?; } @@ -679,10 +684,11 @@ impl GPUContext<'_> { .unwrap_or((parent_quota, parent_quota, None)); for control in fork_control_map.get(&curr_fork).unwrap() { let goto = gotos.get_mut(control).unwrap(); + let pre = &mut goto.pre; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor.is_some(), init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor.is_some(), pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, @@ -752,7 +758,7 @@ impl GPUContext<'_> { } KernelState::InThreadFork => { if parallel_factor.is_none() { - let fork_iter = self.get_fork_iter(*control); + let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { @@ -777,7 +783,7 @@ impl GPUContext<'_> { let Some(nesting_fork) = nesting_fork else { panic!("Expected reduce to be nested in a fork node"); }; - let fork_iter = self.get_fork_iter(nesting_fork); + let fork_iter = self.get_fork_iter(nesting_fork, false); write!(w, "{}if ({} == 0) {{\n", tabs, fork_iter)?; write!(w, "{}\t{} = {};\n", tabs, define_variable, init_val)?; write!(w, "{}}}\n", tabs)?; @@ -1003,9 +1009,11 @@ impl GPUContext<'_> { fn codegen_control_node( &self, id: NodeID, + state: KernelState, available_thread_quota: Option<usize>, use_thread_quota: Option<usize>, is_parallel: bool, + w_pre: &mut String, w_init: &mut String, w_term: &mut String, ) -> Result<usize, Error> { @@ -1014,7 +1022,7 @@ impl GPUContext<'_> { | Node::Region { preds: _ } | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; 1 } Node::If { control: _, cond } => { @@ -1026,9 +1034,9 @@ impl GPUContext<'_> { "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w_term, "\t\tgoto {};\n", get_block_name(succ1))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, true))?; write!(w_term, "\t}} else {{\n")?; - write!(w_term, "\t\tgoto {};\n", get_block_name(succ2))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, true))?; write!(w_term, "\t}}\n")?; 1 } @@ -1036,15 +1044,15 @@ impl GPUContext<'_> { let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; - write!(w_term, "\t\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, true))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; let join = self.fork_join_map.get(&id).unwrap(); - write!(w_term, "\t\tgoto {};\n", get_block_name(*join))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*join, true))?; write!(w_term, "\t}}\n")?; 2 } else { - write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; 1 } } @@ -1061,19 +1069,19 @@ impl GPUContext<'_> { write!(w_term, "\t__syncthreads();\n")?; } if is_parallel { - write!(w_term, "\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; } else { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { panic!("Expected join node to use a fork node"); }; let fork_size = multiply_dcs(factors); - let fork_iter = self.get_fork_iter(*control); + let fork_iter = self.get_fork_iter(*control, false); write!(w_term, "\t{} += 1;\n", fork_iter)?; write!(w_term, "\tif ({} == {}) {{\n", fork_iter, fork_size)?; - write!(w_term, "\t\tgoto {};\n", get_block_name(succ))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, true))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; - write!(w_term, "\t\tgoto {};\n", get_block_name(*control))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*control, false))?; write!(w_term, "\t}}\n")?; } if has_thread_quota { 2 } else { 1 } @@ -1718,8 +1726,20 @@ impl GPUContext<'_> { Ok(()) } - fn get_fork_iter(&self, fork: NodeID) -> String { - format!("{}_iter", self.get_value(fork, false, false)) + fn get_cg_name(&self, start_or_fork: NodeID) -> String { + format!("cg_{}", self.get_value(start_or_fork, false, false)) + } + + fn get_fork_iter(&self, fork: NodeID, ty: bool) -> String { + if ty { + format!("unsigned int iter_{}", self.get_value(fork, false, false)) + } else { + format!("iter_{}", self.get_value(fork, false, false)) + } + } + + fn get_block_name(&self, id: NodeID, pre: bool) -> String { + format!("bb_{}{}", self.get_value(id, false, false), if pre { "_pre" } else { "" }) } // Setting ty = true will return with type in declaration format. make_pointer @@ -1770,10 +1790,6 @@ impl GPUContext<'_> { } } -fn get_block_name(id: NodeID) -> String { - format!("bb_{}", id.idx()) -} - fn multiply_dcs(dcs: &[DynamicConstantID]) -> String { if dcs.is_empty() { "1".to_string() -- GitLab From 7466a988a8258e963b56f27388bf7e4746bb2e6b Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 18:01:50 -0600 Subject: [PATCH 037/109] just reduct left --- hercules_cg/src/gpu.rs | 388 ++++++++--------------------------------- 1 file changed, 70 insertions(+), 318 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index d7629ede..9e70956f 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -254,7 +254,7 @@ namespace cg = cooperative_groups; self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; - self.codegen_declare_all(&mut top)?; + self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); @@ -347,7 +347,10 @@ namespace cg = cooperative_groups; // Type is char since it's simplest to use single bytes for indexing, // casting will be needed for use with different types. - write!(w, ") {{\n\textern __shared__ char dynamic_shared[];\n\tuint64_t dynamic_shared_offset = 0;\n")?; + write!(w, ") {{\n")?; + write!(w, "\textern __shared__ char dynamic_shared[];\n")?; + write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; + write!(w, "}}\n")?; Ok(()) } @@ -387,7 +390,7 @@ namespace cg = cooperative_groups; // To abide by c++ reassignment restrictions, we declare all data values // upfront. - fn codegen_declare_all(&self, w: &mut String) -> Result<(), Error> { + fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { if !self.function.nodes[id.idx()].is_control() { write!(w, "\t{};\n", self.get_value(id, true, false))?; @@ -403,6 +406,8 @@ namespace cg = cooperative_groups; fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!(w, "\tsize_t alignment;\n")?; write!(w, "\tsize_t max_variant_size;\n")?; + write!(w, "\tgrid_group grid = this_grid();\n")?; + write!(w, "\tthread_block block = this_thread_block();\n")?; Ok(()) } @@ -614,28 +619,20 @@ namespace cg = cooperative_groups; // First emit data and control gen for each control node outside any fork. // Recall that this was tracked through a fake fork node with NodeID 0. - // If num_blocks > 1, initialize state to 0, else 1. This is because - // if there is no block fork, then everything is in a single block, which - // is semantically the same as being directly nested in the block fork. - let has_block_fork = block_fork.is_some(); - let mut state = if has_block_fork { - KernelState::OutBlockFork - } else { - KernelState::InBlockFork - }; + let mut state = KernelState::OutBlockFork; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); let pre = &mut goto.pre; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, false, pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, None, None, None, pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, num_threads, None, None, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists - if has_block_fork { + if block_fork.is_some() { state = KernelState::InBlockFork; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); @@ -643,9 +640,9 @@ namespace cg = cooperative_groups; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, false, pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, num_threads, None, Some(block_fork.unwrap()), body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), body, &mut tabs)?; } } } @@ -688,12 +685,12 @@ namespace cg = cooperative_groups; let init = &mut goto.init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor.is_some(), pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor, pre, init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, state, - use_thread_quota, + Some(use_thread_quota), parallel_factor, Some(curr_fork), body, @@ -727,7 +724,7 @@ namespace cg = cooperative_groups; &self, id: NodeID, state: KernelState, - use_thread_quota: usize, + use_thread_quota: Option<usize>, parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, w: &mut String, @@ -761,8 +758,9 @@ namespace cg = cooperative_groups; let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { - - + // We can directly use use_thread_quota and not worry about available + // because Fork basic block's init section already does gating + write!(w, "{}{} = (threadIdx.x % {}) / {};\n", tabs, define_variable, use_thread_quota.unwrap(), use_thread_quota.unwrap() / parallel_factor.unwrap())?; } } _ => { @@ -943,45 +941,45 @@ namespace cg = cooperative_groups; Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); - let type_id = self.typing[id.idx()]; - self.codegen_copy( - false, - type_id, - &define_variable, - &collect_with_indices, - if !self.types[type_id.idx()].is_primitive() { - Some(use_thread_quota) + let data_type_id = self.typing[id.idx()]; + if self.types[data_type_id.idx()].is_primitive() { + if is_char { + let type_name = self.get_type(data_type_id, true); + write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; } else { - None - }, - false, - w, - *num_tabs, - )?; + write!(w, "{}{} = *{};\n", tabs, define_variable, collect_with_indices)?; + } + } else { + let nested_fork = nesting_fork.unwrap(); + let cg_name = self.get_cg_name(nested_fork, false); + let data_size = self.get_size(data_type_id, None); + write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, define_variable, collect_with_indices, data_size)?; + write!(w, "{}wait({});\n", tabs, cg_name)?; + } } Node::Write { collect, data, indices, } => { - let data_variable = self.get_value(*data, false, false); let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); - let type_id = self.typing[data.idx()]; - self.codegen_copy( - true, - type_id, - &data_variable, - &collect_with_indices, - if !self.types[type_id.idx()].is_primitive() { - Some(use_thread_quota) + let data_variable = self.get_value(*data, false, false); + let data_type_id = self.typing[data.idx()]; + if self.types[data_type_id.idx()].is_primitive() { + if is_char { + let type_name = self.get_type(data_type_id, true); + write!(w, "{}*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; } else { - None - }, - state == KernelState::OutBlockFork, - w, - *num_tabs, - )?; + write!(w, "{}*{} = {};\n", tabs, collect_with_indices, data_variable)?; + } + } else { + let nested_fork = nesting_fork.unwrap(); + let cg_name = self.get_cg_name(nested_fork, false); + let data_size = self.get_size(data_type_id, None); + write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, collect_with_indices, data_variable, data_size)?; + write!(w, "{}wait({});\n", tabs, cg_name)?; + } let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -1009,17 +1007,20 @@ namespace cg = cooperative_groups; fn codegen_control_node( &self, id: NodeID, - state: KernelState, available_thread_quota: Option<usize>, use_thread_quota: Option<usize>, - is_parallel: bool, + parallel_factor: Option<usize>, w_pre: &mut String, w_init: &mut String, w_term: &mut String, ) -> Result<usize, Error> { let tabs = match &self.function.nodes[id.idx()] { - Node::Start - | Node::Region { preds: _ } + Node::Start => { + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + 1 + } + Node::Region { preds: _ } | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; @@ -1041,6 +1042,17 @@ namespace cg = cooperative_groups; 1 } Node::Fork { control: _, factors: _ } => { + // We don't do anything smart to mitigate control flow divergence + // if use_thread_quota < warp size + let cg_name = self.get_cg_name(id, false); + if use_thread_quota.is_some() { + let use_thread_quota = use_thread_quota.unwrap(); + let use_thread_per_id = use_thread_quota / parallel_factor.unwrap(); + write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_per_id, cg_name, use_thread_per_id)?; + let cg_name_full = self.get_cg_name(id, true); + write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_quota, cg_name_full, use_thread_quota)?; + } + write!(w_pre, "\tgoto {};\n", self.get_block_name(id, false))?; let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; @@ -1066,9 +1078,8 @@ namespace cg = cooperative_groups; write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; write!(w_term, "\t}}\n")?; } - write!(w_term, "\t__syncthreads();\n")?; } - if is_parallel { + if parallel_factor.is_some() { write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; } else { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { @@ -1103,265 +1114,6 @@ namespace cg = cooperative_groups; Ok(tabs) } - // Handles copying data to/from global and shared memory. Thread parallelization - // is used only for arrays (possibly inside another collection). is_char indicates - // a char type and we need to including element size in indexing. - fn codegen_copy( - &self, - is_write: bool, - type_id: TypeID, - data: &String, - collect: &String, - thread_quota: Option<usize>, - block_restrict: bool, - w: &mut String, - num_tabs: usize, - ) -> Result<(), Error> { - let tabs = "\t".repeat(num_tabs); - match &self.types[type_id.idx()] { - Type::Array(element_type_id, extents) => { - let rem_array_size = multiply_dcs(extents); - // Either we parallelize over threads or gate the loop by threadIdx.x - // == 0 - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - let has_thread_quota = thread_quota.is_some(); - write!( - w, - "{}", - if has_thread_quota { - format!( - "{}for (int i = threadIdx.x; i < {}; i += {}) {{\n", - tabs, - rem_array_size, - thread_quota.unwrap() - ) - } else { - format!( - "{}if (threadIdx.x == 0) {{\n{}\tfor (int i = 0; i < {}; i++) {{\n", - tabs, tabs, rem_array_size - ) - } - ); - let element_type_name = self.get_type(*element_type_id, true); - let (new_collect, new_data) = if self.is_char(type_id) { - ( - format!( - "{} + i * {}", - collect, - self.get_size(*element_type_id, None) - ), - format!("{} + i * {}", data, self.get_size(*element_type_id, None)), - ) - } else { - (format!("{} + i", collect), format!("{} + i", data)) - }; - let new_collect = format!( - "{}reinterpret_cast<{}>({})", - if self.types[element_type_id.idx()].is_primitive() { - "*" - } else { - "" - }, - element_type_name, - new_collect - ); - let new_data = format!( - "{}reinterpret_cast<{}>({})", - if self.types[element_type_id.idx()].is_primitive() { - "*" - } else { - "" - }, - element_type_name, - new_data - ); - self.codegen_copy( - is_write, - *element_type_id, - &new_data, - &new_collect, - None, - false, - w, - num_tabs - + if block_restrict { 1 } else { 0 } - + if has_thread_quota { 1 } else { 2 }, - )?; - if !has_thread_quota { - write!(w, "{}\t}}\n", tabs)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; - } - Type::Product(fields) => { - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - let has_thread_quota = thread_quota.is_some(); - if !has_thread_quota { - write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; - extra_tab2 = "\t"; - } - for (i, field) in fields.iter().enumerate() { - let offset = self.get_size(type_id, Some(i)); - let field_type_name = self.get_type(*field, true); - let new_collect = format!( - "{}reinterpret_cast<{}>({} + {})", - if self.types[field.idx()].is_primitive() { - "*" - } else { - "" - }, - field_type_name, - collect, - offset - ); - let new_data = format!( - "{}reinterpret_cast<{}>({} + {})", - if self.types[field.idx()].is_primitive() { - "*" - } else { - "" - }, - field_type_name, - data, - offset - ); - self.codegen_copy( - is_write, - *field, - &new_data, - &new_collect, - thread_quota, - false, - w, - num_tabs - + if block_restrict { 1 } else { 0 } - + if has_thread_quota { 0 } else { 1 }, - )?; - } - if !has_thread_quota { - write!(w, "{}\t}}\n", tabs)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; - } - Type::Summation(variants) => { - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - let has_thread_quota = thread_quota.is_some(); - if !has_thread_quota { - write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; - extra_tab2 = "\t"; - } - // We can guarantee correctness for summation by just copying the - // largest variant. - let max_variant_size = self.get_size(type_id, None); - write!( - w, - "{}{}{}max_variant_size = {};\n", - tabs, extra_tab, extra_tab2, max_variant_size - )?; - for (i, variant) in variants.iter().enumerate() { - let prefix = if i == 0 { "if" } else { "else if" }; - let variant_size = self.get_size(*variant, None); - write!( - w, - "{}{}{}{} (max_variant_size == {}) {{\n", - tabs, extra_tab, extra_tab2, prefix, variant_size - )?; - let field_type_name = self.get_type(*variant, true); - let new_collect = format!( - "{}reinterpret_cast<{}>({})", - if self.types[variant.idx()].is_primitive() { - "*" - } else { - "" - }, - field_type_name, - collect - ); - let new_data = format!( - "{}reinterpret_cast<{}>({})", - if self.types[variant.idx()].is_primitive() { - "*" - } else { - "" - }, - field_type_name, - data - ); - self.codegen_copy( - is_write, - *variant, - &new_data, - &new_collect, - thread_quota, - false, - w, - num_tabs - + if block_restrict { 1 } else { 0 } - + if has_thread_quota { 0 } else { 1 }, - )?; - write!(w, "{}{}{}}}\n", tabs, extra_tab, extra_tab2)?; - } - if !has_thread_quota { - write!(w, "{}\t}}\n", tabs)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - write!(w, "{}}}\n{}__syncthreads();\n", tabs, tabs)?; - } - // Primitive types - _ => { - let mut extra_tab = ""; - let mut extra_tab2 = ""; - if block_restrict { - write!(w, "{}if (blockIdx.x == 0) {{\n", tabs)?; - extra_tab = "\t"; - } - let has_thread_quota = thread_quota.is_some(); - if has_thread_quota { - write!(w, "{}{}if (threadIdx.x == 0) {{\n", tabs, extra_tab)?; - extra_tab2 = "\t"; - } - write!( - w, - "{}{}{}{} = {};\n", - tabs, - extra_tab, - extra_tab2, - if is_write { collect } else { data }, - if is_write { data } else { collect } - )?; - if has_thread_quota { - write!(w, "{}{}}}\n", tabs, extra_tab)?; - } - if block_restrict { - write!(w, "{}}}\n", tabs)?; - } - } - } - Ok(()) - } - // Read/writes to global collections consist of global name + pointer offset. fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { let mut index_ptr = "0".to_string(); @@ -1726,8 +1478,8 @@ namespace cg = cooperative_groups; Ok(()) } - fn get_cg_name(&self, start_or_fork: NodeID) -> String { - format!("cg_{}", self.get_value(start_or_fork, false, false)) + fn get_cg_name(&self, fork: NodeID, full: bool) -> String { + format!("cg_{}{}", self.get_value(fork, false, false), if full { "_full" } else { "" }) } fn get_fork_iter(&self, fork: NodeID, ty: bool) -> String { -- GitLab From 1ed66a59d822dbfda5f56f5333ea012a8b9780e7 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 18:27:00 -0600 Subject: [PATCH 038/109] sync --- hercules_cg/src/gpu.rs | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 9e70956f..6cf653f3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -216,9 +216,16 @@ struct CudaGoto { #[derive(Clone, Copy, PartialEq, Debug)] enum KernelState { - OutBlockFork, - InBlockFork, - InThreadFork, + OutBlock, + InBlock, + InThread, +} + +#[derive(Clone, Copy, PartialEq, Debug)] +enum CGType { + UsePerId, + Use, + Available, } impl GPUContext<'_> { @@ -619,7 +626,7 @@ namespace cg = cooperative_groups; // First emit data and control gen for each control node outside any fork. // Recall that this was tracked through a fake fork node with NodeID 0. - let mut state = KernelState::OutBlockFork; + let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); let pre = &mut goto.pre; @@ -633,7 +640,7 @@ namespace cg = cooperative_groups; } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { - state = KernelState::InBlockFork; + state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); let pre = &mut goto.pre; @@ -648,7 +655,7 @@ namespace cg = cooperative_groups; } // Then generate for the thread fork tree by setting state to 2, traverse, // and update the thread quota. Any traversal is fine, we choose pre-order. - state = KernelState::InThreadFork; + state = KernelState::InThread; for &root_fork in thread_root_forks { self.codegen_data_control_traverse( root_fork, @@ -746,14 +753,14 @@ namespace cg = cooperative_groups; let divide = multiply_dcs(&factors[dimension + 1..]); let modulo = format!("dc{}", factors[*dimension].idx()); match state { - KernelState::InBlockFork => { + KernelState::InBlock => { write!( w, "{}{} = (blockIdx.x / ({})) % {};\n", tabs, define_variable, divide, modulo )?; } - KernelState::InThreadFork => { + KernelState::InThread => { if parallel_factor.is_none() { let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; @@ -951,7 +958,7 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_name = self.get_cg_name(nested_fork, false); + let cg_name = self.get_cg_name(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, define_variable, collect_with_indices, data_size)?; write!(w, "{}wait({});\n", tabs, cg_name)?; @@ -975,7 +982,7 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_name = self.get_cg_name(nested_fork, false); + let cg_name = self.get_cg_name(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, collect_with_indices, data_variable, data_size)?; write!(w, "{}wait({});\n", tabs, cg_name)?; @@ -1044,13 +1051,16 @@ namespace cg = cooperative_groups; Node::Fork { control: _, factors: _ } => { // We don't do anything smart to mitigate control flow divergence // if use_thread_quota < warp size - let cg_name = self.get_cg_name(id, false); + let cg_name = self.get_cg_name(id, CGType::UsePerId); if use_thread_quota.is_some() { let use_thread_quota = use_thread_quota.unwrap(); let use_thread_per_id = use_thread_quota / parallel_factor.unwrap(); write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_per_id, cg_name, use_thread_per_id)?; - let cg_name_full = self.get_cg_name(id, true); - write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_quota, cg_name_full, use_thread_quota)?; + let cg_name_use = self.get_cg_name(id, CGType::Use); + write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_quota, cg_name_use, use_thread_quota)?; + let available_thread_quota = available_thread_quota.unwrap(); + let cg_name_available = self.get_cg_name(id, CGType::Available); + write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", available_thread_quota, cg_name_available, available_thread_quota)?; } write!(w_pre, "\tgoto {};\n", self.get_block_name(id, false))?; let succ = self.control_subgraph.succs(id).next().unwrap(); @@ -1080,6 +1090,8 @@ namespace cg = cooperative_groups; } } if parallel_factor.is_some() { + let cg_name_available = self.get_cg_name(id, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_name_available)?; write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; } else { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { @@ -1478,8 +1490,8 @@ namespace cg = cooperative_groups; Ok(()) } - fn get_cg_name(&self, fork: NodeID, full: bool) -> String { - format!("cg_{}{}", self.get_value(fork, false, false), if full { "_full" } else { "" }) + fn get_cg_name(&self, fork: NodeID, cg_type: CGType) -> String { + format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" }) } fn get_fork_iter(&self, fork: NodeID, ty: bool) -> String { -- GitLab From 317eb124911a6ca305cf00e64d2ae3447903a0a5 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sun, 5 Jan 2025 22:13:02 -0600 Subject: [PATCH 039/109] el fin --- hercules_cg/src/gpu.rs | 349 +++++++++++++++++++++++---------------- hercules_opt/src/pass.rs | 13 ++ 2 files changed, 218 insertions(+), 144 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 6cf653f3..f1d949ba 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -16,7 +16,6 @@ pub fn gpu_codegen<W: Write>( types: &Vec<Type>, constants: &Vec<Constant>, dynamic_constants: &Vec<DynamicConstant>, - reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, bbs: &BasicBlocks, @@ -37,6 +36,9 @@ pub fn gpu_codegen<W: Write>( * be aligned for its type and for full product to be aligned to its * largest element * - similarly, summation types must be aligned to their largest element + * + * Major TODOs: + * - Matmul/Conv detection */ let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) @@ -164,7 +166,6 @@ pub fn gpu_codegen<W: Write>( types, constants, dynamic_constants, - reverse_postorder, typing, control_subgraph, bbs, @@ -190,7 +191,6 @@ struct GPUContext<'a> { types: &'a Vec<Type>, constants: &'a Vec<Constant>, dynamic_constants: &'a Vec<DynamicConstant>, - reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, @@ -208,8 +208,8 @@ struct GPUContext<'a> { // by control flow (Init used only by Fork and Join) and Body populated by data flow. #[derive(Default, Debug)] struct CudaGoto { - pre: String, init: String, + post_init: String, body: String, term: String, } @@ -232,49 +232,19 @@ impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // All possible includes followed by macros for intrinsic calls on // types with no library support - write!( - w, - " -#include <assert.h> -#include <stdio.h> -#include <stddef.h> -#include <cuda.h> -#include <cuda_runtime.h> -#include <mma.h> -#include <helper_cuda.h> -#include <cooperative_groups.h> -#include <cooperative_groups/memcpy_async.h> -#include <cooperative_groups/reduce.h> -namespace cg = cooperative_groups; - -#define uabs(a) (a) -#define umin(a, b) ((a) < (b) ? (a) : (b)) -#define umax(a, b) ((a) > (b) ? (a) : (b)) -#define powi(a, b) ({{ int res = 1; for(int i = 0; i < b; ++i) res *= a; res; }}) -#define roundi(a) (a) -#define isqrt(a) ((int)sqrtf((float)(a))) - -", - )?; - let mut top = String::new(); - self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; + self.codegen_goto_start(&mut top)?; + write!(w, "{}", top)?; let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); - let start = NodeID::new(0); - let ret = (0..self.function.nodes.len()) - .filter(|idx| self.function.nodes[*idx].is_return()) - .map(NodeID::new) - .next() - .unwrap(); // We use CUDA's goto to jump between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -300,12 +270,10 @@ namespace cg = cooperative_groups; &mut gotos, )?; - // Punting on implementation but can likely run einsum -> matmul/conv - // detector on hierarhical fork joins between block edge and given - // thread edge. + let mut rest = String::new(); + self.codegen_gotos(&mut gotos, &mut rest)?; + write!(w, "{}", rest)?; - // finish kernel - write!(w, "{}", top)?; write!(w, "}}\n")?; Ok(()) @@ -313,6 +281,29 @@ namespace cg = cooperative_groups; // Emit kernel signature, arguments, and dynamic shared memory declaration fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { + write!(w, " +#include <assert.h> +#include <stdio.h> +#include <stddef.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <mma.h> +#include <helper_cuda.h> +#include <cooperative_groups.h> +#include <cooperative_groups/memcpy_async.h> +#include <cooperative_groups/reduce.h> +namespace cg = cooperative_groups; + +#define uabs(a) (a) +#define umin(a, b) ((a) < (b) ? (a) : (b)) +#define umax(a, b) ((a) > (b) ? (a) : (b)) +#define powi(a, b) ({{ int res = 1; for(int i = 0; i < b; ++i) res *= a; res; }}) +#define roundi(a) (a) +#define isqrt(a) ((int)sqrtf((float)(a))) + +", + )?; + write!( w, "__global__ void __launch_bounds__({}) {}(", @@ -357,7 +348,6 @@ namespace cg = cooperative_groups; write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; - write!(w, "}}\n")?; Ok(()) } @@ -413,8 +403,25 @@ namespace cg = cooperative_groups; fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!(w, "\tsize_t alignment;\n")?; write!(w, "\tsize_t max_variant_size;\n")?; - write!(w, "\tgrid_group grid = this_grid();\n")?; - write!(w, "\tthread_block block = this_thread_block();\n")?; + write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; + write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; + Ok(()) + } + + fn codegen_goto_start(&self, w: &mut String) -> Result<(), Error> { + let block_start = self.get_block_name(NodeID::new(0), false); + write!(w, "goto {};\n", block_start)?; + Ok(()) + } + + fn codegen_gotos(&self, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { + write!(w, "\n")?; + for (_, goto) in gotos.iter() { + write!(w, "{}\n", goto.init)?; + write!(w, "{}\n", goto.post_init)?; + write!(w, "{}\n", goto.body)?; + write!(w, "{}\n\n", goto.term)?; + } Ok(()) } @@ -629,13 +636,13 @@ namespace cg = cooperative_groups; let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); - let pre = &mut goto.pre; let init = &mut goto.init; + let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, None, None, None, pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, false, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists @@ -643,13 +650,13 @@ namespace cg = cooperative_groups; state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); - let pre = &mut goto.pre; let init = &mut goto.init; + let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, body, &mut tabs)?; } } } @@ -686,13 +693,27 @@ namespace cg = cooperative_groups; .get(&curr_fork) .map(|(a, u, f)| (*a, *u, Some(*f))) .unwrap_or((parent_quota, parent_quota, None)); + let reduces = &self.fork_reduce_map[&curr_fork]; + let reducts = if parallel_factor.is_some() { + reduces + .iter() + .map(|&reduce| { + let Node::Reduce { control: _, init: _, reduct} = &self.function.nodes[reduce.idx()] else { + panic!("Expected reduce node"); + }; + *reduct + }) + .collect() + } else { + HashSet::new() + }; for control in fork_control_map.get(&curr_fork).unwrap() { let goto = gotos.get_mut(control).unwrap(); - let pre = &mut goto.pre; let init = &mut goto.init; + let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor, pre, init, term)?; + let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, @@ -700,6 +721,7 @@ namespace cg = cooperative_groups; Some(use_thread_quota), parallel_factor, Some(curr_fork), + reducts.contains(data), body, &mut tabs, )?; @@ -734,6 +756,7 @@ namespace cg = cooperative_groups; use_thread_quota: Option<usize>, parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, + is_special_reduct: bool, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { @@ -863,55 +886,75 @@ namespace cg = cooperative_groups; Node::Binary { op, left, right } => { let left_val = self.get_value(*left, false, false); let right_val = self.get_value(*right, false, false); - match (op, &self.types[self.typing[left.idx()].idx()]) { - (BinaryOperator::Rem, Type::Float32) => write!( - w, - "{}{} = fmodf({}, {});\n", - tabs, define_variable, left_val, right_val, - )?, - (BinaryOperator::Rem, Type::Float64) => write!( - w, - "{}{} = fmod({}, {});\n", - tabs, define_variable, left_val, right_val, - )?, - // Doesn't need special syntax but bool type - (BinaryOperator::Or, Type::Boolean) => write!( - w, - "{}{} = {} || {};\n", - tabs, define_variable, left_val, right_val, - )?, - (BinaryOperator::And, Type::Boolean) => write!( - w, - "{}{} = {} && {};\n", - tabs, define_variable, left_val, right_val, - )?, - (op, _) => write!( - w, - "{}{} = {} {} {};\n", - tabs, - define_variable, - left_val, - match op { - BinaryOperator::Add => "+", - BinaryOperator::Sub => "-", - BinaryOperator::Mul => "*", - BinaryOperator::Div => "/", - BinaryOperator::Rem => "%", - BinaryOperator::LT => "<", - BinaryOperator::LTE => "<=", - BinaryOperator::GT => ">", - BinaryOperator::GTE => ">=", - BinaryOperator::EQ => "==", - BinaryOperator::NE => "!=", - BinaryOperator::Or => "|", - BinaryOperator::And => "&", - BinaryOperator::Xor => "^", - BinaryOperator::LSh => "<<", - BinaryOperator::RSh => ">>", - }, - right_val, - )?, - }; + let id_type = self.typing[id.idx()]; + if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And + | BinaryOperator::Xor) && is_special_reduct { + let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[left.idx()] { + right_val + } else { + left_val + }; + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + #[allow(unreachable_patterns)] + let cg_op = match op { + BinaryOperator::Add => "plus", + BinaryOperator::Or => "bit_or", + BinaryOperator::And => "bit_and", + BinaryOperator::Xor => "bit_xor", + _ => unreachable!(), + }; + let id_type_name = self.get_type(id_type, false); + write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, cg_tile, non_reduce_arg, cg_op, id_type_name)?; + } else { + match (op, &self.types[id_type.idx()]) { + (BinaryOperator::Or, Type::Boolean) => write!( + w, + "{}{} = {} || {};\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::And, Type::Boolean) => write!( + w, + "{}{} = {} && {};\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::Rem, Type::Float32) => write!( + w, + "{}{} = fmodf({}, {});\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::Rem, Type::Float64) => write!( + w, + "{}{} = fmod({}, {});\n", + tabs, define_variable, left_val, right_val, + )?, + (op, _) => write!( + w, + "{}{} = {} {} {};\n", + tabs, + define_variable, + left_val, + match op { + BinaryOperator::Add => "+", + BinaryOperator::Sub => "-", + BinaryOperator::Mul => "*", + BinaryOperator::Div => "/", + BinaryOperator::Rem => "%", + BinaryOperator::LT => "<", + BinaryOperator::LTE => "<=", + BinaryOperator::GT => ">", + BinaryOperator::GTE => ">=", + BinaryOperator::EQ => "==", + BinaryOperator::NE => "!=", + BinaryOperator::Or => "|", + BinaryOperator::And => "&", + BinaryOperator::Xor => "^", + BinaryOperator::LSh => "<<", + BinaryOperator::RSh => ">>", + }, + right_val, + )?, + }; + } } Node::Ternary { op, @@ -932,16 +975,34 @@ namespace cg = cooperative_groups; } }, Node::IntrinsicCall { intrinsic, args } => { - let ty = &self.types[self.typing[args[0].idx()].idx()]; - let func_name = self.codegen_intrinsic(intrinsic, ty); - write!( - w, - "{}{} = {}({});\n", - tabs, - define_variable, - func_name, - self.get_value(args[0], false, false), - )?; + let id_type = self.typing[id.idx()]; + if matches!(intrinsic, Intrinsic::Max | Intrinsic::Min) && is_special_reduct { + let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[args[0].idx()] { + self.get_value(args[1], false, false) + } else { + self.get_value(args[0], false, false) + }; + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + #[allow(unreachable_patterns)] + let cg_op = match intrinsic { + Intrinsic::Max => "max", + Intrinsic::Min => "min", + _ => unreachable!(), + }; + let id_type_name = self.get_type(id_type, false); + write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name)?; + } else { + let ty = &self.types[id_type.idx()]; + let func_name = self.codegen_intrinsic(intrinsic, ty); + write!( + w, + "{}{} = {}({});\n", + tabs, + define_variable, + func_name, + self.get_value(args[0], false, false), + )?; + } } // Main difference between read and write is codegen_copy takes the // returned node's type for read and data node's type for write @@ -958,10 +1019,10 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_name = self.get_cg_name(nested_fork, CGType::UsePerId); + let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); - write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, define_variable, collect_with_indices, data_size)?; - write!(w, "{}wait({});\n", tabs, cg_name)?; + write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, define_variable, collect_with_indices, data_size)?; + write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } } Node::Write { @@ -982,10 +1043,10 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_name = self.get_cg_name(nested_fork, CGType::UsePerId); + let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); - write!(w, "{}memcpy_async({}, {}, {}, {});\n", tabs, cg_name, collect_with_indices, data_variable, data_size)?; - write!(w, "{}wait({});\n", tabs, cg_name)?; + write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, collect_with_indices, data_variable, data_size)?; + write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; @@ -1017,20 +1078,20 @@ namespace cg = cooperative_groups; available_thread_quota: Option<usize>, use_thread_quota: Option<usize>, parallel_factor: Option<usize>, - w_pre: &mut String, w_init: &mut String, + w_post_init: &mut String, w_term: &mut String, ) -> Result<usize, Error> { let tabs = match &self.function.nodes[id.idx()] { Node::Start => { let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; 1 } Node::Region { preds: _ } | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; 1 } Node::If { control: _, cond } => { @@ -1042,43 +1103,43 @@ namespace cg = cooperative_groups; "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, true))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, false))?; write!(w_term, "\t}} else {{\n")?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, true))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, false))?; write!(w_term, "\t}}\n")?; 1 } Node::Fork { control: _, factors: _ } => { // We don't do anything smart to mitigate control flow divergence // if use_thread_quota < warp size - let cg_name = self.get_cg_name(id, CGType::UsePerId); + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { let use_thread_quota = use_thread_quota.unwrap(); let use_thread_per_id = use_thread_quota / parallel_factor.unwrap(); - write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_per_id, cg_name, use_thread_per_id)?; - let cg_name_use = self.get_cg_name(id, CGType::Use); - write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", use_thread_quota, cg_name_use, use_thread_quota)?; + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id)?; + let cg_tile_use = self.get_cg_tile(id, CGType::Use); + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota)?; let available_thread_quota = available_thread_quota.unwrap(); - let cg_name_available = self.get_cg_name(id, CGType::Available); - write!(w_pre, "\tthread_block_tile<{}> {} = tiled_partition<{}>(block);\n", available_thread_quota, cg_name_available, available_thread_quota)?; + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota)?; } - write!(w_pre, "\tgoto {};\n", self.get_block_name(id, false))?; + write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { - write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_post_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; let join = self.fork_join_map.get(&id).unwrap(); - write!(w_term, "\t\tgoto {};\n", self.get_block_name(*join, true))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*join, false))?; write!(w_term, "\t}}\n")?; 2 } else { - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; 1 } } - Node::Join { control } => { + Node::Join { control: fork } => { let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); if has_thread_quota { @@ -1090,21 +1151,21 @@ namespace cg = cooperative_groups; } } if parallel_factor.is_some() { - let cg_name_available = self.get_cg_name(id, CGType::Available); - write!(w_term, "\t{}.sync();\n", cg_name_available)?; - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, true))?; + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; } else { - let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { + let Node::Fork { factors, .. } = &self.function.nodes[fork.idx()] else { panic!("Expected join node to use a fork node"); }; let fork_size = multiply_dcs(factors); - let fork_iter = self.get_fork_iter(*control, false); + let fork_iter = self.get_fork_iter(*fork, false); write!(w_term, "\t{} += 1;\n", fork_iter)?; write!(w_term, "\tif ({} == {}) {{\n", fork_iter, fork_size)?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, true))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(*control, false))?; + write!(w_term, "\t\tgoto {};\n", self.get_block_name(*fork, true))?; write!(w_term, "\t}}\n")?; } if has_thread_quota { 2 } else { 1 } @@ -1490,7 +1551,7 @@ namespace cg = cooperative_groups; Ok(()) } - fn get_cg_name(&self, fork: NodeID, cg_type: CGType) -> String { + fn get_cg_tile(&self, fork: NodeID, cg_type: CGType) -> String { format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" }) } @@ -1502,8 +1563,8 @@ namespace cg = cooperative_groups; } } - fn get_block_name(&self, id: NodeID, pre: bool) -> String { - format!("bb_{}{}", self.get_value(id, false, false), if pre { "_pre" } else { "" }) + fn get_block_name(&self, id: NodeID, post: bool) -> String { + format!("bb_{}{}", self.get_value(id, false, false), if post { "_post" } else { "" }) } // Setting ty = true will return with type in declaration format. make_pointer diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 12444b36..3e7d1089 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -929,6 +929,7 @@ impl PassManager { let mut rust_rt = String::new(); let mut llvm_ir = String::new(); + let mut cuda_ir = String::new(); for idx in 0..self.module.functions.len() { match devices[idx] { Device::LLVM => cpu_codegen( @@ -954,6 +955,18 @@ impl PassManager { &mut rust_rt, ) .unwrap(), + Device::CUDA => gpu_codegen( + &self.module.functions[idx], + &self.module.types, + &self.module.constants, + &self.module.dynamic_constants, + &typing[idx], + &control_subgraphs[idx], + &bbs[idx], + &collection_objects[&FunctionID::new(idx)], + &mut cuda_ir, + ) + .unwrap(), _ => todo!(), } } -- GitLab From 7421e35e18261e45c4981997cd67545da0bb347b Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 6 Jan 2025 09:34:15 -0600 Subject: [PATCH 040/109] cleanup comments --- hercules_cg/src/gpu.rs | 374 ++++++++++++++++++++++++----------------- 1 file changed, 223 insertions(+), 151 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index f1d949ba..10e7d9e3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -8,8 +8,8 @@ use self::hercules_ir::*; /* * The top level function to compile a Hercules IR function into CUDA - * kernel for execution on the GPU. We generate CUDA C textually, based - * on the CPU LLVM approach. + * kernel for execution on the GPU. We generate CUDA C textually, with a lot + * of similarities with the CPU LLVM generation plus custom GPU parallelization. */ pub fn gpu_codegen<W: Write>( function: &Function, @@ -24,21 +24,26 @@ pub fn gpu_codegen<W: Write>( ) -> Result<(), Error> { /* * We assert the following: - * - Fork node must have >= 1 reduce nodes + * - There is at least one Fork node + * - Fork node must have >= 1 Reduce nodes * - If the returned data type is a collection, it must have * originated from a single known parameter. Can relax to allow * one of multiple parameters. * * We don't assert but assume the following: - * - max_num_blocks is within constraint of 1D grid size. This can be - * relaxed if we want to support larger grids. - * - product types are packed with padding inserted for each element to + * - max_num_blocks in KernelParams is within constraint of 1D grid size. This + * can be relaxed if we want to support larger grids. + * - Product types are packed with padding inserted for each element to * be aligned for its type and for full product to be aligned to its * largest element - * - similarly, summation types must be aligned to their largest element + * - Summation types must be aligned to their largest element * * Major TODOs: + * - Fix dynamic shared memory allocation to reuse old shmem. The main case + * for improvement is when we have serialized forks with unused intermediate + * values from previous iterations. * - Matmul/Conv detection + * - Add float8, float16, bfloat16 dtypes if they come */ let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) @@ -46,11 +51,11 @@ pub fn gpu_codegen<W: Write>( .map(NodeID::new) .collect(); - // Fork reduce map should have all reduces contained in some key + // Fork Reduce map should have all reduces contained in some key let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); - // Reduct reduce map should have all non-parallel and non-associative reduces - // contained in some key. Unlike fork, reduct is not involved in any assertions, - // put it here for convenience but can move. + // Reduct Reduce map should have all non-parallel and non-associative reduces + // contained in some key. Unlike Fork, Reduct is not involved in any assertions. + // It's placed here for convenience but can be moved. let reduct_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); for reduce_node in &reduce_nodes { if let Node::Reduce { @@ -76,7 +81,7 @@ pub fn gpu_codegen<W: Write>( } } if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce) - && !function.schedules[reduce_node.idx()].contains(&Schedule::Associative) + && !function.schedules[reduce_node.idx()].contains(&Schedule::TightAssociative) { reduct_reduce_map .entry(*reduct) @@ -85,6 +90,9 @@ pub fn gpu_codegen<W: Write>( } } } + if fork_reduce_map.is_empty() { + panic!("Function must have at least one fork node"); + } for idx in 0..function.nodes.len() { if function.nodes[idx].is_fork() && fork_reduce_map @@ -95,6 +103,9 @@ pub fn gpu_codegen<W: Write>( } } + // Obtain the Return node and if it's a collection, use the collection objects + // analysis to determine the origin. Also save the return node id for later + // conversion of primitive Return into Parameter. let (return_node_id, data_node_id) = { let pos = function .nodes @@ -179,7 +190,6 @@ pub fn gpu_codegen<W: Write>( ctx.codegen_function(w) } -// Kernel parameters that are fixed prior to codegen. struct GPUKernelParams { max_num_blocks: usize, max_num_threads: usize, @@ -202,10 +212,14 @@ struct GPUContext<'a> { return_type_id: &'a TypeID, } -// Pre is its own basic block, to separate one-time vs repeated code. It is -// non-trivial only for Fork nodes to create cooperative groups. -// Init, Body, and Term compose the main basic block, with Init and Term populated -// by control flow (Init used only by Fork and Join) and Body populated by data flow. +/* + * For all control nodes besides forks, Init, Body, and Term compose the main basic + * block, with Init and Term populated by control flow (Init used only by Fork and + * Join) and Body populated by data flow. + * For serialized Fork nodes which may be jumped back to by corresponding Join node, + * init and post_init separate one-time code (currently just cooperative group + * creation) from repeated code. + */ #[derive(Default, Debug)] struct CudaGoto { init: String, @@ -214,6 +228,14 @@ struct CudaGoto { term: String, } +/* + * KernelState is used for data and control node organization and generation. + * We define a block fork as one with each ThreadID being a block, and a thread + * fork as one with each ThreadID being a subset of threads within a block. + * OutBlock is outside a potential block fork at the full grid level, InBlock + * is inside a block fork but outside any thread forks, and InThread is inside + * a thread fork. + */ #[derive(Clone, Copy, PartialEq, Debug)] enum KernelState { OutBlock, @@ -221,6 +243,12 @@ enum KernelState { InThread, } +/* + * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) + * threads for a current ThreadID, Use is the union of such threads for all ThreadIDs + * in the current innermost Fork, and Available is Use plus additional threads not + * used in the current Fork. + */ #[derive(Clone, Copy, PartialEq, Debug)] enum CGType { UsePerId, @@ -230,8 +258,7 @@ enum CGType { impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { - // All possible includes followed by macros for intrinsic calls on - // types with no library support + // Emit all code up to the "goto" to Start's block let mut top = String::new(); self.codegen_kernel_begin(&mut top)?; self.codegen_dynamic_constants(&mut top)?; @@ -240,13 +267,16 @@ impl GPUContext<'_> { self.codegen_goto_start(&mut top)?; write!(w, "{}", top)?; + // Create structures and determine block and thread parallelization strategy let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); + println!("fork join map size: {}", self.fork_join_map.len()); + println!("fork tree size: {}", fork_tree.len()); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); - // We use CUDA's goto to jump between basic blocks. + // Setup for CUDA's "goto" for control flow between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) .map(|idx| { @@ -256,6 +286,7 @@ impl GPUContext<'_> { }) .collect(); + // Core function for the CUDA code of all data and control nodes. self.codegen_data_control( if num_blocks > 1 { Some(thread_root_root_fork) @@ -270,6 +301,7 @@ impl GPUContext<'_> { &mut gotos, )?; + // Emit all code from the previous step let mut rest = String::new(); self.codegen_gotos(&mut gotos, &mut rest)?; write!(w, "{}", rest)?; @@ -279,7 +311,7 @@ impl GPUContext<'_> { Ok(()) } - // Emit kernel signature, arguments, and dynamic shared memory declaration + // Emit kernel headers, signature, arguments, and dynamic shared memory declaration fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { write!(w, " #include <assert.h> @@ -343,8 +375,9 @@ namespace cg = cooperative_groups; )?; } - // Type is char since it's simplest to use single bytes for indexing, - // casting will be needed for use with different types. + // Type is char since it's simplest to use single bytes for indexing + // and it's required for heterogeneous Product and Summation types. + // Casting is later used for conversion to different types like int. write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; @@ -389,20 +422,23 @@ namespace cg = cooperative_groups; // upfront. fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if !self.function.nodes[id.idx()].is_control() { + if !self.function.nodes[id.idx()].is_control() && + !self.function.nodes[id.idx()].is_dynamic_constant() && + !self.function.nodes[id.idx()].is_parameter() { write!(w, "\t{};\n", self.get_value(id, true, false))?; } } Ok(()) } - // Emit helper registers that are used throughout the kernel- alignment - // is for proper dynamic shared memory allocation, max_variant_size is - // for variant selection during read/write copies since we don't keep - // tag (don't need and it can double summation memory usage due to alignment) + /* + * Emit helper registers that are used throughout the kernel. alignment + * is for proper dynamic shared memory allocation. grid and block are + * from CUDA's cooperative groups API and are used specifically for reads and + * writes. + */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!(w, "\tsize_t alignment;\n")?; - write!(w, "\tsize_t max_variant_size;\n")?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; Ok(()) @@ -449,8 +485,10 @@ namespace cg = cooperative_groups; } else { fork_control_map.entry(nested_fork).or_insert_with(HashSet::new).insert(control); } - for i in 0..forks.len()-1 { - fork_tree.entry(forks[i+1]).or_insert_with(HashSet::new).insert(forks[i]); + if forks.len() > 1 { + for i in 0..forks.len()-1 { + fork_tree.entry(forks[i+1]).or_insert_with(HashSet::new).insert(forks[i]); + } } (fork_tree, fork_control_map) }, @@ -459,7 +497,9 @@ namespace cg = cooperative_groups; /* * If tree has a single root fork of known size s <= max_num_blocks - * with parallel-fork schedule, then set num_blocks to s, else set num_blocks to 1. + * with parallel-fork schedule, then set num_blocks to s, else set num_blocks + * to 1. Also return the root fork(s) for parallelization strategy within + * threadblocks for threads and their eventual generation. */ fn get_root_forks_and_num_blocks( &self, @@ -493,9 +533,11 @@ namespace cg = cooperative_groups; * maximum over its descendants (leafs have base 1). We traverse up (details * in helper) and pass the factor and a map from fork node to a tuple of * (max quota of its siblings (including itself), its quota, its fork factor) + * from each node to its parents. The parent then compares * - all three are needed for codegen. A node is in the map IFF it will be parallelized. - * If not, the fork will use the parent's quota. Nodes may be removed from the - * map when traversing up the tree due to either of the max scenarios. + * If not, the fork will use the parent's quota and serialize over the Fork's + * ThreadIDs. Nodes may be removed from the map when traversing up the tree + * due to an ancestor having a larger factor that conflicts. */ fn get_thread_quotas( &self, @@ -514,8 +556,10 @@ namespace cg = cooperative_groups; is_root: bool, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { // Subsubtree map is the union of all keys for grandchildren and lower - // nodes, and subtree_quota is constructed map from children to their - // quota + // nodes. children_quota_map is a constructed map from parallelized children + // to their quota to update the subsubtree map at grandchildren level to + // subtreemap at children level. subtree_quota is cumulative factor of + // subtree and is then compared to this fork's factor. let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree .get(&curr_fork) .unwrap() @@ -544,18 +588,13 @@ namespace cg = cooperative_groups; if is_root { return (subtree_map, subtree_quota, true) } - /* - * A node can only be considered for parallelization if: - * a) it has statically known size - * b) the known size is less than or equal to the max_num_threads - * c) the known size is a power of 2 - * d) all reduces are parallel-reduce or associative - * - * Note: there are a few cases where we choose between - * parallelizing the fork vs its subtree, by taking max factor over subtree. - * However, parts of the subtree may have had smaller quotas and didn't - * need to be discarded. For now we avoid this complexity and discard full. - */ + // A node can only be considered for parallelization if: + // a) it has statically known size + // b) the known size is less than or equal to the max_num_threads + // c) the known size is a power of 2 + // d) all reduces are parallel-reduce or associative + // + // If not, just take the max cumulative factor of its subtree let reduces = &self.fork_reduce_map[&curr_fork]; if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] && let Some(fork_size) = self.multiply_fork_factors(factors) @@ -563,46 +602,39 @@ namespace cg = cooperative_groups; && fork_size.is_power_of_two() && reduces.iter().all(|&reduce| { self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) - || self.function.schedules[reduce.idx()].contains(&Schedule::Associative) + || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) { - /* - * If there's an associative reduce, - * if fork and subtree fit in warp, parallelize both - * else if fork is a multiple of warp size, parallelize the max between them - * else parallelize subtree - * Else, parallelize both - */ - if fork_size <= self.kernel_params.max_num_threads / subtree_quota { - if reduces.iter().any(|&reduce| { - self.function.schedules[reduce.idx()].contains(&Schedule::Associative) - }) { - if self.kernel_params.threads_per_warp % (fork_size * subtree_quota) == 0 { - (subtree_map, fork_size * subtree_quota, true) - } else if fork_size % self.kernel_params.threads_per_warp == 0 { - if fork_size >= subtree_quota { - (HashMap::new(), fork_size, true) - } else { - (subtree_map, subtree_quota, false) - } - } else { - (subtree_map, subtree_quota, false) - } + // If there's an associative Reduce, parallelize the larger factor + // between the Fork and subtree + // Else, all Reduces must be only parallel-reduce, so parallelize + // both if they fit and the larger if not. + // The reason for this distinction is that we only perform Reduces over + // ThreadID-based values over consecutive CUDA threads, so there's no + // opportunity for further nested parallelization. In contrast, this + // restriction doesn't help for parallel Writes, so nested parallelization + // is possible. + if reduces.iter().any(|&reduce| { + self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) + }) || fork_size > self.kernel_params.max_num_threads / subtree_quota { + if fork_size >= subtree_quota { + (HashMap::new(), fork_size, true) } else { - (subtree_map, fork_size * subtree_quota, true) + (subtree_map, subtree_quota, false) } - } - // We have to choose either the fork or its subtree - else if fork_size >= subtree_quota { - (HashMap::new(), fork_size, true) } else { - (subtree_map, subtree_quota, false) + (subtree_map, fork_size * subtree_quota, true) } } else { (subtree_map, subtree_quota, false) } } + /* + * If there's a block fork, then thread root forks are it's child forks. If + * not, thread root forks are the root forks. This will be used to begin the + * thread fork tree traversal for codegen. + */ fn get_thread_root_forks( &self, root_forks: &HashSet<NodeID>, @@ -617,6 +649,9 @@ namespace cg = cooperative_groups; } } + /* + * Codegen for all control and data nodes. + */ fn codegen_data_control( &self, block_fork: Option<NodeID>, @@ -627,10 +662,6 @@ namespace cg = cooperative_groups; num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { - // Define the following states: - // 0 is above block fork, 1 is in block fork above any thread fork, 2 is - // in any thread fork, 3 is below block fork - // First emit data and control gen for each control node outside any fork. // Recall that this was tracked through a fake fork node with NodeID 0. let mut state = KernelState::OutBlock; @@ -660,8 +691,7 @@ namespace cg = cooperative_groups; } } } - // Then generate for the thread fork tree by setting state to 2, traverse, - // and update the thread quota. Any traversal is fine, we choose pre-order. + // Then generate for the thread fork tree through Fork node traversal. state = KernelState::InThread; for &root_fork in thread_root_forks { self.codegen_data_control_traverse( @@ -678,6 +708,12 @@ namespace cg = cooperative_groups; Ok(()) } + /* + * The important feature of this traversal is that we update the available + * thread quota, use thread quota, and parallel factor for each Fork node. + * Either this information is in the precomputed map, or we use the parent's + * quota with no parallel factor. + */ fn codegen_data_control_traverse( &self, curr_fork: NodeID, @@ -742,13 +778,6 @@ namespace cg = cooperative_groups; Ok(()) } - // state dictates where we are in the kernel, and affects ThreadID and Write - // use_thread_quota is the number of threads used by the node, and affects - // ThreadID, Read, Write, and associative Binops - // parallel_factor is parallelization degree, and affects ThreadID and associative - // Binops - // nesting_fork is the fork node that the node is nested in, and affects ThreadID - // and Reduce fn codegen_data_node( &self, id: NodeID, @@ -763,7 +792,7 @@ namespace cg = cooperative_groups; let define_variable = self.get_value(id, false, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { - // Phi registers were already emitted and the data nodes it uses will + // Phi registers emitted at top and the data nodes it uses will // update the phi Node::Phi { control: _, @@ -785,6 +814,8 @@ namespace cg = cooperative_groups; } KernelState::InThread => { if parallel_factor.is_none() { + // No dependence on threadIdx.x because each used thread + // will run this Fork serially let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { @@ -798,9 +829,10 @@ namespace cg = cooperative_groups; } } } - // Only initialize the reduce, as reduct will update the reduce. If - // serialized, add gate to prevent re-assignment when we hit this reduce - // again + // The Reduce node only generates it's initialization, as reduct will + // perform the update. If serialized, add gate to prevent re-assignment + // when we hit this reduce again due to the control flow loop between + // the Fork and Join. Node::Reduce { control: _, init, @@ -821,18 +853,27 @@ namespace cg = cooperative_groups; } // Parameters emitted at top Node::Parameter { index: _ } => {} + // If the constant is primitive, it's stored in register so we repeat + // for all threads. Otherwise, it's stored in shared memory so we only + // want to "allocate" and initialize it once. Node::Constant { id: cons_id } => { + let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); + if (!is_primitive) { + let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; + *num_tabs += 1; + } self.codegen_constant( - if self.types[self.typing[id.idx()].idx()].is_primitive() { - define_variable - } else { - format!("*{}", define_variable) - }, + define_variable, *cons_id, true, w, *num_tabs, )?; + if (!is_primitive) { + write!(w, "{}}}\n", tabs)?; + *num_tabs -= 1; + } } // Dynamic constants emitted at top Node::DynamicConstant { id: _ } => {} @@ -889,12 +930,17 @@ namespace cg = cooperative_groups; let id_type = self.typing[id.idx()]; if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And | BinaryOperator::Xor) && is_special_reduct { + // For parallelized associative Reduces, use the cooperative + // groups reduce API. Associative multiplication is not + // supported. We need to use CGType::Use not CGType::UsePerId + // because for parallelized reduction we only have one thread + // per ThreadID and the reduction is over Use, not UsePerId. let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[left.idx()] { right_val } else { left_val }; - let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + let cg_tile = self.get_cg_tile(id, CGType::Use); #[allow(unreachable_patterns)] let cg_op = match op { BinaryOperator::Add => "plus", @@ -977,12 +1023,13 @@ namespace cg = cooperative_groups; Node::IntrinsicCall { intrinsic, args } => { let id_type = self.typing[id.idx()]; if matches!(intrinsic, Intrinsic::Max | Intrinsic::Min) && is_special_reduct { + // Similar to associative Binops let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[args[0].idx()] { self.get_value(args[1], false, false) } else { self.get_value(args[0], false, false) }; - let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + let cg_tile = self.get_cg_tile(id, CGType::Use); #[allow(unreachable_patterns)] let cg_op = match intrinsic { Intrinsic::Max => "max", @@ -1004,8 +1051,15 @@ namespace cg = cooperative_groups; )?; } } - // Main difference between read and write is codegen_copy takes the - // returned node's type for read and data node's type for write + // For read, all the cases are: + // 1. Reading collection from/to global to/from shared + // 2. Reading primitive from/to global to/from shared + // 3. Reading primitive from/to global to/from register + // 4. Reading primitive from/to shared to/from register + // The first three can all use cooperative groups memcpy and the last + // one can't. However, the C++/CUDA semantics for the last three are + // identical, so we differentiate the cases by data type instead of + // data source and destination. Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); @@ -1025,6 +1079,9 @@ namespace cg = cooperative_groups; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } } + // For write, the cases are the same, but since we're using C++/CUDA + // not-thread-safe write semantics, we need to gate the write with + // a thread rank check. Node::Write { collect, data, @@ -1034,16 +1091,18 @@ namespace cg = cooperative_groups; let collect_with_indices = self.codegen_collect(*collect, indices, is_char); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; + let nested_fork = nesting_fork.unwrap(); + let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); if self.types[data_type_id.idx()].is_primitive() { + write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; if is_char { let type_name = self.get_type(data_type_id, true); - write!(w, "{}*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; + write!(w, "{}\t*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; } else { - write!(w, "{}*{} = {};\n", tabs, collect_with_indices, data_variable)?; + write!(w, "{}\t*{} = {};\n", tabs, collect_with_indices, data_variable)?; } + write!(w, "{}}}\n", tabs)?; } else { - let nested_fork = nesting_fork.unwrap(); - let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); let data_size = self.get_size(data_type_id, None); write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, collect_with_indices, data_variable, data_size)?; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; @@ -1055,6 +1114,8 @@ namespace cg = cooperative_groups; panic!("Unsupported data node type") } } + // Since the data uses and reducts are responsible for updating Phi and + // Reduce nodes, respectively, we check and emit those for each data node. if let Some(phis) = self.label_data_for_phi.get(&id) { let val = self.get_value(id, false, false); for phi in phis { @@ -1083,12 +1144,8 @@ namespace cg = cooperative_groups; w_term: &mut String, ) -> Result<usize, Error> { let tabs = match &self.function.nodes[id.idx()] { - Node::Start => { - let succ = self.control_subgraph.succs(id).next().unwrap(); - write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; - 1 - } - Node::Region { preds: _ } + Node::Start + | Node::Region { preds: _ } | Node::Projection { control: _, selection: _ } => { let succ = self.control_subgraph.succs(id).next().unwrap(); write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; @@ -1110,8 +1167,14 @@ namespace cg = cooperative_groups; 1 } Node::Fork { control: _, factors: _ } => { - // We don't do anything smart to mitigate control flow divergence - // if use_thread_quota < warp size + // We create a cooperative group tile for each of: used threads per + // thread ID- for reads and writes-, used threads across all thread + // IDs- for parallelized reductions-, and available threads- to + // synchronize between used and unused threads. We want to create + // these only once, so we create two goto sections for each fork- + // one run only once for creating groups, and other may be ran + // multiple times if the Fork is serialized and Join jumps back + // to it. let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { let use_thread_quota = use_thread_quota.unwrap(); @@ -1124,6 +1187,9 @@ namespace cg = cooperative_groups; write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota)?; } write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; + // Fork nodes gate the used vs unused threads out of all available + // threads. If unused, we jump straight to the Join, and if used, + // we jump to successor like normal. let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { write!(w_post_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; @@ -1140,6 +1206,8 @@ namespace cg = cooperative_groups; } } Node::Join { control: fork } => { + // Join nodes also gate the used vs unused threads with a tile + // sync after the body. let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); if has_thread_quota { @@ -1150,9 +1218,14 @@ namespace cg = cooperative_groups; write!(w_term, "\t}}\n")?; } } + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + // If the Fork was parallelized, each thread or UsedPerId tile of + // threads only runs one ThreadID, so we can jump straight to the + // successor. Else, we jump back to the Fork until all ThreadIDs + // or equivalently the Fork's full factor number of iterations have + // been completed. if parallel_factor.is_some() { - let cg_tile_available = self.get_cg_tile(id, CGType::Available); - write!(w_term, "\t{}.sync();\n", cg_tile_available)?; write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; } else { let Node::Fork { factors, .. } = &self.function.nodes[fork.idx()] else { @@ -1171,6 +1244,8 @@ namespace cg = cooperative_groups; if has_thread_quota { 2 } else { 1 } } Node::Return { control: _, data } => { + // Since we lift originally primitive returns into a parameter, + // we write to that parameter upon return. if self.types[self.typing[data.idx()].idx()].is_primitive() { let return_val = self.get_value(*data, false, false); write!(w_term, "\tif (threadIdx.x == 0) {{\n")?; @@ -1187,7 +1262,13 @@ namespace cg = cooperative_groups; Ok(tabs) } - // Read/writes to global collections consist of global name + pointer offset. + /* + * This function emits collection name + pointer math for the provided indices. + * One nuance is whether the collection is represented as char pointer or + * the original primitive pointer. For Field, it's always char, for Variant, + * it doesn't matter here, and for Array, it depends- so we may need to tack + * on the element size to the index math. + */ fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { let mut index_ptr = "0".to_string(); let type_id = self.typing[collect.idx()]; @@ -1206,15 +1287,7 @@ namespace cg = cooperative_groups; else { panic!("Expected array type") }; - let mut cumulative_offset = "1 * ".to_string() - + extents - .iter() - .enumerate() - .filter(|(i, _)| *i >= array_indices.len()) - .map(|(_, id)| format!("dc{}", id.idx())) - .collect::<Vec<_>>() - .join(" * ") - .as_str(); + let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); for index in array_indices.iter().rev() { cumulative_offset = format!( "{} * ({} + ", @@ -1238,10 +1311,17 @@ namespace cg = cooperative_groups; format!("{} + {}", name, index_ptr) } - // Standalone function allows us to handle recursive initialization for - // product and summation collections. `allow_allocate` prevents unnecessary - // shared memory allocations for nested product and summation collections. - // Since not initialized, array collections don't need to be recursed into. + /* + * The outlined codegen for constants allows us to handle recursive initialization + * for collections. We perform "allocation" by atomically incrementing dynamic + * shared memory and CUDA's support for dynamic is limited to a single extern + * array. Dynamic is required here because not all dynamic constants and therefore + * array sizes are known. This approach will need further work, as currently + * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` + * prevents unnecessary shared memory allocations for nested product and summation + * collections, since the outermost allocates everything for the full collection. + * Since not initialized, array collections don't need to be recursed into. + */ fn codegen_constant( &self, name: String, @@ -1335,10 +1415,12 @@ namespace cg = cooperative_groups; Ok(()) } - // Emit code to calculate data size. For Product types, setting `field_number` - // gives data size up to but not including that field, so = 2 gives 1st field - // and offset to 2nd field. This is useful for generating constant initialization - // and read/write index math. + /* + * Emit code to calculate data size. For Product types, setting `num_fields` + * gives data size up to but not including that field, so = 2 gives 1st field + * and offset to 2nd field. This is useful for constant initialization and read/write + * index math. + */ fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { @@ -1541,16 +1623,6 @@ namespace cg = cooperative_groups; } } - // matmul detection- only called if einsum detected - fn matmul_detection(&self) -> Result<(), Error> { - Ok(()) - } - - // convolution detection- only called if einsum detected - fn convolution_detection(&self) -> Result<(), Error> { - Ok(()) - } - fn get_cg_tile(&self, fork: NodeID, cg_type: CGType) -> String { format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" }) } @@ -1567,8 +1639,11 @@ namespace cg = cooperative_groups; format!("bb_{}{}", self.get_value(id, false, false), if post { "_post" } else { "" }) } - // Setting ty = true will return with type in declaration format. make_pointer - // is only considered if ty = true and only relevant for primitive types. + /* + * Setting `ty = true` will return with type in declaration format. `make_pointer` + * is only considered if `ty = true` and only relevant for primitive types- + * otherwise it makes no difference because collections are already pointers. + */ fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] { if ty { @@ -1595,11 +1670,9 @@ namespace cg = cooperative_groups; } } - // Setting make_pointer = true will only affect primitive types- the - // collections are already pointers fn get_type(&self, id: TypeID, make_pointer: bool) -> String { match &self.types[id.idx()] { - // Product and summation collections are char* for byte-addressability + // Product and summation collections are char* for 1 byte-addressability // since we can have variable type fields Type::Product(_) | Type::Summation(_) => "char*".to_string(), Type::Array(element_type, _) => self.get_type(*element_type, true), @@ -1623,7 +1696,6 @@ fn multiply_dcs(dcs: &[DynamicConstantID]) -> String { } } -// TODO: Add float8, float16, bfloat16 dtypes if they come fn convert_type(ty: &Type, make_pointer: bool) -> String { let mut result = match ty { Type::Boolean => "bool".to_string(), -- GitLab From 0607cf119e64a551767d7202e73634e3614eed5f Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 6 Jan 2025 09:57:39 -0600 Subject: [PATCH 041/109] comms --- hercules_cg/src/gpu.rs | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 10e7d9e3..38d4a9bb 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -38,10 +38,37 @@ pub fn gpu_codegen<W: Write>( * largest element * - Summation types must be aligned to their largest element * - * Major TODOs: + * Notes on GPU parallelization strategy and tips for IR transformations: + * - The top level block fork and any lower thread forks require a known Fork + * size. Thus for an otherwise parallelizable Fork with unknown size, + * consider splitting it into two Forks with one of known size. For block + * level, the known fork has to be the (only) top-most fork. + * - The thread-level strategy is determined by starting at the most nested + * Forks and working outwards in a greedy manner, with caps by GPU spec. + * Thus, to ensure some outer Fork is parallelized, ensure the inner + * parallelizable Forks aren't too large or consider removing schedule + * annotations. + * - Tight-Associative reductions can only be efficiently implemented if + * different Hercules ThreadIDs correspond to consecutive CUDA threads. But + * this prevents nested parallelization since each parallel group must always + * be a contiguous tile of threads. We use a heuristic of choosing the larger + * factor when this results in a conflict between a Fork and it's subtree, + * but this choice may not be optimal. + * - A given Fork (not talking about its children) can only be parallelized + * if all its Reduces are Parallel-Reduce or Tight-Associative. So if the + * Fork contains expensive parallelizable operations, ensure all reductions + * are parallelizable or if not try pulling those out into a different Fork. + * - We do nothing to mitigate intra-warp divergence. To mitigate this, the + * IR, for example, should ensure the innermost parallelizable Forks either + * have factor >= warp size (32) or remove Fork/Reduce node schedule + * annotations. + * + * Main TODOs: * - Fix dynamic shared memory allocation to reuse old shmem. The main case - * for improvement is when we have serialized forks with unused intermediate - * values from previous iterations. + * for improvement is when we have serialized forks with unused intermediate + * values from previous iterations. + * - Add mapping from Region node to Fork node if there's a reduce whose control + * is a Region not Join. * - Matmul/Conv detection * - Add float8, float16, bfloat16 dtypes if they come */ -- GitLab From 249294c59101d9c738f18eca5bdb11463fce4680 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 6 Jan 2025 11:16:51 -0600 Subject: [PATCH 042/109] gpu juno --- juno_samples/matmul/src/gpu_matmul.jn | 45 +++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 juno_samples/matmul/src/gpu_matmul.jn diff --git a/juno_samples/matmul/src/gpu_matmul.jn b/juno_samples/matmul/src/gpu_matmul.jn new file mode 100644 index 00000000..e719ba9b --- /dev/null +++ b/juno_samples/matmul/src/gpu_matmul.jn @@ -0,0 +1,45 @@ +#[entry] +fn tiled_64_matmul_with_n_1024<m : usize, l : usize>(a : i32[1024, m], b : i32[m, l]) -> i32 { + let res = 0; + + for bi = 0 to 16 { + for bk = 0 to l / 64 { + // TODO: make these all the same size, clone analysis should undo GVN's + // combining of these three arrays. + let atile : i32[66, 64]; + let btile : i32[65, 64]; + let ctile : i32[64, 64]; + + for tile_idx = 0 to m / 64 { + for ti = 0 to 64 { + for tk = 0 to 64 { + atile[ti, tk] = a[bi * 64 + ti, tile_idx * 64 + tk]; + btile[ti, tk] = b[tile_idx * 64 + ti, bk * 64 + tk]; + // TODO: remove setting ctile to zero explicitly, clone analysis + // should see a lack of a phi for ctile in the block loops and + // induce a copy of an initial value of ctile (all zeros) on each + // iteration of the block loops. + ctile[ti, tk] = 0; + } + } + for ti = 0 to 64 { + for tk = 0 to 64 { + let c_acc = ctile[ti, tk]; + for inner_idx = 0 to 64 { + c_acc += atile[ti, inner_idx] * btile[inner_idx, tk]; + } + ctile[ti, tk] = c_acc; + } + } + } + + for ti = 0 to 64 { + for tk = 0 to 64 { + res += ctile[ti, tk]; + } + } + } + } + + return res; +} -- GitLab From 4897c13fde5e8372bfca19a820699b6d08309e39 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 6 Jan 2025 12:30:03 -0600 Subject: [PATCH 043/109] minor --- hercules_cg/src/gpu.rs | 50 ++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 38d4a9bb..25443be5 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -459,13 +459,11 @@ namespace cg = cooperative_groups; } /* - * Emit helper registers that are used throughout the kernel. alignment - * is for proper dynamic shared memory allocation. grid and block are - * from CUDA's cooperative groups API and are used specifically for reads and - * writes. + * Emit helper registers that are used throughout the kernel. grid and block + * are from CUDA's cooperative groups API and are used specifically for reads + * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { - write!(w, "\tsize_t alignment;\n")?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?; Ok(()) @@ -479,9 +477,15 @@ namespace cg = cooperative_groups; fn codegen_gotos(&self, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { write!(w, "\n")?; - for (_, goto) in gotos.iter() { + for (id, goto) in gotos.iter() { + let goto_block = self.get_block_name(*id, false); + write!(w, "{}:\n", goto_block)?; write!(w, "{}\n", goto.init)?; - write!(w, "{}\n", goto.post_init)?; + if !goto.post_init.is_empty() { + let goto_block = self.get_block_name(*id, true); + write!(w, "{}:\n", goto_block)?; + write!(w, "{}\n", goto.post_init)?; + } write!(w, "{}\n", goto.body)?; write!(w, "{}\n\n", goto.term)?; } @@ -886,7 +890,11 @@ namespace cg = cooperative_groups; Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); if (!is_primitive) { - let cg_tile = self.get_cg_tile(id, CGType::UsePerId); + let cg_tile = match state { + KernelState::OutBlock + | KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(id, CGType::UsePerId), + }; write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; } @@ -967,6 +975,8 @@ namespace cg = cooperative_groups; } else { left_val }; + // Special reduct is only enabled for thread parallelization + // so don't need to worry about grid and block cases let cg_tile = self.get_cg_tile(id, CGType::Use); #[allow(unreachable_patterns)] let cg_op = match op { @@ -1086,7 +1096,7 @@ namespace cg = cooperative_groups; // The first three can all use cooperative groups memcpy and the last // one can't. However, the C++/CUDA semantics for the last three are // identical, so we differentiate the cases by data type instead of - // data source and destination. + // data src/dest, with only collection type using collective group. Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char); @@ -1100,15 +1110,19 @@ namespace cg = cooperative_groups; } } else { let nested_fork = nesting_fork.unwrap(); - let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); + let cg_tile = match state { + KernelState::OutBlock => "grid".to_string(), + KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nested_fork, CGType::UsePerId), + }; let data_size = self.get_size(data_type_id, None); write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, define_variable, collect_with_indices, data_size)?; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } } - // For write, the cases are the same, but since we're using C++/CUDA - // not-thread-safe write semantics, we need to gate the write with - // a thread rank check. + // For write, the cases are the same, but when using C++ dereference + // semantics, we need to gate the write with a thread rank check for + // thread safety. Node::Write { collect, data, @@ -1119,7 +1133,11 @@ namespace cg = cooperative_groups; let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; let nested_fork = nesting_fork.unwrap(); - let cg_tile = self.get_cg_tile(nested_fork, CGType::UsePerId); + let cg_tile = match state { + KernelState::OutBlock => "grid".to_string(), + KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nested_fork, CGType::UsePerId), + }; if self.types[data_type_id.idx()].is_primitive() { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; if is_char { @@ -1244,9 +1262,9 @@ namespace cg = cooperative_groups; write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; write!(w_term, "\t}}\n")?; } + let cg_tile_available = self.get_cg_tile(id, CGType::Available); + write!(w_term, "\t{}.sync();\n", cg_tile_available)?; } - let cg_tile_available = self.get_cg_tile(id, CGType::Available); - write!(w_term, "\t{}.sync();\n", cg_tile_available)?; // If the Fork was parallelized, each thread or UsedPerId tile of // threads only runs one ThreadID, so we can jump straight to the // successor. Else, we jump back to the Fork until all ThreadIDs -- GitLab From 21ef764d7ab37df0396154d529254ffbd8dcf19b Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 10 Jan 2025 10:21:09 -0600 Subject: [PATCH 044/109] w host --- hercules_cg/src/device.rs | 2 + hercules_cg/src/gpu.rs | 325 +++++++++++++----- hercules_opt/src/pass.rs | 1 + hercules_samples/matmul/src/matmul.hir | 8 +- juno_samples/test2.jn | 25 ++ .../__pycache__/mobilenet.cpython-310.pyc | Bin 0 -> 582 bytes .../__pycache__/torch_export.cpython-310.pyc | Bin 0 -> 3266 bytes 7 files changed, 264 insertions(+), 97 deletions(-) create mode 100644 juno_samples/test2.jn create mode 100644 torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc create mode 100644 torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs index 866fa6ad..09a5bc26 100644 --- a/hercules_cg/src/device.rs +++ b/hercules_cg/src/device.rs @@ -9,6 +9,8 @@ pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec let mut devices = vec![]; for (idx, function) in functions.into_iter().enumerate() { + devices.push(Device::CUDA); + continue; if let Some(device) = function.device { devices.push(device); } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 { diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 25443be5..a153b7ef 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -78,6 +78,12 @@ pub fn gpu_codegen<W: Write>( .map(NodeID::new) .collect(); + + let fork_join_map = &fork_join_map(function, control_subgraph); + let join_fork_map: &HashMap<NodeID, NodeID> = &fork_join_map + .into_iter() + .map(|(fork, join)| (*join, *fork)) + .collect(); // Fork Reduce map should have all reduces contained in some key let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); // Reduct Reduce map should have all non-parallel and non-associative reduces @@ -92,9 +98,8 @@ pub fn gpu_codegen<W: Write>( } = &function.nodes[reduce_node.idx()] { match function.nodes[control.idx()] { - Node::Join { - control: fork_node, .. - } => { + Node::Join {..} => { + let fork_node = join_fork_map[control]; fork_reduce_map .entry(fork_node) .or_default() @@ -123,8 +128,7 @@ pub fn gpu_codegen<W: Write>( for idx in 0..function.nodes.len() { if function.nodes[idx].is_fork() && fork_reduce_map - .get(&NodeID::new(idx)) - .map_or(true, |reduces| reduces.is_empty()) + .get(&NodeID::new(idx)).is_none_or(|reduces| reduces.is_empty()) { panic!("Fork node {} has no reduce nodes", idx); } @@ -197,7 +201,7 @@ pub fn gpu_codegen<W: Write>( }; let label_data_for_phi = &label_data_for_phi(); - let fork_join_map = &fork_join_map(function, control_subgraph); + let def_use_map = &def_use(function); let ctx = GPUContext { function, @@ -208,10 +212,12 @@ pub fn gpu_codegen<W: Write>( control_subgraph, bbs, kernel_params, + def_use_map, + fork_join_map, + join_fork_map, fork_reduce_map, reduct_reduce_map, label_data_for_phi, - fork_join_map, return_type_id, }; ctx.codegen_function(w) @@ -232,10 +238,12 @@ struct GPUContext<'a> { control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, kernel_params: &'a GPUKernelParams, + def_use_map: &'a ImmutableDefUseMap, + fork_join_map: &'a HashMap<NodeID, NodeID>, + join_fork_map: &'a HashMap<NodeID, NodeID>, fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, - fork_join_map: &'a HashMap<NodeID, NodeID>, return_type_id: &'a TypeID, } @@ -296,12 +304,12 @@ impl GPUContext<'_> { // Create structures and determine block and thread parallelization strategy let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); - println!("fork join map size: {}", self.fork_join_map.len()); - println!("fork tree size: {}", fork_tree.len()); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); + println!("num_blocks: {}", num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); + let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); // Setup for CUDA's "goto" for control flow between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -324,17 +332,22 @@ impl GPUContext<'_> { &fork_tree, &fork_control_map, &fork_thread_quota_map, + &extra_dim_collects, num_threads, &mut gotos, )?; - // Emit all code from the previous step - let mut rest = String::new(); - self.codegen_gotos(&mut gotos, &mut rest)?; - write!(w, "{}", rest)?; - + // Emit all GPU kernel code from previous steps + let mut kernel_body = String::new(); + self.codegen_gotos(&mut gotos, &mut kernel_body)?; + write!(w, "{}", kernel_body)?; write!(w, "}}\n")?; + // Emit host launch code + let mut host_launch = String::new(); + self.codegen_launch_code(num_blocks, num_threads, &mut host_launch)?; + write!(w, "{}", host_launch)?; + Ok(()) } @@ -347,7 +360,6 @@ impl GPUContext<'_> { #include <cuda.h> #include <cuda_runtime.h> #include <mma.h> -#include <helper_cuda.h> #include <cooperative_groups.h> #include <cooperative_groups/memcpy_async.h> #include <cooperative_groups/reduce.h> @@ -407,6 +419,8 @@ namespace cg = cooperative_groups; // Casting is later used for conversion to different types like int. write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; + // This will only get used by thread rank 0 in each block, since it + // does all shared memory "allocation" write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; Ok(()) @@ -471,7 +485,7 @@ namespace cg = cooperative_groups; fn codegen_goto_start(&self, w: &mut String) -> Result<(), Error> { let block_start = self.get_block_name(NodeID::new(0), false); - write!(w, "goto {};\n", block_start)?; + write!(w, "\tgoto {};\n", block_start)?; Ok(()) } @@ -480,15 +494,59 @@ namespace cg = cooperative_groups; for (id, goto) in gotos.iter() { let goto_block = self.get_block_name(*id, false); write!(w, "{}:\n", goto_block)?; - write!(w, "{}\n", goto.init)?; + write!(w, "{}", goto.init)?; if !goto.post_init.is_empty() { let goto_block = self.get_block_name(*id, true); write!(w, "{}:\n", goto_block)?; - write!(w, "{}\n", goto.post_init)?; + write!(w, "{}", goto.post_init)?; + } + write!(w, "{}", goto.body)?; + write!(w, "{}\n", goto.term)?; + } + Ok(()) + } + + fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, w: &mut String) -> Result<(), Error> { + write!(w, " +int main(")?; + // The following steps are for host-side C function arguments, but we also + // need to pass arguments to kernel, so we keep track of the arguments here. + let mut pass_args = String::new(); + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + write!(w, "unsigned long long dc_p{}", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; } - write!(w, "{}\n", goto.body)?; - write!(w, "{}\n\n", goto.term)?; + let param_type = self.get_type(*ty, false); + write!(w, "{} p{}", param_type, idx)?; + write!(pass_args, "p{}", idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { + write!(w, ", ")?; + write!(pass_args, ", ")?; + let ret_type = self.get_type(*self.return_type_id, true); + write!(w, "{} ret", ret_type)?; + write!(pass_args, "ret")?; } + write!(w, ") {{ + {}<<<{}, {}>>>({}); +}}", self.function.name, num_blocks, num_threads, pass_args); Ok(()) } @@ -510,17 +568,19 @@ namespace cg = cooperative_groups; fork_nesting.into_iter().fold( (HashMap::new(), HashMap::new()), |(mut fork_tree, mut fork_control_map), (control, forks)| { - let nested_fork = forks.first().copied().unwrap_or(NodeID::new(0)); if self.function.nodes[control.idx()].is_fork() { - fork_tree.entry(nested_fork).or_insert_with(HashSet::new).insert(control); - } else { - fork_control_map.entry(nested_fork).or_insert_with(HashSet::new).insert(control); - } - if forks.len() > 1 { - for i in 0..forks.len()-1 { - fork_tree.entry(forks[i+1]).or_insert_with(HashSet::new).insert(forks[i]); - } + // If control node is fork make sure it's in the fork_tree even + // if has no nested forks. + fork_tree.entry(control).or_insert_with(HashSet::new); + // Then get it's nesting fork- index = 1 to not count itself! + let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0)); + fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(control); + println!("fork_tree parent: {}, child: {}", nesting_fork.idx(), control.idx()); } + // Here the desired fork is always the first fork + let fork = forks.first().copied().unwrap_or(NodeID::new(0)); + fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(control); + println!("fork_control_map parent: {}, child: {}", fork.idx(), control.idx()); (fork_tree, fork_control_map) }, ) @@ -557,6 +617,25 @@ namespace cg = cooperative_groups; } } + /* + * If there's a block fork, then thread root forks are it's child forks. If + * not, thread root forks are the root forks. This will be used to begin the + * thread fork tree traversal for codegen. + */ + fn get_thread_root_forks( + &self, + root_forks: &HashSet<NodeID>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, + num_blocks: usize, + ) -> (NodeID, HashSet<NodeID>) { + if num_blocks > 1 { + let root_fork = root_forks.iter().next().unwrap(); + (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect()) + } else { + (NodeID::new(0), root_forks.clone()) + } + } + /* * This analysis determines the parallelization strategy within threadblocks. * We run post-order traversal on the fork tree to get the thread quota per @@ -661,23 +740,43 @@ namespace cg = cooperative_groups; } } - /* - * If there's a block fork, then thread root forks are it's child forks. If - * not, thread root forks are the root forks. This will be used to begin the - * thread fork tree traversal for codegen. + /* + * All non reduced-over collections used in fork joins have an extra dimension. + * However, this is only useful if ThreadIDs run in parallel not serially, + * otherwise it's unnecessarily consuming shared memory. This function returns + * the set of collections that have an unnecessary extra dimension. */ - fn get_thread_root_forks( + fn get_extra_dim_collects( &self, - root_forks: &HashSet<NodeID>, - fork_tree: &HashMap<NodeID, HashSet<NodeID>>, - num_blocks: usize, - ) -> (NodeID, HashSet<NodeID>) { - if num_blocks > 1 { - (NodeID::new(0), root_forks.clone()) - } else { - let root_fork = root_forks.iter().next().unwrap(); - (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect()) - } + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, + fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + ) -> HashSet<TypeID> { + // Get all constant collection creations + let collect_consts: HashSet<NodeID> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_constant() && !self.types[self.typing[*idx].idx()].is_primitive()) + .map(|idx| NodeID::new(idx)) + .collect(); + // Reverse fork_control_map + let control_fork_map: HashMap<NodeID, NodeID> = fork_control_map.iter() + .flat_map(|(fork, controls)| { + controls.iter().map(move |control| (*control, *fork)) + }) + .collect(); + // Get all uses of each collection, map each use to basic block, then map each basic block to fork + let collect_fork_users: HashMap<NodeID, HashSet<NodeID>> = collect_consts.iter() + .map(|collect_const| { + (*collect_const, self.def_use_map.get_users(*collect_const)) + }) + .map(|(collect_const, users)| { + (collect_const, users.iter().map(|user| control_fork_map[&self.bbs.0[user.idx()]]).collect()) + }) + .collect(); + // For now assert that each collection is used by a single fork and get + // parallel status, TODO: revisit + collect_fork_users.iter() + .filter(|(_, fork_users)| !fork_thread_quota_map.contains_key(fork_users.iter().next().unwrap())) + .map(|(collect_const, _)| self.typing[collect_const.idx()]) + .collect() } /* @@ -690,6 +789,7 @@ namespace cg = cooperative_groups; fork_tree: &HashMap<NodeID, HashSet<NodeID>>, fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, + extra_dim_collects: &HashSet<TypeID>, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -697,6 +797,7 @@ namespace cg = cooperative_groups; // Recall that this was tracked through a fake fork node with NodeID 0. let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { + println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -704,13 +805,14 @@ namespace cg = cooperative_groups; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, false, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { + println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -718,7 +820,7 @@ namespace cg = cooperative_groups; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, body, &mut tabs)?; } } } @@ -733,6 +835,7 @@ namespace cg = cooperative_groups; fork_thread_quota_map, 1, num_threads, + extra_dim_collects, gotos, )?; } @@ -754,6 +857,7 @@ namespace cg = cooperative_groups; fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, parent_quota: usize, num_threads: usize, + extra_dim_collections: &HashSet<TypeID>, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map @@ -775,6 +879,7 @@ namespace cg = cooperative_groups; HashSet::new() }; for control in fork_control_map.get(&curr_fork).unwrap() { + println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -789,6 +894,7 @@ namespace cg = cooperative_groups; parallel_factor, Some(curr_fork), reducts.contains(data), + extra_dim_collections, body, &mut tabs, )?; @@ -803,6 +909,7 @@ namespace cg = cooperative_groups; fork_thread_quota_map, use_thread_quota, num_threads, + extra_dim_collections, gotos, )?; } @@ -817,6 +924,7 @@ namespace cg = cooperative_groups; parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, is_special_reduct: bool, + extra_dim_collects: &HashSet<TypeID>, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { @@ -890,10 +998,12 @@ namespace cg = cooperative_groups; Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); if (!is_primitive) { - let cg_tile = match state { - KernelState::OutBlock - | KernelState::InBlock => "block".to_string(), - KernelState::InThread => self.get_cg_tile(id, CGType::UsePerId), + let cg_tile = { + let KernelState::OutBlock = state else { + panic!("Expected constant to be in start basic block + outside any fork"); + }; + "block".to_string() }; write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; @@ -902,6 +1012,7 @@ namespace cg = cooperative_groups; define_variable, *cons_id, true, + Some(extra_dim_collects), w, *num_tabs, )?; @@ -1099,14 +1210,14 @@ namespace cg = cooperative_groups; // data src/dest, with only collection type using collective group. Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); - let collect_with_indices = self.codegen_collect(*collect, indices, is_char); + let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_type_id = self.typing[id.idx()]; if self.types[data_type_id.idx()].is_primitive() { if is_char { let type_name = self.get_type(data_type_id, true); write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; } else { - write!(w, "{}{} = *{};\n", tabs, define_variable, collect_with_indices)?; + write!(w, "{}{} = *({});\n", tabs, define_variable, collect_with_indices)?; } } else { let nested_fork = nesting_fork.unwrap(); @@ -1115,7 +1226,7 @@ namespace cg = cooperative_groups; KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nested_fork, CGType::UsePerId), }; - let data_size = self.get_size(data_type_id, None); + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, define_variable, collect_with_indices, data_size)?; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } @@ -1129,7 +1240,7 @@ namespace cg = cooperative_groups; indices, } => { let is_char = self.is_char(self.typing[collect.idx()]); - let collect_with_indices = self.codegen_collect(*collect, indices, is_char); + let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; let nested_fork = nesting_fork.unwrap(); @@ -1144,11 +1255,11 @@ namespace cg = cooperative_groups; let type_name = self.get_type(data_type_id, true); write!(w, "{}\t*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; } else { - write!(w, "{}\t*{} = {};\n", tabs, collect_with_indices, data_variable)?; + write!(w, "{}\t*({}) = {};\n", tabs, collect_with_indices, data_variable)?; } write!(w, "{}}}\n", tabs)?; } else { - let data_size = self.get_size(data_type_id, None); + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, collect_with_indices, data_variable, data_size)?; write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; } @@ -1169,7 +1280,7 @@ namespace cg = cooperative_groups; } } if let Some(reduces) = self.reduct_reduce_map.get(&id) { - let val = self.get_value(id, true, false); + let val = self.get_value(id, false, false); for reduce in reduces { let reduce_val = self.get_value(*reduce, false, false); write!(w, "{}{} = {};\n", tabs, reduce_val, val)?; @@ -1223,21 +1334,29 @@ namespace cg = cooperative_groups; let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { let use_thread_quota = use_thread_quota.unwrap(); - let use_thread_per_id = use_thread_quota / parallel_factor.unwrap(); + let use_thread_per_id = if parallel_factor.is_some() { + use_thread_quota / parallel_factor.unwrap() + } else { + use_thread_quota + }; write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id)?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota)?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota)?; + if parallel_factor.is_none() { + write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?; + write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; + } } - write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?; // Fork nodes gate the used vs unused threads out of all available // threads. If unused, we jump straight to the Join, and if used, // we jump to successor like normal. let succ = self.control_subgraph.succs(id).next().unwrap(); if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota { - write!(w_post_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; + let w_target = if parallel_factor.is_none() { w_post_init } else { w_init }; + write!(w_target, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?; write!(w_term, "\t}}\n")?; write!(w_term, "\telse {{\n")?; @@ -1246,23 +1365,30 @@ namespace cg = cooperative_groups; write!(w_term, "\t}}\n")?; 2 } else { + // Make sure post-init isn't empty so it goto header generated + if use_thread_quota.is_some() && parallel_factor.is_none() { + write!(w_post_init, " ")?; + } write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; 1 } } - Node::Join { control: fork } => { + Node::Join { control: _ } => { // Join nodes also gate the used vs unused threads with a tile // sync after the body. let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); + let mut tabs = 1; if has_thread_quota { let available_thread_quota = available_thread_quota.unwrap(); let use_thread_quota = use_thread_quota.unwrap(); if use_thread_quota < available_thread_quota { write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?; write!(w_term, "\t}}\n")?; + tabs += 1; } - let cg_tile_available = self.get_cg_tile(id, CGType::Available); + let fork = self.join_fork_map.get(&id).unwrap(); + let cg_tile_available = self.get_cg_tile(*fork, CGType::Available); write!(w_term, "\t{}.sync();\n", cg_tile_available)?; } // If the Fork was parallelized, each thread or UsedPerId tile of @@ -1273,8 +1399,9 @@ namespace cg = cooperative_groups; if parallel_factor.is_some() { write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?; } else { + let fork = self.join_fork_map.get(&id).unwrap(); let Node::Fork { factors, .. } = &self.function.nodes[fork.idx()] else { - panic!("Expected join node to use a fork node"); + panic!("Expected join_fork_map to point to a fork node"); }; let fork_size = multiply_dcs(factors); let fork_iter = self.get_fork_iter(*fork, false); @@ -1286,7 +1413,7 @@ namespace cg = cooperative_groups; write!(w_term, "\t\tgoto {};\n", self.get_block_name(*fork, true))?; write!(w_term, "\t}}\n")?; } - if has_thread_quota { 2 } else { 1 } + tabs } Node::Return { control: _, data } => { // Since we lift originally primitive returns into a parameter, @@ -1314,13 +1441,13 @@ namespace cg = cooperative_groups; * it doesn't matter here, and for Array, it depends- so we may need to tack * on the element size to the index math. */ - fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool) -> String { + fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool, has_extra_dim: bool) -> String { let mut index_ptr = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { match index { Index::Field(field) => { - self.get_size(type_id, Some(*field)); + self.get_size(type_id, Some(*field), None); } // Variants of summations have zero offset Index::Variant(_) => {} @@ -1333,11 +1460,12 @@ namespace cg = cooperative_groups; panic!("Expected array type") }; let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); - for index in array_indices.iter().rev() { + for (i, index) in array_indices.iter().skip(if has_extra_dim { 1 } else { 0 }).rev().enumerate() { cumulative_offset = format!( - "{} * ({} + ", + "{} * ({} + {}", cumulative_offset, - self.get_value(*index, false, false) + self.get_value(*index, false, false), + format!("dc{}", extents[i].idx()) ); } index_ptr.push_str(&format!( @@ -1346,7 +1474,7 @@ namespace cg = cooperative_groups; ")".repeat(array_indices.len()) )); if is_char { - let element_size = self.get_size(*element_type, None); + let element_size = self.get_size(*element_type, None, None); index_ptr.push_str(&format!(" * {}", element_size)); } } @@ -1372,6 +1500,7 @@ namespace cg = cooperative_groups; name: String, cons_id: ConstantID, allow_allocate: bool, + extra_dim_collects: Option<&HashSet<TypeID>>, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { @@ -1388,12 +1517,12 @@ namespace cg = cooperative_groups; Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, val)?, Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, - // All three followign collections involve align then allocate from the + // All three following collections involve align then allocate from the // single dynamic shared memory buffer by using and updating the offset. Constant::Product(type_id, constant_fields) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None); + let size = self.get_size(*type_id, None, extra_dim_collects); write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; @@ -1404,20 +1533,26 @@ namespace cg = cooperative_groups; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call let field_type = self.get_type(type_fields[i], true); - let offset = self.get_size(type_fields[i], Some(i)); - self.codegen_constant( - format!("*reinterpret_cast<{}>({}{})", field_type, name, offset), - constant_fields[i], - false, - w, - num_tabs, - ); + let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects); + let field_constant = &self.constants[constant_fields[i].idx()]; + if field_constant.is_scalar() { + self.codegen_constant( + format!("*reinterpret_cast<{}>({}+{})", field_type, name, offset), + constant_fields[i], + false, + extra_dim_collects, + w, + num_tabs, + ); + } else if !field_constant.is_array() { + self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, w, num_tabs); + } } } Constant::Summation(type_id, variant, field) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None); + let size = self.get_size(*type_id, None, extra_dim_collects); write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; @@ -1431,26 +1566,30 @@ namespace cg = cooperative_groups; let variant_constant = &self.constants[field.idx()]; if variant_constant.is_scalar() { self.codegen_constant( - format!("*reinterpret_cast<{}>{}", variant_type, name), - cons_id, + format!("*reinterpret_cast<{}>({})", variant_type, name), + *field, false, + extra_dim_collects, w, num_tabs, ); } else if !variant_constant.is_array() { - self.codegen_constant(name, cons_id, false, w, num_tabs); + self.codegen_constant(name, *field, false, extra_dim_collects, w, num_tabs); }; } Constant::Array(type_id) => { let Type::Array(element_type, _) = &self.types[type_id.idx()] else { panic!("Expected array type") }; + if !allow_allocate { + panic!("Nested array constant should not be re-allocated"); + } let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None); - let element_type = self.get_type(*element_type, false); + let size = self.get_size(*type_id, None, extra_dim_collects); + let element_type = self.get_type(*element_type, true); write!( w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", + "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {};\n", tabs, alignment, alignment, alignment )?; write!(w, "{}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n", tabs, name, element_type)?; @@ -1466,10 +1605,10 @@ namespace cg = cooperative_groups; * and offset to 2nd field. This is useful for constant initialization and read/write * index math. */ - fn get_size(&self, type_id: TypeID, num_fields: Option<usize>) -> String { + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { - let array_size = multiply_dcs(extents); + let array_size = multiply_dcs(if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { &extents[1..] } else { extents }); format!("{} * {}", self.get_alignment(*element_type), array_size) } Type::Product(fields) => { @@ -1478,7 +1617,7 @@ namespace cg = cooperative_groups; .iter() .enumerate() .filter(|(i, _)| i < num_fields) - .map(|(_, id)| (self.get_size(*id, None), self.get_alignment(*id))) + .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects), self.get_alignment(*id))) .fold(String::from("0"), |acc, (size, align)| { if acc == "0" { size @@ -1493,7 +1632,7 @@ namespace cg = cooperative_groups; format!( "{} - {}", with_field, - self.get_size(fields[*num_fields], None) + self.get_size(fields[*num_fields], None, extra_dim_collects) ) } else { with_field @@ -1503,7 +1642,7 @@ namespace cg = cooperative_groups; // The argmax variant by size is not guaranteed to be same as // argmax variant by alignment, eg product of 3 4-byte primitives // vs 1 8-byte primitive, so we need to calculate both. - let max_size = variants.iter().map(|id| self.get_size(*id, None)).fold( + let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects)).fold( String::from("0"), |acc, x| { if acc == "0" { diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 3e7d1089..7366a336 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -970,6 +970,7 @@ impl PassManager { _ => todo!(), } } + println!("{}", cuda_ir); println!("{}", llvm_ir); println!("{}", rust_rt); diff --git a/hercules_samples/matmul/src/matmul.hir b/hercules_samples/matmul/src/matmul.hir index ab0f384a..400ab5e1 100644 --- a/hercules_samples/matmul/src/matmul.hir +++ b/hercules_samples/matmul/src/matmul.hir @@ -1,9 +1,9 @@ -fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2) - c = constant(array(i32, #0, #2), []) - i_j_ctrl = fork(start, #0, #2) +fn matmul(a: array(i32, 16, 64), b: array(i32, 64, 32)) -> array(i32, 16, 32) + c = constant(array(i32, 16, 32), []) + i_j_ctrl = fork(start, 16, 32) i_idx = thread_id(i_j_ctrl, 0) j_idx = thread_id(i_j_ctrl, 1) - k_ctrl = fork(i_j_ctrl, #1) + k_ctrl = fork(i_j_ctrl, 64) k_idx = thread_id(k_ctrl, 0) k_join_ctrl = join(k_ctrl) i_j_join_ctrl = join(k_join_ctrl) diff --git a/juno_samples/test2.jn b/juno_samples/test2.jn new file mode 100644 index 00000000..a1fc6e65 --- /dev/null +++ b/juno_samples/test2.jn @@ -0,0 +1,25 @@ +#[entry] +fn main<m, n : usize>() -> i32[m, n, 64, 64] { + let res : i32[m, n, 64, 64]; + for bi = 0 to m { + for bj = 0 to n { + let tile : i32[64, 64]; + for ti = 0 to 64 { + for tj = 0 to 64 { + tile[ti, tj] = (ti as i32) + (tj as i32) + (bi as i32) + (bj as i32); + } + } + for si = 1 to 63 { + for sj = 1 to 63 { + tile[si, sj] = tile[si-1, sj-1] + tile[si+1, sj+1]; + } + } + for ri = 0 to 64 { + for rj = 0 to 64 { + res[bi, bj, ri, rj] = tile[ri, rj]; + } + } + } + } + return res; +} diff --git a/torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc b/torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb6ac34cdbe90728338b32e7b1eae9b0edb1b69b GIT binary patch literal 582 zcmYjN!HN?>5bf^iOlBvO!0tg%Jczd(+<`qWB8Z;Ei+HKa(3`3x9Vb0Q_w2APd-Eo{ z|Bze+e<@c_^9zDtbp*AVe$`b})q7pdbeapcZ>M*wuS$p?G5Fsy4qovM_Y5dd)&ine zwcILDGQN{K*(Q)kK?*gF9wcZIo6#o46vvq1344wCLrQuxM)8<?E=8=^Wt=&|$zHL~ zG3Bh@P4<E(iCw{6DHd4l^-{u=64Erq=@;ff`9aKQ11l2m=kic&n=4zBqps>NV%#-; zv99Q*ZG883E+KC{b>3J<I7n_9o<&F^@Snw8!{ifrM|BnKEkPPOZ;@c!8bs?>ps%b6 zLtNDGl;L*vm+;(l*3V}!I#R;f(e*ai#+`2)q&oP>;3F4|ju}wSLHfFJ{RKj{B-34c z3Xr~IdB#Igo6uE@wQ;Nm@&$*%E-m?krry*r^2VWqf^N*Vt*J786}}5Ui7XL*z9~Nj z@}X?Gkf<w>`f^Qvaa~h*anx7-+0W89t|NzKI8F-XzhAYV57{NF<(`*uVc`iwCJI@| SJbA7ksZ5<EN+}sn_UktrSe*C( literal 0 HcmV?d00001 diff --git a/torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc b/torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ca624dd252afba2297a33d14c41710767938ad1 GIT binary patch literal 3266 zcma)8&2QVt6`vs~ijpW<{!X0jE}Il+<YeW<Uvk`hG-$fpJ=9$^>854mvW#XVQ>ICI zII?4b0#Z1|p0?QgVgc!xKc)Acc->RZEl|K{o4y&+T5mT%OM#;|kMHC6-n{q7uv{)8 zxR$GNXH-S#8?m`~8L;^&+|eIkA&6if+2UXE@quD1SkS7k9%!~EY0b|a=(a9t?C0$~ z$x+lbx&@+B4X7f?x0TC?7_>wR1h)-RB&AoHZ4&$#)yy~G1?|*uqSF_>ka3`LPx?Kd z0-=5C^1UQ?5EAMq75c(G0KVe|y*`iXNh$1epq#en(?n;K_gT<Vg}LlnfPWF|&)|;! zsvs1j5gH?mK8C1>@(N<&xT|#4Q+24uYK%_tP~%#x#rRYi=Hgs8H`HUjs}q%IU*KWB zgNBBOhJ`pkf+)xM5!(IRuoxFd#u$lzwh1)EJ;tnYSR?tkc!by^;N?kvjEQj>rIMr| zpZt~=M<oJ!V`W&1OYCV}8kwX7lI9}V3uZ}K%I!<J%0zDeyK-Q^Or}T`tOzV+GEHVC z^emYZG?|x{UXulhW<Eu8gxTwhdx_4Ra7rm&h-eWa0?7{ImDo7NZ8a{$=3N9aO1ocT zwn#2b^s+dQkl;_anYCe=sKZKJ7)^~4<nNQFGBlg&aB5f`PQ~RB*d>=1(HMj7%wUCA z<1+X(PcbPiB69gpP)Tq~(-W)bIh_Hi>M4$=_c41LS4Y!iX^gSJn{lJR$1`y?p1zC7 zm0{&6R6$VE5TE%HpG;%eVe~t6_?Kh2<BGt2Sc{Dj_z_kPYkW3Ed@9cTW`=ndFi-P2 z@*erYD->5B0kTtpdxyL~nvbhv{1qY}#ItbvBHvm3U!kv6I45JY5DV+jPa>E1AXkUi zNDlOjF&@tG#aMwj=i~YArS4^N1%B2SLUz8p6wi?#lB;4zmY{>Kxg893Zo&1~J)+h_ zP6O+cUt8^vS+4cz$&-+^_AJ>$77u|X`lT-A<?@hqqHReD6H;#b`)X-(PQ<?3i`$m$ zSSxH>*#k>9sRdA?kVO{XbGdckasa~jXzJw0k$`yYGGa~4Ug+p;>*4b#^!uDzJ(vOl zdL+W*;Y6C81nz;?a-u!AN26`)S3T~9f$LkuYe|ZofYV;j3pyDdkf(qKObHc>i#Zcn z$9t4fYbOjSh=H!lfteWxTL%yX*p+Tu*=U*ksTmdh2ndR7OpvX7j@npG7{gY@?0AK2 z?Q^6dI|t{L8(ih$NSrjK^}Bx9a{cIj{g-0QJ!JrxvKCCM$t7rU&%zze!NSog8euWX zVjLq!A?PuBjdyBll8gGFofr;GNsl`Y%Yk@JNwkRiZM*o~ji@7!_%%)nj??npC~}<F z=<F9OzlkV|R(i}0DCH}J9<4xHT75{+hZ65fin`6h0FITcL~<HO89NK6cfv~GaY-Q6 zU@77v9^Clf@z&1)x1g3M*LmPiffI>I9-}UV{AUQ46<}M#S$xcdU81>A%4`bAbb+@U zO7dBey$_p67z9Ae{|BH<t)`1)in?)}r09s`K}vu&9p|v``souHG@B9r%2`%h)b}C( z-=ZnQQzjvq7f<@Kz@@`tAQM+Lwg}`$@a{8v%nP`ac0_`UXZYF~{s|N58C;S0>tQdd zL#+n&cHj5wgtpzj&j*jLH>2g|@r`Y(y|&tX*4$mT?m@{z#WjUSQ>rwjPV-!;dA<3p zzWkuMd((RM`<uHrYHx&}Gh7>_n%}x|p*}ja9&wj(C*rg>*qQhPr{Iq)H+SFZF;#6| z&~09q-mX8`KKJ{eX3ySPU0qvitlnDN+}c=gG&Z(wHSFT*?e)g`*4pNJ!=AgnwZ6XD zSYN-@SZmz6v$1h|^Y(^aTwlGjerL0>dS~NZCblk^*h=udGrT>}(=?vpXK&uZwe#eA z#V5Ku689xo{s`r_n;+sBdc2Q6gYG+&ohM6C9yBzXQoohpqYK9_PrPuqLj!0LcApAs zqBE423uy4cy9;q5Jy3310~N9fmh(NA(SS#ZL0@<gRL1@w*XQjIx1d-I8ni+JbFOwJ z(dd!uCpl?3QGy`Rq1>QmnJE6m?7)E8%gQ*Zq(#Q4OWf!16_ty6zQ^H@Gq0Bv<tazU z9(;r;44%pfp5!_#?Dvv9qdnhk(M0uWkmNj0528ecnoV>u!UB>MKXd&){W)VHOEe!a zC#viFiPjB0K;;E6a5xjWNc1!|@UTxjmK2W}m~%LN!D|b$#hK_3mKTd?5W9RV=Ojso znGN5D;@gpCdv;UERI(O^GKE&^*yWZ_VYm^;_W~N()10}jeYu5f9=R+MAOp%)g>oJk zqPG*3?L|qh9rht0?AS&5&Pc^okGx0>5R~)5E`-eMNaOa@xvLC<eg3T~US<DX9OlI$ z5Fth}EXLHFY7~lCS52&`8m?eNsbB>g>I~e#1CH|YoHBu9;w5-?3D1DkRiHGyfU7_Y zjtO(vd}}Bg)<I9h%3I|hdP%{ivZT)8tMFXM+RLJ1s8x_=9|M3|nbkxlQh4<=!)X!e z{6xG|nP?ADmweCnp39SDw}Hi$#d1SD)St*NfJB8ekXA1QFGr(n594k+R_}|c6TJtE Z7>43!U`5dZt*I#gR#fF*idOmOe*hW`e6j!l literal 0 HcmV?d00001 -- GitLab From fe6ad988dc41a3234a5e3d7f12a5710da28692b7 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 10 Jan 2025 10:22:18 -0600 Subject: [PATCH 045/109] no torch frontend --- .../__pycache__/mobilenet.cpython-310.pyc | Bin 582 -> 0 bytes .../__pycache__/torch_export.cpython-310.pyc | Bin 3266 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc delete mode 100644 torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc diff --git a/torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc b/torch_frontend/samples/__pycache__/mobilenet.cpython-310.pyc deleted file mode 100644 index cb6ac34cdbe90728338b32e7b1eae9b0edb1b69b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 582 zcmYjN!HN?>5bf^iOlBvO!0tg%Jczd(+<`qWB8Z;Ei+HKa(3`3x9Vb0Q_w2APd-Eo{ z|Bze+e<@c_^9zDtbp*AVe$`b})q7pdbeapcZ>M*wuS$p?G5Fsy4qovM_Y5dd)&ine zwcILDGQN{K*(Q)kK?*gF9wcZIo6#o46vvq1344wCLrQuxM)8<?E=8=^Wt=&|$zHL~ zG3Bh@P4<E(iCw{6DHd4l^-{u=64Erq=@;ff`9aKQ11l2m=kic&n=4zBqps>NV%#-; zv99Q*ZG883E+KC{b>3J<I7n_9o<&F^@Snw8!{ifrM|BnKEkPPOZ;@c!8bs?>ps%b6 zLtNDGl;L*vm+;(l*3V}!I#R;f(e*ai#+`2)q&oP>;3F4|ju}wSLHfFJ{RKj{B-34c z3Xr~IdB#Igo6uE@wQ;Nm@&$*%E-m?krry*r^2VWqf^N*Vt*J786}}5Ui7XL*z9~Nj z@}X?Gkf<w>`f^Qvaa~h*anx7-+0W89t|NzKI8F-XzhAYV57{NF<(`*uVc`iwCJI@| SJbA7ksZ5<EN+}sn_UktrSe*C( diff --git a/torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc b/torch_frontend/samples/__pycache__/torch_export.cpython-310.pyc deleted file mode 100644 index 3ca624dd252afba2297a33d14c41710767938ad1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3266 zcma)8&2QVt6`vs~ijpW<{!X0jE}Il+<YeW<Uvk`hG-$fpJ=9$^>854mvW#XVQ>ICI zII?4b0#Z1|p0?QgVgc!xKc)Acc->RZEl|K{o4y&+T5mT%OM#;|kMHC6-n{q7uv{)8 zxR$GNXH-S#8?m`~8L;^&+|eIkA&6if+2UXE@quD1SkS7k9%!~EY0b|a=(a9t?C0$~ z$x+lbx&@+B4X7f?x0TC?7_>wR1h)-RB&AoHZ4&$#)yy~G1?|*uqSF_>ka3`LPx?Kd z0-=5C^1UQ?5EAMq75c(G0KVe|y*`iXNh$1epq#en(?n;K_gT<Vg}LlnfPWF|&)|;! zsvs1j5gH?mK8C1>@(N<&xT|#4Q+24uYK%_tP~%#x#rRYi=Hgs8H`HUjs}q%IU*KWB zgNBBOhJ`pkf+)xM5!(IRuoxFd#u$lzwh1)EJ;tnYSR?tkc!by^;N?kvjEQj>rIMr| zpZt~=M<oJ!V`W&1OYCV}8kwX7lI9}V3uZ}K%I!<J%0zDeyK-Q^Or}T`tOzV+GEHVC z^emYZG?|x{UXulhW<Eu8gxTwhdx_4Ra7rm&h-eWa0?7{ImDo7NZ8a{$=3N9aO1ocT zwn#2b^s+dQkl;_anYCe=sKZKJ7)^~4<nNQFGBlg&aB5f`PQ~RB*d>=1(HMj7%wUCA z<1+X(PcbPiB69gpP)Tq~(-W)bIh_Hi>M4$=_c41LS4Y!iX^gSJn{lJR$1`y?p1zC7 zm0{&6R6$VE5TE%HpG;%eVe~t6_?Kh2<BGt2Sc{Dj_z_kPYkW3Ed@9cTW`=ndFi-P2 z@*erYD->5B0kTtpdxyL~nvbhv{1qY}#ItbvBHvm3U!kv6I45JY5DV+jPa>E1AXkUi zNDlOjF&@tG#aMwj=i~YArS4^N1%B2SLUz8p6wi?#lB;4zmY{>Kxg893Zo&1~J)+h_ zP6O+cUt8^vS+4cz$&-+^_AJ>$77u|X`lT-A<?@hqqHReD6H;#b`)X-(PQ<?3i`$m$ zSSxH>*#k>9sRdA?kVO{XbGdckasa~jXzJw0k$`yYGGa~4Ug+p;>*4b#^!uDzJ(vOl zdL+W*;Y6C81nz;?a-u!AN26`)S3T~9f$LkuYe|ZofYV;j3pyDdkf(qKObHc>i#Zcn z$9t4fYbOjSh=H!lfteWxTL%yX*p+Tu*=U*ksTmdh2ndR7OpvX7j@npG7{gY@?0AK2 z?Q^6dI|t{L8(ih$NSrjK^}Bx9a{cIj{g-0QJ!JrxvKCCM$t7rU&%zze!NSog8euWX zVjLq!A?PuBjdyBll8gGFofr;GNsl`Y%Yk@JNwkRiZM*o~ji@7!_%%)nj??npC~}<F z=<F9OzlkV|R(i}0DCH}J9<4xHT75{+hZ65fin`6h0FITcL~<HO89NK6cfv~GaY-Q6 zU@77v9^Clf@z&1)x1g3M*LmPiffI>I9-}UV{AUQ46<}M#S$xcdU81>A%4`bAbb+@U zO7dBey$_p67z9Ae{|BH<t)`1)in?)}r09s`K}vu&9p|v``souHG@B9r%2`%h)b}C( z-=ZnQQzjvq7f<@Kz@@`tAQM+Lwg}`$@a{8v%nP`ac0_`UXZYF~{s|N58C;S0>tQdd zL#+n&cHj5wgtpzj&j*jLH>2g|@r`Y(y|&tX*4$mT?m@{z#WjUSQ>rwjPV-!;dA<3p zzWkuMd((RM`<uHrYHx&}Gh7>_n%}x|p*}ja9&wj(C*rg>*qQhPr{Iq)H+SFZF;#6| z&~09q-mX8`KKJ{eX3ySPU0qvitlnDN+}c=gG&Z(wHSFT*?e)g`*4pNJ!=AgnwZ6XD zSYN-@SZmz6v$1h|^Y(^aTwlGjerL0>dS~NZCblk^*h=udGrT>}(=?vpXK&uZwe#eA z#V5Ku689xo{s`r_n;+sBdc2Q6gYG+&ohM6C9yBzXQoohpqYK9_PrPuqLj!0LcApAs zqBE423uy4cy9;q5Jy3310~N9fmh(NA(SS#ZL0@<gRL1@w*XQjIx1d-I8ni+JbFOwJ z(dd!uCpl?3QGy`Rq1>QmnJE6m?7)E8%gQ*Zq(#Q4OWf!16_ty6zQ^H@Gq0Bv<tazU z9(;r;44%pfp5!_#?Dvv9qdnhk(M0uWkmNj0528ecnoV>u!UB>MKXd&){W)VHOEe!a zC#viFiPjB0K;;E6a5xjWNc1!|@UTxjmK2W}m~%LN!D|b$#hK_3mKTd?5W9RV=Ojso znGN5D;@gpCdv;UERI(O^GKE&^*yWZ_VYm^;_W~N()10}jeYu5f9=R+MAOp%)g>oJk zqPG*3?L|qh9rht0?AS&5&Pc^okGx0>5R~)5E`-eMNaOa@xvLC<eg3T~US<DX9OlI$ z5Fth}EXLHFY7~lCS52&`8m?eNsbB>g>I~e#1CH|YoHBu9;w5-?3D1DkRiHGyfU7_Y zjtO(vd}}Bg)<I9h%3I|hdP%{ivZT)8tMFXM+RLJ1s8x_=9|M3|nbkxlQh4<=!)X!e z{6xG|nP?ADmweCnp39SDw}Hi$#d1SD)St*NfJB8ekXA1QFGr(n594k+R_}|c6TJtE Z7>43!U`5dZt*I#gR#fF*idOmOe*hW`e6j!l -- GitLab From 1c92d5b86684fc302a591657731df75ff9e3b8a2 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 10 Jan 2025 18:53:09 -0600 Subject: [PATCH 046/109] runs --- hercules_cg/src/gpu.rs | 212 +++++++++++++++++++++++++++++------------ 1 file changed, 149 insertions(+), 63 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a153b7ef..499ecce8 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -296,6 +296,7 @@ impl GPUContext<'_> { // Emit all code up to the "goto" to Start's block let mut top = String::new(); self.codegen_kernel_begin(&mut top)?; + let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; @@ -306,10 +307,11 @@ impl GPUContext<'_> { let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - println!("num_blocks: {}", num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); - let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); + // TODO: Uncomment and adjust once we know logic of extra dim + // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); + let extra_dim_collects = HashSet::new(); // Setup for CUDA's "goto" for control flow between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) @@ -333,19 +335,20 @@ impl GPUContext<'_> { &fork_control_map, &fork_thread_quota_map, &extra_dim_collects, + &mut dynamic_shared_offset, num_threads, &mut gotos, )?; // Emit all GPU kernel code from previous steps let mut kernel_body = String::new(); - self.codegen_gotos(&mut gotos, &mut kernel_body)?; + self.codegen_gotos(false, &mut gotos, &mut kernel_body)?; write!(w, "{}", kernel_body)?; write!(w, "}}\n")?; // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(num_blocks, num_threads, &mut host_launch)?; + self.codegen_launch_code(true, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) @@ -420,8 +423,9 @@ namespace cg = cooperative_groups; write!(w, ") {{\n")?; write!(w, "\textern __shared__ char dynamic_shared[];\n")?; // This will only get used by thread rank 0 in each block, since it - // does all shared memory "allocation" - write!(w, "\tuint64_t dynamic_shared_offset = 0;\n")?; + // does all shared memory "allocation". The actual state is preserved + // in Rust string and this offset is assigned to for ease of readability. + write!(w, "\tuint64_t dynamic_shared_offset;\n")?; Ok(()) } @@ -489,11 +493,14 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_gotos(&self, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { + fn codegen_gotos(&self, goto_debug: bool, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> { write!(w, "\n")?; for (id, goto) in gotos.iter() { let goto_block = self.get_block_name(*id, false); write!(w, "{}:\n", goto_block)?; + if goto_debug { + write!(w, "\tprintf(\"goto {}\\n\");\n", goto_block)?; + } write!(w, "{}", goto.init)?; if !goto.post_init.is_empty() { let goto_block = self.get_block_name(*id, true); @@ -506,47 +513,119 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, run_debug: bool, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { write!(w, " -int main(")?; +int main() {{ +")?; // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); - // The first set of parameters are dynamic constants. - let mut first_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(w, ", ")?; + if run_debug { + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(pass_args, ", ")?; + } + write!(w, "\tunsigned long long dc_p{} = 1ull;\n", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + self.codegen_dynamic_constants(w)?; + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(pass_args, ", ")?; + } + let param_type = self.get_type(*ty, false); + if self.types[ty.idx()].is_primitive() { + write!(w, "\t{} p{} = 1;\n", param_type, idx)?; + } else { + let param_size = self.get_size(*ty, None, None); + write!(w, "\t{} p{};\n", param_type, idx); + write!(w, "\tif (cudaMalloc(&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; + write!(w, "\t\tprintf(\"Error allocating memory for parameter %d\\n\", {});\n", idx)?; + write!(w, "\t\treturn -1;\n"); + write!(w, "\t}}\n"); + } + write!(pass_args, "p{}", idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { write!(pass_args, ", ")?; + let ret_type_no_pnt = self.get_type(*self.return_type_id, false); + let ret_type = self.get_type(*self.return_type_id, true); + write!(w, "\t{} ret;\n", ret_type)?; + write!(w, "\tif (cudaMalloc(&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; + write!(w, "\t\tprintf(\"Error allocating memory for return value\\n\");\n")?; + write!(w, "\t\treturn -1;\n")?; + write!(w, "\t}}\n"); + write!(pass_args, "ret")?; } - write!(w, "unsigned long long dc_p{}", idx)?; - write!(pass_args, "dc_p{}", idx)?; - } - // The second set of parameters are normal arguments. - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { + write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); + write!(w, "\tbool skip = false;\n")?; + write!(w, "\tcudaError_t err = cudaGetLastError();\n")?; + write!(w, "\tif (err != cudaSuccess) {{\n")?; + write!(w, "\t\tprintf(\"Error launching kernel: %s\\n\", cudaGetErrorString(err));\n")?; + write!(w, "\t\tskip = true;\n")?; + write!(w, "\t}}\n"); + write!(w, "\tif (cudaDeviceSynchronize() != cudaSuccess && !skip) {{\n")?; + write!(w, "\t\tprintf(\"Error synchronizing device\\n\");\n")?; + write!(w, "\t\tskip = true;\n")?; + write!(w, "\t}}\n"); + for (idx, ty) in self.function.param_types.iter().enumerate() { + if !self.types[ty.idx()].is_primitive() { + write!(w, "\tcudaFree(p{});\n", idx)?; + } + } + if self.types[self.return_type_id.idx()].is_primitive() { + write!(w, "\tcudaFree(ret);\n"); + } + write!(w, "\treturn 0;\n"); + write!(w, "}}\n"); + } + + else { + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + write!(w, "unsigned long long dc_p{}", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; + } + let param_type = self.get_type(*ty, false); + write!(w, "{} p{}", param_type, idx)?; + write!(pass_args, "p{}", idx)?; + } + // Pull primitive return to a pointer parameter + if self.types[self.return_type_id.idx()].is_primitive() { write!(w, ", ")?; write!(pass_args, ", ")?; + let ret_type = self.get_type(*self.return_type_id, true); + write!(w, "{} ret", ret_type)?; + write!(pass_args, "ret")?; } - let param_type = self.get_type(*ty, false); - write!(w, "{} p{}", param_type, idx)?; - write!(pass_args, "p{}", idx)?; - } - // Pull primitive return to a pointer parameter - if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, ", ")?; - write!(pass_args, ", ")?; - let ret_type = self.get_type(*self.return_type_id, true); - write!(w, "{} ret", ret_type)?; - write!(pass_args, "ret")?; + write!(w, ") {{ + {}<<<{}, {}, {}>>>({}); +}}", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); } - write!(w, ") {{ - {}<<<{}, {}>>>({}); -}}", self.function.name, num_blocks, num_threads, pass_args); + Ok(()) } @@ -575,12 +654,10 @@ int main(")?; // Then get it's nesting fork- index = 1 to not count itself! let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0)); fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(control); - println!("fork_tree parent: {}, child: {}", nesting_fork.idx(), control.idx()); } // Here the desired fork is always the first fork let fork = forks.first().copied().unwrap_or(NodeID::new(0)); fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(control); - println!("fork_control_map parent: {}, child: {}", fork.idx(), control.idx()); (fork_tree, fork_control_map) }, ) @@ -676,7 +753,7 @@ int main(")?; .iter() .map(|child| (child, self.recurse_thread_quotas(*child, fork_tree, false))) .fold( - (HashMap::new(), HashMap::new(), 0), + (HashMap::new(), HashMap::new(), 1), |(mut subsubtree_map, mut children_quota_map, subtree_quota), (child, (curr_map, curr_quota, use_curr))| { subsubtree_map.extend(curr_map); if use_curr { @@ -771,8 +848,6 @@ int main(")?; (collect_const, users.iter().map(|user| control_fork_map[&self.bbs.0[user.idx()]]).collect()) }) .collect(); - // For now assert that each collection is used by a single fork and get - // parallel status, TODO: revisit collect_fork_users.iter() .filter(|(_, fork_users)| !fork_thread_quota_map.contains_key(fork_users.iter().next().unwrap())) .map(|(collect_const, _)| self.typing[collect_const.idx()]) @@ -790,6 +865,7 @@ int main(")?; fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -797,7 +873,6 @@ int main(")?; // Recall that this was tracked through a fake fork node with NodeID 0. let mut state = KernelState::OutBlock; for control in fork_control_map.get(&NodeID::new(0)).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -805,14 +880,13 @@ int main(")?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { state = KernelState::InBlock; for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -820,7 +894,7 @@ int main(")?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } } @@ -836,6 +910,7 @@ int main(")?; 1, num_threads, extra_dim_collects, + dynamic_shared_offset, gotos, )?; } @@ -858,6 +933,7 @@ int main(")?; parent_quota: usize, num_threads: usize, extra_dim_collections: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map @@ -879,7 +955,6 @@ int main(")?; HashSet::new() }; for control in fork_control_map.get(&curr_fork).unwrap() { - println!("gen for control: {}", control.idx()); let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -895,6 +970,7 @@ int main(")?; Some(curr_fork), reducts.contains(data), extra_dim_collections, + dynamic_shared_offset, body, &mut tabs, )?; @@ -910,6 +986,7 @@ int main(")?; use_thread_quota, num_threads, extra_dim_collections, + dynamic_shared_offset, gotos, )?; } @@ -925,6 +1002,7 @@ int main(")?; nesting_fork: Option<NodeID>, is_special_reduct: bool, extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, w: &mut String, num_tabs: &mut usize, ) -> Result<(), Error> { @@ -1013,6 +1091,7 @@ int main(")?; *cons_id, true, Some(extra_dim_collects), + dynamic_shared_offset, w, *num_tabs, )?; @@ -1460,18 +1539,23 @@ int main(")?; panic!("Expected array type") }; let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); + let max_left_array_index = array_indices.len() - 1 - if has_extra_dim { 1 } else { 0 }; for (i, index) in array_indices.iter().skip(if has_extra_dim { 1 } else { 0 }).rev().enumerate() { cumulative_offset = format!( - "{} * ({} + {}", + "{} * ({}{}", cumulative_offset, self.get_value(*index, false, false), - format!("dc{}", extents[i].idx()) + if i != max_left_array_index { + format!(" + dc{}", extents[max_left_array_index - i].idx()) + } else { + "".to_string() + } ); } index_ptr.push_str(&format!( " + {}{}", cumulative_offset, - ")".repeat(array_indices.len()) + ")".repeat(array_indices.len() - if has_extra_dim { 1 } else { 0 }) )); if is_char { let element_size = self.get_size(*element_type, None, None); @@ -1501,6 +1585,7 @@ int main(")?; cons_id: ConstantID, allow_allocate: bool, extra_dim_collects: Option<&HashSet<TypeID>>, + dynamic_shared_offset: &mut String, w: &mut String, num_tabs: usize, ) -> Result<(), Error> { @@ -1523,9 +1608,10 @@ int main(")?; if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); - write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } let Type::Product(type_fields) = &self.types[type_id.idx()] else { panic!("Product constant should have product type") @@ -1541,11 +1627,12 @@ int main(")?; constant_fields[i], false, extra_dim_collects, + dynamic_shared_offset, w, num_tabs, ); } else if !field_constant.is_array() { - self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, w, num_tabs); + self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, dynamic_shared_offset, w, num_tabs); } } } @@ -1553,9 +1640,10 @@ int main(")?; if allow_allocate { let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); - write!(w, "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {}\n", tabs, alignment, alignment, alignment)?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } // No offset updating needed since all variants start at 0 let Type::Summation(variants) = &self.types[type_id.idx()] else { @@ -1570,11 +1658,12 @@ int main(")?; *field, false, extra_dim_collects, + dynamic_shared_offset, w, num_tabs, ); } else if !variant_constant.is_array() { - self.codegen_constant(name, *field, false, extra_dim_collects, w, num_tabs); + self.codegen_constant(name, *field, false, extra_dim_collects, dynamic_shared_offset, w, num_tabs); }; } Constant::Array(type_id) => { @@ -1587,13 +1676,10 @@ int main(")?; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None, extra_dim_collects); let element_type = self.get_type(*element_type, true); - write!( - w, - "{}dynamic_shared_offset = ((dynamic_shared_offset + {} - 1) / {}) * {};\n", - tabs, alignment, alignment, alignment - )?; + *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); + write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n", tabs, name, element_type)?; - write!(w, "{}dynamic_shared_offset += {};\n", tabs, size)?; + *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } } Ok(()) -- GitLab From ca20c08fc21f24a0f348b5eed134d538e87c5086 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Tue, 14 Jan 2025 16:39:16 -0600 Subject: [PATCH 047/109] rm stuff --- hercules_cg/src/device.rs | 2 - hercules_cg/src/gpu.rs | 118 +++++++++++++++++++------------------- 2 files changed, 59 insertions(+), 61 deletions(-) diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs index 09a5bc26..866fa6ad 100644 --- a/hercules_cg/src/device.rs +++ b/hercules_cg/src/device.rs @@ -9,8 +9,6 @@ pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec let mut devices = vec![]; for (idx, function) in functions.into_iter().enumerate() { - devices.push(Device::CUDA); - continue; if let Some(device) = function.device { devices.push(device); } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 { diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 499ecce8..ab6e8f41 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -31,24 +31,24 @@ pub fn gpu_codegen<W: Write>( * one of multiple parameters. * * We don't assert but assume the following: - * - max_num_blocks in KernelParams is within constraint of 1D grid size. This + * - max_num_blocks in KernelParams is within constraint of 1D grid size. This * can be relaxed if we want to support larger grids. * - Product types are packed with padding inserted for each element to * be aligned for its type and for full product to be aligned to its * largest element * - Summation types must be aligned to their largest element - * + * * Notes on GPU parallelization strategy and tips for IR transformations: * - The top level block fork and any lower thread forks require a known Fork - * size. Thus for an otherwise parallelizable Fork with unknown size, + * size. Thus for an otherwise parallelizable Fork with unknown size, * consider splitting it into two Forks with one of known size. For block * level, the known fork has to be the (only) top-most fork. - * - The thread-level strategy is determined by starting at the most nested + * - The thread-level strategy is determined by starting at the most nested * Forks and working outwards in a greedy manner, with caps by GPU spec. * Thus, to ensure some outer Fork is parallelized, ensure the inner * parallelizable Forks aren't too large or consider removing schedule * annotations. - * - Tight-Associative reductions can only be efficiently implemented if + * - Tight-Associative reductions can only be efficiently implemented if * different Hercules ThreadIDs correspond to consecutive CUDA threads. But * this prevents nested parallelization since each parallel group must always * be a contiguous tile of threads. We use a heuristic of choosing the larger @@ -59,10 +59,10 @@ pub fn gpu_codegen<W: Write>( * Fork contains expensive parallelizable operations, ensure all reductions * are parallelizable or if not try pulling those out into a different Fork. * - We do nothing to mitigate intra-warp divergence. To mitigate this, the - * IR, for example, should ensure the innermost parallelizable Forks either + * IR, for example, should ensure the innermost parallelizable Forks either * have factor >= warp size (32) or remove Fork/Reduce node schedule * annotations. - * + * * Main TODOs: * - Fix dynamic shared memory allocation to reuse old shmem. The main case * for improvement is when we have serialized forks with unused intermediate @@ -135,7 +135,7 @@ pub fn gpu_codegen<W: Write>( } // Obtain the Return node and if it's a collection, use the collection objects - // analysis to determine the origin. Also save the return node id for later + // analysis to determine the origin. Also save the return node id for later // conversion of primitive Return into Parameter. let (return_node_id, data_node_id) = { let pos = function @@ -248,11 +248,11 @@ struct GPUContext<'a> { } /* - * For all control nodes besides forks, Init, Body, and Term compose the main basic - * block, with Init and Term populated by control flow (Init used only by Fork and - * Join) and Body populated by data flow. + * For all control nodes besides forks, Init, Body, and Term compose the main basic + * block, with Init and Term populated by control flow (Init used only by Fork and + * Join) and Body populated by data flow. * For serialized Fork nodes which may be jumped back to by corresponding Join node, - * init and post_init separate one-time code (currently just cooperative group + * init and post_init separate one-time code (currently just cooperative group * creation) from repeated code. */ #[derive(Default, Debug)] @@ -279,9 +279,9 @@ enum KernelState { } /* - * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) + * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) * threads for a current ThreadID, Use is the union of such threads for all ThreadIDs - * in the current innermost Fork, and Available is Use plus additional threads not + * in the current innermost Fork, and Available is Use plus additional threads not * used in the current Fork. */ #[derive(Clone, Copy, PartialEq, Debug)] @@ -348,7 +348,7 @@ impl GPUContext<'_> { // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(true, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; + self.codegen_launch_code(false, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) @@ -375,7 +375,7 @@ namespace cg = cooperative_groups; #define roundi(a) (a) #define isqrt(a) ((int)sqrtf((float)(a))) -", +", )?; write!( @@ -463,12 +463,12 @@ namespace cg = cooperative_groups; Ok(()) } - // To abide by c++ reassignment restrictions, we declare all data values + // To abide by c++ reassignment restrictions, we declare all data values // upfront. fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if !self.function.nodes[id.idx()].is_control() && - !self.function.nodes[id.idx()].is_dynamic_constant() && + if !self.function.nodes[id.idx()].is_control() && + !self.function.nodes[id.idx()].is_dynamic_constant() && !self.function.nodes[id.idx()].is_parameter() { write!(w, "\t{};\n", self.get_value(id, true, false))?; } @@ -477,8 +477,8 @@ namespace cg = cooperative_groups; } /* - * Emit helper registers that are used throughout the kernel. grid and block - * are from CUDA's cooperative groups API and are used specifically for reads + * Emit helper registers that are used throughout the kernel. grid and block + * are from CUDA's cooperative groups API and are used specifically for reads * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { @@ -517,7 +517,7 @@ namespace cg = cooperative_groups; write!(w, " int main() {{ ")?; - // The following steps are for host-side C function arguments, but we also + // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); if run_debug { @@ -586,8 +586,8 @@ int main() {{ } write!(w, "\treturn 0;\n"); write!(w, "}}\n"); - } - + } + else { // The first set of parameters are dynamic constants. let mut first_param = true; @@ -633,9 +633,9 @@ int main() {{ * a) domination by F * b) no domination by F's join * c) no domination by any other fork that's also dominated by F, where we don't count self-domination - * Note that the fork_tree also includes the start node, to include all controls + * Note that the fork_tree also includes the start node, to include all controls * outside any fork. - * + * * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying: * a) domination by F * b) no domination by F's join @@ -665,8 +665,8 @@ int main() {{ /* * If tree has a single root fork of known size s <= max_num_blocks - * with parallel-fork schedule, then set num_blocks to s, else set num_blocks - * to 1. Also return the root fork(s) for parallelization strategy within + * with parallel-fork schedule, then set num_blocks to s, else set num_blocks + * to 1. Also return the root fork(s) for parallelization strategy within * threadblocks for threads and their eventual generation. */ fn get_root_forks_and_num_blocks( @@ -720,10 +720,10 @@ int main() {{ * maximum over its descendants (leafs have base 1). We traverse up (details * in helper) and pass the factor and a map from fork node to a tuple of * (max quota of its siblings (including itself), its quota, its fork factor) - * from each node to its parents. The parent then compares + * from each node to its parents. The parent then compares * - all three are needed for codegen. A node is in the map IFF it will be parallelized. - * If not, the fork will use the parent's quota and serialize over the Fork's - * ThreadIDs. Nodes may be removed from the map when traversing up the tree + * If not, the fork will use the parent's quota and serialize over the Fork's + * ThreadIDs. Nodes may be removed from the map when traversing up the tree * due to an ancestor having a larger factor that conflicts. */ fn get_thread_quotas( @@ -743,9 +743,9 @@ int main() {{ is_root: bool, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { // Subsubtree map is the union of all keys for grandchildren and lower - // nodes. children_quota_map is a constructed map from parallelized children + // nodes. children_quota_map is a constructed map from parallelized children // to their quota to update the subsubtree map at grandchildren level to - // subtreemap at children level. subtree_quota is cumulative factor of + // subtreemap at children level. subtree_quota is cumulative factor of // subtree and is then compared to this fork's factor. let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree .get(&curr_fork) @@ -762,7 +762,7 @@ int main() {{ (subsubtree_map, children_quota_map, subtree_quota.max(curr_quota)) }, ); - // First update children_quota_map items with full information and add + // First update children_quota_map items with full information and add // to subsubtree_map for (&child, quota) in children_quota_map.iter() { let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else { @@ -780,7 +780,7 @@ int main() {{ // b) the known size is less than or equal to the max_num_threads // c) the known size is a power of 2 // d) all reduces are parallel-reduce or associative - // + // // If not, just take the max cumulative factor of its subtree let reduces = &self.fork_reduce_map[&curr_fork]; if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] @@ -792,12 +792,12 @@ int main() {{ || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) { - // If there's an associative Reduce, parallelize the larger factor + // If there's an associative Reduce, parallelize the larger factor // between the Fork and subtree - // Else, all Reduces must be only parallel-reduce, so parallelize + // Else, all Reduces must be only parallel-reduce, so parallelize // both if they fit and the larger if not. // The reason for this distinction is that we only perform Reduces over - // ThreadID-based values over consecutive CUDA threads, so there's no + // ThreadID-based values over consecutive CUDA threads, so there's no // opportunity for further nested parallelization. In contrast, this // restriction doesn't help for parallel Writes, so nested parallelization // is possible. @@ -817,10 +817,10 @@ int main() {{ } } - /* + /* * All non reduced-over collections used in fork joins have an extra dimension. - * However, this is only useful if ThreadIDs run in parallel not serially, - * otherwise it's unnecessarily consuming shared memory. This function returns + * However, this is only useful if ThreadIDs run in parallel not serially, + * otherwise it's unnecessarily consuming shared memory. This function returns * the set of collections that have an unnecessary extra dimension. */ fn get_extra_dim_collects( @@ -1036,8 +1036,8 @@ int main() {{ let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { - // We can directly use use_thread_quota and not worry about available - // because Fork basic block's init section already does gating + // We can directly use use_thread_quota and not worry about available + // because Fork basic block's init section already does gating write!(w, "{}{} = (threadIdx.x % {}) / {};\n", tabs, define_variable, use_thread_quota.unwrap(), use_thread_quota.unwrap() / parallel_factor.unwrap())?; } } @@ -1046,8 +1046,8 @@ int main() {{ } } } - // The Reduce node only generates it's initialization, as reduct will - // perform the update. If serialized, add gate to prevent re-assignment + // The Reduce node only generates it's initialization, as reduct will + // perform the update. If serialized, add gate to prevent re-assignment // when we hit this reduce again due to the control flow loop between // the Fork and Join. Node::Reduce { @@ -1076,7 +1076,7 @@ int main() {{ Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); if (!is_primitive) { - let cg_tile = { + let cg_tile = { let KernelState::OutBlock = state else { panic!("Expected constant to be in start basic block outside any fork"); @@ -1153,9 +1153,9 @@ int main() {{ let left_val = self.get_value(*left, false, false); let right_val = self.get_value(*right, false, false); let id_type = self.typing[id.idx()]; - if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And + if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And | BinaryOperator::Xor) && is_special_reduct { - // For parallelized associative Reduces, use the cooperative + // For parallelized associative Reduces, use the cooperative // groups reduce API. Associative multiplication is not // supported. We need to use CGType::Use not CGType::UsePerId // because for parallelized reduction we only have one thread @@ -1349,7 +1349,7 @@ int main() {{ panic!("Unsupported data node type") } } - // Since the data uses and reducts are responsible for updating Phi and + // Since the data uses and reducts are responsible for updating Phi and // Reduce nodes, respectively, we check and emit those for each data node. if let Some(phis) = self.label_data_for_phi.get(&id) { let val = self.get_value(id, false, false); @@ -1403,12 +1403,12 @@ int main() {{ } Node::Fork { control: _, factors: _ } => { // We create a cooperative group tile for each of: used threads per - // thread ID- for reads and writes-, used threads across all thread + // thread ID- for reads and writes-, used threads across all thread // IDs- for parallelized reductions-, and available threads- to // synchronize between used and unused threads. We want to create // these only once, so we create two goto sections for each fork- // one run only once for creating groups, and other may be ran - // multiple times if the Fork is serialized and Join jumps back + // multiple times if the Fork is serialized and Join jumps back // to it. let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { @@ -1453,7 +1453,7 @@ int main() {{ } } Node::Join { control: _ } => { - // Join nodes also gate the used vs unused threads with a tile + // Join nodes also gate the used vs unused threads with a tile // sync after the body. let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); @@ -1516,7 +1516,7 @@ int main() {{ /* * This function emits collection name + pointer math for the provided indices. * One nuance is whether the collection is represented as char pointer or - * the original primitive pointer. For Field, it's always char, for Variant, + * the original primitive pointer. For Field, it's always char, for Variant, * it doesn't matter here, and for Array, it depends- so we may need to tack * on the element size to the index math. */ @@ -1571,12 +1571,12 @@ int main() {{ /* * The outlined codegen for constants allows us to handle recursive initialization * for collections. We perform "allocation" by atomically incrementing dynamic - * shared memory and CUDA's support for dynamic is limited to a single extern + * shared memory and CUDA's support for dynamic is limited to a single extern * array. Dynamic is required here because not all dynamic constants and therefore - * array sizes are known. This approach will need further work, as currently - * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` - * prevents unnecessary shared memory allocations for nested product and summation - * collections, since the outermost allocates everything for the full collection. + * array sizes are known. This approach will need further work, as currently + * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` + * prevents unnecessary shared memory allocations for nested product and summation + * collections, since the outermost allocates everything for the full collection. * Since not initialized, array collections don't need to be recursed into. */ fn codegen_constant( @@ -1911,7 +1911,7 @@ int main() {{ /* * Setting `ty = true` will return with type in declaration format. `make_pointer` - * is only considered if `ty = true` and only relevant for primitive types- + * is only considered if `ty = true` and only relevant for primitive types- * otherwise it makes no difference because collections are already pointers. */ fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { -- GitLab From d8e05684b589a02e660b2868d9256b5cde06ccc8 Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Tue, 14 Jan 2025 16:47:56 -0600 Subject: [PATCH 048/109] use cuda as default device if cuda feature present --- hercules_cg/Cargo.toml | 3 +++ hercules_cg/src/device.rs | 2 ++ 2 files changed, 5 insertions(+) diff --git a/hercules_cg/Cargo.toml b/hercules_cg/Cargo.toml index cf0767de..0952ee57 100644 --- a/hercules_cg/Cargo.toml +++ b/hercules_cg/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = [] + [dependencies] rand = "*" ordered-float = "*" diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs index 866fa6ad..9f71c1a7 100644 --- a/hercules_cg/src/device.rs +++ b/hercules_cg/src/device.rs @@ -13,6 +13,8 @@ pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec devices.push(device); } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 { devices.push(Device::AsyncRust); + } else if cfg!(feature = "cuda") { + devices.push(Device::CUDA); } else { devices.push(Device::LLVM); } -- GitLab From 492f2a7ea361b1161cb8392f82b75797eb87dde1 Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Tue, 14 Jan 2025 16:57:14 -0600 Subject: [PATCH 049/109] propagate cuda feature in cargo.tomls --- hercules_opt/Cargo.toml | 3 +++ hercules_samples/fac/Cargo.toml | 3 +++ juno_build/Cargo.toml | 3 +++ juno_frontend/Cargo.toml | 3 +++ 4 files changed, 12 insertions(+) diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml index 84f6aca8..c8f7780f 100644 --- a/hercules_opt/Cargo.toml +++ b/hercules_opt/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" +[features] +cuda = ["hercules_cg/cuda"] + [dependencies] ordered-float = "*" bitvec = "*" diff --git a/hercules_samples/fac/Cargo.toml b/hercules_samples/fac/Cargo.toml index d4b9c5fe..350e3658 100644 --- a/hercules_samples/fac/Cargo.toml +++ b/hercules_samples/fac/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_build/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml index 72faf4bd..13889171 100644 --- a/juno_build/Cargo.toml +++ b/juno_build/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_frontend/cuda"] + [dependencies] juno_frontend = { path = "../juno_frontend" } hercules_ir = { path = "../hercules_ir" } diff --git a/juno_frontend/Cargo.toml b/juno_frontend/Cargo.toml index 39e18baa..3c3d557f 100644 --- a/juno_frontend/Cargo.toml +++ b/juno_frontend/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" +[features] +cuda = ["hercules_opt/cuda"] + [[bin]] name = "juno" path = "src/main.rs" -- GitLab From 217e4d09c03a757d8befe2eda153bcb8ca43459a Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Tue, 14 Jan 2025 17:15:45 -0600 Subject: [PATCH 050/109] fix --- hercules_cg/src/gpu.rs | 118 +++++++++++----------- hercules_ir/src/ir.rs | 221 +---------------------------------------- 2 files changed, 65 insertions(+), 274 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 499ecce8..36666ac7 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -6,6 +6,8 @@ use std::fmt::{Error, Write}; use self::hercules_ir::*; +use crate::*; + /* * The top level function to compile a Hercules IR function into CUDA * kernel for execution on the GPU. We generate CUDA C textually, with a lot @@ -31,24 +33,24 @@ pub fn gpu_codegen<W: Write>( * one of multiple parameters. * * We don't assert but assume the following: - * - max_num_blocks in KernelParams is within constraint of 1D grid size. This + * - max_num_blocks in KernelParams is within constraint of 1D grid size. This * can be relaxed if we want to support larger grids. * - Product types are packed with padding inserted for each element to * be aligned for its type and for full product to be aligned to its * largest element * - Summation types must be aligned to their largest element - * + * * Notes on GPU parallelization strategy and tips for IR transformations: * - The top level block fork and any lower thread forks require a known Fork - * size. Thus for an otherwise parallelizable Fork with unknown size, + * size. Thus for an otherwise parallelizable Fork with unknown size, * consider splitting it into two Forks with one of known size. For block * level, the known fork has to be the (only) top-most fork. - * - The thread-level strategy is determined by starting at the most nested + * - The thread-level strategy is determined by starting at the most nested * Forks and working outwards in a greedy manner, with caps by GPU spec. * Thus, to ensure some outer Fork is parallelized, ensure the inner * parallelizable Forks aren't too large or consider removing schedule * annotations. - * - Tight-Associative reductions can only be efficiently implemented if + * - Tight-Associative reductions can only be efficiently implemented if * different Hercules ThreadIDs correspond to consecutive CUDA threads. But * this prevents nested parallelization since each parallel group must always * be a contiguous tile of threads. We use a heuristic of choosing the larger @@ -59,10 +61,10 @@ pub fn gpu_codegen<W: Write>( * Fork contains expensive parallelizable operations, ensure all reductions * are parallelizable or if not try pulling those out into a different Fork. * - We do nothing to mitigate intra-warp divergence. To mitigate this, the - * IR, for example, should ensure the innermost parallelizable Forks either + * IR, for example, should ensure the innermost parallelizable Forks either * have factor >= warp size (32) or remove Fork/Reduce node schedule * annotations. - * + * * Main TODOs: * - Fix dynamic shared memory allocation to reuse old shmem. The main case * for improvement is when we have serialized forks with unused intermediate @@ -135,7 +137,7 @@ pub fn gpu_codegen<W: Write>( } // Obtain the Return node and if it's a collection, use the collection objects - // analysis to determine the origin. Also save the return node id for later + // analysis to determine the origin. Also save the return node id for later // conversion of primitive Return into Parameter. let (return_node_id, data_node_id) = { let pos = function @@ -248,11 +250,11 @@ struct GPUContext<'a> { } /* - * For all control nodes besides forks, Init, Body, and Term compose the main basic - * block, with Init and Term populated by control flow (Init used only by Fork and - * Join) and Body populated by data flow. + * For all control nodes besides forks, Init, Body, and Term compose the main basic + * block, with Init and Term populated by control flow (Init used only by Fork and + * Join) and Body populated by data flow. * For serialized Fork nodes which may be jumped back to by corresponding Join node, - * init and post_init separate one-time code (currently just cooperative group + * init and post_init separate one-time code (currently just cooperative group * creation) from repeated code. */ #[derive(Default, Debug)] @@ -279,9 +281,9 @@ enum KernelState { } /* - * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) + * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) * threads for a current ThreadID, Use is the union of such threads for all ThreadIDs - * in the current innermost Fork, and Available is Use plus additional threads not + * in the current innermost Fork, and Available is Use plus additional threads not * used in the current Fork. */ #[derive(Clone, Copy, PartialEq, Debug)] @@ -375,7 +377,7 @@ namespace cg = cooperative_groups; #define roundi(a) (a) #define isqrt(a) ((int)sqrtf((float)(a))) -", +", )?; write!( @@ -463,12 +465,12 @@ namespace cg = cooperative_groups; Ok(()) } - // To abide by c++ reassignment restrictions, we declare all data values + // To abide by c++ reassignment restrictions, we declare all data values // upfront. fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { - if !self.function.nodes[id.idx()].is_control() && - !self.function.nodes[id.idx()].is_dynamic_constant() && + if !self.function.nodes[id.idx()].is_control() && + !self.function.nodes[id.idx()].is_dynamic_constant() && !self.function.nodes[id.idx()].is_parameter() { write!(w, "\t{};\n", self.get_value(id, true, false))?; } @@ -477,8 +479,8 @@ namespace cg = cooperative_groups; } /* - * Emit helper registers that are used throughout the kernel. grid and block - * are from CUDA's cooperative groups API and are used specifically for reads + * Emit helper registers that are used throughout the kernel. grid and block + * are from CUDA's cooperative groups API and are used specifically for reads * and writes. */ fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { @@ -517,7 +519,7 @@ namespace cg = cooperative_groups; write!(w, " int main() {{ ")?; - // The following steps are for host-side C function arguments, but we also + // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); if run_debug { @@ -586,8 +588,8 @@ int main() {{ } write!(w, "\treturn 0;\n"); write!(w, "}}\n"); - } - + } + else { // The first set of parameters are dynamic constants. let mut first_param = true; @@ -633,9 +635,9 @@ int main() {{ * a) domination by F * b) no domination by F's join * c) no domination by any other fork that's also dominated by F, where we don't count self-domination - * Note that the fork_tree also includes the start node, to include all controls + * Note that the fork_tree also includes the start node, to include all controls * outside any fork. - * + * * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying: * a) domination by F * b) no domination by F's join @@ -665,8 +667,8 @@ int main() {{ /* * If tree has a single root fork of known size s <= max_num_blocks - * with parallel-fork schedule, then set num_blocks to s, else set num_blocks - * to 1. Also return the root fork(s) for parallelization strategy within + * with parallel-fork schedule, then set num_blocks to s, else set num_blocks + * to 1. Also return the root fork(s) for parallelization strategy within * threadblocks for threads and their eventual generation. */ fn get_root_forks_and_num_blocks( @@ -720,10 +722,10 @@ int main() {{ * maximum over its descendants (leafs have base 1). We traverse up (details * in helper) and pass the factor and a map from fork node to a tuple of * (max quota of its siblings (including itself), its quota, its fork factor) - * from each node to its parents. The parent then compares + * from each node to its parents. The parent then compares * - all three are needed for codegen. A node is in the map IFF it will be parallelized. - * If not, the fork will use the parent's quota and serialize over the Fork's - * ThreadIDs. Nodes may be removed from the map when traversing up the tree + * If not, the fork will use the parent's quota and serialize over the Fork's + * ThreadIDs. Nodes may be removed from the map when traversing up the tree * due to an ancestor having a larger factor that conflicts. */ fn get_thread_quotas( @@ -743,9 +745,9 @@ int main() {{ is_root: bool, ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) { // Subsubtree map is the union of all keys for grandchildren and lower - // nodes. children_quota_map is a constructed map from parallelized children + // nodes. children_quota_map is a constructed map from parallelized children // to their quota to update the subsubtree map at grandchildren level to - // subtreemap at children level. subtree_quota is cumulative factor of + // subtreemap at children level. subtree_quota is cumulative factor of // subtree and is then compared to this fork's factor. let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree .get(&curr_fork) @@ -762,7 +764,7 @@ int main() {{ (subsubtree_map, children_quota_map, subtree_quota.max(curr_quota)) }, ); - // First update children_quota_map items with full information and add + // First update children_quota_map items with full information and add // to subsubtree_map for (&child, quota) in children_quota_map.iter() { let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else { @@ -780,7 +782,7 @@ int main() {{ // b) the known size is less than or equal to the max_num_threads // c) the known size is a power of 2 // d) all reduces are parallel-reduce or associative - // + // // If not, just take the max cumulative factor of its subtree let reduces = &self.fork_reduce_map[&curr_fork]; if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()] @@ -792,12 +794,12 @@ int main() {{ || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) { - // If there's an associative Reduce, parallelize the larger factor + // If there's an associative Reduce, parallelize the larger factor // between the Fork and subtree - // Else, all Reduces must be only parallel-reduce, so parallelize + // Else, all Reduces must be only parallel-reduce, so parallelize // both if they fit and the larger if not. // The reason for this distinction is that we only perform Reduces over - // ThreadID-based values over consecutive CUDA threads, so there's no + // ThreadID-based values over consecutive CUDA threads, so there's no // opportunity for further nested parallelization. In contrast, this // restriction doesn't help for parallel Writes, so nested parallelization // is possible. @@ -817,10 +819,10 @@ int main() {{ } } - /* + /* * All non reduced-over collections used in fork joins have an extra dimension. - * However, this is only useful if ThreadIDs run in parallel not serially, - * otherwise it's unnecessarily consuming shared memory. This function returns + * However, this is only useful if ThreadIDs run in parallel not serially, + * otherwise it's unnecessarily consuming shared memory. This function returns * the set of collections that have an unnecessary extra dimension. */ fn get_extra_dim_collects( @@ -1036,8 +1038,8 @@ int main() {{ let fork_iter = self.get_fork_iter(*control, false); write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?; } else { - // We can directly use use_thread_quota and not worry about available - // because Fork basic block's init section already does gating + // We can directly use use_thread_quota and not worry about available + // because Fork basic block's init section already does gating write!(w, "{}{} = (threadIdx.x % {}) / {};\n", tabs, define_variable, use_thread_quota.unwrap(), use_thread_quota.unwrap() / parallel_factor.unwrap())?; } } @@ -1046,8 +1048,8 @@ int main() {{ } } } - // The Reduce node only generates it's initialization, as reduct will - // perform the update. If serialized, add gate to prevent re-assignment + // The Reduce node only generates it's initialization, as reduct will + // perform the update. If serialized, add gate to prevent re-assignment // when we hit this reduce again due to the control flow loop between // the Fork and Join. Node::Reduce { @@ -1076,7 +1078,7 @@ int main() {{ Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); if (!is_primitive) { - let cg_tile = { + let cg_tile = { let KernelState::OutBlock = state else { panic!("Expected constant to be in start basic block outside any fork"); @@ -1153,9 +1155,9 @@ int main() {{ let left_val = self.get_value(*left, false, false); let right_val = self.get_value(*right, false, false); let id_type = self.typing[id.idx()]; - if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And + if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And | BinaryOperator::Xor) && is_special_reduct { - // For parallelized associative Reduces, use the cooperative + // For parallelized associative Reduces, use the cooperative // groups reduce API. Associative multiplication is not // supported. We need to use CGType::Use not CGType::UsePerId // because for parallelized reduction we only have one thread @@ -1349,7 +1351,7 @@ int main() {{ panic!("Unsupported data node type") } } - // Since the data uses and reducts are responsible for updating Phi and + // Since the data uses and reducts are responsible for updating Phi and // Reduce nodes, respectively, we check and emit those for each data node. if let Some(phis) = self.label_data_for_phi.get(&id) { let val = self.get_value(id, false, false); @@ -1403,12 +1405,12 @@ int main() {{ } Node::Fork { control: _, factors: _ } => { // We create a cooperative group tile for each of: used threads per - // thread ID- for reads and writes-, used threads across all thread + // thread ID- for reads and writes-, used threads across all thread // IDs- for parallelized reductions-, and available threads- to // synchronize between used and unused threads. We want to create // these only once, so we create two goto sections for each fork- // one run only once for creating groups, and other may be ran - // multiple times if the Fork is serialized and Join jumps back + // multiple times if the Fork is serialized and Join jumps back // to it. let cg_tile = self.get_cg_tile(id, CGType::UsePerId); if use_thread_quota.is_some() { @@ -1453,7 +1455,7 @@ int main() {{ } } Node::Join { control: _ } => { - // Join nodes also gate the used vs unused threads with a tile + // Join nodes also gate the used vs unused threads with a tile // sync after the body. let succ = self.control_subgraph.succs(id).next().unwrap(); let has_thread_quota = available_thread_quota.is_some(); @@ -1516,7 +1518,7 @@ int main() {{ /* * This function emits collection name + pointer math for the provided indices. * One nuance is whether the collection is represented as char pointer or - * the original primitive pointer. For Field, it's always char, for Variant, + * the original primitive pointer. For Field, it's always char, for Variant, * it doesn't matter here, and for Array, it depends- so we may need to tack * on the element size to the index math. */ @@ -1571,12 +1573,12 @@ int main() {{ /* * The outlined codegen for constants allows us to handle recursive initialization * for collections. We perform "allocation" by atomically incrementing dynamic - * shared memory and CUDA's support for dynamic is limited to a single extern + * shared memory and CUDA's support for dynamic is limited to a single extern * array. Dynamic is required here because not all dynamic constants and therefore - * array sizes are known. This approach will need further work, as currently - * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` - * prevents unnecessary shared memory allocations for nested product and summation - * collections, since the outermost allocates everything for the full collection. + * array sizes are known. This approach will need further work, as currently + * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` + * prevents unnecessary shared memory allocations for nested product and summation + * collections, since the outermost allocates everything for the full collection. * Since not initialized, array collections don't need to be recursed into. */ fn codegen_constant( @@ -1911,7 +1913,7 @@ int main() {{ /* * Setting `ty = true` will return with type in declaration format. `make_pointer` - * is only considered if `ty = true` and only relevant for primitive types- + * is only considered if `ty = true` and only relevant for primitive types- * otherwise it makes no difference because collections are already pointers. */ fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String { diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index ba01d8bf..3ae79a84 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -829,16 +829,16 @@ impl Type { } } - pub fn is_summation(&self) -> bool { - if let Type::Summation(_) = self { + pub fn is_array(&self) -> bool { + if let Type::Array(_, _) = self { true } else { false } } - pub fn is_array(&self) -> bool { - if let Type::Array(_, _) = self { + pub fn is_summation(&self) -> bool { + if let Type::Summation(_) = self { true } else { false @@ -1000,218 +1000,6 @@ impl DynamicConstant { } } -#[derive(Default, Clone)] -struct DynamicConstantRange { - min: isize, - max: isize, -} - -// The ith element is the exponent of the ith parameter, all together giving a -// unique key for each combination of parameters aka term. -#[derive(Eq, PartialEq, Hash)] -struct ParamKey(Vec<isize>); - -pub fn dynamic_constant_cmp( - a: DynamicConstantID, - b: DynamicConstantID, - dcs: &Vec<DynamicConstant>, - num_params: usize, -) -> Result<Option<Ordering>, Error> { - fn dynamic_constant_evaluation_iter( - a: DynamicConstantID, - dcs: &Vec<DynamicConstant>, - num_params: usize, - ) -> Result<HashMap<ParamKey, DynamicConstantRange>, Error> { - // We evaluate each dynamic constant by constructing range for each "term", - // aka unique combination of parameter exponents (eg param1^0 * param2^0 - // aka scalar represented by [0, 0] or param1^1 * param2^2 by [1, 2]). - // Range instead of single value is needed due to use of modulo. - let mut ranges = HashMap::new(); - match dcs[a.idx()] { - DynamicConstant::Parameter(idx) => { - let mut param_vec = vec![0; num_params]; - param_vec[idx] = 1; - ranges.insert(ParamKey(param_vec), DynamicConstantRange { min: 1, max: 1 }); - } - DynamicConstant::Constant(cons) => { - let param_vec = vec![0; num_params]; - ranges.insert( - ParamKey(param_vec), - DynamicConstantRange { - min: cons.try_into().map_err(|_| Error)?, - max: cons.try_into().map_err(|_| Error)?, - }, - ); - } - DynamicConstant::Add(left, right) => { - // Add same-form terms by adding their values. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - ranges.extend(left_ranges); - for r in right_ranges { - if let Some(l) = ranges.get_mut(&r.0) { - l.min += r.1.min; - l.max += r.1.max; - } else { - ranges.insert(r.0, r.1); - } - } - } - DynamicConstant::Sub(left, right) => { - // Subtract same-form terms by subtracting their values. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - ranges.extend(left_ranges); - for r in right_ranges { - if let Some(l) = ranges.get_mut(&r.0) { - l.min -= r.1.max; - l.max -= r.1.min; - } else { - ranges.insert( - r.0, - DynamicConstantRange { - min: -r.1.max, - max: -r.1.min, - }, - ); - } - } - } - DynamicConstant::Mul(left, right) => { - // Pairwise multiply each term by elementwise adding the two - // exponent keys and multiplying the values. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - for l in left_ranges { - for r in right_ranges.iter() { - let mut param_vec = l.0 .0.clone(); - for (idx, r_val) in r.0 .0.iter().enumerate() { - param_vec[idx] += r_val; - } - ranges.insert( - ParamKey(param_vec), - DynamicConstantRange { - min: l.1.min * r.1.min, - max: l.1.max * r.1.max, - }, - ); - } - } - } - DynamicConstant::Div(left, right) => { - // Pairwise divide each term by elementwise subtracting the two - // exponent keys and dividing the values. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - for l in left_ranges { - for r in right_ranges.iter() { - let mut param_vec = l.0 .0.clone(); - for (idx, r_val) in r.0 .0.iter().enumerate() { - param_vec[idx] -= r_val; - } - ranges.insert( - ParamKey(param_vec), - DynamicConstantRange { - min: l.1.min / r.1.min, - max: l.1.max / r.1.max, - }, - ); - } - } - } - DynamicConstant::Rem(left, right) => { - // We do simplest check for 0 or scalar multiple, and ignore all - // other cases of pure multiple. If check fails, the remainder is - // somewhere between 0 and the right value. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let mut is_zero = true; - for l in left_ranges.iter() { - if l.1.min != 0 || l.1.max != 0 { - is_zero = false; - break; - } - } - if is_zero { - return Ok(ranges); - } - - // Scalar multiple requires both that all right terms have left - // term with same positive multiplier, and there are no - // outstanding left terms after matching. - let mut is_scalar_multiple = true; - let mut scalar_factor = 0; - let mut remaining_left_terms = left_ranges.len(); - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - for r in right_ranges.iter() { - if let Some(l_range) = left_ranges.get(r.0) { - if l_range.min != l_range.max || r.1.min != r.1.max || l_range.min % r.1.min != 0 || (scalar_factor != 0 && l_range.min / r.1.min != scalar_factor) { - is_scalar_multiple = false; - break; - } - scalar_factor = l_range.min / r.1.min; - remaining_left_terms -= 1; - } - } - if is_scalar_multiple && scalar_factor >= 0 && remaining_left_terms == 0 { - return Ok(ranges); - } - - for r in right_ranges { - ranges.insert( - r.0, - DynamicConstantRange { - min: min(0, r.1.min), - max: max(0, r.1.max), - }, - ); - } - } - } - Ok(ranges) - } - - let a_ranges = dynamic_constant_evaluation_iter(a, dcs, num_params)?; - let b_ranges = dynamic_constant_evaluation_iter(b, dcs, num_params)?; - // a >= b iff a's min >= b's max. >= requires all terms in b to satisfy: - // if also in a, then a's coef >= b's coef; if not in a, have b's coef <= 0. - let mut a_is_greater = true; - for b in b_ranges.iter() { - if let Some(a) = a_ranges.get(b.0) { - if a.min < b.1.max { - a_is_greater = false; - break; - } - } else if b.1.min > 0 { - a_is_greater = false; - break; - } - } - - // Now check if b >= a. - let mut b_is_greater = true; - for a in a_ranges.iter() { - if let Some(b) = b_ranges.get(a.0) { - if b.min < a.1.max { - b_is_greater = false; - break; - } - } else if a.1.min > 0 { - b_is_greater = false; - break; - } - } - - if a_is_greater && b_is_greater { - Ok(Some(Ordering::Equal)) - } else if a_is_greater { - Ok(Some(Ordering::Greater)) - } else if b_is_greater { - Ok(Some(Ordering::Less)) - } else { - Ok(None) - } -} - pub fn evaluate_dynamic_constant( cons: DynamicConstantID, dcs: &Vec<DynamicConstant>, @@ -1363,6 +1151,7 @@ impl Node { selection: _ } ); + define_pattern_predicate!(is_undef, Node::Undef { ty: _ }); pub fn try_region(&self) -> Option<&[NodeID]> { -- GitLab From 5e3de6c327f1de7d1b0932bf4e57b83650fd25ae Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Tue, 14 Jan 2025 17:50:23 -0600 Subject: [PATCH 051/109] gpu --- hercules_cg/src/gpu.rs | 87 +++++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 36666ac7..477c5a9f 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -26,7 +26,6 @@ pub fn gpu_codegen<W: Write>( ) -> Result<(), Error> { /* * We assert the following: - * - There is at least one Fork node * - Fork node must have >= 1 Reduce nodes * - If the returned data type is a collection, it must have * originated from a single known parameter. Can relax to allow @@ -124,9 +123,6 @@ pub fn gpu_codegen<W: Write>( } } } - if fork_reduce_map.is_empty() { - panic!("Function must have at least one fork node"); - } for idx in 0..function.nodes.len() { if function.nodes[idx].is_fork() && fork_reduce_map @@ -305,16 +301,6 @@ impl GPUContext<'_> { self.codegen_goto_start(&mut top)?; write!(w, "{}", top)?; - // Create structures and determine block and thread parallelization strategy - let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); - let (root_forks, num_blocks) = - self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); - let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); - // TODO: Uncomment and adjust once we know logic of extra dim - // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); - let extra_dim_collects = HashSet::new(); - // Setup for CUDA's "goto" for control flow between basic blocks. let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len()) .filter(|idx| self.function.nodes[*idx].is_control()) @@ -325,22 +311,39 @@ impl GPUContext<'_> { }) .collect(); - // Core function for the CUDA code of all data and control nodes. - self.codegen_data_control( - if num_blocks > 1 { - Some(thread_root_root_fork) - } else { - None - }, - &thread_root_forks, - &fork_tree, - &fork_control_map, - &fork_thread_quota_map, - &extra_dim_collects, - &mut dynamic_shared_offset, - num_threads, - &mut gotos, - )?; + // If there are no forks, fast forward to single-block, single-thread codegen + let (num_blocks, num_threads) = if self.fork_join_map.is_empty() { + self.codegen_data_control_no_forks(&HashSet::new(), &mut dynamic_shared_offset, &mut gotos)?; + (1, 1) + } else { + // Create structures and determine block and thread parallelization strategy + let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); + let (root_forks, num_blocks) = + self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); + let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); + let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); + // TODO: Uncomment and adjust once we know logic of extra dim + // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); + let extra_dim_collects = HashSet::new(); + + // Core function for the CUDA code of all data and control nodes. + self.codegen_data_control( + if num_blocks > 1 { + Some(thread_root_root_fork) + } else { + None + }, + &thread_root_forks, + &fork_tree, + &fork_control_map, + &fork_thread_quota_map, + &extra_dim_collects, + &mut dynamic_shared_offset, + num_threads, + &mut gotos, + )?; + (num_blocks, num_threads) + }; // Emit all GPU kernel code from previous steps let mut kernel_body = String::new(); @@ -856,6 +859,29 @@ int main() {{ .collect() } + fn codegen_data_control_no_forks( + &self, + extra_dim_collects: &HashSet<TypeID>, + dynamic_shared_offset: &mut String, + gotos: &mut BTreeMap<NodeID, CudaGoto>, + ) -> Result<(), Error> { + (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_control()) + .try_for_each(|idx| -> Result<(), Error> { + let control = NodeID::new(idx); + let goto = gotos.get_mut(&control).unwrap(); + let init = &mut goto.init; + let post_init = &mut goto.post_init; + let body = &mut goto.body; + let term = &mut goto.term; + let mut tabs = self.codegen_control_node(control, None, None, None, init, post_init, term)?; + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_node(*data, KernelState::OutBlock, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + } + Ok(()) + }) + } + /* * Codegen for all control and data nodes. */ @@ -1348,6 +1374,7 @@ int main() {{ write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } _ => { + println!("Unsupported data node type: {:?}", self.function.nodes[id.idx()]); panic!("Unsupported data node type") } } -- GitLab From a864072fb6ccffd510c3b6f39f8713abdd20d4ec Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Tue, 14 Jan 2025 18:05:25 -0600 Subject: [PATCH 052/109] fix --- hercules_cg/src/device.rs | 2 - hercules_ir/src/ir.rs | 239 ++------------------------------------ 2 files changed, 7 insertions(+), 234 deletions(-) diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs index 174c7e9d..9f71c1a7 100644 --- a/hercules_cg/src/device.rs +++ b/hercules_cg/src/device.rs @@ -9,8 +9,6 @@ pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec let mut devices = vec![]; for (idx, function) in functions.into_iter().enumerate() { - devices.push(Device::CUDA); - continue; if let Some(device) = function.device { devices.push(device); } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 { diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 97330e63..822b3f45 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1,19 +1,13 @@ -extern crate bitvec; -extern crate ordered_float; -extern crate serde; - -use self::bitvec::prelude::*; -use self::serde::Deserialize; -use self::serde::Serialize; -use std::cmp::Ordering; -use std::cmp::{max, min}; -use std::collections::HashMap; -use std::convert::TryInto; -use std::fmt::{Error, Write}; +use std::fmt::Write; use std::ops::Coroutine; use std::ops::CoroutineState; use std::pin::Pin; +use bitvec::prelude::*; +use ordered_float::OrderedFloat; +use serde::Deserialize; +use serde::Serialize; + use crate::*; /* @@ -851,14 +845,6 @@ impl Type { } } - pub fn is_summation(&self) -> bool { - if let Type::Summation(_) = self { - true - } else { - false - } - } - pub fn try_element_type(&self) -> Option<TypeID> { if let Type::Array(elem, _) = self { Some(*elem) @@ -1014,218 +1000,6 @@ impl DynamicConstant { } } -#[derive(Default, Clone)] -struct DynamicConstantRange { - min: isize, - max: isize, -} - -// The ith element is the exponent of the ith parameter, all together giving a -// unique key for each combination of parameters aka term. -#[derive(Eq, PartialEq, Hash)] -struct ParamKey(Vec<isize>); - -pub fn dynamic_constant_cmp( - a: DynamicConstantID, - b: DynamicConstantID, - dcs: &Vec<DynamicConstant>, - num_params: usize, -) -> Result<Option<Ordering>, Error> { - fn dynamic_constant_evaluation_iter( - a: DynamicConstantID, - dcs: &Vec<DynamicConstant>, - num_params: usize, - ) -> Result<HashMap<ParamKey, DynamicConstantRange>, Error> { - // We evaluate each dynamic constant by constructing range for each "term", - // aka unique combination of parameter exponents (eg param1^0 * param2^0 - // aka scalar represented by [0, 0] or param1^1 * param2^2 by [1, 2]). - // Range instead of single value is needed due to use of modulo. - let mut ranges = HashMap::new(); - match dcs[a.idx()] { - DynamicConstant::Parameter(idx) => { - let mut param_vec = vec![0; num_params]; - param_vec[idx] = 1; - ranges.insert(ParamKey(param_vec), DynamicConstantRange { min: 1, max: 1 }); - } - DynamicConstant::Constant(cons) => { - let param_vec = vec![0; num_params]; - ranges.insert( - ParamKey(param_vec), - DynamicConstantRange { - min: cons.try_into().map_err(|_| Error)?, - max: cons.try_into().map_err(|_| Error)?, - }, - ); - } - DynamicConstant::Add(left, right) => { - // Add same-form terms by adding their values. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - ranges.extend(left_ranges); - for r in right_ranges { - if let Some(l) = ranges.get_mut(&r.0) { - l.min += r.1.min; - l.max += r.1.max; - } else { - ranges.insert(r.0, r.1); - } - } - } - DynamicConstant::Sub(left, right) => { - // Subtract same-form terms by subtracting their values. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - ranges.extend(left_ranges); - for r in right_ranges { - if let Some(l) = ranges.get_mut(&r.0) { - l.min -= r.1.max; - l.max -= r.1.min; - } else { - ranges.insert( - r.0, - DynamicConstantRange { - min: -r.1.max, - max: -r.1.min, - }, - ); - } - } - } - DynamicConstant::Mul(left, right) => { - // Pairwise multiply each term by elementwise adding the two - // exponent keys and multiplying the values. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - for l in left_ranges { - for r in right_ranges.iter() { - let mut param_vec = l.0 .0.clone(); - for (idx, r_val) in r.0 .0.iter().enumerate() { - param_vec[idx] += r_val; - } - ranges.insert( - ParamKey(param_vec), - DynamicConstantRange { - min: l.1.min * r.1.min, - max: l.1.max * r.1.max, - }, - ); - } - } - } - DynamicConstant::Div(left, right) => { - // Pairwise divide each term by elementwise subtracting the two - // exponent keys and dividing the values. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - for l in left_ranges { - for r in right_ranges.iter() { - let mut param_vec = l.0 .0.clone(); - for (idx, r_val) in r.0 .0.iter().enumerate() { - param_vec[idx] -= r_val; - } - ranges.insert( - ParamKey(param_vec), - DynamicConstantRange { - min: l.1.min / r.1.min, - max: l.1.max / r.1.max, - }, - ); - } - } - } - DynamicConstant::Rem(left, right) => { - // We do simplest check for 0 or scalar multiple, and ignore all - // other cases of pure multiple. If check fails, the remainder is - // somewhere between 0 and the right value. - let left_ranges = dynamic_constant_evaluation_iter(left, dcs, num_params)?; - let mut is_zero = true; - for l in left_ranges.iter() { - if l.1.min != 0 || l.1.max != 0 { - is_zero = false; - break; - } - } - if is_zero { - return Ok(ranges); - } - - // Scalar multiple requires both that all right terms have left - // term with same positive multiplier, and there are no - // outstanding left terms after matching. - let mut is_scalar_multiple = true; - let mut scalar_factor = 0; - let mut remaining_left_terms = left_ranges.len(); - let right_ranges = dynamic_constant_evaluation_iter(right, dcs, num_params)?; - for r in right_ranges.iter() { - if let Some(l_range) = left_ranges.get(r.0) { - if l_range.min != l_range.max || r.1.min != r.1.max || l_range.min % r.1.min != 0 || (scalar_factor != 0 && l_range.min / r.1.min != scalar_factor) { - is_scalar_multiple = false; - break; - } - scalar_factor = l_range.min / r.1.min; - remaining_left_terms -= 1; - } - } - if is_scalar_multiple && scalar_factor >= 0 && remaining_left_terms == 0 { - return Ok(ranges); - } - - for r in right_ranges { - ranges.insert( - r.0, - DynamicConstantRange { - min: min(0, r.1.min), - max: max(0, r.1.max), - }, - ); - } - } - } - Ok(ranges) - } - - let a_ranges = dynamic_constant_evaluation_iter(a, dcs, num_params)?; - let b_ranges = dynamic_constant_evaluation_iter(b, dcs, num_params)?; - // a >= b iff a's min >= b's max. >= requires all terms in b to satisfy: - // if also in a, then a's coef >= b's coef; if not in a, have b's coef <= 0. - let mut a_is_greater = true; - for b in b_ranges.iter() { - if let Some(a) = a_ranges.get(b.0) { - if a.min < b.1.max { - a_is_greater = false; - break; - } - } else if b.1.min > 0 { - a_is_greater = false; - break; - } - } - - // Now check if b >= a. - let mut b_is_greater = true; - for a in a_ranges.iter() { - if let Some(b) = b_ranges.get(a.0) { - if b.min < a.1.max { - b_is_greater = false; - break; - } - } else if a.1.min > 0 { - b_is_greater = false; - break; - } - } - - if a_is_greater && b_is_greater { - Ok(Some(Ordering::Equal)) - } else if a_is_greater { - Ok(Some(Ordering::Greater)) - } else if b_is_greater { - Ok(Some(Ordering::Less)) - } else { - Ok(None) - } -} - pub fn evaluate_dynamic_constant( cons: DynamicConstantID, dcs: &Vec<DynamicConstant>, @@ -1377,6 +1151,7 @@ impl Node { selection: _ } ); + define_pattern_predicate!(is_undef, Node::Undef { ty: _ }); pub fn try_region(&self) -> Option<&[NodeID]> { -- GitLab From ebcdf33b89d9e97230d92d8602745d2903fe79f0 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 08:31:21 -0600 Subject: [PATCH 053/109] objs --- hercules_opt/src/pass.rs | 49 ++- test.jn | 752 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 791 insertions(+), 10 deletions(-) create mode 100644 test.jn diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 7366a336..a9654b06 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -970,36 +970,65 @@ impl PassManager { _ => todo!(), } } - println!("{}", cuda_ir); println!("{}", llvm_ir); + println!("{}", cuda_ir); println!("{}", rust_rt); // Write the LLVM IR into a temporary file. let tmp_dir = TempDir::new().unwrap(); - let mut tmp_path = tmp_dir.path().to_path_buf(); - tmp_path.push(format!("{}.ll", module_name)); - println!("{}", tmp_path.display()); - let mut file = File::create(&tmp_path) + let mut llvm_path = tmp_dir.path().to_path_buf(); + llvm_path.push(format!("{}.ll", module_name)); + println!("{}", llvm_path.display()); + let mut file = File::create(&llvm_path) .expect("PANIC: Unable to open output LLVM IR file."); file.write_all(llvm_ir.as_bytes()) .expect("PANIC: Unable to write output LLVM IR file contents."); // Compile LLVM IR into an ELF object file. - let output_archive = format!("{}/lib{}.a", output_dir, module_name); - println!("{}", output_archive); + let llvm_object = format!("{}/{}_cpu.o", tmp_dir.path().to_str().unwrap(), module_name); let mut clang_process = Command::new("clang") - .arg(&tmp_path) - .arg("--emit-static-lib") + .arg(&llvm_path) + .arg("-c") .arg("-O3") .arg("-march=native") .arg("-o") - .arg(&output_archive) + .arg(&llvm_object) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .spawn() .expect("Error running clang. Is it installed?"); assert!(clang_process.wait().unwrap().success()); + // Write the CUDA IR into a temporary file. + let mut cuda_path = tmp_dir.path().to_path_buf(); + cuda_path.push(format!("{}.cu", module_name)); + let mut file = File::create(&cuda_path) + .expect("PANIC: Unable to open output CUDA IR file."); + file.write_all(cuda_ir.as_bytes()) + .expect("PANIC: Unable to write output CUDA IR file contents."); + + let cuda_object = format!("{}/{}_cuda.o", tmp_dir.path().to_str().unwrap(), module_name); + let mut nvcc_process = Command::new("nvcc") + .arg("-c") + .arg("-O3") + .arg("-o") + .arg(&cuda_object) + .arg(&cuda_path) + .spawn() + .expect("Error running nvcc. Is it installed?"); + assert!(nvcc_process.wait().unwrap().success()); + + let output_archive = format!("{}/lib{}.a", output_dir, module_name); + println!("{}", output_archive); + let mut ar_process = Command::new("ar") + .arg("crus") + .arg(&output_archive) + .arg(&llvm_object) + .arg(&cuda_object) + .spawn() + .expect("Error running ar. Is it installed?"); + assert!(ar_process.wait().unwrap().success()); + // Write the Rust runtime into a file. let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name); println!("{}", output_rt); diff --git a/test.jn b/test.jn new file mode 100644 index 00000000..5e9eb513 --- /dev/null +++ b/test.jn @@ -0,0 +1,752 @@ +fn Flat1d<t:number, n:usize>(X: t[n]) -> t[n] { + return X; +} +fn Flat1dB0<t:number, n:usize>(X: t[1]) -> t[n] { + let Z : t[n]; + for i = 0 to n { + Z[i] = X[0]; + } + return Z; +} +fn Flat1dI0<t:number, n:usize, p:usize>(X: t[p]) -> t[n * p] { + let Z : t[n * p]; + for i = 0 to n { + for j = 0 to p { + Z[i * p + j] = X[j]; + } + } + return Z; +} +fn Flat1dI1<t:number, n:usize, p:usize>(X: t[n]) -> t[n * p] { + let Z : t[n * p]; + for i = 0 to n { + for j = 0 to p { + Z[i * p + j] = X[i]; + } + } + return Z; +} +fn Flat2d<t:number, n:usize, p:usize>(X: t[n, p]) -> t[n * p] { + let Z : t[n * p]; + for i = 0 to n { + for j = 0 to p { + Z[i * p + j] = X[i, j]; + } + } + return Z; +} +fn Flat2dB0<t:number, n:usize, p:usize>(X: t[1, p]) -> t[n * p] { + let Z : t[n * p]; + for i = 0 to n { + for j = 0 to p { + Z[i * p + j] = X[0, j]; + } + } + return Z; +} +fn Flat2dB1<t:number, n:usize, p:usize>(X: t[n, 1]) -> t[n * p] { + let Z : t[n * p]; + for i = 0 to n { + for j = 0 to p { + Z[i * p + j] = X[i, 0]; + } + } + return Z; +} +fn Flat2dI0<t:number, n:usize, p:usize, q:usize>(X: t[p, q]) -> t[n * p * q] { + let Z : t[n * p * q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i * p * q + j * q + k] = X[j, k]; + } + } + } + return Z; +} +fn Flat2dI1<t:number, n:usize, p:usize, q:usize>(X: t[n, q]) -> t[n * p * q] { + let Z : t[n * p * q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i * p * q + j * q + k] = X[i, k]; + } + } + } + return Z; +} +fn Flat2dI2<t:number, n:usize, p:usize, q:usize>(X: t[n, p]) -> t[n * p * q] { + let Z : t[n * p * q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i * p * q + j * q + k] = X[i, j]; + } + } + } + return Z; +} +fn Flat3d<t:number, n:usize, p:usize, q:usize>(X: t[n, p, q]) -> t[n * p * q] { + let Z : t[n * p * q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i * p * q + j * q + k] = X[i, j, k]; + } + } + } + return Z; +} +fn Flat3dB0<t:number, n:usize, p:usize, q:usize>(X: t[1, p, q]) -> t[n * p * q] { + let Z : t[n * p * q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i * p * q + j * q + k] = X[0, j, k]; + } + } + } + return Z; +} +fn Flat3dB1<t:number, n:usize, p:usize, q:usize>(X: t[n, 1, q]) -> t[n * p * q] { + let Z : t[n * p * q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i * p * q + j * q + k] = X[i, 0, k]; + } + } + } + return Z; +} +fn Flat3dB2<t:number, n:usize, p:usize, q:usize>(X: t[n, p, 1]) -> t[n * p * q] { + let Z : t[n * p * q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i * p * q + j * q + k] = X[i, j, 0]; + } + } + } + return Z; +} +fn Flat3dI0<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[p, q, r]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[j, k, l]; + } + } + } + } + return Z; +} +fn Flat3dI1<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, q, r]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[i, k, l]; + } + } + } + } + return Z; +} +fn Flat3dI2<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, r]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, l]; + } + } + } + } + return Z; +} +fn Flat3dI3<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, k]; + } + } + } + } + return Z; +} +fn Flat4d<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, k, l]; + } + } + } + } + return Z; +} +fn Flat4dB0<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[1, p, q, r]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[0, j, k, l]; + } + } + } + } + return Z; +} +fn Flat4dB1<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, 1, q, r]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[i, 0, k, l]; + } + } + } + } + return Z; +} +fn Flat4dB2<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, 1, r]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, 0, l]; + } + } + } + } + return Z; +} +fn Flat4dB3<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, 1]) -> t[n * p * q * r] { + let Z : t[n * p * q * r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, k, 0]; + } + } + } + } + return Z; +} +fn Unflat2d<t:number, n:usize, p:usize>(X: t[n * p]) -> t[n, p] { + let Z : t[n, p]; + for i = 0 to n { + for j = 0 to p { + Z[i, j] = X[i * p + j]; + } + } + return Z; +} +fn Unflat3d<t:number, n:usize, p:usize, q:usize>(X: t[n * p * q]) -> t[n, p, q] { + let Z : t[n, p, q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i, j, k] = X[i * p * q + j * q + k]; + } + } + } + return Z; +} +fn Unflat4d<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n * p * q * r]) -> t[n, p, q, r] { + let Z : t[n, p, q, r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i, j, k, l] = X[i * p * q * r + j * q * r + k * r + l]; + } + } + } + } + return Z; +} +fn Permute2d<t:number, n:usize, p:usize>(X: t[n, p]) -> t[p, n] { + let Z : t[p, n]; + for i = 0 to n { + for j = 0 to p { + Z[j, i] = X[i, j]; + } + } + return Z; +} +fn Permute3d01<t:number, n:usize, p:usize, q:usize>(X: t[n, p, q]) -> t[p, n, q] { + let Z : t[p, n, q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[j, i, k] = X[i, j, k]; + } + } + } + return Z; +} +fn Permute3d02<t:number, n:usize, p:usize, q:usize>(X: t[n, p, q]) -> t[q, p, n] { + let Z : t[q, p, n]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[k, j, i] = X[i, j, k]; + } + } + } + return Z; +} +fn Permute3d12<t:number, n:usize, p:usize, q:usize>(X: t[n, p, q]) -> t[n, q, p] { + let Z : t[n, q, p]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + Z[i, k, j] = X[i, j, k]; + } + } + } + return Z; +} +fn Permute4d01<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[p, n, q, r] { + let Z : t[p, n, q, r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[j, i, k, l] = X[i, j, k, l]; + } + } + } + } + return Z; +} +fn Permute4d02<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[q, p, n, r] { + let Z : t[q, p, n, r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[k, j, i, l] = X[i, j, k, l]; + } + } + } + } + return Z; +} +fn Permute4d03<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[r, p, q, n] { + let Z : t[r, p, q, n]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[l, j, k, i] = X[i, j, k, l]; + } + } + } + } + return Z; +} +fn Permute4d12<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[n, q, p, r] { + let Z : t[n, q, p, r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i, k, j, l] = X[i, j, k, l]; + } + } + } + } + return Z; +} +fn Permute4d13<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[n, r, q, p] { + let Z : t[n, r, q, p]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i, l, k, j] = X[i, j, k, l]; + } + } + } + } + return Z; +} +fn Permute4d23<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[n, p, r, q] { + let Z : t[n, p, r, q]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to q { + for l = 0 to r { + Z[i, j, l, k] = X[i, j, k, l]; + } + } + } + } + return Z; +} +fn Add<t:number, n:usize>(X: t[n], Y: t[n]) -> t[n] { + let Z : t[n]; + for i = 0 to n { + Z[i] = X[i] + Y[i]; + } + return Z; +} +fn AddConst<t:number, n:usize>(X: t[n], Y: t) -> t[n] { + let Z : t[n]; + for i = 0 to n { + Z[i] = X[i] + Y; + } + return Z; +} +fn Mul<t:number, n:usize>(X: t[n], Y: t[n]) -> t[n] { + let Z : t[n]; + for i = 0 to n { + Z[i] = X[i] * Y[i]; + } + return Z; +} +fn MulConst<t:number, n:usize>(X: t[n], Y: t) -> t[n] { + let Z : t[n]; + for i = 0 to n { + Z[i] = X[i] * Y; + } + return Z; +} +fn HardTanh<t:number, n:usize>(X: t[n], Min: t, Max: t) -> t[n] { + let Z : t[n]; + for i = 0 to n { + Z[i] = max!::<t>(Min, min!::<t>(X[i], Max)); + } + return Z; +} +fn Mean<t: number, n:usize, p:usize>(X: t[n, p]) -> t[n] { + let Z : t[n]; + for i = 0 to n { + let sum : t = 0; + for j = 0 to p { + sum += X[i, j]; + } + Z[i] = sum / (p as t); + } + return Z; +} +fn MeanKD<t: number, n:usize, p:usize>(X: t[n, p]) -> t[n, p] { + let Z : t[n, p]; + for i = 0 to n { + let sum : t = 0; + for j = 0 to p { + sum += X[i, j]; + } + for j = 0 to p { + Z[i, j] = sum / (p as t); + } + } + return Z; +} +fn BatchNorm<t: number, n:usize, p:usize>(X: t[n, p], W: t[n], B: t[n], M: t[n], V: t[n], E: t) -> t[n, p] { + let Z : t[n, p]; + for i = 0 to n { + for j = 0 to p { + Z[i, j] = W[i] * (X[i, j] - M[i]) / (sqrt!::<f32>((V[i] + E) as f32) as t) + B[i]; + } + } + return Z; +} +fn Conv2d<t:number, m:usize, n:usize, p:usize, q:usize, r:usize, s:usize, u:usize, v:usize, w:usize, stride1:usize, stride2:usize, padding1:usize, padding2:usize, dilation1:usize, dilation2:usize, output_padding1:usize, output_padding2:usize, groups:usize>(X: t[m, n, p, q], W: t[r, n, s, u]) -> t[m, r, v, w] { + let Z : t[m, r, v, w]; + for i = 0 to m { + for j = 0 to r { + let C: t[v, w]; + for k = 0 to v { + for l = 0 to w { + C[k, l] = 0; + } + } + let gid = (j * groups) / r; + for k = 0 to (n / groups) { + for l = 0 to s { + let real_l : usize = l * dilation1; + let ih : usize = (padding1 / stride1 + 1) * stride1 - padding1 + real_l; + let oh : usize = (padding1 / stride1 + 1); + let numh : usize = min!::<usize>(v - oh, (p - ih) / stride1 + 1); + for g = 0 to u { + let real_g : usize = g * dilation2; + let iw : usize = (padding2 / stride2 + 1) * stride2 - padding2 + real_g; + let ow : usize = (padding2 / stride2 + 1); + let numw : usize = min!::<usize>(w - ow, (q - iw) / stride2 + 1); + for iterh = 0 to numh { + for iterw = 0 to numw { + C[output_padding1 + oh + iterh, output_padding2 + ow + iterw] += X[i, (gid * n) / groups + k, ih + iterh * stride1, iw + iterw * stride2] * W[j, k, l, g]; + } + } + } + } + } + for k = 0 to v { + for l = 0 to w { + Z[i, j, k, l] = C[k, l]; + } + } + } + } + return Z; +} + +fn BatchMatMul<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q], Y: t[n, q, r]) -> t[n, p, r] { + let Z : t[n, p, r]; + for i = 0 to n { + for j = 0 to p { + for k = 0 to r { + Z[i, j, k] = 0; + for l = 0 to q { + Z[i, j, k] += X[i, j, l] * Y[i, l, k]; + } + } + } + } + return Z; +} +fn MatMul<t:number, n:usize, p:usize, q:usize>(X: t[n, p], Y: t[p, q]) -> t[n, q] { + let Z : t[n, q]; + for i = 0 to n { + for j = 0 to q { + Z[i, j] = 0; + for k = 0 to p { + Z[i, j] += X[i, k] * Y[k, j]; + } + } + } + return Z; +} + +#[entry] +fn main(arg0_1: f32[32, 1, 3, 3], arg1_1: f32[32], arg2_1: f32[32], arg3_1: f32[32, 1, 3, 3], arg4_1: f32[32], arg5_1: f32[32], arg6_1: f32[16, 32, 1, 1], arg7_1: f32[16], arg8_1: f32[16], arg9_1: f32[96, 16, 1, 1], arg10_1: f32[96], arg11_1: f32[96], arg12_1: f32[96, 1, 3, 3], arg13_1: f32[96], arg14_1: f32[96], arg15_1: f32[24, 96, 1, 1], arg16_1: f32[24], arg17_1: f32[24], arg18_1: f32[144, 24, 1, 1], arg19_1: f32[144], arg20_1: f32[144], arg21_1: f32[144, 1, 3, 3], arg22_1: f32[144], arg23_1: f32[144], arg24_1: f32[24, 144, 1, 1], arg25_1: f32[24], arg26_1: f32[24], arg27_1: f32[144, 24, 1, 1], arg28_1: f32[144], arg29_1: f32[144], arg30_1: f32[144, 1, 3, 3], arg31_1: f32[144], arg32_1: f32[144], arg33_1: f32[32, 144, 1, 1], arg34_1: f32[32], arg35_1: f32[32], arg36_1: f32[192, 32, 1, 1], arg37_1: f32[192], arg38_1: f32[192], arg39_1: f32[192, 1, 3, 3], arg40_1: f32[192], arg41_1: f32[192], arg42_1: f32[32, 192, 1, 1], arg43_1: f32[32], arg44_1: f32[32], arg45_1: f32[192, 32, 1, 1], arg46_1: f32[192], arg47_1: f32[192], arg48_1: f32[192, 1, 3, 3], arg49_1: f32[192], arg50_1: f32[192], arg51_1: f32[32, 192, 1, 1], arg52_1: f32[32], arg53_1: f32[32], arg54_1: f32[192, 32, 1, 1], arg55_1: f32[192], arg56_1: f32[192], arg57_1: f32[192, 1, 3, 3], arg58_1: f32[192], arg59_1: f32[192], arg60_1: f32[64, 192, 1, 1], arg61_1: f32[64], arg62_1: f32[64], arg63_1: f32[384, 64, 1, 1], arg64_1: f32[384], arg65_1: f32[384], arg66_1: f32[384, 1, 3, 3], arg67_1: f32[384], arg68_1: f32[384], arg69_1: f32[64, 384, 1, 1], arg70_1: f32[64], arg71_1: f32[64], arg72_1: f32[384, 64, 1, 1], arg73_1: f32[384], arg74_1: f32[384], arg75_1: f32[384, 1, 3, 3], arg76_1: f32[384], arg77_1: f32[384], arg78_1: f32[64, 384, 1, 1], arg79_1: f32[64], arg80_1: f32[64], arg81_1: f32[384, 64, 1, 1], arg82_1: f32[384], arg83_1: f32[384], arg84_1: f32[384, 1, 3, 3], arg85_1: f32[384], arg86_1: f32[384], arg87_1: f32[64, 384, 1, 1], arg88_1: f32[64], arg89_1: f32[64], arg90_1: f32[384, 64, 1, 1], arg91_1: f32[384], arg92_1: f32[384], arg93_1: f32[384, 1, 3, 3], arg94_1: f32[384], arg95_1: f32[384], arg96_1: f32[96, 384, 1, 1], arg97_1: f32[96], arg98_1: f32[96], arg99_1: f32[576, 96, 1, 1], arg100_1: f32[576], arg101_1: f32[576], arg102_1: f32[576, 1, 3, 3], arg103_1: f32[576], arg104_1: f32[576], arg105_1: f32[96, 576, 1, 1], arg106_1: f32[96], arg107_1: f32[96], arg108_1: f32[576, 96, 1, 1], arg109_1: f32[576], arg110_1: f32[576], arg111_1: f32[576, 1, 3, 3], arg112_1: f32[576], arg113_1: f32[576], arg114_1: f32[96, 576, 1, 1], arg115_1: f32[96], arg116_1: f32[96], arg117_1: f32[576, 96, 1, 1], arg118_1: f32[576], arg119_1: f32[576], arg120_1: f32[576, 1, 3, 3], arg121_1: f32[576], arg122_1: f32[576], arg123_1: f32[160, 576, 1, 1], arg124_1: f32[160], arg125_1: f32[160], arg126_1: f32[960, 160, 1, 1], arg127_1: f32[960], arg128_1: f32[960], arg129_1: f32[960, 1, 3, 3], arg130_1: f32[960], arg131_1: f32[960], arg132_1: f32[160, 960, 1, 1], arg133_1: f32[160], arg134_1: f32[160], arg135_1: f32[960, 160, 1, 1], arg136_1: f32[960], arg137_1: f32[960], arg138_1: f32[960, 1, 3, 3], arg139_1: f32[960], arg140_1: f32[960], arg141_1: f32[160, 960, 1, 1], arg142_1: f32[160], arg143_1: f32[160], arg144_1: f32[960, 160, 1, 1], arg145_1: f32[960], arg146_1: f32[960], arg147_1: f32[960, 1, 3, 3], arg148_1: f32[960], arg149_1: f32[960], arg150_1: f32[320, 960, 1, 1], arg151_1: f32[320], arg152_1: f32[320], arg153_1: f32[1280, 320, 1, 1], arg154_1: f32[1280], arg155_1: f32[1280], arg156_1: f32[10, 1280], arg157_1: f32[10], arg158_1: f32[32], arg159_1: f32[32], arg160_1: i64[], arg161_1: f32[32], arg162_1: f32[32], arg163_1: i64[], arg164_1: f32[16], arg165_1: f32[16], arg166_1: i64[], arg167_1: f32[96], arg168_1: f32[96], arg169_1: i64[], arg170_1: f32[96], arg171_1: f32[96], arg172_1: i64[], arg173_1: f32[24], arg174_1: f32[24], arg175_1: i64[], arg176_1: f32[144], arg177_1: f32[144], arg178_1: i64[], arg179_1: f32[144], arg180_1: f32[144], arg181_1: i64[], arg182_1: f32[24], arg183_1: f32[24], arg184_1: i64[], arg185_1: f32[144], arg186_1: f32[144], arg187_1: i64[], arg188_1: f32[144], arg189_1: f32[144], arg190_1: i64[], arg191_1: f32[32], arg192_1: f32[32], arg193_1: i64[], arg194_1: f32[192], arg195_1: f32[192], arg196_1: i64[], arg197_1: f32[192], arg198_1: f32[192], arg199_1: i64[], arg200_1: f32[32], arg201_1: f32[32], arg202_1: i64[], arg203_1: f32[192], arg204_1: f32[192], arg205_1: i64[], arg206_1: f32[192], arg207_1: f32[192], arg208_1: i64[], arg209_1: f32[32], arg210_1: f32[32], arg211_1: i64[], arg212_1: f32[192], arg213_1: f32[192], arg214_1: i64[], arg215_1: f32[192], arg216_1: f32[192], arg217_1: i64[], arg218_1: f32[64], arg219_1: f32[64], arg220_1: i64[], arg221_1: f32[384], arg222_1: f32[384], arg223_1: i64[], arg224_1: f32[384], arg225_1: f32[384], arg226_1: i64[], arg227_1: f32[64], arg228_1: f32[64], arg229_1: i64[], arg230_1: f32[384], arg231_1: f32[384], arg232_1: i64[], arg233_1: f32[384], arg234_1: f32[384], arg235_1: i64[], arg236_1: f32[64], arg237_1: f32[64], arg238_1: i64[], arg239_1: f32[384], arg240_1: f32[384], arg241_1: i64[], arg242_1: f32[384], arg243_1: f32[384], arg244_1: i64[], arg245_1: f32[64], arg246_1: f32[64], arg247_1: i64[], arg248_1: f32[384], arg249_1: f32[384], arg250_1: i64[], arg251_1: f32[384], arg252_1: f32[384], arg253_1: i64[], arg254_1: f32[96], arg255_1: f32[96], arg256_1: i64[], arg257_1: f32[576], arg258_1: f32[576], arg259_1: i64[], arg260_1: f32[576], arg261_1: f32[576], arg262_1: i64[], arg263_1: f32[96], arg264_1: f32[96], arg265_1: i64[], arg266_1: f32[576], arg267_1: f32[576], arg268_1: i64[], arg269_1: f32[576], arg270_1: f32[576], arg271_1: i64[], arg272_1: f32[96], arg273_1: f32[96], arg274_1: i64[], arg275_1: f32[576], arg276_1: f32[576], arg277_1: i64[], arg278_1: f32[576], arg279_1: f32[576], arg280_1: i64[], arg281_1: f32[160], arg282_1: f32[160], arg283_1: i64[], arg284_1: f32[960], arg285_1: f32[960], arg286_1: i64[], arg287_1: f32[960], arg288_1: f32[960], arg289_1: i64[], arg290_1: f32[160], arg291_1: f32[160], arg292_1: i64[], arg293_1: f32[960], arg294_1: f32[960], arg295_1: i64[], arg296_1: f32[960], arg297_1: f32[960], arg298_1: i64[], arg299_1: f32[160], arg300_1: f32[160], arg301_1: i64[], arg302_1: f32[960], arg303_1: f32[960], arg304_1: i64[], arg305_1: f32[960], arg306_1: f32[960], arg307_1: i64[], arg308_1: f32[320], arg309_1: f32[320], arg310_1: i64[], arg311_1: f32[1280], arg312_1: f32[1280], arg313_1: i64[], arg314_1: f32[1, 1, 28, 28]) -> (f32[1, 10]) { +let convolution = Conv2d::<f32, 1, 1, 28, 28, 32, 3, 3, 15, 15, 2, 2, 1, 1, 1, 1, 0, 0, 1>(arg314_1, arg0_1); +let getitem_0 = Flat4d::<f32, 1, 32, 14, 14>(getitem); +let hardtanh_0 = HardTanh::<f32, 6272>(getitem_0, 0, 6); +let hardtanh = Unflat4d::<f32, 1, 32, 14, 14>(hardtanh_0); +let convolution_1 = Conv2d::<f32, 1, 32, 14, 14, 32, 3, 3, 15, 15, 1, 1, 1, 1, 1, 1, 0, 0, 32>(hardtanh, arg3_1); +let getitem_1_0 = Flat4d::<f32, 1, 32, 14, 14>(getitem_1); +let hardtanh_1_0 = HardTanh::<f32, 6272>(getitem_1_0, 0, 6); +let hardtanh_1 = Unflat4d::<f32, 1, 32, 14, 14>(hardtanh_1_0); +let convolution_2 = Conv2d::<f32, 1, 32, 14, 14, 16, 1, 1, 15, 15, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_1, arg6_1); +let convolution_3 = Conv2d::<f32, 1, 16, 14, 14, 96, 1, 1, 15, 15, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_2, arg9_1); +let getitem_3_0 = Flat4d::<f32, 1, 96, 14, 14>(getitem_3); +let hardtanh_2_0 = HardTanh::<f32, 18816>(getitem_3_0, 0, 6); +let hardtanh_2 = Unflat4d::<f32, 1, 96, 14, 14>(hardtanh_2_0); +let convolution_4 = Conv2d::<f32, 1, 96, 14, 14, 96, 3, 3, 8, 8, 2, 2, 1, 1, 1, 1, 0, 0, 96>(hardtanh_2, arg12_1); +let getitem_4_0 = Flat4d::<f32, 1, 96, 7, 7>(getitem_4); +let hardtanh_3_0 = HardTanh::<f32, 4704>(getitem_4_0, 0, 6); +let hardtanh_3 = Unflat4d::<f32, 1, 96, 7, 7>(hardtanh_3_0); +let convolution_5 = Conv2d::<f32, 1, 96, 7, 7, 24, 1, 1, 8, 8, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_3, arg15_1); +let convolution_6 = Conv2d::<f32, 1, 24, 7, 7, 144, 1, 1, 8, 8, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_5, arg18_1); +let getitem_6_0 = Flat4d::<f32, 1, 144, 7, 7>(getitem_6); +let hardtanh_4_0 = HardTanh::<f32, 7056>(getitem_6_0, 0, 6); +let hardtanh_4 = Unflat4d::<f32, 1, 144, 7, 7>(hardtanh_4_0); +let convolution_7 = Conv2d::<f32, 1, 144, 7, 7, 144, 3, 3, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 144>(hardtanh_4, arg21_1); +let getitem_7_0 = Flat4d::<f32, 1, 144, 7, 7>(getitem_7); +let hardtanh_5_0 = HardTanh::<f32, 7056>(getitem_7_0, 0, 6); +let hardtanh_5 = Unflat4d::<f32, 1, 144, 7, 7>(hardtanh_5_0); +let convolution_8 = Conv2d::<f32, 1, 144, 7, 7, 24, 1, 1, 8, 8, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_5, arg24_1); +let getitem_5_0 = Flat4d::<f32, 1, 24, 7, 7>(getitem_5); +let getitem_8_0 = Flat4d::<f32, 1, 24, 7, 7>(getitem_8); +let add_0 = Add::<f32, 1176>(getitem_5_0, getitem_8_0); +let add = Unflat4d::<f32, 1, 24, 7, 7>(add_0); +let convolution_9 = Conv2d::<f32, 1, 24, 7, 7, 144, 1, 1, 8, 8, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add, arg27_1); +let getitem_9_0 = Flat4d::<f32, 1, 144, 7, 7>(getitem_9); +let hardtanh_6_0 = HardTanh::<f32, 7056>(getitem_9_0, 0, 6); +let hardtanh_6 = Unflat4d::<f32, 1, 144, 7, 7>(hardtanh_6_0); +let convolution_10 = Conv2d::<f32, 1, 144, 7, 7, 144, 3, 3, 4, 4, 2, 2, 1, 1, 1, 1, 0, 0, 144>(hardtanh_6, arg30_1); +let getitem_10_0 = Flat4d::<f32, 1, 144, 4, 4>(getitem_10); +let hardtanh_7_0 = HardTanh::<f32, 2304>(getitem_10_0, 0, 6); +let hardtanh_7 = Unflat4d::<f32, 1, 144, 4, 4>(hardtanh_7_0); +let convolution_11 = Conv2d::<f32, 1, 144, 4, 4, 32, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_7, arg33_1); +let convolution_12 = Conv2d::<f32, 1, 32, 4, 4, 192, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_11, arg36_1); +let getitem_12_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_12); +let hardtanh_8_0 = HardTanh::<f32, 3072>(getitem_12_0, 0, 6); +let hardtanh_8 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_8_0); +let convolution_13 = Conv2d::<f32, 1, 192, 4, 4, 192, 3, 3, 5, 5, 1, 1, 1, 1, 1, 1, 0, 0, 192>(hardtanh_8, arg39_1); +let getitem_13_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_13); +let hardtanh_9_0 = HardTanh::<f32, 3072>(getitem_13_0, 0, 6); +let hardtanh_9 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_9_0); +let convolution_14 = Conv2d::<f32, 1, 192, 4, 4, 32, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_9, arg42_1); +let getitem_11_0 = Flat4d::<f32, 1, 32, 4, 4>(getitem_11); +let getitem_14_0 = Flat4d::<f32, 1, 32, 4, 4>(getitem_14); +let add_1_0 = Add::<f32, 512>(getitem_11_0, getitem_14_0); +let add_1 = Unflat4d::<f32, 1, 32, 4, 4>(add_1_0); +let convolution_15 = Conv2d::<f32, 1, 32, 4, 4, 192, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_1, arg45_1); +let getitem_15_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_15); +let hardtanh_10_0 = HardTanh::<f32, 3072>(getitem_15_0, 0, 6); +let hardtanh_10 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_10_0); +let convolution_16 = Conv2d::<f32, 1, 192, 4, 4, 192, 3, 3, 5, 5, 1, 1, 1, 1, 1, 1, 0, 0, 192>(hardtanh_10, arg48_1); +let getitem_16_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_16); +let hardtanh_11_0 = HardTanh::<f32, 3072>(getitem_16_0, 0, 6); +let hardtanh_11 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_11_0); +let convolution_17 = Conv2d::<f32, 1, 192, 4, 4, 32, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_11, arg51_1); +let add_1_0 = Flat4d::<f32, 1, 32, 4, 4>(add_1); +let getitem_17_0 = Flat4d::<f32, 1, 32, 4, 4>(getitem_17); +let add_2_0 = Add::<f32, 512>(add_1_0, getitem_17_0); +let add_2 = Unflat4d::<f32, 1, 32, 4, 4>(add_2_0); +let convolution_18 = Conv2d::<f32, 1, 32, 4, 4, 192, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_2, arg54_1); +let getitem_18_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_18); +let hardtanh_12_0 = HardTanh::<f32, 3072>(getitem_18_0, 0, 6); +let hardtanh_12 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_12_0); +let convolution_19 = Conv2d::<f32, 1, 192, 4, 4, 192, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 192>(hardtanh_12, arg57_1); +let getitem_19_0 = Flat4d::<f32, 1, 192, 2, 2>(getitem_19); +let hardtanh_13_0 = HardTanh::<f32, 768>(getitem_19_0, 0, 6); +let hardtanh_13 = Unflat4d::<f32, 1, 192, 2, 2>(hardtanh_13_0); +let convolution_20 = Conv2d::<f32, 1, 192, 2, 2, 64, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_13, arg60_1); +let convolution_21 = Conv2d::<f32, 1, 64, 2, 2, 384, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_20, arg63_1); +let getitem_21_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_21); +let hardtanh_14_0 = HardTanh::<f32, 1536>(getitem_21_0, 0, 6); +let hardtanh_14 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_14_0); +let convolution_22 = Conv2d::<f32, 1, 384, 2, 2, 384, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 384>(hardtanh_14, arg66_1); +let getitem_22_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_22); +let hardtanh_15_0 = HardTanh::<f32, 1536>(getitem_22_0, 0, 6); +let hardtanh_15 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_15_0); +let convolution_23 = Conv2d::<f32, 1, 384, 2, 2, 64, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_15, arg69_1); +let getitem_20_0 = Flat4d::<f32, 1, 64, 2, 2>(getitem_20); +let getitem_23_0 = Flat4d::<f32, 1, 64, 2, 2>(getitem_23); +let add_3_0 = Add::<f32, 256>(getitem_20_0, getitem_23_0); +let add_3 = Unflat4d::<f32, 1, 64, 2, 2>(add_3_0); +let convolution_24 = Conv2d::<f32, 1, 64, 2, 2, 384, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_3, arg72_1); +let getitem_24_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_24); +let hardtanh_16_0 = HardTanh::<f32, 1536>(getitem_24_0, 0, 6); +let hardtanh_16 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_16_0); +let convolution_25 = Conv2d::<f32, 1, 384, 2, 2, 384, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 384>(hardtanh_16, arg75_1); +let getitem_25_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_25); +let hardtanh_17_0 = HardTanh::<f32, 1536>(getitem_25_0, 0, 6); +let hardtanh_17 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_17_0); +let convolution_26 = Conv2d::<f32, 1, 384, 2, 2, 64, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_17, arg78_1); +let add_3_0 = Flat4d::<f32, 1, 64, 2, 2>(add_3); +let getitem_26_0 = Flat4d::<f32, 1, 64, 2, 2>(getitem_26); +let add_4_0 = Add::<f32, 256>(add_3_0, getitem_26_0); +let add_4 = Unflat4d::<f32, 1, 64, 2, 2>(add_4_0); +let convolution_27 = Conv2d::<f32, 1, 64, 2, 2, 384, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_4, arg81_1); +let getitem_27_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_27); +let hardtanh_18_0 = HardTanh::<f32, 1536>(getitem_27_0, 0, 6); +let hardtanh_18 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_18_0); +let convolution_28 = Conv2d::<f32, 1, 384, 2, 2, 384, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 384>(hardtanh_18, arg84_1); +let getitem_28_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_28); +let hardtanh_19_0 = HardTanh::<f32, 1536>(getitem_28_0, 0, 6); +let hardtanh_19 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_19_0); +let convolution_29 = Conv2d::<f32, 1, 384, 2, 2, 64, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_19, arg87_1); +let add_4_0 = Flat4d::<f32, 1, 64, 2, 2>(add_4); +let getitem_29_0 = Flat4d::<f32, 1, 64, 2, 2>(getitem_29); +let add_5_0 = Add::<f32, 256>(add_4_0, getitem_29_0); +let add_5 = Unflat4d::<f32, 1, 64, 2, 2>(add_5_0); +let convolution_30 = Conv2d::<f32, 1, 64, 2, 2, 384, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_5, arg90_1); +let getitem_30_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_30); +let hardtanh_20_0 = HardTanh::<f32, 1536>(getitem_30_0, 0, 6); +let hardtanh_20 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_20_0); +let convolution_31 = Conv2d::<f32, 1, 384, 2, 2, 384, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 384>(hardtanh_20, arg93_1); +let getitem_31_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_31); +let hardtanh_21_0 = HardTanh::<f32, 1536>(getitem_31_0, 0, 6); +let hardtanh_21 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_21_0); +let convolution_32 = Conv2d::<f32, 1, 384, 2, 2, 96, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_21, arg96_1); +let convolution_33 = Conv2d::<f32, 1, 96, 2, 2, 576, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_32, arg99_1); +let getitem_33_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_33); +let hardtanh_22_0 = HardTanh::<f32, 2304>(getitem_33_0, 0, 6); +let hardtanh_22 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_22_0); +let convolution_34 = Conv2d::<f32, 1, 576, 2, 2, 576, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 576>(hardtanh_22, arg102_1); +let getitem_34_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_34); +let hardtanh_23_0 = HardTanh::<f32, 2304>(getitem_34_0, 0, 6); +let hardtanh_23 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_23_0); +let convolution_35 = Conv2d::<f32, 1, 576, 2, 2, 96, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_23, arg105_1); +let getitem_32_0 = Flat4d::<f32, 1, 96, 2, 2>(getitem_32); +let getitem_35_0 = Flat4d::<f32, 1, 96, 2, 2>(getitem_35); +let add_6_0 = Add::<f32, 384>(getitem_32_0, getitem_35_0); +let add_6 = Unflat4d::<f32, 1, 96, 2, 2>(add_6_0); +let convolution_36 = Conv2d::<f32, 1, 96, 2, 2, 576, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_6, arg108_1); +let getitem_36_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_36); +let hardtanh_24_0 = HardTanh::<f32, 2304>(getitem_36_0, 0, 6); +let hardtanh_24 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_24_0); +let convolution_37 = Conv2d::<f32, 1, 576, 2, 2, 576, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 576>(hardtanh_24, arg111_1); +let getitem_37_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_37); +let hardtanh_25_0 = HardTanh::<f32, 2304>(getitem_37_0, 0, 6); +let hardtanh_25 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_25_0); +let convolution_38 = Conv2d::<f32, 1, 576, 2, 2, 96, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_25, arg114_1); +let add_6_0 = Flat4d::<f32, 1, 96, 2, 2>(add_6); +let getitem_38_0 = Flat4d::<f32, 1, 96, 2, 2>(getitem_38); +let add_7_0 = Add::<f32, 384>(add_6_0, getitem_38_0); +let add_7 = Unflat4d::<f32, 1, 96, 2, 2>(add_7_0); +let convolution_39 = Conv2d::<f32, 1, 96, 2, 2, 576, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_7, arg117_1); +let getitem_39_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_39); +let hardtanh_26_0 = HardTanh::<f32, 2304>(getitem_39_0, 0, 6); +let hardtanh_26 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_26_0); +let convolution_40 = Conv2d::<f32, 1, 576, 2, 2, 576, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 576>(hardtanh_26, arg120_1); +let getitem_40_0 = Flat4d::<f32, 1, 576, 1, 1>(getitem_40); +let hardtanh_27_0 = HardTanh::<f32, 576>(getitem_40_0, 0, 6); +let hardtanh_27 = Unflat4d::<f32, 1, 576, 1, 1>(hardtanh_27_0); +let convolution_41 = Conv2d::<f32, 1, 576, 1, 1, 160, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_27, arg123_1); +let convolution_42 = Conv2d::<f32, 1, 160, 1, 1, 960, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_41, arg126_1); +let getitem_42_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_42); +let hardtanh_28_0 = HardTanh::<f32, 960>(getitem_42_0, 0, 6); +let hardtanh_28 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_28_0); +let convolution_43 = Conv2d::<f32, 1, 960, 1, 1, 960, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 960>(hardtanh_28, arg129_1); +let getitem_43_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_43); +let hardtanh_29_0 = HardTanh::<f32, 960>(getitem_43_0, 0, 6); +let hardtanh_29 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_29_0); +let convolution_44 = Conv2d::<f32, 1, 960, 1, 1, 160, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_29, arg132_1); +let getitem_41_0 = Flat4d::<f32, 1, 160, 1, 1>(getitem_41); +let getitem_44_0 = Flat4d::<f32, 1, 160, 1, 1>(getitem_44); +let add_8_0 = Add::<f32, 160>(getitem_41_0, getitem_44_0); +let add_8 = Unflat4d::<f32, 1, 160, 1, 1>(add_8_0); +let convolution_45 = Conv2d::<f32, 1, 160, 1, 1, 960, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_8, arg135_1); +let getitem_45_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_45); +let hardtanh_30_0 = HardTanh::<f32, 960>(getitem_45_0, 0, 6); +let hardtanh_30 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_30_0); +let convolution_46 = Conv2d::<f32, 1, 960, 1, 1, 960, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 960>(hardtanh_30, arg138_1); +let getitem_46_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_46); +let hardtanh_31_0 = HardTanh::<f32, 960>(getitem_46_0, 0, 6); +let hardtanh_31 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_31_0); +let convolution_47 = Conv2d::<f32, 1, 960, 1, 1, 160, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_31, arg141_1); +let add_8_0 = Flat4d::<f32, 1, 160, 1, 1>(add_8); +let getitem_47_0 = Flat4d::<f32, 1, 160, 1, 1>(getitem_47); +let add_9_0 = Add::<f32, 160>(add_8_0, getitem_47_0); +let add_9 = Unflat4d::<f32, 1, 160, 1, 1>(add_9_0); +let convolution_48 = Conv2d::<f32, 1, 160, 1, 1, 960, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_9, arg144_1); +let getitem_48_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_48); +let hardtanh_32_0 = HardTanh::<f32, 960>(getitem_48_0, 0, 6); +let hardtanh_32 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_32_0); +let convolution_49 = Conv2d::<f32, 1, 960, 1, 1, 960, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 960>(hardtanh_32, arg147_1); +let getitem_49_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_49); +let hardtanh_33_0 = HardTanh::<f32, 960>(getitem_49_0, 0, 6); +let hardtanh_33 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_33_0); +let convolution_50 = Conv2d::<f32, 1, 960, 1, 1, 320, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_33, arg150_1); +let convolution_51 = Conv2d::<f32, 1, 320, 1, 1, 1280, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_50, arg153_1); +let getitem_51_0 = Flat4d::<f32, 1, 1280, 1, 1>(getitem_51); +let hardtanh_34_0 = HardTanh::<f32, 1280>(getitem_51_0, 0, 6); +let hardtanh_34 = Unflat4d::<f32, 1, 1280, 1, 1>(hardtanh_34_0); +let hardtanh_34_1_0 = Permute4d01::<f32, 1, 1280, 1, 1>(hardtanh_34); +let hardtanh_34_1_1 = Permute4d12::<f32, 1280, 1, 1, 1>(hardtanh_34_1_0); +let hardtanh_34_1_1_0 = Flat4d::<f32, 1280, 1, 1, 1>(hardtanh_34_1_1); +let hardtanh_34_1_1_1 = Unflat2d::<f32, 1280, 1>(hardtanh_34_1_1_0); +let mean_0 = MeanKD::<f32, 1280, 1>(hardtanh_34_1_1_1); +let mean_1 = Flat2d::<f32, 1, 1280, 1, 1>(mean_0); +let mean = Unflat4d::<f32, 1, 1280, 1, 1>(mean_1); +let clone_0 = Flat2d::<f32, 1, 1280>(clone); +let clone_1 = Unflat2d::<f32, 1, 1280>(clone_0); +let clone_1_0 = Flat2d::<f32, 1, 10>(clone_1); +let arg157_1_0 = Flat1dI0::<f32, 1, 10>(arg157_1); +let arg157_1_0_0 = Flat2d::<f32, 1, 10>(arg157_1_0); +let addmm_0 = Add::<f32, 10>(arg157_1_0_0, clone_1_0); +let addmm = Unflat2d::<f32, 1, 10>(addmm_0); +return (addmm); +} -- GitLab From 87b27f3463a185ef518fae03416a3927c0f07dad Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 08:31:42 -0600 Subject: [PATCH 054/109] objs --- test.jn | 752 -------------------------------------------------------- 1 file changed, 752 deletions(-) delete mode 100644 test.jn diff --git a/test.jn b/test.jn deleted file mode 100644 index 5e9eb513..00000000 --- a/test.jn +++ /dev/null @@ -1,752 +0,0 @@ -fn Flat1d<t:number, n:usize>(X: t[n]) -> t[n] { - return X; -} -fn Flat1dB0<t:number, n:usize>(X: t[1]) -> t[n] { - let Z : t[n]; - for i = 0 to n { - Z[i] = X[0]; - } - return Z; -} -fn Flat1dI0<t:number, n:usize, p:usize>(X: t[p]) -> t[n * p] { - let Z : t[n * p]; - for i = 0 to n { - for j = 0 to p { - Z[i * p + j] = X[j]; - } - } - return Z; -} -fn Flat1dI1<t:number, n:usize, p:usize>(X: t[n]) -> t[n * p] { - let Z : t[n * p]; - for i = 0 to n { - for j = 0 to p { - Z[i * p + j] = X[i]; - } - } - return Z; -} -fn Flat2d<t:number, n:usize, p:usize>(X: t[n, p]) -> t[n * p] { - let Z : t[n * p]; - for i = 0 to n { - for j = 0 to p { - Z[i * p + j] = X[i, j]; - } - } - return Z; -} -fn Flat2dB0<t:number, n:usize, p:usize>(X: t[1, p]) -> t[n * p] { - let Z : t[n * p]; - for i = 0 to n { - for j = 0 to p { - Z[i * p + j] = X[0, j]; - } - } - return Z; -} -fn Flat2dB1<t:number, n:usize, p:usize>(X: t[n, 1]) -> t[n * p] { - let Z : t[n * p]; - for i = 0 to n { - for j = 0 to p { - Z[i * p + j] = X[i, 0]; - } - } - return Z; -} -fn Flat2dI0<t:number, n:usize, p:usize, q:usize>(X: t[p, q]) -> t[n * p * q] { - let Z : t[n * p * q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i * p * q + j * q + k] = X[j, k]; - } - } - } - return Z; -} -fn Flat2dI1<t:number, n:usize, p:usize, q:usize>(X: t[n, q]) -> t[n * p * q] { - let Z : t[n * p * q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i * p * q + j * q + k] = X[i, k]; - } - } - } - return Z; -} -fn Flat2dI2<t:number, n:usize, p:usize, q:usize>(X: t[n, p]) -> t[n * p * q] { - let Z : t[n * p * q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i * p * q + j * q + k] = X[i, j]; - } - } - } - return Z; -} -fn Flat3d<t:number, n:usize, p:usize, q:usize>(X: t[n, p, q]) -> t[n * p * q] { - let Z : t[n * p * q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i * p * q + j * q + k] = X[i, j, k]; - } - } - } - return Z; -} -fn Flat3dB0<t:number, n:usize, p:usize, q:usize>(X: t[1, p, q]) -> t[n * p * q] { - let Z : t[n * p * q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i * p * q + j * q + k] = X[0, j, k]; - } - } - } - return Z; -} -fn Flat3dB1<t:number, n:usize, p:usize, q:usize>(X: t[n, 1, q]) -> t[n * p * q] { - let Z : t[n * p * q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i * p * q + j * q + k] = X[i, 0, k]; - } - } - } - return Z; -} -fn Flat3dB2<t:number, n:usize, p:usize, q:usize>(X: t[n, p, 1]) -> t[n * p * q] { - let Z : t[n * p * q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i * p * q + j * q + k] = X[i, j, 0]; - } - } - } - return Z; -} -fn Flat3dI0<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[p, q, r]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[j, k, l]; - } - } - } - } - return Z; -} -fn Flat3dI1<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, q, r]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[i, k, l]; - } - } - } - } - return Z; -} -fn Flat3dI2<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, r]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, l]; - } - } - } - } - return Z; -} -fn Flat3dI3<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, k]; - } - } - } - } - return Z; -} -fn Flat4d<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, k, l]; - } - } - } - } - return Z; -} -fn Flat4dB0<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[1, p, q, r]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[0, j, k, l]; - } - } - } - } - return Z; -} -fn Flat4dB1<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, 1, q, r]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[i, 0, k, l]; - } - } - } - } - return Z; -} -fn Flat4dB2<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, 1, r]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, 0, l]; - } - } - } - } - return Z; -} -fn Flat4dB3<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, 1]) -> t[n * p * q * r] { - let Z : t[n * p * q * r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i * p * q * r + j * q * r + k * r + l] = X[i, j, k, 0]; - } - } - } - } - return Z; -} -fn Unflat2d<t:number, n:usize, p:usize>(X: t[n * p]) -> t[n, p] { - let Z : t[n, p]; - for i = 0 to n { - for j = 0 to p { - Z[i, j] = X[i * p + j]; - } - } - return Z; -} -fn Unflat3d<t:number, n:usize, p:usize, q:usize>(X: t[n * p * q]) -> t[n, p, q] { - let Z : t[n, p, q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i, j, k] = X[i * p * q + j * q + k]; - } - } - } - return Z; -} -fn Unflat4d<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n * p * q * r]) -> t[n, p, q, r] { - let Z : t[n, p, q, r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i, j, k, l] = X[i * p * q * r + j * q * r + k * r + l]; - } - } - } - } - return Z; -} -fn Permute2d<t:number, n:usize, p:usize>(X: t[n, p]) -> t[p, n] { - let Z : t[p, n]; - for i = 0 to n { - for j = 0 to p { - Z[j, i] = X[i, j]; - } - } - return Z; -} -fn Permute3d01<t:number, n:usize, p:usize, q:usize>(X: t[n, p, q]) -> t[p, n, q] { - let Z : t[p, n, q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[j, i, k] = X[i, j, k]; - } - } - } - return Z; -} -fn Permute3d02<t:number, n:usize, p:usize, q:usize>(X: t[n, p, q]) -> t[q, p, n] { - let Z : t[q, p, n]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[k, j, i] = X[i, j, k]; - } - } - } - return Z; -} -fn Permute3d12<t:number, n:usize, p:usize, q:usize>(X: t[n, p, q]) -> t[n, q, p] { - let Z : t[n, q, p]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - Z[i, k, j] = X[i, j, k]; - } - } - } - return Z; -} -fn Permute4d01<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[p, n, q, r] { - let Z : t[p, n, q, r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[j, i, k, l] = X[i, j, k, l]; - } - } - } - } - return Z; -} -fn Permute4d02<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[q, p, n, r] { - let Z : t[q, p, n, r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[k, j, i, l] = X[i, j, k, l]; - } - } - } - } - return Z; -} -fn Permute4d03<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[r, p, q, n] { - let Z : t[r, p, q, n]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[l, j, k, i] = X[i, j, k, l]; - } - } - } - } - return Z; -} -fn Permute4d12<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[n, q, p, r] { - let Z : t[n, q, p, r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i, k, j, l] = X[i, j, k, l]; - } - } - } - } - return Z; -} -fn Permute4d13<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[n, r, q, p] { - let Z : t[n, r, q, p]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i, l, k, j] = X[i, j, k, l]; - } - } - } - } - return Z; -} -fn Permute4d23<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q, r]) -> t[n, p, r, q] { - let Z : t[n, p, r, q]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to q { - for l = 0 to r { - Z[i, j, l, k] = X[i, j, k, l]; - } - } - } - } - return Z; -} -fn Add<t:number, n:usize>(X: t[n], Y: t[n]) -> t[n] { - let Z : t[n]; - for i = 0 to n { - Z[i] = X[i] + Y[i]; - } - return Z; -} -fn AddConst<t:number, n:usize>(X: t[n], Y: t) -> t[n] { - let Z : t[n]; - for i = 0 to n { - Z[i] = X[i] + Y; - } - return Z; -} -fn Mul<t:number, n:usize>(X: t[n], Y: t[n]) -> t[n] { - let Z : t[n]; - for i = 0 to n { - Z[i] = X[i] * Y[i]; - } - return Z; -} -fn MulConst<t:number, n:usize>(X: t[n], Y: t) -> t[n] { - let Z : t[n]; - for i = 0 to n { - Z[i] = X[i] * Y; - } - return Z; -} -fn HardTanh<t:number, n:usize>(X: t[n], Min: t, Max: t) -> t[n] { - let Z : t[n]; - for i = 0 to n { - Z[i] = max!::<t>(Min, min!::<t>(X[i], Max)); - } - return Z; -} -fn Mean<t: number, n:usize, p:usize>(X: t[n, p]) -> t[n] { - let Z : t[n]; - for i = 0 to n { - let sum : t = 0; - for j = 0 to p { - sum += X[i, j]; - } - Z[i] = sum / (p as t); - } - return Z; -} -fn MeanKD<t: number, n:usize, p:usize>(X: t[n, p]) -> t[n, p] { - let Z : t[n, p]; - for i = 0 to n { - let sum : t = 0; - for j = 0 to p { - sum += X[i, j]; - } - for j = 0 to p { - Z[i, j] = sum / (p as t); - } - } - return Z; -} -fn BatchNorm<t: number, n:usize, p:usize>(X: t[n, p], W: t[n], B: t[n], M: t[n], V: t[n], E: t) -> t[n, p] { - let Z : t[n, p]; - for i = 0 to n { - for j = 0 to p { - Z[i, j] = W[i] * (X[i, j] - M[i]) / (sqrt!::<f32>((V[i] + E) as f32) as t) + B[i]; - } - } - return Z; -} -fn Conv2d<t:number, m:usize, n:usize, p:usize, q:usize, r:usize, s:usize, u:usize, v:usize, w:usize, stride1:usize, stride2:usize, padding1:usize, padding2:usize, dilation1:usize, dilation2:usize, output_padding1:usize, output_padding2:usize, groups:usize>(X: t[m, n, p, q], W: t[r, n, s, u]) -> t[m, r, v, w] { - let Z : t[m, r, v, w]; - for i = 0 to m { - for j = 0 to r { - let C: t[v, w]; - for k = 0 to v { - for l = 0 to w { - C[k, l] = 0; - } - } - let gid = (j * groups) / r; - for k = 0 to (n / groups) { - for l = 0 to s { - let real_l : usize = l * dilation1; - let ih : usize = (padding1 / stride1 + 1) * stride1 - padding1 + real_l; - let oh : usize = (padding1 / stride1 + 1); - let numh : usize = min!::<usize>(v - oh, (p - ih) / stride1 + 1); - for g = 0 to u { - let real_g : usize = g * dilation2; - let iw : usize = (padding2 / stride2 + 1) * stride2 - padding2 + real_g; - let ow : usize = (padding2 / stride2 + 1); - let numw : usize = min!::<usize>(w - ow, (q - iw) / stride2 + 1); - for iterh = 0 to numh { - for iterw = 0 to numw { - C[output_padding1 + oh + iterh, output_padding2 + ow + iterw] += X[i, (gid * n) / groups + k, ih + iterh * stride1, iw + iterw * stride2] * W[j, k, l, g]; - } - } - } - } - } - for k = 0 to v { - for l = 0 to w { - Z[i, j, k, l] = C[k, l]; - } - } - } - } - return Z; -} - -fn BatchMatMul<t:number, n:usize, p:usize, q:usize, r:usize>(X: t[n, p, q], Y: t[n, q, r]) -> t[n, p, r] { - let Z : t[n, p, r]; - for i = 0 to n { - for j = 0 to p { - for k = 0 to r { - Z[i, j, k] = 0; - for l = 0 to q { - Z[i, j, k] += X[i, j, l] * Y[i, l, k]; - } - } - } - } - return Z; -} -fn MatMul<t:number, n:usize, p:usize, q:usize>(X: t[n, p], Y: t[p, q]) -> t[n, q] { - let Z : t[n, q]; - for i = 0 to n { - for j = 0 to q { - Z[i, j] = 0; - for k = 0 to p { - Z[i, j] += X[i, k] * Y[k, j]; - } - } - } - return Z; -} - -#[entry] -fn main(arg0_1: f32[32, 1, 3, 3], arg1_1: f32[32], arg2_1: f32[32], arg3_1: f32[32, 1, 3, 3], arg4_1: f32[32], arg5_1: f32[32], arg6_1: f32[16, 32, 1, 1], arg7_1: f32[16], arg8_1: f32[16], arg9_1: f32[96, 16, 1, 1], arg10_1: f32[96], arg11_1: f32[96], arg12_1: f32[96, 1, 3, 3], arg13_1: f32[96], arg14_1: f32[96], arg15_1: f32[24, 96, 1, 1], arg16_1: f32[24], arg17_1: f32[24], arg18_1: f32[144, 24, 1, 1], arg19_1: f32[144], arg20_1: f32[144], arg21_1: f32[144, 1, 3, 3], arg22_1: f32[144], arg23_1: f32[144], arg24_1: f32[24, 144, 1, 1], arg25_1: f32[24], arg26_1: f32[24], arg27_1: f32[144, 24, 1, 1], arg28_1: f32[144], arg29_1: f32[144], arg30_1: f32[144, 1, 3, 3], arg31_1: f32[144], arg32_1: f32[144], arg33_1: f32[32, 144, 1, 1], arg34_1: f32[32], arg35_1: f32[32], arg36_1: f32[192, 32, 1, 1], arg37_1: f32[192], arg38_1: f32[192], arg39_1: f32[192, 1, 3, 3], arg40_1: f32[192], arg41_1: f32[192], arg42_1: f32[32, 192, 1, 1], arg43_1: f32[32], arg44_1: f32[32], arg45_1: f32[192, 32, 1, 1], arg46_1: f32[192], arg47_1: f32[192], arg48_1: f32[192, 1, 3, 3], arg49_1: f32[192], arg50_1: f32[192], arg51_1: f32[32, 192, 1, 1], arg52_1: f32[32], arg53_1: f32[32], arg54_1: f32[192, 32, 1, 1], arg55_1: f32[192], arg56_1: f32[192], arg57_1: f32[192, 1, 3, 3], arg58_1: f32[192], arg59_1: f32[192], arg60_1: f32[64, 192, 1, 1], arg61_1: f32[64], arg62_1: f32[64], arg63_1: f32[384, 64, 1, 1], arg64_1: f32[384], arg65_1: f32[384], arg66_1: f32[384, 1, 3, 3], arg67_1: f32[384], arg68_1: f32[384], arg69_1: f32[64, 384, 1, 1], arg70_1: f32[64], arg71_1: f32[64], arg72_1: f32[384, 64, 1, 1], arg73_1: f32[384], arg74_1: f32[384], arg75_1: f32[384, 1, 3, 3], arg76_1: f32[384], arg77_1: f32[384], arg78_1: f32[64, 384, 1, 1], arg79_1: f32[64], arg80_1: f32[64], arg81_1: f32[384, 64, 1, 1], arg82_1: f32[384], arg83_1: f32[384], arg84_1: f32[384, 1, 3, 3], arg85_1: f32[384], arg86_1: f32[384], arg87_1: f32[64, 384, 1, 1], arg88_1: f32[64], arg89_1: f32[64], arg90_1: f32[384, 64, 1, 1], arg91_1: f32[384], arg92_1: f32[384], arg93_1: f32[384, 1, 3, 3], arg94_1: f32[384], arg95_1: f32[384], arg96_1: f32[96, 384, 1, 1], arg97_1: f32[96], arg98_1: f32[96], arg99_1: f32[576, 96, 1, 1], arg100_1: f32[576], arg101_1: f32[576], arg102_1: f32[576, 1, 3, 3], arg103_1: f32[576], arg104_1: f32[576], arg105_1: f32[96, 576, 1, 1], arg106_1: f32[96], arg107_1: f32[96], arg108_1: f32[576, 96, 1, 1], arg109_1: f32[576], arg110_1: f32[576], arg111_1: f32[576, 1, 3, 3], arg112_1: f32[576], arg113_1: f32[576], arg114_1: f32[96, 576, 1, 1], arg115_1: f32[96], arg116_1: f32[96], arg117_1: f32[576, 96, 1, 1], arg118_1: f32[576], arg119_1: f32[576], arg120_1: f32[576, 1, 3, 3], arg121_1: f32[576], arg122_1: f32[576], arg123_1: f32[160, 576, 1, 1], arg124_1: f32[160], arg125_1: f32[160], arg126_1: f32[960, 160, 1, 1], arg127_1: f32[960], arg128_1: f32[960], arg129_1: f32[960, 1, 3, 3], arg130_1: f32[960], arg131_1: f32[960], arg132_1: f32[160, 960, 1, 1], arg133_1: f32[160], arg134_1: f32[160], arg135_1: f32[960, 160, 1, 1], arg136_1: f32[960], arg137_1: f32[960], arg138_1: f32[960, 1, 3, 3], arg139_1: f32[960], arg140_1: f32[960], arg141_1: f32[160, 960, 1, 1], arg142_1: f32[160], arg143_1: f32[160], arg144_1: f32[960, 160, 1, 1], arg145_1: f32[960], arg146_1: f32[960], arg147_1: f32[960, 1, 3, 3], arg148_1: f32[960], arg149_1: f32[960], arg150_1: f32[320, 960, 1, 1], arg151_1: f32[320], arg152_1: f32[320], arg153_1: f32[1280, 320, 1, 1], arg154_1: f32[1280], arg155_1: f32[1280], arg156_1: f32[10, 1280], arg157_1: f32[10], arg158_1: f32[32], arg159_1: f32[32], arg160_1: i64[], arg161_1: f32[32], arg162_1: f32[32], arg163_1: i64[], arg164_1: f32[16], arg165_1: f32[16], arg166_1: i64[], arg167_1: f32[96], arg168_1: f32[96], arg169_1: i64[], arg170_1: f32[96], arg171_1: f32[96], arg172_1: i64[], arg173_1: f32[24], arg174_1: f32[24], arg175_1: i64[], arg176_1: f32[144], arg177_1: f32[144], arg178_1: i64[], arg179_1: f32[144], arg180_1: f32[144], arg181_1: i64[], arg182_1: f32[24], arg183_1: f32[24], arg184_1: i64[], arg185_1: f32[144], arg186_1: f32[144], arg187_1: i64[], arg188_1: f32[144], arg189_1: f32[144], arg190_1: i64[], arg191_1: f32[32], arg192_1: f32[32], arg193_1: i64[], arg194_1: f32[192], arg195_1: f32[192], arg196_1: i64[], arg197_1: f32[192], arg198_1: f32[192], arg199_1: i64[], arg200_1: f32[32], arg201_1: f32[32], arg202_1: i64[], arg203_1: f32[192], arg204_1: f32[192], arg205_1: i64[], arg206_1: f32[192], arg207_1: f32[192], arg208_1: i64[], arg209_1: f32[32], arg210_1: f32[32], arg211_1: i64[], arg212_1: f32[192], arg213_1: f32[192], arg214_1: i64[], arg215_1: f32[192], arg216_1: f32[192], arg217_1: i64[], arg218_1: f32[64], arg219_1: f32[64], arg220_1: i64[], arg221_1: f32[384], arg222_1: f32[384], arg223_1: i64[], arg224_1: f32[384], arg225_1: f32[384], arg226_1: i64[], arg227_1: f32[64], arg228_1: f32[64], arg229_1: i64[], arg230_1: f32[384], arg231_1: f32[384], arg232_1: i64[], arg233_1: f32[384], arg234_1: f32[384], arg235_1: i64[], arg236_1: f32[64], arg237_1: f32[64], arg238_1: i64[], arg239_1: f32[384], arg240_1: f32[384], arg241_1: i64[], arg242_1: f32[384], arg243_1: f32[384], arg244_1: i64[], arg245_1: f32[64], arg246_1: f32[64], arg247_1: i64[], arg248_1: f32[384], arg249_1: f32[384], arg250_1: i64[], arg251_1: f32[384], arg252_1: f32[384], arg253_1: i64[], arg254_1: f32[96], arg255_1: f32[96], arg256_1: i64[], arg257_1: f32[576], arg258_1: f32[576], arg259_1: i64[], arg260_1: f32[576], arg261_1: f32[576], arg262_1: i64[], arg263_1: f32[96], arg264_1: f32[96], arg265_1: i64[], arg266_1: f32[576], arg267_1: f32[576], arg268_1: i64[], arg269_1: f32[576], arg270_1: f32[576], arg271_1: i64[], arg272_1: f32[96], arg273_1: f32[96], arg274_1: i64[], arg275_1: f32[576], arg276_1: f32[576], arg277_1: i64[], arg278_1: f32[576], arg279_1: f32[576], arg280_1: i64[], arg281_1: f32[160], arg282_1: f32[160], arg283_1: i64[], arg284_1: f32[960], arg285_1: f32[960], arg286_1: i64[], arg287_1: f32[960], arg288_1: f32[960], arg289_1: i64[], arg290_1: f32[160], arg291_1: f32[160], arg292_1: i64[], arg293_1: f32[960], arg294_1: f32[960], arg295_1: i64[], arg296_1: f32[960], arg297_1: f32[960], arg298_1: i64[], arg299_1: f32[160], arg300_1: f32[160], arg301_1: i64[], arg302_1: f32[960], arg303_1: f32[960], arg304_1: i64[], arg305_1: f32[960], arg306_1: f32[960], arg307_1: i64[], arg308_1: f32[320], arg309_1: f32[320], arg310_1: i64[], arg311_1: f32[1280], arg312_1: f32[1280], arg313_1: i64[], arg314_1: f32[1, 1, 28, 28]) -> (f32[1, 10]) { -let convolution = Conv2d::<f32, 1, 1, 28, 28, 32, 3, 3, 15, 15, 2, 2, 1, 1, 1, 1, 0, 0, 1>(arg314_1, arg0_1); -let getitem_0 = Flat4d::<f32, 1, 32, 14, 14>(getitem); -let hardtanh_0 = HardTanh::<f32, 6272>(getitem_0, 0, 6); -let hardtanh = Unflat4d::<f32, 1, 32, 14, 14>(hardtanh_0); -let convolution_1 = Conv2d::<f32, 1, 32, 14, 14, 32, 3, 3, 15, 15, 1, 1, 1, 1, 1, 1, 0, 0, 32>(hardtanh, arg3_1); -let getitem_1_0 = Flat4d::<f32, 1, 32, 14, 14>(getitem_1); -let hardtanh_1_0 = HardTanh::<f32, 6272>(getitem_1_0, 0, 6); -let hardtanh_1 = Unflat4d::<f32, 1, 32, 14, 14>(hardtanh_1_0); -let convolution_2 = Conv2d::<f32, 1, 32, 14, 14, 16, 1, 1, 15, 15, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_1, arg6_1); -let convolution_3 = Conv2d::<f32, 1, 16, 14, 14, 96, 1, 1, 15, 15, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_2, arg9_1); -let getitem_3_0 = Flat4d::<f32, 1, 96, 14, 14>(getitem_3); -let hardtanh_2_0 = HardTanh::<f32, 18816>(getitem_3_0, 0, 6); -let hardtanh_2 = Unflat4d::<f32, 1, 96, 14, 14>(hardtanh_2_0); -let convolution_4 = Conv2d::<f32, 1, 96, 14, 14, 96, 3, 3, 8, 8, 2, 2, 1, 1, 1, 1, 0, 0, 96>(hardtanh_2, arg12_1); -let getitem_4_0 = Flat4d::<f32, 1, 96, 7, 7>(getitem_4); -let hardtanh_3_0 = HardTanh::<f32, 4704>(getitem_4_0, 0, 6); -let hardtanh_3 = Unflat4d::<f32, 1, 96, 7, 7>(hardtanh_3_0); -let convolution_5 = Conv2d::<f32, 1, 96, 7, 7, 24, 1, 1, 8, 8, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_3, arg15_1); -let convolution_6 = Conv2d::<f32, 1, 24, 7, 7, 144, 1, 1, 8, 8, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_5, arg18_1); -let getitem_6_0 = Flat4d::<f32, 1, 144, 7, 7>(getitem_6); -let hardtanh_4_0 = HardTanh::<f32, 7056>(getitem_6_0, 0, 6); -let hardtanh_4 = Unflat4d::<f32, 1, 144, 7, 7>(hardtanh_4_0); -let convolution_7 = Conv2d::<f32, 1, 144, 7, 7, 144, 3, 3, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 144>(hardtanh_4, arg21_1); -let getitem_7_0 = Flat4d::<f32, 1, 144, 7, 7>(getitem_7); -let hardtanh_5_0 = HardTanh::<f32, 7056>(getitem_7_0, 0, 6); -let hardtanh_5 = Unflat4d::<f32, 1, 144, 7, 7>(hardtanh_5_0); -let convolution_8 = Conv2d::<f32, 1, 144, 7, 7, 24, 1, 1, 8, 8, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_5, arg24_1); -let getitem_5_0 = Flat4d::<f32, 1, 24, 7, 7>(getitem_5); -let getitem_8_0 = Flat4d::<f32, 1, 24, 7, 7>(getitem_8); -let add_0 = Add::<f32, 1176>(getitem_5_0, getitem_8_0); -let add = Unflat4d::<f32, 1, 24, 7, 7>(add_0); -let convolution_9 = Conv2d::<f32, 1, 24, 7, 7, 144, 1, 1, 8, 8, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add, arg27_1); -let getitem_9_0 = Flat4d::<f32, 1, 144, 7, 7>(getitem_9); -let hardtanh_6_0 = HardTanh::<f32, 7056>(getitem_9_0, 0, 6); -let hardtanh_6 = Unflat4d::<f32, 1, 144, 7, 7>(hardtanh_6_0); -let convolution_10 = Conv2d::<f32, 1, 144, 7, 7, 144, 3, 3, 4, 4, 2, 2, 1, 1, 1, 1, 0, 0, 144>(hardtanh_6, arg30_1); -let getitem_10_0 = Flat4d::<f32, 1, 144, 4, 4>(getitem_10); -let hardtanh_7_0 = HardTanh::<f32, 2304>(getitem_10_0, 0, 6); -let hardtanh_7 = Unflat4d::<f32, 1, 144, 4, 4>(hardtanh_7_0); -let convolution_11 = Conv2d::<f32, 1, 144, 4, 4, 32, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_7, arg33_1); -let convolution_12 = Conv2d::<f32, 1, 32, 4, 4, 192, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_11, arg36_1); -let getitem_12_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_12); -let hardtanh_8_0 = HardTanh::<f32, 3072>(getitem_12_0, 0, 6); -let hardtanh_8 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_8_0); -let convolution_13 = Conv2d::<f32, 1, 192, 4, 4, 192, 3, 3, 5, 5, 1, 1, 1, 1, 1, 1, 0, 0, 192>(hardtanh_8, arg39_1); -let getitem_13_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_13); -let hardtanh_9_0 = HardTanh::<f32, 3072>(getitem_13_0, 0, 6); -let hardtanh_9 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_9_0); -let convolution_14 = Conv2d::<f32, 1, 192, 4, 4, 32, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_9, arg42_1); -let getitem_11_0 = Flat4d::<f32, 1, 32, 4, 4>(getitem_11); -let getitem_14_0 = Flat4d::<f32, 1, 32, 4, 4>(getitem_14); -let add_1_0 = Add::<f32, 512>(getitem_11_0, getitem_14_0); -let add_1 = Unflat4d::<f32, 1, 32, 4, 4>(add_1_0); -let convolution_15 = Conv2d::<f32, 1, 32, 4, 4, 192, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_1, arg45_1); -let getitem_15_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_15); -let hardtanh_10_0 = HardTanh::<f32, 3072>(getitem_15_0, 0, 6); -let hardtanh_10 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_10_0); -let convolution_16 = Conv2d::<f32, 1, 192, 4, 4, 192, 3, 3, 5, 5, 1, 1, 1, 1, 1, 1, 0, 0, 192>(hardtanh_10, arg48_1); -let getitem_16_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_16); -let hardtanh_11_0 = HardTanh::<f32, 3072>(getitem_16_0, 0, 6); -let hardtanh_11 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_11_0); -let convolution_17 = Conv2d::<f32, 1, 192, 4, 4, 32, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_11, arg51_1); -let add_1_0 = Flat4d::<f32, 1, 32, 4, 4>(add_1); -let getitem_17_0 = Flat4d::<f32, 1, 32, 4, 4>(getitem_17); -let add_2_0 = Add::<f32, 512>(add_1_0, getitem_17_0); -let add_2 = Unflat4d::<f32, 1, 32, 4, 4>(add_2_0); -let convolution_18 = Conv2d::<f32, 1, 32, 4, 4, 192, 1, 1, 5, 5, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_2, arg54_1); -let getitem_18_0 = Flat4d::<f32, 1, 192, 4, 4>(getitem_18); -let hardtanh_12_0 = HardTanh::<f32, 3072>(getitem_18_0, 0, 6); -let hardtanh_12 = Unflat4d::<f32, 1, 192, 4, 4>(hardtanh_12_0); -let convolution_19 = Conv2d::<f32, 1, 192, 4, 4, 192, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 192>(hardtanh_12, arg57_1); -let getitem_19_0 = Flat4d::<f32, 1, 192, 2, 2>(getitem_19); -let hardtanh_13_0 = HardTanh::<f32, 768>(getitem_19_0, 0, 6); -let hardtanh_13 = Unflat4d::<f32, 1, 192, 2, 2>(hardtanh_13_0); -let convolution_20 = Conv2d::<f32, 1, 192, 2, 2, 64, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_13, arg60_1); -let convolution_21 = Conv2d::<f32, 1, 64, 2, 2, 384, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_20, arg63_1); -let getitem_21_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_21); -let hardtanh_14_0 = HardTanh::<f32, 1536>(getitem_21_0, 0, 6); -let hardtanh_14 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_14_0); -let convolution_22 = Conv2d::<f32, 1, 384, 2, 2, 384, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 384>(hardtanh_14, arg66_1); -let getitem_22_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_22); -let hardtanh_15_0 = HardTanh::<f32, 1536>(getitem_22_0, 0, 6); -let hardtanh_15 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_15_0); -let convolution_23 = Conv2d::<f32, 1, 384, 2, 2, 64, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_15, arg69_1); -let getitem_20_0 = Flat4d::<f32, 1, 64, 2, 2>(getitem_20); -let getitem_23_0 = Flat4d::<f32, 1, 64, 2, 2>(getitem_23); -let add_3_0 = Add::<f32, 256>(getitem_20_0, getitem_23_0); -let add_3 = Unflat4d::<f32, 1, 64, 2, 2>(add_3_0); -let convolution_24 = Conv2d::<f32, 1, 64, 2, 2, 384, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_3, arg72_1); -let getitem_24_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_24); -let hardtanh_16_0 = HardTanh::<f32, 1536>(getitem_24_0, 0, 6); -let hardtanh_16 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_16_0); -let convolution_25 = Conv2d::<f32, 1, 384, 2, 2, 384, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 384>(hardtanh_16, arg75_1); -let getitem_25_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_25); -let hardtanh_17_0 = HardTanh::<f32, 1536>(getitem_25_0, 0, 6); -let hardtanh_17 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_17_0); -let convolution_26 = Conv2d::<f32, 1, 384, 2, 2, 64, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_17, arg78_1); -let add_3_0 = Flat4d::<f32, 1, 64, 2, 2>(add_3); -let getitem_26_0 = Flat4d::<f32, 1, 64, 2, 2>(getitem_26); -let add_4_0 = Add::<f32, 256>(add_3_0, getitem_26_0); -let add_4 = Unflat4d::<f32, 1, 64, 2, 2>(add_4_0); -let convolution_27 = Conv2d::<f32, 1, 64, 2, 2, 384, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_4, arg81_1); -let getitem_27_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_27); -let hardtanh_18_0 = HardTanh::<f32, 1536>(getitem_27_0, 0, 6); -let hardtanh_18 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_18_0); -let convolution_28 = Conv2d::<f32, 1, 384, 2, 2, 384, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 384>(hardtanh_18, arg84_1); -let getitem_28_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_28); -let hardtanh_19_0 = HardTanh::<f32, 1536>(getitem_28_0, 0, 6); -let hardtanh_19 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_19_0); -let convolution_29 = Conv2d::<f32, 1, 384, 2, 2, 64, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_19, arg87_1); -let add_4_0 = Flat4d::<f32, 1, 64, 2, 2>(add_4); -let getitem_29_0 = Flat4d::<f32, 1, 64, 2, 2>(getitem_29); -let add_5_0 = Add::<f32, 256>(add_4_0, getitem_29_0); -let add_5 = Unflat4d::<f32, 1, 64, 2, 2>(add_5_0); -let convolution_30 = Conv2d::<f32, 1, 64, 2, 2, 384, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_5, arg90_1); -let getitem_30_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_30); -let hardtanh_20_0 = HardTanh::<f32, 1536>(getitem_30_0, 0, 6); -let hardtanh_20 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_20_0); -let convolution_31 = Conv2d::<f32, 1, 384, 2, 2, 384, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 384>(hardtanh_20, arg93_1); -let getitem_31_0 = Flat4d::<f32, 1, 384, 2, 2>(getitem_31); -let hardtanh_21_0 = HardTanh::<f32, 1536>(getitem_31_0, 0, 6); -let hardtanh_21 = Unflat4d::<f32, 1, 384, 2, 2>(hardtanh_21_0); -let convolution_32 = Conv2d::<f32, 1, 384, 2, 2, 96, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_21, arg96_1); -let convolution_33 = Conv2d::<f32, 1, 96, 2, 2, 576, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_32, arg99_1); -let getitem_33_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_33); -let hardtanh_22_0 = HardTanh::<f32, 2304>(getitem_33_0, 0, 6); -let hardtanh_22 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_22_0); -let convolution_34 = Conv2d::<f32, 1, 576, 2, 2, 576, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 576>(hardtanh_22, arg102_1); -let getitem_34_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_34); -let hardtanh_23_0 = HardTanh::<f32, 2304>(getitem_34_0, 0, 6); -let hardtanh_23 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_23_0); -let convolution_35 = Conv2d::<f32, 1, 576, 2, 2, 96, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_23, arg105_1); -let getitem_32_0 = Flat4d::<f32, 1, 96, 2, 2>(getitem_32); -let getitem_35_0 = Flat4d::<f32, 1, 96, 2, 2>(getitem_35); -let add_6_0 = Add::<f32, 384>(getitem_32_0, getitem_35_0); -let add_6 = Unflat4d::<f32, 1, 96, 2, 2>(add_6_0); -let convolution_36 = Conv2d::<f32, 1, 96, 2, 2, 576, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_6, arg108_1); -let getitem_36_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_36); -let hardtanh_24_0 = HardTanh::<f32, 2304>(getitem_36_0, 0, 6); -let hardtanh_24 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_24_0); -let convolution_37 = Conv2d::<f32, 1, 576, 2, 2, 576, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 0, 0, 576>(hardtanh_24, arg111_1); -let getitem_37_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_37); -let hardtanh_25_0 = HardTanh::<f32, 2304>(getitem_37_0, 0, 6); -let hardtanh_25 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_25_0); -let convolution_38 = Conv2d::<f32, 1, 576, 2, 2, 96, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_25, arg114_1); -let add_6_0 = Flat4d::<f32, 1, 96, 2, 2>(add_6); -let getitem_38_0 = Flat4d::<f32, 1, 96, 2, 2>(getitem_38); -let add_7_0 = Add::<f32, 384>(add_6_0, getitem_38_0); -let add_7 = Unflat4d::<f32, 1, 96, 2, 2>(add_7_0); -let convolution_39 = Conv2d::<f32, 1, 96, 2, 2, 576, 1, 1, 3, 3, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_7, arg117_1); -let getitem_39_0 = Flat4d::<f32, 1, 576, 2, 2>(getitem_39); -let hardtanh_26_0 = HardTanh::<f32, 2304>(getitem_39_0, 0, 6); -let hardtanh_26 = Unflat4d::<f32, 1, 576, 2, 2>(hardtanh_26_0); -let convolution_40 = Conv2d::<f32, 1, 576, 2, 2, 576, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 576>(hardtanh_26, arg120_1); -let getitem_40_0 = Flat4d::<f32, 1, 576, 1, 1>(getitem_40); -let hardtanh_27_0 = HardTanh::<f32, 576>(getitem_40_0, 0, 6); -let hardtanh_27 = Unflat4d::<f32, 1, 576, 1, 1>(hardtanh_27_0); -let convolution_41 = Conv2d::<f32, 1, 576, 1, 1, 160, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_27, arg123_1); -let convolution_42 = Conv2d::<f32, 1, 160, 1, 1, 960, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_41, arg126_1); -let getitem_42_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_42); -let hardtanh_28_0 = HardTanh::<f32, 960>(getitem_42_0, 0, 6); -let hardtanh_28 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_28_0); -let convolution_43 = Conv2d::<f32, 1, 960, 1, 1, 960, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 960>(hardtanh_28, arg129_1); -let getitem_43_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_43); -let hardtanh_29_0 = HardTanh::<f32, 960>(getitem_43_0, 0, 6); -let hardtanh_29 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_29_0); -let convolution_44 = Conv2d::<f32, 1, 960, 1, 1, 160, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_29, arg132_1); -let getitem_41_0 = Flat4d::<f32, 1, 160, 1, 1>(getitem_41); -let getitem_44_0 = Flat4d::<f32, 1, 160, 1, 1>(getitem_44); -let add_8_0 = Add::<f32, 160>(getitem_41_0, getitem_44_0); -let add_8 = Unflat4d::<f32, 1, 160, 1, 1>(add_8_0); -let convolution_45 = Conv2d::<f32, 1, 160, 1, 1, 960, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_8, arg135_1); -let getitem_45_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_45); -let hardtanh_30_0 = HardTanh::<f32, 960>(getitem_45_0, 0, 6); -let hardtanh_30 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_30_0); -let convolution_46 = Conv2d::<f32, 1, 960, 1, 1, 960, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 960>(hardtanh_30, arg138_1); -let getitem_46_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_46); -let hardtanh_31_0 = HardTanh::<f32, 960>(getitem_46_0, 0, 6); -let hardtanh_31 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_31_0); -let convolution_47 = Conv2d::<f32, 1, 960, 1, 1, 160, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_31, arg141_1); -let add_8_0 = Flat4d::<f32, 1, 160, 1, 1>(add_8); -let getitem_47_0 = Flat4d::<f32, 1, 160, 1, 1>(getitem_47); -let add_9_0 = Add::<f32, 160>(add_8_0, getitem_47_0); -let add_9 = Unflat4d::<f32, 1, 160, 1, 1>(add_9_0); -let convolution_48 = Conv2d::<f32, 1, 160, 1, 1, 960, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(add_9, arg144_1); -let getitem_48_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_48); -let hardtanh_32_0 = HardTanh::<f32, 960>(getitem_48_0, 0, 6); -let hardtanh_32 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_32_0); -let convolution_49 = Conv2d::<f32, 1, 960, 1, 1, 960, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 960>(hardtanh_32, arg147_1); -let getitem_49_0 = Flat4d::<f32, 1, 960, 1, 1>(getitem_49); -let hardtanh_33_0 = HardTanh::<f32, 960>(getitem_49_0, 0, 6); -let hardtanh_33 = Unflat4d::<f32, 1, 960, 1, 1>(hardtanh_33_0); -let convolution_50 = Conv2d::<f32, 1, 960, 1, 1, 320, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(hardtanh_33, arg150_1); -let convolution_51 = Conv2d::<f32, 1, 320, 1, 1, 1280, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 1>(getitem_50, arg153_1); -let getitem_51_0 = Flat4d::<f32, 1, 1280, 1, 1>(getitem_51); -let hardtanh_34_0 = HardTanh::<f32, 1280>(getitem_51_0, 0, 6); -let hardtanh_34 = Unflat4d::<f32, 1, 1280, 1, 1>(hardtanh_34_0); -let hardtanh_34_1_0 = Permute4d01::<f32, 1, 1280, 1, 1>(hardtanh_34); -let hardtanh_34_1_1 = Permute4d12::<f32, 1280, 1, 1, 1>(hardtanh_34_1_0); -let hardtanh_34_1_1_0 = Flat4d::<f32, 1280, 1, 1, 1>(hardtanh_34_1_1); -let hardtanh_34_1_1_1 = Unflat2d::<f32, 1280, 1>(hardtanh_34_1_1_0); -let mean_0 = MeanKD::<f32, 1280, 1>(hardtanh_34_1_1_1); -let mean_1 = Flat2d::<f32, 1, 1280, 1, 1>(mean_0); -let mean = Unflat4d::<f32, 1, 1280, 1, 1>(mean_1); -let clone_0 = Flat2d::<f32, 1, 1280>(clone); -let clone_1 = Unflat2d::<f32, 1, 1280>(clone_0); -let clone_1_0 = Flat2d::<f32, 1, 10>(clone_1); -let arg157_1_0 = Flat1dI0::<f32, 1, 10>(arg157_1); -let arg157_1_0_0 = Flat2d::<f32, 1, 10>(arg157_1_0); -let addmm_0 = Add::<f32, 10>(arg157_1_0_0, clone_1_0); -let addmm = Unflat2d::<f32, 1, 10>(addmm_0); -return (addmm); -} -- GitLab From 050e7dfaf11b18951561a38440a006e6d88f9387 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 10:42:18 -0600 Subject: [PATCH 055/109] sm --- hercules_cg/src/gpu.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 89155109..37f0cd31 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -155,7 +155,7 @@ pub fn gpu_codegen<W: Write>( (NodeID::new(pos), *data) }; - let return_type_id = &typing[return_node_id.idx()]; + let return_type_id = &typing[data_node_id.idx()]; let return_type = &types[return_type_id.idx()]; if return_type.is_array() || return_type.is_product() || return_type.is_summation() { let objects = &collection_objects.objects(data_node_id); @@ -520,12 +520,13 @@ namespace cg = cooperative_groups; fn codegen_launch_code(&self, run_debug: bool, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { write!(w, " -int main() {{ -")?; +int main(")?; // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); if run_debug { + write!(w, ") {{ +")?; // The first set of parameters are dynamic constants. let mut first_param = true; for idx in 0..self.function.num_dynamic_constants { -- GitLab From ee408623f14eb01fe405642f73e7b93fe894fee6 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 11:20:53 -0600 Subject: [PATCH 056/109] sm --- hercules_cg/src/gpu.rs | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 37f0cd31..f51479a7 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -291,9 +291,13 @@ enum CGType { impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { + // If run_debug, wrapping C host code is self-contained with malloc, etc, + // else it only does kernel launch. + let run_debug = false; + // Emit all code up to the "goto" to Start's block let mut top = String::new(); - self.codegen_kernel_begin(&mut top)?; + self.codegen_kernel_begin(run_debug, &mut top)?; let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; @@ -353,14 +357,14 @@ impl GPUContext<'_> { // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(false, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; + self.codegen_launch_code(run_debug, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) } // Emit kernel headers, signature, arguments, and dynamic shared memory declaration - fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { + fn codegen_kernel_begin(&self, run_debug: bool, w: &mut String) -> Result<(), Error> { write!(w, " #include <assert.h> #include <stdio.h> @@ -385,8 +389,8 @@ namespace cg = cooperative_groups; write!( w, - "__global__ void __launch_bounds__({}) {}(", - self.kernel_params.max_num_threads, self.function.name + "__global__ void __launch_bounds__({}) {}{}(", + self.kernel_params.max_num_threads, self.function.name, if run_debug { "" } else { "_gpu" } )?; // The first set of parameters are dynamic constants. let mut first_param = true; @@ -519,13 +523,12 @@ namespace cg = cooperative_groups; } fn codegen_launch_code(&self, run_debug: bool, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { - write!(w, " -int main(")?; // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); if run_debug { - write!(w, ") {{ + write!(w, " +int main() {{ ")?; // The first set of parameters are dynamic constants. let mut first_param = true; @@ -588,13 +591,13 @@ int main(")?; } } if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, "\tcudaFree(ret);\n"); + write!(w, "\tcudaFree(ret);\n")?; } - write!(w, "\treturn 0;\n"); - write!(w, "}}\n"); } else { + write!(w, " +extern \"C\" int {}(", self.function.name)?; // The first set of parameters are dynamic constants. let mut first_param = true; for idx in 0..self.function.num_dynamic_constants { @@ -627,11 +630,13 @@ int main(")?; write!(w, "{} ret", ret_type)?; write!(pass_args, "ret")?; } - write!(w, ") {{ - {}<<<{}, {}, {}>>>({}); -}}", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); + write!(w, ") {{\n")?; + write!(w, "\t{}<<<{}_gpu, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; + write!(w, "\tcudaDeviceSynchronize();\n")?; } + write!(w, "\treturn 0;\n")?; + write!(w, "}}\n")?; Ok(()) } -- GitLab From 6a2e59af63ba5cd27dd4df40c4d670f10e5ce5ec Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 14:34:47 -0600 Subject: [PATCH 057/109] host --- hercules_cg/src/gpu.rs | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index f51479a7..a3a46d93 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -555,7 +555,7 @@ int main() {{ } else { let param_size = self.get_size(*ty, None, None); write!(w, "\t{} p{};\n", param_type, idx); - write!(w, "\tif (cudaMalloc(&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; + write!(w, "\tif (cudaMalloc((void**)&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; write!(w, "\t\tprintf(\"Error allocating memory for parameter %d\\n\", {});\n", idx)?; write!(w, "\t\treturn -1;\n"); write!(w, "\t}}\n"); @@ -564,15 +564,14 @@ int main() {{ } // Pull primitive return to a pointer parameter if self.types[self.return_type_id.idx()].is_primitive() { - write!(pass_args, ", ")?; let ret_type_no_pnt = self.get_type(*self.return_type_id, false); let ret_type = self.get_type(*self.return_type_id, true); write!(w, "\t{} ret;\n", ret_type)?; - write!(w, "\tif (cudaMalloc(&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; + write!(w, "\tif (cudaMalloc((void**)&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; write!(w, "\t\tprintf(\"Error allocating memory for return value\\n\");\n")?; write!(w, "\t\treturn -1;\n")?; write!(w, "\t}}\n"); - write!(pass_args, "ret")?; + write!(pass_args, ", ret")?; } write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); write!(w, "\tbool skip = false;\n")?; @@ -596,8 +595,10 @@ int main() {{ } else { + let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); + let ret_type = self.get_type(*self.return_type_id, false); write!(w, " -extern \"C\" int {}(", self.function.name)?; +extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_string() }, self.function.name)?; // The first set of parameters are dynamic constants. let mut first_param = true; for idx in 0..self.function.num_dynamic_constants { @@ -622,20 +623,23 @@ extern \"C\" int {}(", self.function.name)?; write!(w, "{} p{}", param_type, idx)?; write!(pass_args, "p{}", idx)?; } - // Pull primitive return to a pointer parameter - if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, ", ")?; - write!(pass_args, ", ")?; - let ret_type = self.get_type(*self.return_type_id, true); - write!(w, "{} ret", ret_type)?; - write!(pass_args, "ret")?; - } write!(w, ") {{\n")?; + // Pull primitive return as pointer parameter for kernel + if ret_primitive { + let ret_type_pnt = self.get_type(*self.return_type_id, true); + write!(w, "\t{} ret;\n", ret_type_pnt)?; + write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; + write!(pass_args, ", ret")?; + } write!(w, "\t{}<<<{}_gpu, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tcudaDeviceSynchronize();\n")?; + if ret_primitive { + write!(w, "\t{} host_ret;\n", ret_type)?; + write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; + write!(w, "\treturn host_ret;\n")?; + } } - write!(w, "\treturn 0;\n")?; write!(w, "}}\n")?; Ok(()) } -- GitLab From ab75a1d659632be853a3a0b5edad04675ec76e94 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 16:56:10 -0600 Subject: [PATCH 058/109] cuda --- Cargo.lock | 1 + hercules_opt/src/pass.rs | 51 +++++++++++++++++++++++----------------- hercules_rt/build.rs | 1 + juno_build/Cargo.toml | 3 ++- 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ea295fdf..67731f60 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -718,6 +718,7 @@ name = "juno_build" version = "0.1.0" dependencies = [ "hercules_ir", + "hercules_rt", "juno_frontend", "with_builtin_macros", ] diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index a9654b06..a8e33a38 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -974,6 +974,9 @@ impl PassManager { println!("{}", cuda_ir); println!("{}", rust_rt); + let output_archive = format!("{}/lib{}.a", output_dir, module_name); + println!("{}", output_archive); + // Write the LLVM IR into a temporary file. let tmp_dir = TempDir::new().unwrap(); let mut llvm_path = tmp_dir.path().to_path_buf(); @@ -999,32 +1002,36 @@ impl PassManager { .expect("Error running clang. Is it installed?"); assert!(clang_process.wait().unwrap().success()); - // Write the CUDA IR into a temporary file. - let mut cuda_path = tmp_dir.path().to_path_buf(); - cuda_path.push(format!("{}.cu", module_name)); - let mut file = File::create(&cuda_path) - .expect("PANIC: Unable to open output CUDA IR file."); - file.write_all(cuda_ir.as_bytes()) - .expect("PANIC: Unable to write output CUDA IR file contents."); + let mut ar_args = vec!["crus", &output_archive, &llvm_object]; let cuda_object = format!("{}/{}_cuda.o", tmp_dir.path().to_str().unwrap(), module_name); - let mut nvcc_process = Command::new("nvcc") - .arg("-c") - .arg("-O3") - .arg("-o") - .arg(&cuda_object) - .arg(&cuda_path) - .spawn() - .expect("Error running nvcc. Is it installed?"); - assert!(nvcc_process.wait().unwrap().success()); + if cfg!(feature = "cuda") { + // Write the CUDA IR into a temporary file. + let mut cuda_path = tmp_dir.path().to_path_buf(); + cuda_path.push(format!("{}.cu", module_name)); + let mut file = File::create(&cuda_path) + .expect("PANIC: Unable to open output CUDA IR file."); + file.write_all(cuda_ir.as_bytes()) + .expect("PANIC: Unable to write output CUDA IR file contents."); + + let mut nvcc_process = Command::new("nvcc") + .arg("-c") + .arg("-O3") + .arg("-lcudart") + .arg("-L/usr/lib/x86_64-linux-gnu") + .arg("-L/usr/local/cuda/lib64") + .arg("-o") + .arg(&cuda_object) + .arg(&cuda_path) + .spawn() + .expect("Error running nvcc. Is it installed?"); + assert!(nvcc_process.wait().unwrap().success()); + + ar_args.push(&cuda_object); + } - let output_archive = format!("{}/lib{}.a", output_dir, module_name); - println!("{}", output_archive); let mut ar_process = Command::new("ar") - .arg("crus") - .arg(&output_archive) - .arg(&llvm_object) - .arg(&cuda_object) + .args(&ar_args) .spawn() .expect("Error running ar. Is it installed?"); assert!(ar_process.wait().unwrap().success()); diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs index 15b9f639..6db177de 100644 --- a/hercules_rt/build.rs +++ b/hercules_rt/build.rs @@ -18,6 +18,7 @@ fn main() { println!("cargo::rustc-link-search=native={}", out_dir); println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); println!("cargo::rustc-link-lib=static=rtdefs"); println!("cargo::rustc-link-lib=cudart"); println!("cargo::rerun-if-changed=src/rtdefs.cu"); diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml index 13889171..67b5bd7e 100644 --- a/juno_build/Cargo.toml +++ b/juno_build/Cargo.toml @@ -5,9 +5,10 @@ authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" [features] -cuda = ["juno_frontend/cuda"] +cuda = ["juno_frontend/cuda", "hercules_rt/cuda"] [dependencies] juno_frontend = { path = "../juno_frontend" } +hercules_rt = { path = "../hercules_rt", optional = true } hercules_ir = { path = "../hercules_ir" } with_builtin_macros = "0.1.0" -- GitLab From 5f8e7ef966c6057ef01f5bfb46a878ce20d1ad75 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 16:58:27 -0600 Subject: [PATCH 059/109] cuda --- hercules_cg/src/gpu.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a3a46d93..85ee7d90 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -631,7 +631,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; write!(pass_args, ", ret")?; } - write!(w, "\t{}<<<{}_gpu, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; + write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tcudaDeviceSynchronize();\n")?; if ret_primitive { write!(w, "\t{} host_ret;\n", ret_type)?; -- GitLab From c06b1537d7b8811ba47457e020c243f3693ab272 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 18:05:56 -0600 Subject: [PATCH 060/109] cuda --- hercules_cg/src/gpu.rs | 12 ++++++++++-- hercules_rt/build.rs | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 85ee7d90..3e0ae81d 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1639,8 +1639,8 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str Constant::UnsignedInteger32(val) => write!(w, "{}{} = {}ul;\n", tabs, name, val)?, Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?, Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, - Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, val)?, - Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, val)?, + Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, format_float(**val as f64))?, + Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, format_float(**val))?, // All three following collections involve align then allocate from the // single dynamic shared memory buffer by using and updating the offset. Constant::Product(type_id, constant_fields) => { @@ -2025,3 +2025,11 @@ fn convert_type(ty: &Type, make_pointer: bool) -> String { } result } + +fn format_float(val: f64) -> String { + let mut s = val.to_string(); + if !s.contains('.') && !s.contains('e') && !s.contains('E') { + s.push_str(".0"); + } + s +} diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs index 6db177de..04c9ef93 100644 --- a/hercules_rt/build.rs +++ b/hercules_rt/build.rs @@ -18,7 +18,7 @@ fn main() { println!("cargo::rustc-link-search=native={}", out_dir); println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + println!("cargo::rustc-link-search=native=/usr/local/cuda/lib64"); println!("cargo::rustc-link-lib=static=rtdefs"); println!("cargo::rustc-link-lib=cudart"); println!("cargo::rerun-if-changed=src/rtdefs.cu"); -- GitLab From aa79f0181bdc01fc1576349fc6ad2794ae564a03 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 18:23:34 -0600 Subject: [PATCH 061/109] cuda woprk? --- Cargo.lock | 1 - hercules_samples/ccp/Cargo.toml | 3 +++ hercules_samples/ccp/build.rs | 7 +++++++ hercules_samples/dot/Cargo.toml | 3 +++ hercules_samples/dot/build.rs | 7 +++++++ hercules_samples/matmul/Cargo.toml | 3 +++ hercules_samples/matmul/build.rs | 7 +++++++ juno_build/Cargo.toml | 3 +-- 8 files changed, 31 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 67731f60..ea295fdf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -718,7 +718,6 @@ name = "juno_build" version = "0.1.0" dependencies = [ "hercules_ir", - "hercules_rt", "juno_frontend", "with_builtin_macros", ] diff --git a/hercules_samples/ccp/Cargo.toml b/hercules_samples/ccp/Cargo.toml index 3547aa52..a5259a92 100644 --- a/hercules_samples/ccp/Cargo.toml +++ b/hercules_samples/ccp/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_build/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs index f04d48c7..0b984a0f 100644 --- a/hercules_samples/ccp/build.rs +++ b/hercules_samples/ccp/build.rs @@ -6,4 +6,11 @@ fn main() { .unwrap() .build() .unwrap(); + + #[cfg(feature = "cuda")] + println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-lib=cudart"); } diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml index 69cd39e3..9b11ddc1 100644 --- a/hercules_samples/dot/Cargo.toml +++ b/hercules_samples/dot/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs index 2a239bc6..43cd34f9 100644 --- a/hercules_samples/dot/build.rs +++ b/hercules_samples/dot/build.rs @@ -6,4 +6,11 @@ fn main() { .unwrap() .build() .unwrap(); + + #[cfg(feature = "cuda")] + println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-lib=cudart"); } diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml index 9066c153..49f05f29 100644 --- a/hercules_samples/matmul/Cargo.toml +++ b/hercules_samples/matmul/Cargo.toml @@ -4,6 +4,9 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs index 08478dea..b170024b 100644 --- a/hercules_samples/matmul/build.rs +++ b/hercules_samples/matmul/build.rs @@ -6,4 +6,11 @@ fn main() { .unwrap() .build() .unwrap(); + + #[cfg(feature = "cuda")] + println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-lib=cudart"); } diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml index 67b5bd7e..13889171 100644 --- a/juno_build/Cargo.toml +++ b/juno_build/Cargo.toml @@ -5,10 +5,9 @@ authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" [features] -cuda = ["juno_frontend/cuda", "hercules_rt/cuda"] +cuda = ["juno_frontend/cuda"] [dependencies] juno_frontend = { path = "../juno_frontend" } -hercules_rt = { path = "../hercules_rt", optional = true } hercules_ir = { path = "../hercules_ir" } with_builtin_macros = "0.1.0" -- GitLab From df630f8422142de13c4ba4b66e7ec4369a46f968 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 18:26:34 -0600 Subject: [PATCH 062/109] cuda nowoprk? --- hercules_cg/src/gpu.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 3e0ae81d..e7195223 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1360,11 +1360,12 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; - let nested_fork = nesting_fork.unwrap(); let cg_tile = match state { KernelState::OutBlock => "grid".to_string(), KernelState::InBlock => "block".to_string(), - KernelState::InThread => self.get_cg_tile(nested_fork, CGType::UsePerId), + KernelState::InThread => { + self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId) + } }; if self.types[data_type_id.idx()].is_primitive() { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; -- GitLab From 2c6c38c0dbbb6268ebbb499fb80966c19500e09b Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 18:33:50 -0600 Subject: [PATCH 063/109] cuda nowoprk? --- hercules_cg/src/gpu.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index e7195223..0c69544b 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -371,6 +371,7 @@ impl GPUContext<'_> { #include <stddef.h> #include <cuda.h> #include <cuda_runtime.h> +#include <math_constants.h> #include <mma.h> #include <cooperative_groups.h> #include <cooperative_groups/memcpy_async.h> @@ -1906,7 +1907,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str _ => "sinh", }, Intrinsic::Sqrt => match ty { - ty if ty.is_float() => "__sqrtf", + Type::Float32 => "__sqrtf", ty if ty.is_signed() || ty.is_unsigned() => "isqrt", _ => "sqrt", }, -- GitLab From 89a93e3c9c585a9ee2e97816de721bc7dbde4097 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Fri, 17 Jan 2025 22:37:54 -0600 Subject: [PATCH 064/109] cu --- hercules_cg/src/gpu.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 0c69544b..2151dc0d 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -419,7 +419,9 @@ namespace cg = cooperative_groups; } // Pull primitive return to a pointer parameter if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, ", ")?; + if !first_param { + write!(w, ", ")?; + } write!( w, "{} __restrict__ ret", @@ -1907,7 +1909,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str _ => "sinh", }, Intrinsic::Sqrt => match ty { - Type::Float32 => "__sqrtf", + Type::Float32 => "sqrtf", ty if ty.is_signed() || ty.is_unsigned() => "isqrt", _ => "sqrt", }, -- GitLab From 889e426dd862906ace6c9d0bf67110c21550ac6c Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sat, 18 Jan 2025 17:51:46 +0000 Subject: [PATCH 065/109] fac works --- hercules_opt/src/pass.rs | 3 --- hercules_samples/fac/build.rs | 7 +++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index a8e33a38..6afba501 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -1017,9 +1017,6 @@ impl PassManager { let mut nvcc_process = Command::new("nvcc") .arg("-c") .arg("-O3") - .arg("-lcudart") - .arg("-L/usr/lib/x86_64-linux-gnu") - .arg("-L/usr/local/cuda/lib64") .arg("-o") .arg(&cuda_object) .arg(&cuda_path) diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs index 4d8226f1..6863b27b 100644 --- a/hercules_samples/fac/build.rs +++ b/hercules_samples/fac/build.rs @@ -6,4 +6,11 @@ fn main() { .unwrap() .build() .unwrap(); + + #[cfg(feature = "cuda")] + println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-lib=cudart"); } -- GitLab From 4a05568cb7046cc75a7ba9b26166c716aa383057 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sat, 18 Jan 2025 11:58:01 -0600 Subject: [PATCH 066/109] mod --- hercules_cg/src/gpu.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 2151dc0d..23e58c14 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -470,6 +470,12 @@ namespace cg = cooperative_groups; DynamicConstant::Rem(left, right) => { write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())? } + DynamicConstant::Min(left, right) => { + write!(w, "\t{} = min(dc{}, dc{});\n", dc_val, left.idx(), right.idx())? + } + DynamicConstant::Max(left, right) => { + write!(w, "\t{} = max(dc{}, dc{});\n", dc_val, left.idx(), right.idx())? + } } } Ok(()) @@ -1880,8 +1886,8 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str _ => "max", }, Intrinsic::Min => match ty { - Type::Float32 => "__fminf", - Type::Float64 => "__fmin", + Type::Float32 => "fminf", + Type::Float64 => "fmin", ty if ty.is_signed() => "smin", ty if ty.is_unsigned() => "umin", _ => "min", -- GitLab From 2d80e858aea516be8676fdb56e04a93bbb8e07b9 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sat, 18 Jan 2025 12:13:26 -0600 Subject: [PATCH 067/109] no cuda for juno --- juno_samples/antideps/Cargo.toml | 6 +++--- juno_samples/casts_and_intrinsics/Cargo.toml | 4 ++-- juno_samples/cava/Cargo.toml | 6 +++--- juno_samples/concat/Cargo.toml | 6 +++--- juno_samples/implicit_clone/Cargo.toml | 6 +++--- juno_samples/matmul/Cargo.toml | 6 +++--- juno_samples/nested_ccp/Cargo.toml | 6 +++--- juno_samples/simple3/Cargo.toml | 6 +++--- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/juno_samples/antideps/Cargo.toml b/juno_samples/antideps/Cargo.toml index 9bd1d5a0..e492e2ae 100644 --- a/juno_samples/antideps/Cargo.toml +++ b/juno_samples/antideps/Cargo.toml @@ -9,10 +9,10 @@ name = "juno_antideps" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } +juno_build = { path = "../../juno_build", features = [] } +hercules_rt = { path = "../../hercules_rt", features = [] } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/casts_and_intrinsics/Cargo.toml b/juno_samples/casts_and_intrinsics/Cargo.toml index af74c07a..83d5be58 100644 --- a/juno_samples/casts_and_intrinsics/Cargo.toml +++ b/juno_samples/casts_and_intrinsics/Cargo.toml @@ -9,9 +9,9 @@ name = "juno_casts_and_intrinsics" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml index ff375d80..dfde5978 100644 --- a/juno_samples/cava/Cargo.toml +++ b/juno_samples/cava/Cargo.toml @@ -9,11 +9,11 @@ name = "juno_cava" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } +juno_build = { path = "../../juno_build", features = [] } +hercules_rt = { path = "../../hercules_rt", features = [] } async-std = "*" clap = { version = "*", features = ["derive"] } image = "*" diff --git a/juno_samples/concat/Cargo.toml b/juno_samples/concat/Cargo.toml index 24ba1acf..888a083f 100644 --- a/juno_samples/concat/Cargo.toml +++ b/juno_samples/concat/Cargo.toml @@ -9,10 +9,10 @@ name = "juno_concat" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } +juno_build = { path = "../../juno_build", features = [] } +hercules_rt = { path = "../../hercules_rt", features = [] } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/implicit_clone/Cargo.toml b/juno_samples/implicit_clone/Cargo.toml index b312f5de..4f5387e7 100644 --- a/juno_samples/implicit_clone/Cargo.toml +++ b/juno_samples/implicit_clone/Cargo.toml @@ -9,10 +9,10 @@ name = "juno_implicit_clone" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } +juno_build = { path = "../../juno_build", features = [] } +hercules_rt = { path = "../../hercules_rt", features = [] } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml index 8ad95853..1e004dcd 100644 --- a/juno_samples/matmul/Cargo.toml +++ b/juno_samples/matmul/Cargo.toml @@ -9,11 +9,11 @@ name = "juno_matmul" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } +juno_build = { path = "../../juno_build", features = [] } +hercules_rt = { path = "../../hercules_rt", features = [] } with_builtin_macros = "0.1.0" async-std = "*" rand = "*" diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml index 8c9b969d..bcf1fff8 100644 --- a/juno_samples/nested_ccp/Cargo.toml +++ b/juno_samples/nested_ccp/Cargo.toml @@ -9,10 +9,10 @@ name = "juno_nested_ccp" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } +juno_build = { path = "../../juno_build", features = [] } +hercules_rt = { path = "../../hercules_rt", features = [] } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml index 8060c5b3..c66dc977 100644 --- a/juno_samples/simple3/Cargo.toml +++ b/juno_samples/simple3/Cargo.toml @@ -9,10 +9,10 @@ name = "juno_simple3" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } +juno_build = { path = "../../juno_build", features = [] } +hercules_rt = { path = "../../hercules_rt", features = [] } with_builtin_macros = "0.1.0" async-std = "*" -- GitLab From 9e7e4c8ab3cfa2a4ddb1adc072a6bd314d7e57a2 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sat, 18 Jan 2025 12:19:31 -0600 Subject: [PATCH 068/109] smol --- hercules_cg/src/gpu.rs | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 23e58c14..731e57e3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1,7 +1,7 @@ extern crate bitvec; extern crate hercules_ir; -use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::{Error, Write}; use self::hercules_ir::*; @@ -135,7 +135,7 @@ pub fn gpu_codegen<W: Write>( // Obtain the Return node and if it's a collection, use the collection objects // analysis to determine the origin. Also save the return node id for later // conversion of primitive Return into Parameter. - let (return_node_id, data_node_id) = { + let (_, data_node_id) = { let pos = function .nodes .iter() @@ -563,11 +563,11 @@ int main() {{ write!(w, "\t{} p{} = 1;\n", param_type, idx)?; } else { let param_size = self.get_size(*ty, None, None); - write!(w, "\t{} p{};\n", param_type, idx); + write!(w, "\t{} p{};\n", param_type, idx)?; write!(w, "\tif (cudaMalloc((void**)&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; write!(w, "\t\tprintf(\"Error allocating memory for parameter %d\\n\", {});\n", idx)?; - write!(w, "\t\treturn -1;\n"); - write!(w, "\t}}\n"); + write!(w, "\t\treturn -1;\n")?; + write!(w, "\t}}\n")?; } write!(pass_args, "p{}", idx)?; } @@ -579,20 +579,20 @@ int main() {{ write!(w, "\tif (cudaMalloc((void**)&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; write!(w, "\t\tprintf(\"Error allocating memory for return value\\n\");\n")?; write!(w, "\t\treturn -1;\n")?; - write!(w, "\t}}\n"); + write!(w, "\t}}\n")?; write!(pass_args, ", ret")?; } - write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); + write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tbool skip = false;\n")?; write!(w, "\tcudaError_t err = cudaGetLastError();\n")?; write!(w, "\tif (err != cudaSuccess) {{\n")?; write!(w, "\t\tprintf(\"Error launching kernel: %s\\n\", cudaGetErrorString(err));\n")?; write!(w, "\t\tskip = true;\n")?; - write!(w, "\t}}\n"); + write!(w, "\t}}\n")?; write!(w, "\tif (cudaDeviceSynchronize() != cudaSuccess && !skip) {{\n")?; write!(w, "\t\tprintf(\"Error synchronizing device\\n\");\n")?; write!(w, "\t\tskip = true;\n")?; - write!(w, "\t}}\n"); + write!(w, "\t}}\n")?; for (idx, ty) in self.function.param_types.iter().enumerate() { if !self.types[ty.idx()].is_primitive() { write!(w, "\tcudaFree(p{});\n", idx)?; @@ -1122,7 +1122,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str // want to "allocate" and initialize it once. Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); - if (!is_primitive) { + if !is_primitive { let cg_tile = { let KernelState::OutBlock = state else { panic!("Expected constant to be in start basic block @@ -1142,7 +1142,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str w, *num_tabs, )?; - if (!is_primitive) { + if !is_primitive { write!(w, "{}}}\n", tabs)?; *num_tabs -= 1; } @@ -1679,9 +1679,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str dynamic_shared_offset, w, num_tabs, - ); + )?; } else if !field_constant.is_array() { - self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, dynamic_shared_offset, w, num_tabs); + self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, dynamic_shared_offset, w, num_tabs)?; } } } @@ -1710,9 +1710,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str dynamic_shared_offset, w, num_tabs, - ); + )?; } else if !variant_constant.is_array() { - self.codegen_constant(name, *field, false, extra_dim_collects, dynamic_shared_offset, w, num_tabs); + self.codegen_constant(name, *field, false, extra_dim_collects, dynamic_shared_offset, w, num_tabs)?; }; } Constant::Array(type_id) => { -- GitLab From 30a184b2a75eb85e294a7c026640ddd90decd7f4 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sat, 18 Jan 2025 12:21:41 -0600 Subject: [PATCH 069/109] smol --- hercules_samples/call/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hercules_samples/call/Cargo.toml b/hercules_samples/call/Cargo.toml index 4a2fbb86..52c588e7 100644 --- a/hercules_samples/call/Cargo.toml +++ b/hercules_samples/call/Cargo.toml @@ -5,10 +5,10 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" [build-dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } [dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" -- GitLab From 96f005d31747a6bd92da0abfba7fedec997a0eaf Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sat, 18 Jan 2025 12:27:20 -0600 Subject: [PATCH 070/109] feature dep --- hercules_samples/ccp/Cargo.toml | 8 ++++---- juno_build/Cargo.toml | 5 +++-- juno_frontend/Cargo.toml | 6 ++++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/hercules_samples/ccp/Cargo.toml b/hercules_samples/ccp/Cargo.toml index a5259a92..c665e73f 100644 --- a/hercules_samples/ccp/Cargo.toml +++ b/hercules_samples/ccp/Cargo.toml @@ -7,11 +7,11 @@ edition = "2021" [features] cuda = ["juno_build/cuda"] -[build-dependencies] -juno_build = { path = "../../juno_build" } - [dependencies] -juno_build = { path = "../../juno_build" } +juno_build = { path = "../../juno_build", features = [] } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" + +[build-dependencies] +juno_build = { path = "../../juno_build", features = [] } diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml index 13889171..79b29c87 100644 --- a/juno_build/Cargo.toml +++ b/juno_build/Cargo.toml @@ -5,9 +5,10 @@ authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" [features] -cuda = ["juno_frontend/cuda"] +cuda = ["juno_frontend/cuda", "dep:cuda-support"] [dependencies] -juno_frontend = { path = "../juno_frontend" } +juno_frontend = { path = "../juno_frontend", default-features = false } hercules_ir = { path = "../hercules_ir" } with_builtin_macros = "0.1.0" +cuda-support = { path = "../cuda-support", optional = true } diff --git a/juno_frontend/Cargo.toml b/juno_frontend/Cargo.toml index 3c3d557f..05957554 100644 --- a/juno_frontend/Cargo.toml +++ b/juno_frontend/Cargo.toml @@ -5,7 +5,8 @@ authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" [features] -cuda = ["hercules_opt/cuda"] +cuda = ["hercules_opt/cuda", "dep:cuda-support"] +default = [] [[bin]] name = "juno" @@ -21,6 +22,8 @@ lrlex = "0.13" lrpar = "0.13" [dependencies] +hercules_opt = { path = "../hercules_opt", default-features = false } +cuda-support = { path = "../cuda-support", optional = true } cfgrammar = "0.13" clap = { version = "*", features = ["derive"] } lrlex = "0.13" @@ -30,5 +33,4 @@ num-traits = "*" ordered-float = "*" phf = { version = "0.11", features = ["macros"] } hercules_ir = { path = "../hercules_ir" } -hercules_opt = { path = "../hercules_opt" } juno_scheduler = { path = "../juno_scheduler" } -- GitLab From b3a7740bf16f43371e1b91716c36d57083659b90 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Sat, 18 Jan 2025 12:41:23 -0600 Subject: [PATCH 071/109] undo --- hercules_samples/ccp/Cargo.toml | 4 ++-- juno_build/Cargo.toml | 3 +-- juno_frontend/Cargo.toml | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/hercules_samples/ccp/Cargo.toml b/hercules_samples/ccp/Cargo.toml index c665e73f..313fd179 100644 --- a/hercules_samples/ccp/Cargo.toml +++ b/hercules_samples/ccp/Cargo.toml @@ -8,10 +8,10 @@ edition = "2021" cuda = ["juno_build/cuda"] [dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml index 79b29c87..11ef85db 100644 --- a/juno_build/Cargo.toml +++ b/juno_build/Cargo.toml @@ -5,10 +5,9 @@ authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" [features] -cuda = ["juno_frontend/cuda", "dep:cuda-support"] +cuda = ["juno_frontend/cuda"] [dependencies] juno_frontend = { path = "../juno_frontend", default-features = false } hercules_ir = { path = "../hercules_ir" } with_builtin_macros = "0.1.0" -cuda-support = { path = "../cuda-support", optional = true } diff --git a/juno_frontend/Cargo.toml b/juno_frontend/Cargo.toml index 05957554..dff81db2 100644 --- a/juno_frontend/Cargo.toml +++ b/juno_frontend/Cargo.toml @@ -5,7 +5,7 @@ authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" [features] -cuda = ["hercules_opt/cuda", "dep:cuda-support"] +cuda = ["hercules_opt/cuda"] default = [] [[bin]] @@ -23,7 +23,6 @@ lrpar = "0.13" [dependencies] hercules_opt = { path = "../hercules_opt", default-features = false } -cuda-support = { path = "../cuda-support", optional = true } cfgrammar = "0.13" clap = { version = "*", features = ["derive"] } lrlex = "0.13" -- GitLab From 7c6aeb76e914efbb4362e6aa992fc23dd1e111ba Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sat, 18 Jan 2025 16:05:47 -0600 Subject: [PATCH 072/109] extra --- .gitignore | 3 +- Cargo.lock | 8 ++-- Cargo.toml | 4 +- hercules_cg/src/gpu.rs | 44 ++++++++++++++----- hercules_ir/src/dot.rs | 3 +- hercules_opt/src/pass.rs | 6 +++ hercules_samples/call/Cargo.toml | 7 ++- hercules_samples/ccp/build.rs | 7 --- hercules_samples/dot/build.rs | 7 --- hercules_samples/fac/build.rs | 7 --- hercules_samples/matmul/build.rs | 7 --- hercules_samples/matmul/src/matmul.hir | 10 ++--- juno_build/build.rs | 8 ++++ juno_build/src/lib.rs | 2 +- juno_samples/antideps/Cargo.toml | 9 ++-- juno_samples/antideps/antideps.mod | Bin 0 -> 1583 bytes juno_samples/casts_and_intrinsics/Cargo.toml | 7 ++- juno_samples/cava/Cargo.toml | 6 +-- juno_samples/concat/Cargo.toml | 6 +-- juno_samples/implicit_clone/Cargo.toml | 9 ++-- juno_samples/matmul/Cargo.toml | 9 ++-- juno_samples/nested_ccp/Cargo.toml | 9 ++-- juno_samples/simple3/Cargo.toml | 9 ++-- 23 files changed, 109 insertions(+), 78 deletions(-) create mode 100644 juno_build/build.rs create mode 100644 juno_samples/antideps/antideps.mod diff --git a/.gitignore b/.gitignore index 45f2e61b..29eb3e04 100644 --- a/.gitignore +++ b/.gitignore @@ -4,11 +4,10 @@ *.out *.ll *.c +*.cu *.o *.a *.hrt .*.swp .vscode *_env - -juno_samples/matmul/src/matmul_indented.jn diff --git a/Cargo.lock b/Cargo.lock index a1eb77de..3f1e2e7c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -374,9 +374,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.9" +version = "1.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b" +checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229" dependencies = [ "jobserver", "libc", @@ -1278,9 +1278,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394" +checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" dependencies = [ "adler2", "simd-adler32", diff --git a/Cargo.toml b/Cargo.toml index c57125f7..4e5826ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,13 +5,13 @@ members = [ "hercules_ir", "hercules_opt", "hercules_rt", - + "hercules_tools/hercules_driver", "juno_frontend", "juno_scheduler", "juno_build", - + #"hercules_test/hercules_interpreter", #"hercules_test/hercules_tests", diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 731e57e3..31c50212 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -638,7 +638,10 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str let ret_type_pnt = self.get_type(*self.return_type_id, true); write!(w, "\t{} ret;\n", ret_type_pnt)?; write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; - write!(pass_args, ", ret")?; + if !first_param { + write!(pass_args, ", ")?; + } + write!(pass_args, "ret")?; } write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tcudaDeviceSynchronize();\n")?; @@ -897,6 +900,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, KernelState::OutBlock, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_phi(*data, tabs, body)?; + } Ok(()) }) } @@ -929,6 +935,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_phi(*data, tabs, body)?; + } } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { @@ -943,6 +952,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_phi(*data, tabs, body)?; + } } } // Then generate for the thread fork tree through Fork node traversal. @@ -1022,6 +1034,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str &mut tabs, )?; } + for data in self.bbs.1[control.idx()].iter() { + self.codegen_data_phi(*data, tabs, body)?; + } } for child in fork_tree.get(&curr_fork).unwrap() { self.codegen_data_control_traverse( @@ -1398,15 +1413,8 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str panic!("Unsupported data node type") } } - // Since the data uses and reducts are responsible for updating Phi and - // Reduce nodes, respectively, we check and emit those for each data node. - if let Some(phis) = self.label_data_for_phi.get(&id) { - let val = self.get_value(id, false, false); - for phi in phis { - let phi_val = self.get_value(*phi, false, false); - write!(w, "{}{} = {};\n", tabs, phi_val, val,)?; - } - } + // Since reducts are responsible for updating Reduce nodes, + // we check and emit those for each data node. if let Some(reduces) = self.reduct_reduce_map.get(&id) { let val = self.get_value(id, false, false); for reduce in reduces { @@ -1417,6 +1425,22 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str Ok(()) } + /* + * Update Phi assignments for each data node. This is run after all data nodes + * for given control block have been emitted. + */ + fn codegen_data_phi(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> { + let tabs = "\t".repeat(num_tabs); + if let Some(phis) = self.label_data_for_phi.get(&id) { + let val = self.get_value(id, false, false); + for phi in phis { + let phi_val = self.get_value(*phi, false, false); + write!(w, "{}{} = {};\n", tabs, phi_val, val)?; + } + } + Ok(()) + } + fn codegen_control_node( &self, id: NodeID, diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs index 4d526366..fe6fee09 100644 --- a/hercules_ir/src/dot.rs +++ b/hercules_ir/src/dot.rs @@ -1,5 +1,5 @@ use std::collections::HashMap; -use std::env::temp_dir; +use std::env::{temp_dir}; use std::fmt::Write; use std::fs::File; use std::io::Write as _; @@ -23,6 +23,7 @@ pub fn xdot_module( let mut rng = rand::thread_rng(); let num: u64 = rng.gen(); tmp_path.push(format!("hercules_dot_{}.dot", num)); + let tmp_path = std::path::PathBuf::from(format!("hercules_dot.dot")); let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output file."); let mut contents = String::new(); write_dot( diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 149e4eeb..dbc24016 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -1079,6 +1079,12 @@ impl PassManager { file.write_all(cuda_ir.as_bytes()) .expect("PANIC: Unable to write output CUDA IR file contents."); + let cuda_text_path = format!("{}.cu", module_name); + let mut cuda_text_file = File::create(&cuda_text_path) + .expect("PANIC: Unable to open CUDA IR text file."); + cuda_text_file.write_all(cuda_ir.as_bytes()) + .expect("PANIC: Unable to write CUDA IR text file contents."); + let mut nvcc_process = Command::new("nvcc") .arg("-c") .arg("-O3") diff --git a/hercules_samples/call/Cargo.toml b/hercules_samples/call/Cargo.toml index 52c588e7..a5a44c2e 100644 --- a/hercules_samples/call/Cargo.toml +++ b/hercules_samples/call/Cargo.toml @@ -4,11 +4,14 @@ version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] edition = "2021" +[features] +cuda = ["juno_build/cuda"] + [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs index 0b984a0f..f04d48c7 100644 --- a/hercules_samples/ccp/build.rs +++ b/hercules_samples/ccp/build.rs @@ -6,11 +6,4 @@ fn main() { .unwrap() .build() .unwrap(); - - #[cfg(feature = "cuda")] - println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); - #[cfg(feature = "cuda")] - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); - #[cfg(feature = "cuda")] - println!("cargo:rustc-link-lib=cudart"); } diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs index 43cd34f9..2a239bc6 100644 --- a/hercules_samples/dot/build.rs +++ b/hercules_samples/dot/build.rs @@ -6,11 +6,4 @@ fn main() { .unwrap() .build() .unwrap(); - - #[cfg(feature = "cuda")] - println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); - #[cfg(feature = "cuda")] - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); - #[cfg(feature = "cuda")] - println!("cargo:rustc-link-lib=cudart"); } diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs index 6863b27b..4d8226f1 100644 --- a/hercules_samples/fac/build.rs +++ b/hercules_samples/fac/build.rs @@ -6,11 +6,4 @@ fn main() { .unwrap() .build() .unwrap(); - - #[cfg(feature = "cuda")] - println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); - #[cfg(feature = "cuda")] - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); - #[cfg(feature = "cuda")] - println!("cargo:rustc-link-lib=cudart"); } diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs index b170024b..08478dea 100644 --- a/hercules_samples/matmul/build.rs +++ b/hercules_samples/matmul/build.rs @@ -6,11 +6,4 @@ fn main() { .unwrap() .build() .unwrap(); - - #[cfg(feature = "cuda")] - println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); - #[cfg(feature = "cuda")] - println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); - #[cfg(feature = "cuda")] - println!("cargo:rustc-link-lib=cudart"); } diff --git a/hercules_samples/matmul/src/matmul.hir b/hercules_samples/matmul/src/matmul.hir index 400ab5e1..b0c31da4 100644 --- a/hercules_samples/matmul/src/matmul.hir +++ b/hercules_samples/matmul/src/matmul.hir @@ -1,9 +1,9 @@ -fn matmul(a: array(i32, 16, 64), b: array(i32, 64, 32)) -> array(i32, 16, 32) - c = constant(array(i32, 16, 32), []) - i_j_ctrl = fork(start, 16, 32) +fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2) + c = constant(array(i32, #0, #2), []) + i_j_ctrl = fork(start, #0, #2) i_idx = thread_id(i_j_ctrl, 0) j_idx = thread_id(i_j_ctrl, 1) - k_ctrl = fork(i_j_ctrl, 64) + k_ctrl = fork(i_j_ctrl, #1) k_idx = thread_id(k_ctrl, 0) k_join_ctrl = join(k_ctrl) i_j_join_ctrl = join(k_join_ctrl) @@ -15,4 +15,4 @@ fn matmul(a: array(i32, 16, 64), b: array(i32, 64, 32)) -> array(i32, 16, 32) add = add(mul, dot) dot = reduce(k_join_ctrl, zero, add) update_c = write(update_i_j_c, dot, position(i_idx, j_idx)) - update_i_j_c = reduce(i_j_join_ctrl, c, update_c) \ No newline at end of file + update_i_j_c = reduce(i_j_join_ctrl, c, update_c) diff --git a/juno_build/build.rs b/juno_build/build.rs new file mode 100644 index 00000000..7ba34c8c --- /dev/null +++ b/juno_build/build.rs @@ -0,0 +1,8 @@ +fn main() { + #[cfg(feature = "cuda")] + println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); + #[cfg(feature = "cuda")] + println!("cargo:rustc-link-lib=cudart"); +} diff --git a/juno_build/src/lib.rs b/juno_build/src/lib.rs index 0c676e4c..40660806 100644 --- a/juno_build/src/lib.rs +++ b/juno_build/src/lib.rs @@ -27,7 +27,7 @@ impl JunoCompiler { src_path: None, out_path: None, verify: JunoVerify::None, - x_dot: false, + x_dot: true, schedule: JunoSchedule::None, } } diff --git a/juno_samples/antideps/Cargo.toml b/juno_samples/antideps/Cargo.toml index e492e2ae..e6f38e09 100644 --- a/juno_samples/antideps/Cargo.toml +++ b/juno_samples/antideps/Cargo.toml @@ -8,11 +8,14 @@ edition = "2021" name = "juno_antideps" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } -hercules_rt = { path = "../../hercules_rt", features = [] } +juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/antideps/antideps.mod b/juno_samples/antideps/antideps.mod new file mode 100644 index 0000000000000000000000000000000000000000..b4abaef48222ace6f66264d6746dd2d7924b9039 GIT binary patch literal 1583 zcmb7E{c;;M5Z_(h>2#-)e3s;|IEihN*s;?z{z#gH4$wF8K$|~0Q%Hs~3_KnT!vN)f zUV*oO)nP*8!c4gt?d`7iOFyl)XuN*;`Q59V^`~Ea_438d-D?6szzhl~Fi#{RXsD7D z1Yw%N<}<HuZ}0A1kd08XM=sK6kCl)>n8=NF^Z9K|@y)%%qV|Y6a#3f{x~xP70+WAo zyyLap&eN`hdOJ_+>zgmXU2i8tt^!q^D~r{2bx-#zkwe1Sr26|v(A)xIJr@8ed)pcd zXOIvrVeXA$mRwpZ_^6E}FF*^OaQIwj1=;(ce=&^*aIF`FlH=}jT;FeUlwi?k1R^9D z60nwqUKSJv1+G=^YX`nn)@<9O>(pGc0b1!o=bB`X$S^@GyUZcU0x8Y&ZdKs!Ry7Yr zzm3;oy%cCH7kJGDx+_k#Z}$lNrC@eaD<&3)T(Aroh74?<g^MC+3>y6I2W&R7vxAJz z8==Bbfjfzwb~XhQ2%Kf9WqljGJ!1CIpN!+anGvuJ`uT#zrA{<ys~ft~O$OI#uY)$a zr3-mZ*k_WQ$_uhxCMP|)qV$Zp&X_l@9x)jlGnwol<qU^n?3ylLJua}I>xT&ZAL$*m zmg^N^gTb1{+dSrUIPlcx-=2$j4Q$c}i@R^3h5H$)4O+It!jTOP3=MF@=w{sr{Lt6F zp4kvPV9$>B(M4njHtoi1?`X+r#H&9d(Pn}jIi>?44_QPq+*f6VbeQMkUV+p1KV;`$ zU(xzJfvV{_%ricy#p9HKg1{ia8WyfLz2ML0T;$ww`yu*;j)I-fz%8S~+M}GVH(lXB znX&HF#iO2jKjmMj+kNI%%%9kH2khWcMhCsHgJB2!`Goade^LMJlPA3WIHRY-Fv2hb zJ7Hjk+$bY%nFGD6kGX-2p2{ows9=JI$;YL<jN)bFp3-WMNo$|Ucn}{9Qo@h>(jmn* zZuavJ4`as^;@SHH_#c+tJSm~%!BH}DmPZ9IB$3KYF&8z0S7HHEUXoYFOCgkqCBmsm z4Nogk2&;Ki@Vp_JibMoK#u%$m{D`7dh4=xL7T=?&#ExkQX)4n+&Ulq5qe>1a)N}Qj tnyELa{`l^<GxaN~-!qk|7djzIbsMGT@se{2gak?;c!Tf@!p{gl;h$>7iFE(~ literal 0 HcmV?d00001 diff --git a/juno_samples/casts_and_intrinsics/Cargo.toml b/juno_samples/casts_and_intrinsics/Cargo.toml index 83d5be58..9fac18b7 100644 --- a/juno_samples/casts_and_intrinsics/Cargo.toml +++ b/juno_samples/casts_and_intrinsics/Cargo.toml @@ -8,10 +8,13 @@ edition = "2021" name = "juno_casts_and_intrinsics" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda"] + [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml index dfde5978..ff375d80 100644 --- a/juno_samples/cava/Cargo.toml +++ b/juno_samples/cava/Cargo.toml @@ -9,11 +9,11 @@ name = "juno_cava" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } -hercules_rt = { path = "../../hercules_rt", features = [] } +juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } async-std = "*" clap = { version = "*", features = ["derive"] } image = "*" diff --git a/juno_samples/concat/Cargo.toml b/juno_samples/concat/Cargo.toml index 888a083f..24ba1acf 100644 --- a/juno_samples/concat/Cargo.toml +++ b/juno_samples/concat/Cargo.toml @@ -9,10 +9,10 @@ name = "juno_concat" path = "src/main.rs" [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } -hercules_rt = { path = "../../hercules_rt", features = [] } +juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/implicit_clone/Cargo.toml b/juno_samples/implicit_clone/Cargo.toml index 4f5387e7..96129371 100644 --- a/juno_samples/implicit_clone/Cargo.toml +++ b/juno_samples/implicit_clone/Cargo.toml @@ -8,11 +8,14 @@ edition = "2021" name = "juno_implicit_clone" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } -hercules_rt = { path = "../../hercules_rt", features = [] } +juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml index 1e004dcd..eac83d15 100644 --- a/juno_samples/matmul/Cargo.toml +++ b/juno_samples/matmul/Cargo.toml @@ -8,12 +8,15 @@ edition = "2021" name = "juno_matmul" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } -hercules_rt = { path = "../../hercules_rt", features = [] } +juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" rand = "*" diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml index bcf1fff8..5ee3f747 100644 --- a/juno_samples/nested_ccp/Cargo.toml +++ b/juno_samples/nested_ccp/Cargo.toml @@ -8,11 +8,14 @@ edition = "2021" name = "juno_nested_ccp" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } -hercules_rt = { path = "../../hercules_rt", features = [] } +juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml index c66dc977..36d50dbd 100644 --- a/juno_samples/simple3/Cargo.toml +++ b/juno_samples/simple3/Cargo.toml @@ -8,11 +8,14 @@ edition = "2021" name = "juno_simple3" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] -juno_build = { path = "../../juno_build", features = [] } +juno_build = { path = "../../juno_build" } [dependencies] -juno_build = { path = "../../juno_build", features = [] } -hercules_rt = { path = "../../hercules_rt", features = [] } +juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" -- GitLab From 23f920e58d5888cd46c00e794c9292f6f5aae530 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 00:19:21 +0000 Subject: [PATCH 073/109] phi works --- .gitignore | 3 +- hercules_cg/src/gpu.rs | 79 +++++++++++++++--------------------------- hercules_ir/src/dot.rs | 3 +- juno_build/src/lib.rs | 2 +- 4 files changed, 31 insertions(+), 56 deletions(-) diff --git a/.gitignore b/.gitignore index 29eb3e04..749cea40 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ *.o *.a *.hrt -.*.swp +*.png +*.swp .vscode *_env diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 31c50212..a0281795 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -183,21 +183,19 @@ pub fn gpu_codegen<W: Write>( threads_per_warp: 32, }; - let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> { - let mut label_data_for_phi = HashMap::new(); - for (idx, node) in function.nodes.iter().enumerate() { - if let Node::Phi { control: _, data } = node { - for &data_id in data.iter() { - label_data_for_phi - .entry(data_id) - .or_insert(vec![]) - .push(NodeID::new(idx)); - } + // Map from control to pairs of data to update phi + // For each phi, we go to its region and get region's controls + let control_data_phi_map: &mut HashMap<NodeID, Vec<(NodeID, NodeID)>> = &mut HashMap::new(); + for (idx, node) in function.nodes.iter().enumerate() { + if let Node::Phi { control, data } = node { + let Node::Region { preds } = &function.nodes[control.idx()] else { + panic!("Phi's control must be a region node"); + }; + for (i, &pred) in preds.iter().enumerate() { + control_data_phi_map.entry(pred).or_insert(vec![]).push((data[i], NodeID::new(idx))); } } - label_data_for_phi - }; - let label_data_for_phi = &label_data_for_phi(); + } let def_use_map = &def_use(function); @@ -215,7 +213,7 @@ pub fn gpu_codegen<W: Write>( join_fork_map, fork_reduce_map, reduct_reduce_map, - label_data_for_phi, + control_data_phi_map, return_type_id, }; ctx.codegen_function(w) @@ -241,7 +239,7 @@ struct GPUContext<'a> { join_fork_map: &'a HashMap<NodeID, NodeID>, fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, - label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>, + control_data_phi_map: &'a HashMap<NodeID, Vec<(NodeID, NodeID)>>, return_type_id: &'a TypeID, } @@ -490,6 +488,9 @@ namespace cg = cooperative_groups; !self.function.nodes[id.idx()].is_parameter() { write!(w, "\t{};\n", self.get_value(id, true, false))?; } + if self.function.nodes[id.idx()].is_phi() { + write!(w, "\t{}_tmp;\n", self.get_value(id, true, false))?; + } } Ok(()) } @@ -900,9 +901,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, KernelState::OutBlock, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } - for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_phi(*data, tabs, body)?; - } Ok(()) }) } @@ -935,9 +933,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } - for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_phi(*data, tabs, body)?; - } } // Then generate data and control for the single block fork if it exists if block_fork.is_some() { @@ -952,9 +947,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } - for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_phi(*data, tabs, body)?; - } } } // Then generate for the thread fork tree through Fork node traversal. @@ -1034,9 +1026,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str &mut tabs, )?; } - for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_phi(*data, tabs, body)?; - } } for child in fork_tree.get(&curr_fork).unwrap() { self.codegen_data_control_traverse( @@ -1071,12 +1060,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str let define_variable = self.get_value(id, false, false).to_string(); let tabs = "\t".repeat(*num_tabs); match &self.function.nodes[id.idx()] { - // Phi registers emitted at top and the data nodes it uses will - // update the phi - Node::Phi { - control: _, - data: _, - } => {} + Node::Phi { control: _, data: _ } => { + write!(w, "{}{} = {}_tmp;\n", tabs, define_variable, define_variable)?; + } Node::ThreadID { control, dimension } => { let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else { panic!("Expected ThreadID's control to be a fork node"); @@ -1413,8 +1399,8 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str panic!("Unsupported data node type") } } - // Since reducts are responsible for updating Reduce nodes, - // we check and emit those for each data node. + // Since reducts are responsible for updating Reduce nodes, we check and + // emit those for each data node. if let Some(reduces) = self.reduct_reduce_map.get(&id) { let val = self.get_value(id, false, false); for reduce in reduces { @@ -1425,22 +1411,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str Ok(()) } - /* - * Update Phi assignments for each data node. This is run after all data nodes - * for given control block have been emitted. - */ - fn codegen_data_phi(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> { - let tabs = "\t".repeat(num_tabs); - if let Some(phis) = self.label_data_for_phi.get(&id) { - let val = self.get_value(id, false, false); - for phi in phis { - let phi_val = self.get_value(*phi, false, false); - write!(w, "{}{} = {};\n", tabs, phi_val, val)?; - } - } - Ok(()) - } - fn codegen_control_node( &self, id: NodeID, @@ -1451,6 +1421,11 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str w_post_init: &mut String, w_term: &mut String, ) -> Result<usize, Error> { + for (data, phi) in self.control_data_phi_map.get(&id).unwrap_or(&vec![]).iter() { + let data = self.get_value(*data, false, false); + let phi = self.get_value(*phi, false, false); + write!(w_term, "\t{}_tmp = {};\n", phi, data)?; + } let tabs = match &self.function.nodes[id.idx()] { Node::Start | Node::Region { preds: _ } @@ -1572,7 +1547,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str // we write to that parameter upon return. if self.types[self.typing[data.idx()].idx()].is_primitive() { let return_val = self.get_value(*data, false, false); - write!(w_term, "\tif (threadIdx.x == 0) {{\n")?; + write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?; write!(w_term, "\t\t*ret = {};\n", return_val)?; write!(w_term, "\t}}\n")?; } diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs index fe6fee09..4d526366 100644 --- a/hercules_ir/src/dot.rs +++ b/hercules_ir/src/dot.rs @@ -1,5 +1,5 @@ use std::collections::HashMap; -use std::env::{temp_dir}; +use std::env::temp_dir; use std::fmt::Write; use std::fs::File; use std::io::Write as _; @@ -23,7 +23,6 @@ pub fn xdot_module( let mut rng = rand::thread_rng(); let num: u64 = rng.gen(); tmp_path.push(format!("hercules_dot_{}.dot", num)); - let tmp_path = std::path::PathBuf::from(format!("hercules_dot.dot")); let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output file."); let mut contents = String::new(); write_dot( diff --git a/juno_build/src/lib.rs b/juno_build/src/lib.rs index 40660806..0c676e4c 100644 --- a/juno_build/src/lib.rs +++ b/juno_build/src/lib.rs @@ -27,7 +27,7 @@ impl JunoCompiler { src_path: None, out_path: None, verify: JunoVerify::None, - x_dot: true, + x_dot: false, schedule: JunoSchedule::None, } } -- GitLab From d137cb23704c9e921c770ac8149fc825c4990404 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 06:44:54 +0000 Subject: [PATCH 074/109] itm --- hercules_cg/src/gpu.rs | 104 +++++++++++++++++++-------------- hercules_opt/src/pass.rs | 7 +-- juno_samples/cava/Cargo.toml | 3 + juno_samples/concat/Cargo.toml | 3 + 4 files changed, 67 insertions(+), 50 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a0281795..efd7ba4b 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -8,6 +8,9 @@ use self::hercules_ir::*; use crate::*; +use std::fs::OpenOptions; +use std::io::Write as IoWrite; + /* * The top level function to compile a Hercules IR function into CUDA * kernel for execution on the GPU. We generate CUDA C textually, with a lot @@ -157,24 +160,25 @@ pub fn gpu_codegen<W: Write>( let return_type_id = &typing[data_node_id.idx()]; let return_type = &types[return_type_id.idx()]; - if return_type.is_array() || return_type.is_product() || return_type.is_summation() { + let return_param_idx = if !return_type.is_primitive() { let objects = &collection_objects.objects(data_node_id); - if objects.len() > 1 { - let origin = collection_objects.origin(objects[0]); - if !objects - .iter() - .all(|obj| collection_objects.origin(*obj) == origin) - { - panic!( - "Returned data node {} has multiple collection objects with different origins", - data_node_id.idx() - ); - } - if !matches!(origin, CollectionObjectOrigin::Parameter(..)) { - panic!("Returns collection object that did not originate from a parameter"); - } + let origin = collection_objects.origin(objects[0]); + if !objects + .iter() + .all(|obj| collection_objects.origin(*obj) == origin) + { + panic!( + "Returned data node {} has multiple collection objects with different origins", + data_node_id.idx() + ); } - } + let CollectionObjectOrigin::Parameter(param_idx) = origin else { + panic!("Returns collection object that did not originate from a parameter"); + }; + Some(param_idx) + } else { + None + }; // Temporary hardcoded values let kernel_params = &GPUKernelParams { @@ -192,7 +196,7 @@ pub fn gpu_codegen<W: Write>( panic!("Phi's control must be a region node"); }; for (i, &pred) in preds.iter().enumerate() { - control_data_phi_map.entry(pred).or_insert(vec![]).push((data[i], NodeID::new(idx))); + control_data_phi_map.entry(pred).or_default().push((data[i], NodeID::new(idx))); } } } @@ -215,6 +219,7 @@ pub fn gpu_codegen<W: Write>( reduct_reduce_map, control_data_phi_map, return_type_id, + return_param_idx, }; ctx.codegen_function(w) } @@ -241,6 +246,7 @@ struct GPUContext<'a> { reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, control_data_phi_map: &'a HashMap<NodeID, Vec<(NodeID, NodeID)>>, return_type_id: &'a TypeID, + return_param_idx: Option<usize>, } /* @@ -372,7 +378,6 @@ impl GPUContext<'_> { #include <math_constants.h> #include <mma.h> #include <cooperative_groups.h> -#include <cooperative_groups/memcpy_async.h> #include <cooperative_groups/reduce.h> namespace cg = cooperative_groups; @@ -608,7 +613,7 @@ int main() {{ let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); let ret_type = self.get_type(*self.return_type_id, false); write!(w, " -extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_string() }, self.function.name)?; +extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // The first set of parameters are dynamic constants. let mut first_param = true; for idx in 0..self.function.num_dynamic_constants { @@ -650,6 +655,8 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str write!(w, "\t{} host_ret;\n", ret_type)?; write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; write!(w, "\treturn host_ret;\n")?; + } else { + write!(w, "\treturn p{};\n", self.return_param_idx.unwrap())?; } } @@ -1286,7 +1293,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str TernaryOperator::Select => { write!( w, - "{}{} = {} ? {} : {};", + "{}{} = {} ? {} : {};\n", tabs, define_variable, self.get_value(*first, false, false), @@ -1315,26 +1322,23 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name)?; } else { let ty = &self.types[id_type.idx()]; - let func_name = self.codegen_intrinsic(intrinsic, ty); + let intrinsic = self.codegen_intrinsic(intrinsic, ty); + let args = args.iter() + .map(|arg| self.get_value(*arg, false, false)) + .collect::<Vec<_>>() + .join(", "); write!( w, "{}{} = {}({});\n", tabs, define_variable, - func_name, - self.get_value(args[0], false, false), + intrinsic, + args, )?; } } - // For read, all the cases are: - // 1. Reading collection from/to global to/from shared - // 2. Reading primitive from/to global to/from shared - // 3. Reading primitive from/to global to/from register - // 4. Reading primitive from/to shared to/from register - // The first three can all use cooperative groups memcpy and the last - // one can't. However, the C++/CUDA semantics for the last three are - // identical, so we differentiate the cases by data type instead of - // data src/dest, with only collection type using collective group. + // If we read collection, distribute elements among threads with cg + // sync after. If we read primitive, copy read on all threads. Node::Read { collect, indices } => { let is_char = self.is_char(self.typing[collect.idx()]); let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); @@ -1347,20 +1351,27 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str write!(w, "{}{} = *({});\n", tabs, define_variable, collect_with_indices)?; } } else { - let nested_fork = nesting_fork.unwrap(); + // Divide up "elements", which are collection size divided + // by element size, among threads. let cg_tile = match state { KernelState::OutBlock => "grid".to_string(), KernelState::InBlock => "block".to_string(), - KernelState::InThread => self.get_cg_tile(nested_fork, CGType::UsePerId), + KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); - write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, define_variable, collect_with_indices, data_size)?; - write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; + let data_type = self.get_type(data_type_id, false); + let num_elements = format!("(({}) / sizeof({}))", data_size, data_type.strip_suffix('*').unwrap()); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {} / {}.size()) {{\n", tabs, cg_tile, num_elements, num_elements, cg_tile)?; + write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, define_variable, collect_with_indices)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; + write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, define_variable, cg_tile, num_elements, cg_tile, cg_tile, collect_with_indices, cg_tile, num_elements, cg_tile, cg_tile)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; } } - // For write, the cases are the same, but when using C++ dereference - // semantics, we need to gate the write with a thread rank check for - // thread safety. + // Write is same as read, but when writing a primitive, we need to gate with + // a thread rank check. Node::Write { collect, data, @@ -1373,9 +1384,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str let cg_tile = match state { KernelState::OutBlock => "grid".to_string(), KernelState::InBlock => "block".to_string(), - KernelState::InThread => { - self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId) - } + KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; if self.types[data_type_id.idx()].is_primitive() { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; @@ -1388,8 +1397,15 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str write!(w, "{}}}\n", tabs)?; } else { let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); - write!(w, "{}cg::memcpy_async({}, {}, {}, {});\n", tabs, cg_tile, collect_with_indices, data_variable, data_size)?; - write!(w, "{}cg::wait({});\n", tabs, cg_tile)?; + let data_type = self.get_type(data_type_id, false); + let num_elements = format!("(({}) / sizeof({}))", data_size, data_type.strip_suffix('*').unwrap()); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {} / {}.size()) {{\n", tabs, cg_tile, num_elements, num_elements, cg_tile)?; + write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, collect_with_indices, data_variable)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; + write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, num_elements, cg_tile, cg_tile, data_variable, cg_tile, num_elements, cg_tile, cg_tile)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; } let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index dbc24016..bb70bf08 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -1079,12 +1079,6 @@ impl PassManager { file.write_all(cuda_ir.as_bytes()) .expect("PANIC: Unable to write output CUDA IR file contents."); - let cuda_text_path = format!("{}.cu", module_name); - let mut cuda_text_file = File::create(&cuda_text_path) - .expect("PANIC: Unable to open CUDA IR text file."); - cuda_text_file.write_all(cuda_ir.as_bytes()) - .expect("PANIC: Unable to write CUDA IR text file contents."); - let mut nvcc_process = Command::new("nvcc") .arg("-c") .arg("-O3") @@ -1111,6 +1105,7 @@ impl PassManager { .expect("PANIC: Unable to open output Rust runtime file."); file.write_all(rust_rt.as_bytes()) .expect("PANIC: Unable to write output Rust runtime file contents."); + } Pass::Serialize(output_file) => { let module_contents: Vec<u8> = postcard::to_allocvec(&self.module).unwrap(); diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml index ff375d80..63b6b2ac 100644 --- a/juno_samples/cava/Cargo.toml +++ b/juno_samples/cava/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_cava" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/concat/Cargo.toml b/juno_samples/concat/Cargo.toml index 24ba1acf..f2f90237 100644 --- a/juno_samples/concat/Cargo.toml +++ b/juno_samples/concat/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_concat" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } -- GitLab From 107a376f74ee7e6b2946b167445c508a344054e1 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 06:45:36 +0000 Subject: [PATCH 075/109] sitm --- hercules_cg/src/gpu.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index efd7ba4b..b14a136f 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -8,9 +8,6 @@ use self::hercules_ir::*; use crate::*; -use std::fs::OpenOptions; -use std::io::Write as IoWrite; - /* * The top level function to compile a Hercules IR function into CUDA * kernel for execution on the GPU. We generate CUDA C textually, with a lot -- GitLab From f2ce75095348fa2ab5491676ca36c03063c05cb6 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 16:40:21 +0000 Subject: [PATCH 076/109] yay --- hercules_cg/src/gpu.rs | 242 +++++++++++++--------------------- hercules_rt/src/rtdefs.cu | 11 +- juno_samples/cava/src/main.rs | 12 +- 3 files changed, 106 insertions(+), 159 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index b14a136f..bb28db8a 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -292,13 +292,9 @@ enum CGType { impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { - // If run_debug, wrapping C host code is self-contained with malloc, etc, - // else it only does kernel launch. - let run_debug = false; - // Emit all code up to the "goto" to Start's block let mut top = String::new(); - self.codegen_kernel_begin(run_debug, &mut top)?; + self.codegen_kernel_begin(&mut top)?; let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; @@ -344,6 +340,7 @@ impl GPUContext<'_> { &fork_thread_quota_map, &extra_dim_collects, &mut dynamic_shared_offset, + num_blocks, num_threads, &mut gotos, )?; @@ -358,14 +355,14 @@ impl GPUContext<'_> { // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(run_debug, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; + self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) } // Emit kernel headers, signature, arguments, and dynamic shared memory declaration - fn codegen_kernel_begin(&self, run_debug: bool, w: &mut String) -> Result<(), Error> { + fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { write!(w, " #include <assert.h> #include <stdio.h> @@ -390,8 +387,8 @@ namespace cg = cooperative_groups; write!( w, - "__global__ void __launch_bounds__({}) {}{}(", - self.kernel_params.max_num_threads, self.function.name, if run_debug { "" } else { "_gpu" } + "__global__ void __launch_bounds__({}) {}_gpu(", + self.kernel_params.max_num_threads, self.function.name )?; // The first set of parameters are dynamic constants. let mut first_param = true; @@ -534,129 +531,59 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, run_debug: bool, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); - if run_debug { - write!(w, " -int main() {{ -")?; - // The first set of parameters are dynamic constants. - let mut first_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(pass_args, ", ")?; - } - write!(w, "\tunsigned long long dc_p{} = 1ull;\n", idx)?; - write!(pass_args, "dc_p{}", idx)?; - } - self.codegen_dynamic_constants(w)?; - // The second set of parameters are normal arguments. - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { - write!(pass_args, ", ")?; - } - let param_type = self.get_type(*ty, false); - if self.types[ty.idx()].is_primitive() { - write!(w, "\t{} p{} = 1;\n", param_type, idx)?; - } else { - let param_size = self.get_size(*ty, None, None); - write!(w, "\t{} p{};\n", param_type, idx)?; - write!(w, "\tif (cudaMalloc((void**)&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; - write!(w, "\t\tprintf(\"Error allocating memory for parameter %d\\n\", {});\n", idx)?; - write!(w, "\t\treturn -1;\n")?; - write!(w, "\t}}\n")?; - } - write!(pass_args, "p{}", idx)?; - } - // Pull primitive return to a pointer parameter - if self.types[self.return_type_id.idx()].is_primitive() { - let ret_type_no_pnt = self.get_type(*self.return_type_id, false); - let ret_type = self.get_type(*self.return_type_id, true); - write!(w, "\t{} ret;\n", ret_type)?; - write!(w, "\tif (cudaMalloc((void**)&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; - write!(w, "\t\tprintf(\"Error allocating memory for return value\\n\");\n")?; - write!(w, "\t\treturn -1;\n")?; - write!(w, "\t}}\n")?; - write!(pass_args, ", ret")?; - } - write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; - write!(w, "\tbool skip = false;\n")?; - write!(w, "\tcudaError_t err = cudaGetLastError();\n")?; - write!(w, "\tif (err != cudaSuccess) {{\n")?; - write!(w, "\t\tprintf(\"Error launching kernel: %s\\n\", cudaGetErrorString(err));\n")?; - write!(w, "\t\tskip = true;\n")?; - write!(w, "\t}}\n")?; - write!(w, "\tif (cudaDeviceSynchronize() != cudaSuccess && !skip) {{\n")?; - write!(w, "\t\tprintf(\"Error synchronizing device\\n\");\n")?; - write!(w, "\t\tskip = true;\n")?; - write!(w, "\t}}\n")?; - for (idx, ty) in self.function.param_types.iter().enumerate() { - if !self.types[ty.idx()].is_primitive() { - write!(w, "\tcudaFree(p{});\n", idx)?; - } - } - if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, "\tcudaFree(ret);\n")?; - } - } - - else { - let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); - let ret_type = self.get_type(*self.return_type_id, false); - write!(w, " + let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); + let ret_type = self.get_type(*self.return_type_id, false); + write!(w, " extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; - // The first set of parameters are dynamic constants. - let mut first_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(w, ", ")?; - write!(pass_args, ", ")?; - } - write!(w, "unsigned long long dc_p{}", idx)?; - write!(pass_args, "dc_p{}", idx)?; - } - // The second set of parameters are normal arguments. - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { - write!(w, ", ")?; - write!(pass_args, ", ")?; - } - let param_type = self.get_type(*ty, false); - write!(w, "{} p{}", param_type, idx)?; - write!(pass_args, "p{}", idx)?; - } - write!(w, ") {{\n")?; - // Pull primitive return as pointer parameter for kernel - if ret_primitive { - let ret_type_pnt = self.get_type(*self.return_type_id, true); - write!(w, "\t{} ret;\n", ret_type_pnt)?; - write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; - if !first_param { - write!(pass_args, ", ")?; - } - write!(pass_args, "ret")?; + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; } - write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; - write!(w, "\tcudaDeviceSynchronize();\n")?; - if ret_primitive { - write!(w, "\t{} host_ret;\n", ret_type)?; - write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; - write!(w, "\treturn host_ret;\n")?; + write!(w, "unsigned long long dc_p{}", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; } else { - write!(w, "\treturn p{};\n", self.return_param_idx.unwrap())?; + write!(w, ", ")?; + write!(pass_args, ", ")?; } + let param_type = self.get_type(*ty, false); + write!(w, "{} p{}", param_type, idx)?; + write!(pass_args, "p{}", idx)?; + } + write!(w, ") {{\n")?; + // Pull primitive return as pointer parameter for kernel + if ret_primitive { + let ret_type_pnt = self.get_type(*self.return_type_id, true); + write!(w, "\t{} ret;\n", ret_type_pnt)?; + write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; + if !first_param { + write!(pass_args, ", ")?; + } + write!(pass_args, "ret")?; + } + write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; + write!(w, "\tcudaDeviceSynchronize();\n")?; + write!(w, "\tfflush(stdout);\n")?; + if ret_primitive { + write!(w, "\t{} host_ret;\n", ret_type)?; + write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; + write!(w, "\treturn host_ret;\n")?; + } else { + write!(w, "\treturn p{};\n", self.return_param_idx.unwrap())?; } - write!(w, "}}\n")?; Ok(()) } @@ -903,7 +830,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, KernelState::OutBlock, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + self.codegen_data_node(*data, KernelState::OutBlock, Some(1), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } Ok(()) }) @@ -921,6 +848,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, extra_dim_collects: &HashSet<TypeID>, dynamic_shared_offset: &mut String, + num_blocks: usize, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -935,7 +863,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_blocks), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists @@ -949,7 +877,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } } @@ -1020,6 +948,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; self.codegen_data_node( *data, state, + None, Some(use_thread_quota), parallel_factor, Some(curr_fork), @@ -1052,6 +981,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; &self, id: NodeID, state: KernelState, + num_blocks: Option<usize>, use_thread_quota: Option<usize>, parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, @@ -1348,17 +1278,18 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w, "{}{} = *({});\n", tabs, define_variable, collect_with_indices)?; } } else { - // Divide up "elements", which are collection size divided - // by element size, among threads. + if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { + panic!("GPU can't guarantee correctness for multi-block collection reads"); + } let cg_tile = match state { - KernelState::OutBlock => "grid".to_string(), - KernelState::InBlock => "block".to_string(), + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); - let data_type = self.get_type(data_type_id, false); - let num_elements = format!("(({}) / sizeof({}))", data_size, data_type.strip_suffix('*').unwrap()); - write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {} / {}.size()) {{\n", tabs, cg_tile, num_elements, num_elements, cg_tile)?; + // Divide up "elements", which are collection size divided + // by element size, among threads. + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects), Some(true)); + let num_elements = format!("({})", data_size); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, define_variable, collect_with_indices)?; write!(w, "{}}}\n", tabs)?; write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; @@ -1378,9 +1309,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; + if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { + panic!("GPU can't guarantee correctness for multi-block collection writes"); + } let cg_tile = match state { - KernelState::OutBlock => "grid".to_string(), - KernelState::InBlock => "block".to_string(), + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; if self.types[data_type_id.idx()].is_primitive() { @@ -1393,17 +1326,16 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } write!(w, "{}}}\n", tabs)?; } else { - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); - let data_type = self.get_type(data_type_id, false); - let num_elements = format!("(({}) / sizeof({}))", data_size, data_type.strip_suffix('*').unwrap()); - write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {} / {}.size()) {{\n", tabs, cg_tile, num_elements, num_elements, cg_tile)?; + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects), Some(true)); + let num_elements = format!("({})", data_size); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, collect_with_indices, data_variable)?; write!(w, "{}}}\n", tabs)?; write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, num_elements, cg_tile, cg_tile, data_variable, cg_tile, num_elements, cg_tile, cg_tile)?; write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; } + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -1587,7 +1519,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; for index in indices { match index { Index::Field(field) => { - self.get_size(type_id, Some(*field), None); + self.get_size(type_id, Some(*field), None, None); } // Variants of summations have zero offset Index::Variant(_) => {} @@ -1619,7 +1551,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; ")".repeat(array_indices.len() - if has_extra_dim { 1 } else { 0 }) )); if is_char { - let element_size = self.get_size(*element_type, None, None); + let element_size = self.get_size(*element_type, None, None, None); index_ptr.push_str(&format!(" * {}", element_size)); } } @@ -1668,7 +1600,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Constant::Product(type_id, constant_fields) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None, extra_dim_collects, None); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; @@ -1680,7 +1612,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call let field_type = self.get_type(type_fields[i], true); - let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects); + let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects, None); let field_constant = &self.constants[constant_fields[i].idx()]; if field_constant.is_scalar() { self.codegen_constant( @@ -1700,7 +1632,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Constant::Summation(type_id, variant, field) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None, extra_dim_collects, None); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; @@ -1735,7 +1667,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; panic!("Nested array constant should not be re-allocated"); } let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None, extra_dim_collects, None); let element_type = self.get_type(*element_type, true); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; @@ -1752,11 +1684,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; * and offset to 2nd field. This is useful for constant initialization and read/write * index math. */ - fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>) -> String { + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>, exclude_element_size: Option<bool>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { let array_size = multiply_dcs(if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { &extents[1..] } else { extents }); - format!("{} * {}", self.get_alignment(*element_type), array_size) + if exclude_element_size.unwrap_or(false) { + array_size + } else { + format!("{} * {}", self.get_alignment(*element_type), array_size) + } } Type::Product(fields) => { let num_fields = &num_fields.unwrap_or(fields.len()); @@ -1764,7 +1700,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; .iter() .enumerate() .filter(|(i, _)| i < num_fields) - .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects), self.get_alignment(*id))) + .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects, None), self.get_alignment(*id))) .fold(String::from("0"), |acc, (size, align)| { if acc == "0" { size @@ -1779,7 +1715,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; format!( "{} - {}", with_field, - self.get_size(fields[*num_fields], None, extra_dim_collects) + self.get_size(fields[*num_fields], None, extra_dim_collects, None) ) } else { with_field @@ -1789,7 +1725,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // The argmax variant by size is not guaranteed to be same as // argmax variant by alignment, eg product of 3 4-byte primitives // vs 1 8-byte primitive, so we need to calculate both. - let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects)).fold( + let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects, None)).fold( String::from("0"), |acc, x| { if acc == "0" { diff --git a/hercules_rt/src/rtdefs.cu b/hercules_rt/src/rtdefs.cu index b7378d81..6c59abe2 100644 --- a/hercules_rt/src/rtdefs.cu +++ b/hercules_rt/src/rtdefs.cu @@ -7,7 +7,7 @@ extern "C" { } return ptr; } - + void *cuda_alloc_zeroed(size_t size) { void *ptr = cuda_alloc(size); if (!ptr) { @@ -15,23 +15,24 @@ extern "C" { } cudaError_t res = cudaMemset(ptr, 0, size); if (res != cudaSuccess) { + cuda_dealloc(ptr); return NULL; } return ptr; } - + void cuda_dealloc(void *ptr) { cudaFree(ptr); } - + void copy_cpu_to_cuda(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); } - + void copy_cuda_to_cpu(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost); } - + void copy_cuda_to_cuda(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice); } diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index 9c2f99a8..a36d8826 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -171,7 +171,17 @@ fn cava_harness(args: CavaInputs) { .expect("Error saving verification image"); } - assert_eq!(result, cpu_result.into(), "Verification failed, mismatch"); + let max_diff = result.iter() + .zip(cpu_result.iter()) + .map(|(a, b)| (*a as i16 - *b as i16).abs()) + .max() + .unwrap_or(0); + + assert!( + max_diff <= 3, + "Verification failed: maximum pixel difference of {} exceeds threshold of 3", + max_diff + ); } } -- GitLab From 8051fcf9fc89d19ca0bc325819d8f793b565a6a0 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 16:47:55 +0000 Subject: [PATCH 077/109] augment told me to --- hercules_rt/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs index 04c9ef93..51fdfa23 100644 --- a/hercules_rt/build.rs +++ b/hercules_rt/build.rs @@ -11,8 +11,8 @@ fn main() { .status() .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?"); Command::new("ar") - .args(&["crus", "librtdefs.a", "rtdefs.o"]) .current_dir(&Path::new(&out_dir)) + .args(&["crus", "librtdefs.a", "rtdefs.o"]) .status() .unwrap(); -- GitLab From 57c1e6fd37278a69cb7765077141cf31aac1ece2 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 17:03:56 +0000 Subject: [PATCH 078/109] trying --- hercules_rt/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs index 51fdfa23..459903c1 100644 --- a/hercules_rt/build.rs +++ b/hercules_rt/build.rs @@ -19,8 +19,8 @@ fn main() { println!("cargo::rustc-link-search=native={}", out_dir); println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); println!("cargo::rustc-link-search=native=/usr/local/cuda/lib64"); - println!("cargo::rustc-link-lib=static=rtdefs"); println!("cargo::rustc-link-lib=cudart"); + println!("cargo::rustc-link-lib=static=rtdefs"); println!("cargo::rerun-if-changed=src/rtdefs.cu"); } } -- GitLab From b944cdefbc4c148ff38cddbe7adb9f517cd83711 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 17:07:20 +0000 Subject: [PATCH 079/109] trying --- .gitlab-ci.yml | 2 +- hercules_rt/build.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b80dd590..c2d17349 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,4 +2,4 @@ test-job: stage: test script: - cargo test - - cargo test --features=cuda + - RUSTFLAGS="-C link-arg=-Wl,--no-as-needed" cargo build --features=cuda -vv diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs index 459903c1..51fdfa23 100644 --- a/hercules_rt/build.rs +++ b/hercules_rt/build.rs @@ -19,8 +19,8 @@ fn main() { println!("cargo::rustc-link-search=native={}", out_dir); println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); println!("cargo::rustc-link-search=native=/usr/local/cuda/lib64"); - println!("cargo::rustc-link-lib=cudart"); println!("cargo::rustc-link-lib=static=rtdefs"); + println!("cargo::rustc-link-lib=cudart"); println!("cargo::rerun-if-changed=src/rtdefs.cu"); } } -- GitLab From ce7687419da6fea031bcc0f2c8629bee92b9fd88 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 17:10:56 +0000 Subject: [PATCH 080/109] dum --- hercules_rt/src/rtdefs.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hercules_rt/src/rtdefs.cu b/hercules_rt/src/rtdefs.cu index 6c59abe2..3fa50b48 100644 --- a/hercules_rt/src/rtdefs.cu +++ b/hercules_rt/src/rtdefs.cu @@ -8,6 +8,10 @@ extern "C" { return ptr; } + void cuda_dealloc(void *ptr) { + cudaFree(ptr); + } + void *cuda_alloc_zeroed(size_t size) { void *ptr = cuda_alloc(size); if (!ptr) { @@ -21,10 +25,6 @@ extern "C" { return ptr; } - void cuda_dealloc(void *ptr) { - cudaFree(ptr); - } - void copy_cpu_to_cuda(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); } -- GitLab From 59becc070bda5854f888f41073866e6064e94b22 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 17:13:02 +0000 Subject: [PATCH 081/109] mb --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index c2d17349..b80dd590 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,4 +2,4 @@ test-job: stage: test script: - cargo test - - RUSTFLAGS="-C link-arg=-Wl,--no-as-needed" cargo build --features=cuda -vv + - cargo test --features=cuda -- GitLab From 502189f342e41df72c71301c6b450a16030f3d07 Mon Sep 17 00:00:00 2001 From: prathi3 <prathi3@illinois.edu> Date: Sun, 19 Jan 2025 14:26:51 -0600 Subject: [PATCH 082/109] Delete matmul.hbin --- juno_samples/matmul/src/matmul.hbin | Bin 1323 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 juno_samples/matmul/src/matmul.hbin diff --git a/juno_samples/matmul/src/matmul.hbin b/juno_samples/matmul/src/matmul.hbin deleted file mode 100644 index 046898dddc96e417ac560697d7030bf9583f77d6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1323 zcmd6n>sJ&<5XI}(%=El_VWts$ASj|M3Jbz6WHAaFHRk2pm>9D-!3YX!A`nah8sG1e zKRML{=jh3|d`o|*>iykYy)*apuB#TWFD))FFxYY6Q$!I$*8`}{Wru`<RZKo@uFBH< z!j0KhW32V@oJ`p{KZsTA$AO*p9XqYSuBXCS^baf87;O#9*^bzn<#fae>8k57c}Bq~ z(u4a+%d>DjGBHd{3G1r%c&uXDVA@CziJl7>H>LxogA(X!m&c8*W*f{l(lyb$1x)NT zJup3#4S;!YD1&XaOkSC5L%MGd>)rj759}?~7%{5}KCn+U_~5V_V`oH4HI$FlT!d=3 z8n<fQs-vQUgB0v1;H(qez6HEv7kGPbIC`l0;hq2UuKiy@w?-t(R99?Ze`7PT$cfy@ zlid7SM;C!)7n%qrB7N&8(y4%_(53=YvEgOblzaRHn}O-O;_4D|X)`iOu-lq;di)GM zrWdA{^ldTq37Ohs`eFJ>&k3l`dOWh<48RPKo)<7E;1z~U4W>r=>Ky6DJe5%&8qIHv zNw#zI`t~gAhDg^$vhj~k<O`paQhx9HhGkjKIxJ_M_|zPMIYN3xz_frbkD22z$0@rn z#^n{gD*Tl2xA@$ghB-}H-3qH&Q`+i^{_1PizkNgb{WBZ>HSwPl|M%z31(*w@*9BY_ z@E+fqD==3iG<m$MYozZ=zqU@<Lz!G#&mXj;|L|iR+Pe{Z>J|9sx_Sw|SywOQ3YYt$ zYI!lJo=8nCFZwK5Y+C1?rX_AuJrmw#`7e`}|2k>8cUHXtFP&Ad!INbz4qH-8tGD2j zXKXD_srTUDXVg3JA2arxKWFTBsYkG=IA`VJvU*+~_q3EBmohkM2ghyT&K=+_+rhi{ zfp^Jue}Mzwmly^gJPLk|&%kf+1^78Gg5Tl0@G<^^T(<w)+Ujazf)udb$v;QJjvKP$ zDL)Y6Sc7Ma>k<+;vhz7XZhM%ZLpn+COj0MH0I^H4{CJ#TDos<JRvd~md2J{V0H$L! HMbrNPPRfa# -- GitLab From f91df13b01621d182e1653a072f6846b1bbdfb96 Mon Sep 17 00:00:00 2001 From: prathi3 <prathi3@illinois.edu> Date: Sun, 19 Jan 2025 14:26:59 -0600 Subject: [PATCH 083/109] Delete matmul.pdf --- juno_samples/matmul/src/matmul.pdf | Bin 88675 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 juno_samples/matmul/src/matmul.pdf diff --git a/juno_samples/matmul/src/matmul.pdf b/juno_samples/matmul/src/matmul.pdf deleted file mode 100644 index 529c95335bbad3971558e34beb66f9b091600b7f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 88675 zcmXV%Q<Nq=)2`dLt*33<w%yaVt!dk~ZQHhOYo={;&-cIkAS+2FN6D(Xuga>BDT<2I zG0}6tkhNUZe!#F2F%sDuS;6q~5;4e{+L^mp5V8F;DZ>yE5iy8c+PIiH{p)QET}(wy zjqOcLVfguBoL!tu4Q*jOvX6Ug=V+yz@Sl79MnF%e;P!haUoicN@)O{P0!gtBLB8ur z;11{~*c|IjRhr?yyvDeY6SQ`8zg%8sFXL$frg#+0)XYpZ#KcE0B3_Y?e|%9-@_5HS zU*Es({qzUZw5`&Qj$S3E`BvimUc2J_-tq>=_P)|&_Q!v21^kBNeiU$i#J_)T82Ub} zJJ`HGzfp$9`aUQ@f2u5fi!*x1_xs*nCq^fD1-_n9rqYmvsYlcsGAk51`c@dYg82`} z?ECb;PxijaKgq|Jd_UUK_P%bv#`cPzhy31|S_!}J#;~ftem=3r5clT4VOv=?{oaoB zf1dY#eV6U=ziC~(EGso!W=w5O<+Awo+SES%^b7nt+55!0K<pbM`FSC9{wg@AYs|dh zSpCAS`W%(H{E%1`xRw~r<n}d{b&|mA>3Q)qveRFjdl&o+Igogx5;)Q*PGIpH!MXy( zarCKQGkKr&9ljjuBER^3XFZLb<n|puGKGDX-|R*jdQ*}49%J;fzS&2=I(4XiTwF-V zKgiTB)s6Sr{(9zYbzQ96#aT>5=63?iO7!#AofCdIguP=56Z8Fgt-{pzme0?{GHF|W z!2Hn=D9yJ|Abms>7^Rf!uGkx9a7|_9_XSjCHr77hZSCm?C;NW9ct?4p_emkWr|bVd zC2WA~9a~A&55h#b_+*Bml%yX^zN)JuEb>XHXi*E<H1Dkz+HB?Ot1a!O=tjlWN}3z* z>#db2@Z5lHP+EiadDD%3Bs<i7e!tcIuJcIyV7q51S?SYip6vS$V9<mZV|!)zW-jf6 zI$*dOG+ZV00}Ou<`n@uh5`Kf9rhB~!e3$Q0LMuJ09<O<0_e`(8#T?$@9vppM1AJJy zn2-C%9ep$4MHl*xBN$S|zY-y!JjZR0041*_?itK7U56A?)(jChO=BKklh1(Fh~<wo z6bG*ll;aO5s^(`?(MQz{@RK9lJ)_L~a1tJ1d_PX?=Oz4@6O>vDo0gbgjkg(M#uaH| z##AqICQ`m{=qXxbh2bmgk#`HE4D!~&rE^`HUNxhkx^>T{!ZKVxECtQ4*^7(j$*DmL z=)O7wFDR3P3^Hd^(Sx~zpUREd)E(XxbP;Hb!K|i&RQjXib5L}UPLv(G{Au$i?vcm= z`qGq|<lp3cCfyBCfaD;oU^juwvMxXYJ8?1|W<gEs0~vpR1_lFVfm%WMa*dX#G?lHL za8s-pa+NLGQIV8<_9~QI-rwL>^`oI$^$+HP6|6pQd9$B`G1cVQZ-mGGhc|(vx}PP5 z@LiGeqfGKiKDlNw)J9b(Z%%S+to#V7Ps)j2g#($(v!Z1ZTHrj{*c|FBsw%j3=M$N! zFKi_=;s6v#DIi&yUfuq+5gmMe;ap)*F?oM2N4-Y|-Dj@yL*H$_4=N8eB8H;KmFGEF zyViBt2%iKof{^SuTdqx0st~y&o!wT%_RU!-Z%zRLiq)p83*S<`d-RR|3-msEj&6tb zFYEbG<e<+7ecx!)31qd8@4Mu;;Iiqz?)#R0F@5Y03HtLY-Q>;B*bm6q28UjDf0q8F zF)se$2lzq{x@Lcv5VJN$uBFcncaLgm1Bels@tN}$gDXW9Q`b0CmckoFuAMoqb7!s4 z%xH=kT*|k{w(eu3%<o;3E<2Yg1!v9tpbt!Vc`QGKuWBds?gOfHlWVG5$0Lea2y-J9 zqWdJxzh(8bX&M98Xnb>KMGujB1m#oaCZb87<XVDWBrB7O52v&jUl05^y#nX|P^exQ zZ4e4@erFFr|GdgiIQcItPCdz2wDEw$1u*Z7>uXPQ@)W_PFd&5pN}(o?1%n(XL(OYt zLWjMfveS|brcp*yml*tFHAhN-I!7+Emsf*Q>TTb~Omy1)U=p~Cp2ctM%v<1kW~iB% zO9btSRLm6PtZCFmK|s0GL&nXbtdNql;!ji>(CEn5?2DF$R(4V5a=fb&KY}?)$IdrF zRx`8e9PBcDK@kbKB*7&gD>M&kGwpMjgOS1aQ6@v>-)Z~;M?mmou===Hy0r>6tO?lt zGpz5<gJycij+YoKKi$`EewE=ip>Ygar~Y}LOE*ZU6FA7DACKF<(LVDH{OLUfPv&yF z3rP5*a2y($mPCQ5Snp~WNkWd9fJ;=}2I44neHt>z-JBNbZe-IiS~yHB-Q3+3H4YXM zXoT~w4`mb&O)U7VnFot43<cc=%O!zFFXloSj)G(;0k)xNcK{H@&KU}h-f|SsMKD)~ zFSIxS!$u0=v@TnZ<ij>6L#^#dJSCnCzomr)Ksn#P@aeRMt8xM-W&6ACUs73~-(VOG zp=-He=)!X=B!BG|(SO)&uQWz6`bFL}b|OoDHl{S>B;s|%8%wViu}5^H8cHQX7i~wl ztC${^U5tX>QuAmgOzY!-b0WEweQO?9O%apwHQLG*cHuuIRjWeLQ=)H3i-A|<{34qG z2|2hkE>R4jkW%3fc0j!q?|yl3g`v7=!=#?lHII3>2*&}36!<;ZpOzTZxB+rAy3>?G z>HT1keP61RVFY(#kehhjaj8<c|JOD@&=lrRNyC`-maOzHtJe~a+!h*zI?alyfGWdA zqW8Ar00RrUr}oDRt(tCZQ4zXfiyTfGRV#WyOMUSxdD$?#8uV0qtD5B)ySnF=i6fz$ zP|i}OQ`wyJiQRv(?1W?U@YCE#>^0+=x<DoYToM4Sqr)!L|4dRdk(GujJ8P1=LShYj zF-NG|TG_Xq8xkg{KWMo?f6pDYiA5c&;n{Pa+kaz_1=8g^82K9{LXJ8q2@kK;wg8Gc z34N2yjMdmgF>2^5-z&o}yxo;_a)tdIhXlY-i07z-_nRS$vI}okL(~#K>bLq<#gzDi zt)sS={W~wkWD8DPw_eS&Y2l^QhX7*XXGSR<u{3gxK}kF);8rqPh)m(O52`H{$v6-% zH-s{TKs*aATgZd6DHmz8KTFQ4#ovPh4LGK?YK4Ry3azN39yms&SYmPWtKCaGsic%f zWb~S`F{22d7Q4w#=3Ms9Vgf7P1R*tgg?&)d7Ld)BZ88$Elh}}3@%-6B%CV)-@NSx! z#lSe|2hkdhTC|a{DLM*2d3$~g88o?V$r_~3<bO(SgYBfL`YI8^OH9$vy_}0PL+--w z^N{hNkRWIn4Z0OP`XFZ(ZB>Q7K!ed@x(&}m%(l*7D{1T6t}j{sC}>SR2$v?6zXILy z>$A||0nl-U0Epaa7#}TFZ0tP!yr$HiQ_s|~ei&G*@)OBUkoz_lba6Kepebb0QU*7* z6_!gM!6|e&XkH1Ge7vRRdw>V;jCJI&vEjE}`NP3_alrj0+$JI)P8);k+sd6Sugu5X zO!+=b-pF}`o-pDzv*Z?rx^b@Q!u)UQ=u6S~WaF$Fi#zJp^S%~A8!G@i`<#)&9#~|W zn>o8d!tS7#4R0R-S>wq5wvsZo7ga5HZNDs^<9L8fk`c#>uuV6JkUC{#O4|LAK=F?K zi(Nut;o9Lmz}E0BGPuBIu1z*GwM|mACb6nBXjSpXVNeHOo`*ju5rg98!G%w=uEzCk z{vxGM-mT=1K*UzLpV4=Ofha1jtf(K?_A9z6xQoIN5ZD+KoK}HRlli1cLp4WG8}Yqn z1g5VG+oU-!KQ;Po^>m0CzoZ3YBySO|ceLt?F=^widpp8?T4;xus5R;+YKIhth);1+ ztdy2JK*2*hf&=pJxx=~t&|&{l`8kj2z%Q?3ORjCa?5ZItYgbO6SG3tB&c42xCmTSw zhqHrT-M8yu;L5|Cf7#5Or}i5&IXtadcWnHqkkP$*yMzibhYAHn5P?CmX}oO7!vpr& zi6R{-V6p#kr@YSuzzU&t9RHLwxzwe#r4X8nUp~lvH~`whGo3_Affp;-R6ofeKebLh z7#F{NFEmpDw=GfI+7YR~^-Ic{6JL(IiKk#y<!Mtp8L>z28TU{-wD#Hhwcz`evu%oa z{%XI4+OPPgTm<nt?|02RtX|c(1fskIJg;Ad=O4$Te}2_f9cDObUaE_?nkFy9T@sjW zNk^j?4KWMb0_K^W0fnKJOu6ocokASPnswP61V#*nx?3THx&B6fZ2K4u%@nvvG%|U> ze72Uky%uj`m=u>W46V~P+2}6f*9mut3bpd5=1pD&m~W{tBmm3OHo#imS2gYdR484B z5L5X^hyZZewRDdfd?lzAti`nt77URr@wYYt(Z{$u23KhniV(^-b-d%f>DO6O7xk6d z?f$?{p}>XRX+&|danF7H6~cqd#B&ZLVaE7~A=d;+JM<+sW}oI{N|ud#I*+w`dY`pi z!F4_C-?hHJYjf){y)49Dy|0UEvX)otj;VerM;DkCgOa@!+Wz@8Ez6Umv7kLts6Ley z_}%4kL`bPuWJ@?s{gV%)-s)l@4Gc@_u=7eCX=$?oJUt44Um~b-Fz4i+@jPzy|3>1# z6Bi7q6sFb9@8o!e;b@}k5Ee-Cn$=$yC*C#lN_VyUmKl$QQfmg&;N9KCO50&dk{amN zERH{R`Lt{kFKhj_I9KJoXu7=Wkwv&47`wLO@X8{{8IXI}^v!SMR21$f4BI71(hz2L z8TTNm5#zGes0q4ax}sEDJUzJ3_S+aK#CedT{LH+miAA?fm9^fJpjrsr=_o2e{y--% z`~~HPuMgmNppJ3dv?+_5?&yTblT2Z>%f*PCe@~BwSX5uzaXn6?(11Is=MvpvqPQ>^ ze&J-}o6Ha`CZb+d-#<|HrY$2nG3aSpry+W`V^F@4zf0fMAP1v5PR?OWEGv7Qv!;AN zav*znskr6syMx<fl5-}%GtulU2$w<?9L)6){{C_%KDp<#Fth~&qdnAdU4Y*gCt4f* zI|@A}8mP=ht#%Dp*vGdHHAXnUj0^jh{Rxd&<=bWQv*GleWsmdKAl&+k1K>AEk<IHs zA{Q)IL#K<VxDE;!+3Mec;}@C5ZG7J|{ECTlaoe#;fE%tBNoS%-14bR;K5;&$$M+WK zoZSkb7(=|ggOsRbOJi{%5%%LzHoq0<3v5460k}T8oJh8`@+jAwAt({n>?+H4<5hes zb`{IqJC1?(`R>ADe?Q<?Xh6;Y1S%D6OEAY39(N4GoGOL)i`J=fk@^oHe^=fUhwy1F zN7>|wjxFPKa$olE;JcB2!2LFIzDIvML~OOT*5!ZveNq=@1~4P%=bHK9?XJwHf3G8{ zvAn+nK~Qgp2(eEa#iENsPErW0NySRF#sKPw-*-Ww7KhOq#T?Q?yz7wNDMT-$?JfBw zI6sRjPlIE$OoSjK`HE1|V09XShXpnN8mfhhSH-EZJbEA2l<*>Lo7nZXAcHSnySv~5 zC?g-l8C2_3maBZGfRcVj2MOV(qE*WPWw3_V#b04YP?x|KuVoj3w!TM+{`EEBHnVmj zQm66F-LYD*n!HHh93+6~N0U)d|9p0!Pl9O{DcU$h5vTZ;(*$b@_r;S^PH4n&`}gGb z1v2>E6_MoUcgq4<d6_n@C8NzL_E+n0yZu|6;`i!+R@UBpbl3&y_7*WkYdXD4QSXaI zYX45sbSA}xnTx+XCJ(9Qm(FaFc+{JzuI&o@J5H}x;f8~o0%KI}Y~VKaOywcVdjD51 z&=qI(eAwP+<Qif#d!aU%&Sx0{j^td#D`}`BYQuJ!x%0{`KXL)<x3&u+PT)~^rMwvB zHfVK=ZI<;7)U~OD#7Y&T9uz3#(n{#UXMy8~d4=<i)0>4PND$J_@sj!mf3sfEf()Cq zX3UlVMSJ9q6H^dm7{$l3(CjnBlI2n~VwnV%K*6HWLcvMU4bgxDwn2e8i$b1H+w~89 z4dJyg7PoxF`tLRkHN#B*&UxBxw$%j7YLCoO3{~ee-~>oKHv)cqT`-3*jZRwRUa~;u zYik`Ber%uCwgl|8zwGyQ2yIL5o)Xh)N<6+7Jogg2U8gO=0d~h>^Pr$(@*R%|9RIiF z1I_OE?`PEh=Z4j>{(n$lECU((<!5&EL+F>>^PxX8x)+C=Il5H0K9%<dj5DUY>ju`I z;}-UK^TjV7(b9+9%EUcw9$n(Xrd2wBNgmv!@hTClTwpZN$Ccm`;m<bCMzH*#+H)bf z8Loy}3=u&o=ISUkQ$$a#+sNf*Og)L0>*&dIFKUa@+DGSR(>_q+dVXLQXYJ%fZ>&qx ztoT|&oAn!U%5m#--2pW#h8Y%*{MRa%SL(z|XDpjd%`H4kG2#OY)wdt;+K<l9qv3Qg z;@fcD4niLPKos3%bMD39L>T#saJ^tLf+`a1kg1Va^;Q3c^?AH|KERE9=iUUoDpW}6 z%jNzF>lVQcEBAp#segrtZz!$QvO`N`IsYNZIVgkd#;Un9UjAA<*P)Tio^#{e!HhKf zczmMVkcR@**L!3_y#2tWYKZF}_-O5%WF|TK`sG|~OirwBdfz^s@6ES9<Uvw&Ng+?l z^**$4y}6%DP{WcQsKRhmg@=GOoDmDzct|4QGK7N7DApc&T~5`+=QDjAa6{j_jYl+w zedYuBt`QX8L!~>;m`WN!SrT~<B;%lM9wg(shEzs1!v02opozzE_tDH<Y-B3;uC}7g z)~Ww=4u(B_am^ch2M{d5K0<xa3^T3&uu_{mSBOWI-)&P7!4Zbx=~3+=cU$YCMh+<E z;n(BkkAsa{E&S@Jh0&=#&v^~C?3zgP;rbt2#8Yph-ZF8P{PtSVuBUaBV35pG6uGki zgd#132mPfp0kvlM*LVtQ?xe%Bw?M<6gPscOtd(9Jcj}fb#;poI%OY;u6V9|Jd4Xyv z>&kCmxK7aJFcFuI`pUeGQT#z|h<M#lcJr-Ul_hsb+E1V`wf3n_z`G6Wp|fpOW(YRq zD=P_-?bFpsq3)*}e>GBwC?H%hVVQ7biFtZ->BK{#&<Fylz%NVcW`(K&K;%eHWW}nG zO@XL#ysF!>Xi!&cF{&Cy(;ENK6Z~x;u)(y|<$t!?smoi{#V>B+gMN5jQq=QKxj!sR zhMYi2(1_oNq%K+7R}*axP7@OqgS~@3cR|;NmRLYh|1RJn$su;qx`KksHW<2s>iXyU zGaso_5|BEEv;`x|P!TxTNQc6D-2Enc4s>p$aIC~p)t|VkQn)RN5(|{U^yh6hG-r|L z3tvMU<EVO3<2-O^t7iOBJ1Wx;{x15N49Qm|sb4pBqkpi?4Dp<)&G&}f71-xSuBHC? z&1)I3C^<Ls4}Hpc0bl<A_{DJy>W}+aiL5X;gzT~X^PRjaFeSYf^!)=u_w%<Uic<IT zwoHWglt<+Uf#+=RC%sa>f1Zjy<sHWq*N59(xrL4*?K}MV-7jwAlL<f0(nEGTPRc{} zr{HGnldblHTq55;+@vi~Znb2`?p~F}mA`U^lYIX>9x*1LKVQp*yW?<7y3Mr6&6&gF z$f>(uU@ZD0cVjsv6N#&u6)eUD<;23u3U5%{q)3;*W;Hpi(;a)3wSC3JVgH+*g$R*c zR`S9qJmVf)p)_fM0=Fe<*{?^*J?@6}cm<!`N@c6^0~0yXb-nzxXARf&<yCx#uXo8M zwMXd*yH{xq#HHvO{rxse|1R_|M8x>dm(B9`h0XG_gw3*BpVAH!=a>V70=iO1oF)8N zp<q!+5TP{gheA<#i^$pVq~S+FDX-aEfk{=W0-n+z*+X4O$Ye$Hj%UFd9t8UGp>mcB zgyAW6S=r@P7h*??U1LOQ(&s}zmH4dsGfPNO1dCxWN*zE4a(PvWf+;WVhPKwy`JR47 z>V?QvW$kzvf%-=%6(orlZk(9Nd-``iG(7oG2{W>k3(3&VMNQ!in%to^NnMu3QrKwF zsM(<sT4*^I61*VkB{H_Tvbi<fkN0ruiFiz83NFhN?6~~OqIu@6dHEz`d2;@{FzS_y z=I-gnDN9U1#@H)xb`ITwr0nvNJ+rFC4cs!_$cW!pwuq^>C+-(#a?SE~#`=?3dfI?^ zXhRwN5R@UGYTbFH_o-ovy#=&E7wF;D9BJ_HTJcdm9G2|N@*o4$y%nEGgF1*SyM9>z zTL!FP++V@0tWOObg2MyK*xmt`X(oZsip}C9_wgL2w<_(}s?Yv*`4JmvG?NX0RM3K7 z02gf;A{Wpjumt@aCM%iCUR)nx4%^rq+5S)pl>Rzi{K}B;vJ}aAAaJg=5@q@@L%1=m z@6g61An)WqP!5B~*AQN*e^5`kSL?ZUXK^hG{rLy#-rEG8--qn^I<1P+&HX}Wj^+7T zni?Yv1UjYfyn@x1rY9i=)d(}SPEre7aocLoVWsSJZC&!;UD;mt;J7}rGox*61a%s? zX04ZaJ-&eY-5dG}=-2}rEFgE9p8lPZfsKC+qb<Y7xIyo1LaWSMBY~AhM<Lr|{pMPq z?NvtUG8Un8;L#X*4X;V&;&shAg)ERNca?apk&4e+&Fp197b{u=Fh~RX=oQIs{Yj@~ z2Q!fVvPXM=0y>ZeM*s=#9dHSg(F@1fKN?6w)W?~|A2$<@ZwiIp7s3cp)bJrh3-O`= zSpyBM(j8C^I9*Kkw{9zj6Ac2GJ>tHzKB`{hey))Ad<?7sH|47via?eV=SJ2Km#|}@ z0b@*@Uf3b1Wt-p5v9Mou@C?dp?b=x)h0hwT^TPcOn6mfBnuQ@!<7NEtJ&h1)(GZKA z9CDjGPxr<8BirVv!~hA}H+X0ez!i(xweH|R6u$sj7kf~va&y{?YXcK1eFQc#ViwEk z*XnlJ^0t4m(l<cG!%`j3{(9Ch*$f$Z@7HJK)?15|n<Tr)Wp()#BdZL>2ek&dYUq}# zNojaubHEw%nsy&KX;R+qa!MaU5UaUcCjTbByhsen4fu!tRMRRLIH5V`S-BDh$68Rd zPj;#<9FNm<42|@?A~S#Nbn#6`T)=%Nw37j)Mt&%ST{P$$V!<K);;Z{B35WrJOUYb5 z8x^AC&;UTXz`N(=Tr3XI{(mGBM4?b9IzWA2R#w<`*+fHK`B>Diy2J`aKn#ww%~FU( zxqC;ZN`k5m<8H05WKw`AxZd{3FD^1L75j)7oQ!gY6qzCUx4%xTaLD<`msFv;P?J36 zFyB(}-Mo@R#pyQ3quO}&)DrteWDZMXfm0tbZwDU(lx7-0d?aFQW*lc3GM8Ac1XZ8| z6n4qE{<1c|qV)CU5jN6ai;YE6mE`q$`MX~C;lC$}f#y;}w(o_EYyUk3<g!0KX7TdV zpIQC!=iv%U<@XmKX6wuO!10X+h)H<h4qnNS3&6f19?Fx!3dUhZX6%Fz7P3(L)i?&} z2lV(kPJQ6PfFOSD#^s%UmcW6iB9(mY&-rUh40tfzaGDz&qtwiwFSVj8<-Y2w{Xv+p z{eJ?Z3t9Yv7KON|B}h60qLIb+3a2lqUZ_j+YpBj|1oyYeJoW_*MEA?Y-)R|_eT9AM z6udr5MXbZH_>0gVa_7TGEH*uq7&^Mz(UV(_pJY$OTw+5<r6IL}4XT-eX+e)8?acu= zunu_-tX=-W0;*Kx$v_6m@|$uYBavbYNKp}=$dOUtebeoI&UWm=RoePmvmyV#KF{Ui zaP%t9*W!6%7Qz8;xXmq*sP?QSLx%vvV#f~_IjLpgY8bI1wzlg1j?S3DRcV_18F4G> zM0soSzf->Dx+nh^<!<Ub1ljK^YTon*lHa{r8h%&KLvaZVh$$*X_Tz(t45@l@FSt#r z`jyHu4@M6-06m?mLC2I4v37D2as!S>KSx+cPJtNcC=mojZzWU%bYC;gM53l!7F84= z@}PCzuzc!j(7E#>aY*P|`9tIIJrN~r>ccIpx{k11>*(Uz*VHu+Q(~Jve#>7BJf}S= z_GQK;Jcq6Hq9$4!?!=K*U4)NQQ}oOA_EL4_b=4vJ^vHqWsOhfIq9S7oS>$mCnSq+A z!y8t~`jzc={?c9S)8OUQK|i>nt7_&BpHM~r%pqNb+TtYP9|OE|J7nc=kTH`0d0ln& zw#cs&5flFUDBTOVjT3%FaP(n*j-IZR9@(1j_1ZStZIoWMv$ZeSuGd1IWVmpup>Dl_ zc8kl}agW=Yv&CWI(7U9XdTw7e7=7uP?3H!U=sU(QiW=<aZyD;NI+%*H&lz@6sLPu& z3|G@I$!dmox@Ti6BkEG4r#uDFCYWUg&8uEsh0{L6_{hOy7Lq44qNbZk0@*8%G(g~~ zQkSr0lszGq6)A`{I*J%|jZuN2EF=v7hJi$}9#pE6tbX$NpH<4{g%)PPl0D*%YODLo znh|BI>Ji$`cfw2ft4_}7%^5|dgByehE8&L~T%8$_Aezx*;eI5K1_fUW2MnLh9z0dA zowtO6OY+I4JSW|{SxQwnaGchvu~4Z{laU(Ee5s`5V_~fxRFa!fu);L!qTfdGWsflh zJk>mV8p&AIT*?iMhWQ2-w*(YeKmmUtk*}g0Ah&+EsI#9F=CY--({V^qG>4blW8(*o zd+iZl<9@3lYx2_7XY|MAR%03s;Ty)%_Wzb{m8|Ih2HPDynP)U)UuX{f_T18EdRY36 zTbJ2<bEFX*%PMlP-<htY6om|tkiMj+IUS~n?O;E!F~hi1uD%T^yqjzbBo@SpB~pg* zW!UZ<mN5VaS(?L+z%Oq+aub@{!G<ogvy*Alp<-0*_-GC%i8z(F@VEQk+=cUIRnfLX zAgJaf`8MHC!+!Ht-T~rMRdS5a|K$I*jkxO5+fV%W{#-gEK6b02#(Wbta|RX?ry`O> zIr05=_TbLx2_8vtjlVJxW}JNf@8qay*}BtV1YlOip&CdLJ4|{|J)y=^98svouwd~S z605Tsp(jH6kUJy>lv*SB&|#kqP{gvQfHm?N@q9k=O59<YwH!EpF7GhKv&OZ?gFouW z0h3-#)|cb1Syqtg>y3w#aUFP}5rsXp%{J`;D-so7DX5*ct9j^ZbSM9Kpk>fq*}&n) zoP0wAJ?uj=t1A4W(BMYYF*1`f5QmPj0|JK<C=6bMPzej0v)}|G0F}7;_VJexM1nVc z@V$w+tc*h818&M<WZ`B$Y%7?yX_Vj3R)V?y9(AywcawqjMK}Tc(((%kc!GY-R^F@C z!Y98Q^4W5H`9G~e-u5uVUq)ub5-tOx1d`=R*Z=wM%&Wn-w)MJQIo~_re*6&R|85Dc z`L3vEGjKy;9t>?1fzfriUbb4~<$h((gnkHD2|zA-e$3Q$?N4I$@FRSV7w;ln7FMtc zeCRiKRr%Z*SHUc-)HpBy@jb<|%V0*Y(c<pK>#(Ia+|CwoYsmO+gQ$zWhRJhhZByJp zb1<rcZCKyaN1UrQ`)U49n~#K-mB+uJA14_krH`DIg{ygl4`f^)$wj9GyF=t_TE$e0 z(Gn1@?f&dN^S{&<*>aK}A82IeSrFh4J|<c^=XjLgmxla0a)yGzYQt6s`im+rs{P0l zfuY}!WdH}DcY*5`ONz7~%W13^Pns$0S~(~NJr>?z6xT9V+SfoYIu5siAPfMYei$M? zA(VBl90VV+Q0l@{Fv(#xKyI<=ui^Y|x#w8dF!Fp<ki7nzemj@A6`L*n5Z-^6qk{hW zm){^VTx5<Uy*=x1ETA1B)Y*FzQy4!K1KEG3uv`dMf?Qb&$(s|M8gU`e0m@K5-Qudh z3)>T-9bAV*WvbEGt|s?zT&p^xoDv@H9`BpOSfZ=b?~|Goh;HX_NJBv{Vt6+R&o`>6 zJW6<=E0oQnk3leQrAL_1_km`YD5tXa>AviafS-v94#sochO^DWIS+0Ck|Te5d;bSH zg381A%}@RRQXGzB$R>K^$LR$Xu}h1Qa&5liX-0K%RyFnfd*$sKs!Oj^YAtB@ImmD% z&QkwUOz7@(3vwV)$>OVTqnT{HYm+wl@7yN`Ol9-a#z06Fgrg{s%&Hv!CprvgdNFGx z3>p}7p)|?8hx6cQeX<tx*Xz73`uqi;tpqTRNyj;+LK;_8cxo~A=ae(zPWK=M=l8<_ z!;+fD^#@MwYH_--IWS*m7tNUEI1$}BW;5S$r**&zS;-Q8*vjw!N#g2!&n8E!x|Cn= z5cL5d&8_nHv{wj5cXBeOrq>343Qt0%8WKT=g`1;+Wl_frI54(D0blIutWt)XApT~> z1E_=8N4I*zqZ<5A=wO7P4j`m!!5or(H7>Gg#Y|%MliMD2O-ogRbWt%v#?xRy;mW}2 zFE5;_Oxf3Y&B0<$%rpjAprx~ZvjJjDRcCL*>`Ql<4_{dl558KlU+8y+-guX2wk_ir z`sIxo5!QNq1I;;wGdgm&Si0-24l_IsoKunMJvc$YVHv*>x3}cqggh@t|Fc*R=Jl8+ zDK2TOm!LT)(BYV%l4jF~``q43Jt^DUN#q#e(eLv=khfEmPH?-u<m-I4;5r@(d=M`~ zi2VjRQ39qq0^aauKblq5Qy6YfGP^|CrIx29KaIo@h>sDE!sBuFLT$NCax(H-HHPZW ztwb+9{`J5r#w%NNP}{ps&T7p%F*)0ASg$+fEZB7+xZIn3@(_&Ngl85g!|vDw`>_Ix z$h637F3n!qv2JrDpgCqs3H&2x7Ii}Z+9E~!8`%v|ZK{4wEI;PAc(wL;_P(8&kD`Hj z&6LpVt5u_Dfe3INw&PAdtLiJd(kZxj$mi@ZGK{b}H8I}~rK~_DfuZh#dP&{VS#9>Y z?_-OEWu}LgxYqkqGn(4C$*>Q`fjr;m;7fXYkRSqy9TAAId~%s@12BK4Ty@h|fWBeM z6Zu`|EsY1c$U98X0N6XtwM3qfH&FZbw$$cr<fYO=^@=$u$+6}&>7YHWyD|)mojZzl zS0oSaA585cWi8aqCL=x>AN2p;<6>;?8U29Wm4C-0#1wWV-{0;+Lqv7TDT*B}4{FF3 zl_5ekdYk)x<PW6!FT{b2qex4{<g>w{nc`V;Mb@UUs-pSRk4%vxp>F``UWev-UjZRp z?UGt&$WRtcUO$tU0QH4?B`J^JoytiQ&mPTtNE$Dsb_K7F8(=Y|yLl$9ihNPK*08zM zJ>w&#eam<x-FI+Mk(EcQ(SVi9@SglaOKqp3QJ{Q}IfWxiYdDBc>2l=K+s~`tsPWnl zJCuN261?6fMc_f<6y_;TL`3(_kg0V=My-ri@QIm_al(w@GS1n2jDK|?1m1!AWRDSL zePKsw&2kj&j*G|#CUW{0u$W)JRpp@sB)~#uW5iuiCP{;LlVAP+Ze%}$P?hTlU!|w9 z!O>;iR<Sjr#>Ge*%cbLGj_0PpWkCJof>oG3BCgQf=lAq|J@iKbS5~tAP6c6Zh0MQq z2w@mlp%kc*0`cDbK>qM5RJd%|x;3LG9szzN$2L1f24Q$P9`9_<*+#}J#Fb6)&mO=1 z9U+!GBhJ6blVA2L#HwnQQ3ab$8g`KW!UImt_Mz!Us$=tz;aR<0Nkw{Nm2@S7keBak zJHOu4I`DnqeVM9Mtl;(V9hG8S!G?ODsUU`8=(+uF0gHL~kFSsasg02Ju>c`}Z(!yd z+_Y0+M}1p1KXw--Y7+@snr4+q4eNtyKrtZt4gnL@VO^;ht6^0VKD77=&%V;q78Quh zbyY)}abp084QsW$8SF@cmJn!rATv4j-?cN9ZazB825~>cV&m1xw)66z!c8iQrsNKU zc%)+p%5I`k@F&JcBI@CDf31m(Ib=&@8M%O;41E@LVVTaY#`dfqXj*Di`2o(&klbJj z&+Eiw^SEpAz!0kS=G6C_h~$9yjls;~*9O0#dWdQA@@N}5wN0GfRB}mK3eVAR%7R0% zA6KlQv5-&-*$TaLF~&hs;_W1InaE^LwOo$zqhbrqz_76+1Y5?T7MWEcTN&J*L9A>N z@?Breck!c;Tp)`6)jJ3<&>K$$ePuE%hw0UQ!iS22f3cvvbIk@ff!+zyLlqL5oLgl` z(teX35=Jv{nxmMYS?Cs-@evV&5le&W-+?<3lXgNO=7XtewOE@h!m9GmPvNOc=iF$5 z-*+XLdqXlq1<z&6LT>SGuu%`+_5)Q&nIxG_9!7(eFz!d*SFOY&`@9xV;4_-HP}Iiv z2$rPGk6`+VI{;5(BB^rf&@>;k^&P@PEU$=X5?ojcg#EsDZKF9yu7G6HSIorCiFRIO z{$|3Cz<wh*P0AEpl9lyW;Gws1BZnzSEJo)V^8(Wt6s?Zp*cv!OqPRb8QadP>GfxHj zn?#QHs~KD5X-nhc_=X?Ry+N5MTOz6b7g2a}5s0a*!6li(O?D~e@G_k|b|-jK)*>UR z#Xiz~9IF!7C3sL2DiX>9&r~)Moaz+7-v-Az<G_`&705rQ2{WgM@Ke%NH(qB`yWg!= z4Ch{#>s(R>SnVdEqRiTY_n1s;im7o`5`1CNa>uKg&`RS2Ip?B&(_MA#e#$d$vE#*} z5AScfO`8F8U+Z$Jb<KAM9S8V~BlJ#8L+xSLfcoZ{M_6u2ThXisARN=9b%hj|Q&*Q8 z(X3ig@|o*@`EmzL3_8RlzZS7|G+ESJvwqLYcG)$6PBC*SbLFCu(8`UWw!DMAUgIMc zNc=mjvOqw@by~qk6dIgCvl)*&Col+H^}RR>%HTs6*#iLTfsEa+X(8b+L3YlcE77l# zYSfOTXdqEHtJY%h+eA{}gr>@QRvQGO&z<>McCZo7$B>TRs3!LhbE@eZ?wSl!$_(&H z{qrx<E<8A41BH68bCj0uGFq)FFC}|Z+IO@%lGe3+ve9R1pTI2r``Gc44#|pdY9}w> zb0U%g#s|ptLnazrpi;)m9a<ors7A}w8U<D&K>+3s;lMLhSlTT1>3Sw3EDME;%%nHQ zYqk>;fPY3w>w`JyVn))H;~iDkL1q2gl4En9gxEv<1-oi;yRg3Gx&|77?@65O7g(dq zpqSx%vUd?WcgU4mDsIoME{yZYtwu@xoUTOt@M)5v>ylBlSc-_cBPj`fB^l#N7?zT@ zz(D?}0j<JUN0N@7ld^37aN$&CC`3)MwD8KjXogl18i27%adRepedm;B4_Pa370b#v zXP_Kx(tfL@bJ1i*e>x_y18riz`bGNij7;9f5;49~xD``fD>}AB!jS#5RC8YyLQ$Hf z?%~b%F+vtXgj7g0483lfkEI<MVWU)+MmiID_WQf?o~=N!482V>;63%_c88@q{N!5e ziIQsya-gk4E`5M$8^s)UK6WV6_VudH!#HgpR#-19Z@Kx?9unqBW03aSIA5{uNUg5Y zc3<^g5Mb-sj8rL301~r5wh<r>P3D0RAl`WQ;Yg`XfNVk+g`o{#OO`Ct4G_O5@J+eU z5qYuF$ArvxQL%lcT|QN7A<n}+Q<g5pMM$2uVa+yOZ_fysuZg5b#aH?b!B1lgoEF6Z zP1o;+v=Wzg{nNT-l%dO)O@;#|A!N(CykO!`I-<P4A|r6LfXy$pF4{lOQb=SPQUxB4 zZlxwcf|C-4z-VB>_Vs6{KdG3(THKW>8aH18KnBi~pAXNejmG!x`>6Gl<~IJfDYZ)} zhjkiMW>AYjvyA$sKk?7=yu<k4rB$b*cgcr_KgJVXa&f*bBS-)6QXr5(z<J=ii_kAA z5hMBMIh*K47fCoys7M5%)5gewi8pCR+S**S=dQN4rTo}-n1R*;J=jTe{2ZBW*v7Tt zSgPjqB{C@gH*)`5@Qoy;;%pFhZM#`Vj(Lz@r^SgHLRnpWR=FS#$*D&boWImaw_bL` zIF%ff^J?x`*x6YUThw<Zt`spNO%N*vnjuK3qKS8WU`DbaS|Xk-H$OJ5wVrBuLda@s zxv;SvhLQzc47qo^sGZ9$cuBwc^6kLj2$Q}2PJwk7zH4VoCrGEq_*%gjNvkBWe{x$o zJsaNdaZGd?x<`w&gIbg3nM{M1X7uZQz7?Nw?inMMXEfM1?&^Z|)CaZIf93PO6S$d% zZR5#vM!nx{gwE;%J~GID&eS~1WhElNXwQ&j&LcDz{X#YGBfA9~ihqQSCSo2{;^MT7 zq=GdQV~wPuXbW8DTIHwh>oc}ImHR9X@FX9nKNuRQUCgblN)klq6vNC^A-RB*B?*lO z$9)P#W6Oz?D+w+pYKZM}*f39OOC8wzOIY@slM2jH1o|2AOt-W*Uer$^lWoa<OKh0w zR)S8tMb_YlT21ZO*O)49coln2bJi5u+=QL0=_VlZRX^uQo1PqD3T@E;bxR7KP7u{X zoy@^GdQ}8FN3PV@DGy6%YVj{LI*)Fn*GRj(NkD7^UIN)HYKae--fv#$B@LOn&N~}+ zI$|AX9$NXXt%}P$+6gCllT)KrhZqTF#Hs|n^nJ|G%^UgCsN=aoZFfRv<*>p^lbp0E zrTHiwVu}B#g^ya_d@`Z2DLtiX4NzHMDm6WM3fokp3;p}gS@t3j&-2Q3dc>3{2DxY` zpPb+%<(M+$ie*RAjjNFc;NuMyl(6qmgjD94`By}Q^H1z<*(^g$d@K742Q#6Mfw6Y1 z{3oWSMreF@;=ZAo_*J+LBxygTlMIjZ9Lux9>0c0{4(7Ed7+HNUH?xr}h~UWtMFRCn z4oT0s?C0r{XNfXdA>R2xGRH7L)+H8j%E9t8z~qg1@q4<nqgTY6AUt)rPPPhOZS`iL z(*5l07$O;1W@ure5I``INTkD|RF@4rG^k}SJdw}<FB!fel5@2!-~sBB{+m)$UmVWz ze+q>PYdXrV+$b?<c!y|7<are#a4@OrYS^BthH^8f=V|PDnY*!XDG?WxP$mf*0~Ki1 z%{hg#_LD=SjCY-Zzz1k~+de+sPC?;@)c~<EYR7T|P&NE`@i1vytBP<}%AoaQI1t@K zYFM%Gb(epNs)NDEf1#6JYUv4rTxgCQ|3gyXz{$MSK(8p>>|qAT(>lG@1{&8vh*11u zF`E9rShRWL)Z>xv?d+B`s&6&e7%UcQ;Xp+D7mRR#U{BlV@X}E2d?zPFIDb#rjd90d z16@+c5|LcrzaL<DoI#by>=46Z2Eq%wG+W+c&ZBzFA=udxA~XtUF=Aq|&4IiTc=drT zSgMrzuq#Fd7_Mjq$lx}U+SLFMMd@-9%+O{&BOIj;v|rbZ2)Ib}ml&wZ+HSc4s@?`v zmMN%4%e%3>;J+ucS~pH%PpR-Myi(nuq!iOy^g(8)2{1UQtcD_%B~lv6T{fT1(IJ0N zh8bFi+yL=C>BqJzP!ole$|fr+Xam7D!}~KHpW@Jg;#AQziGgl*PNMC$<1ySSFwl%c z04;d<gFq{%4)Y!f#W}7gN4PH$>oK7tjjdOM-m~{(L1~4~!^Pc$_G2)Gj4tm>dP70z z1D0?lrb75`&HOVRQ20PfsI}Q~+l1(WfYB>)QF}c`U<dPjm#Cc`1nz0gi=V_YHcxX| zl&4B%54LedrLd%Sk|S={rwlc>gNtL%CkWXmRbgb?Sz)|Zl_}p1*R>CDodm4szl4)L zjp_sKYS)#6K>FQXsS(91*gov@%-WPCgM!0HbKFq1HlIkE)3NNx8AO*rD{l=aB^qY& z*{L%J#u%5+z;1XJ$AxrX8SHv4*G14EQ_+Di=+0JzGknN378Lt}l9jN$zfXt9G=%0! z4zpHzhn6vq8Ix^#S=`W>`=_zM`0j2(zg?8j!I3rNe`S_Jp)_j!!BwBdNO*y9LU##W zZ+e**e8(!nR3ocK+i*D*+W!W=<`>4BiIyjFfoma<EcP^v*^B#@V#x>Qf>+iT<fmSg z%Wf~Jiot{ZWye>@u>gm)9P&ogo-6fs-22(EFrS}Uc@1?yDUvkUSV}VCLw$kGJGx_0 zjTO>T&s#870eynH5VbDGs4f&?Y$1CDvce(77MX@rqfnnNu_+?U&RWTmdgw6E5x|IM zDgXy$=z{ewu4&ljUL<A_(=Rij=2u2`NJKATT>y(YglTe*CI$szCLS6{3Tl!B&A;N4 zR5C~WF;4<t4-DF;p0_+BGzlc9P9a&`U)k&}fD@IQDJ5hLzUXNnVu*Z+d{*A3&de87 zQ?h71#rm_;{=Q`oT8RjArA<@8ofw}N+G(W<bL4F!5x8F$Drx~FqF;y3N)ze{DT3lK z;!y3}yM8schv-}U$^+EbqTQkX*#+KBIPjA8zRT9EmGS}JT}KDGv!+h97U?&)mAmip zp>y|A^=kpAQ{$ub@CSSPTgQCT#~-ZNsb+s5{H?G7Mpvsna-sP<ytT}Uv3+K|Qw1c+ z*3bzi4j+b-4NF5hGRCS(I0whBvityS9lSqzfjU6iO5Nn!j<p>ggd%*FnWe}Yo-r-~ zx;#P<EJ^Cp-wLauUY%HZU_r57hk)5Nn9D*pM7pR#Ow;16Gtt9{`JUK^xZF>vMie~1 zt3jaPRt{jCL91t<a!5_5i+Zg%kIbFVL<|=4jkmktL2XHX3y9ry@Bl5$;Ptd<08&+( zQse4$LSMyRz^O0L&LO}!>A?L;!Z`B>3Ydp#my_dAl0Zc<dnE{3afXfz8B6pTg5`f1 zD`)zYHu&-gQL5rJpaqjXFxfxgWI4cZVPiFprT{c-O0prDdr$a+8Ds8u7c-<$*d9;G z!b~ejW}|Dx%t>>{LT#E(t4JWx=@UOY>x<y2L4XfTFexvPAuH)P(imi!Yn$Ri{=|dG zbP-oktJZ~ls1_`WzwZBLRGuWD4TTM+HdVSqsxpgLCxW9g8C=JD57s{D#3iM(fYv5M zCMD~V{cgu1nW#05Y=RSiKnOmT6kIZBM2<vLYzTWX$T%WKgW516fBkEDn6Q_OZmm)R z4RL)wp>7G|g#|plS585+ST{b~Vuz+MDX)Y5b^!%b3eIj5(C;23gj5f!c)k?(rprOf z2{$-NMT*o2T4_aygAH->sT`c4CBB4kE~f0C0YfAa5Z*z&AEgyV4T<6EMG525!ju2W z`U;c;-(}@ZEGF`n@{vhW)xQe)Y*pC5W0Cl25Z}K&a!O<jl9HO<2$v%m|2_{dQ+}oQ zkno3xY$Pi18>t+6g}Y_Ypv^!S<L|Hhek2}jI^yNRKp@J%Qm<nJ6SPKB--p;BAyQ3N zjbxk{79;lqa%mGlMC^s&PsyY{V$3*BE6h(KS(^wjiECl6YwP-cX(r@CU8PsqHRO~` zuv1_(?L62acla>`n@{XuoAf;Rtq<5TQ-JYfDG8QIz`aEcsm^ADKOV+oG)N*LqMY}5 zDY&(l5|1V0((PbeJNER5nerCi%9PA>{v8l2_|i3e7manIs85GCh!PEv$=K&a0fh^c z@k%gP&*|eWfjU-fsplIY`q#e<t@>iR3pet_Ol)*?ztZ*a7M5Ft!l7^=8#%rQBul_k zzFk^Rt7dA2?8D2nE<Yt-2cqZC1XaQl7PRE`hI~2{6?#a628p(6=xSwa_<f0)$=Lx^ zJ#%28sgMoXgv^|nNOb(BMA8h50C(&4&$A}EE!BQwsfxr2lOUlez0x4nkT5e;3<$ux znL@FK?w$uSt!|(GOuA|*o6@D#AuABm`k0)3<jcalBllH=;#|SZbgUpgoBW)*tvVJ5 zi#Y!-0DfS**Ou)zbv#tWNX0PUEmSKl-gvw<7!jF~LKYtA#yi?@`a^XtDpK_epX@kk zwYc9)MIv$=Q!ZM{lei%}=35Fl4bojXKUhq8ZZuY;X}St6nW?gXPItkKd(n)0-mJ55 z#-w>71Xv<FKm?iu;q@W-k8}Qvp{b$bJxU||Q5_Wn5I%GtC{+dqH^e`IT#pXyUQ17z zinS!@Db_Lrs$ZyYLgUMX_E|}p`xsE<FZxq#YbHe~V||>OlZ`iyAk|@7RMMQGiD(y5 zbQPdL4X)xlagWCX(gH|XaVyelbYzBQoVGlXniF>($YFq=!Yw_3OvMN4EA~Z<<|0Vh zPgBycQJWlKxElkl$m-;M=m<GOcOC>xLSJf<!raRB>WeLwr0f5x>A;O#5iXhhjgjGr z7=l|C<`&5;$(>nVz=K^GzpMeH=m`ZCC(P~O;hv<E3Enw?l{2$HG?%;oH!8EnZC#Ke zsAIL1K5kX2fgNzgr^}aGVbBwtBtgkZy|rpVrhm&Puz=tJC|bMYLY<l+A0M-d&*E+D zJ7_g(BYWC(8ledCQx=5EbR*KzAq4@cYc6Co>9IP{-oN!9!54dD2GBJJjtzl4xy+(K z!f>cN<Mw0o)j0h7z<Hl$z9J>^&|UH|Ab?E|LbGn^lYB$X)u7F6!(-8}CMv;FE{T6Q zbPilPMkFLzEz}z@E$b?X$>Z9f#s{~@$`>pRjFC^xfa%5YI{gx?(Q_VYITaNoWWlwJ zj@yKH9;x)0(N&9uFvO>-<LC(<H!|Q|tcGyZ+*O+{uw46JY0*u7(aKy^WRGbUmrOI9 zI-@*6|0`7YL;|`^SD~+T#d)O=74GUDbHo@5v*hXtYJftyiCGOFEP*NKXe0u=l*^u~ z1+02?^jKJgw|kIb#=t$ffc^{0a1d~{$ABj^Zs|Ob30IfmV7OYcN9566X2BbiASQ=4 zytSkxl30sbPvCSU80Z`%tW?SAc;QQsrJOVZE^o|T;qS0-ZvP}2MUA#tt8>PIJD|A8 zcVPgZEMF3C8O;9UT?IXiMaw9P_PEZH3z23^voG$-Fa3uv3n)fbBRHQqktq>jV~U%= zsJH{7eht}3di(D!;|`k7(d=-}=zf0T*#XpAga<P+1;yZkh5c<yRS!*LQ5pnfh!sDR zdK;6&sLe2;uO!YVBP|clm!df<(EghdWvZpA$KP|4&jC_t=7lGqN(Z_d!i*pj#Pv%d z`;%dZPU-z#>Yek~cP_+PW+$*`Dx-~fuQ;Qc;B+oK)`1uL_YjTdt(Z^RgGiv3*d%in zJMbR<NQb@<9u5#IO#w!RjQM7K7A{Kq+N;buMVX3GltS<bLbjfp2|BKIkoS*-mHCjG zbL`c-i2)W(**eW_yEU2&U95h?!?G<%yNKctI|t_D>LhJK&1naOpp8+Jy6$e^;f$r1 zu4}Xg0E#Mblq(9Q$FaSF25C_|Ic=VDcQGTtuwAj_IANhqNDJ<l`!;-UjSRb*KF~<U z1#)Z3tl(tp)e26w8k%<Cmag1Yvjne87rt+(w+>FyU2cM+0a;@Pj7)8%|ND;MdyiSb zcl5^qhXClRg`qdp*McSoGgT@xbSepSyL-4|Ki-sx_0g5IFcd6`o;NW9_LF}on4u>k zI<{Pfx#9Mm5c)XEy%^Jls=dZX{*nJej4qmn#P*Vm&GvT(($NBu5wC0MIXPAMDjewH zP}@#=uiIy+frpSba%$4J*HTls6eU4fxJ3exJ<q?-#2~D&$V5BK>$q`l+)ymr72*7< z?XIt64tgaJzQR$=q*w~H#sQPlk43r7)8=j5ad1GUv1$V2s;rVxoPJ7O975mJ`LAcw z;Ts?#A4;7fZ*p|_T7}qy$Q2bMq_BrskG5xQ%kBP*_sP6L;y#0dMH!}l!Uffy54u6l zG)Uwz0~gfX$+u&>iD#;zSx0C|_Ev&HLm{*H#i9{1Aki!K(JRzr;i$}0z@yJ$$y|kt zSQUp{xNHO=Q75ZqUDobp12n}H?F}lJBrmrTII`)f{k09hEVs0i_9|tki*m8lZw&L{ zf*=vuI&p3(GwZ|8<sp61M58sm(S}p-%OyG(8f+l(Me<d;`ymu(LZL_to`%FlOGzrP zh=Kx+8RKvrTZj0wy&fj=(uVfD0JBHp!5#`l>_PqqKsI;nh{RpyH9Ps|+#0y(@p>cT z$pHTAA-h7g!2uCMd>39M-uXl*!Tb~cwlRvVu|5=-|HsrjMn@KP?b@+Zv5k(+j#06b zj%|0xwr$%sJL=fBZQK0vJn#9=dw%RcwMUJy*BEQBT63=Jx~qWdttb;(TW|0=ex#<2 zz?PTokNi%p%qbM(mE&_aS|lR-F0Zu+1kh4a_9f5qo*t8Iy>T3WgRPHNeO~?)g|M_f z%>-t)UXz*)(7;Kb8|HAB@#15-aGORr|9wD{%(9SxFlvqY7HTe?hJQV%I;VZ{lw?pW zBlr`_jT8o>91O0+)GH<k%HETrN+R&>gMKCokU5;IVcnL%a~e#Jf0noxZIGv7;oI^= z<jCBs3riB+%c71D>4-3X9)k}G%>bE<Kx8{vJ>fKWjV);wFF3t=nTo;K*UM)UqT4{N zI@V)Kr3v!hJ|i=%ikWVc;2W=)#&%Mk<eW>Y+ChDn?@lIv4s}OF2_>r}DV9cPnb9!K za@)veRW}688y^+M%Y9NmH?q`U;#6@zE(}TLQboD@23d^Rj_I3OcnE3PD|fk3_0?s} zQ>~H?hIygWVE<9YeSl$uCH;8)musSxilxlT4X8D?4X`Q47&J$#fW7`kacuZ=J@FFP z!HF^lh;J!n%lJ|w{fSL^A?v!7^Qr?<cT5uI5>!rx;gQ#&)b$e)XD!=8O0^gHfZdC1 zoV4Xeh!KQFWty8l`pL`2#=)14V(KVcGy|!Hibo;boG{15Jjmv}X}7GOVUdR8i<(VS zNL6oL4x_3F(ix!~5ynMH8KQyM^Z~Ry6!=zNf;S3xZYtvlZFA6tk?~oGk>)2U62_M? zhK5LuAip>arxbBEMlDE<a0ZG76_hoDo4aeaU~YfuO#LHp146|Ph^&~E3uY0DL63lo z@xNwU=GzKA)eNqgjn;q9=gw!zh9FcxuW_dpHoOc1y0fw@Wo6=V2OZbB(9sjn=8?4f zQnsAIR9w>W!8tjeWCRe8w}&XrlJX`MP|B-<vXtSZ7NF)~NSjyu2e5=X!3H^OaBgO9 zg8*Kh=s{LdsGF5EOq&lzvNn7crUVphy6Bnrx5L1h8IcKuEzrc)5fUvZGx;a9@xXM+ znh$~<Cr?7+)=p?^;%<w`i&{Q5!@>SQYi^P#w|OY)a|ONS2_*>bOe>x;AzbXO{SF~Y z5}7Uz*^Y;3%UVGkh6W{bST@ex0A<HAxh7m&SdlPN>b9TZwy@9BcedI-0`)zfa54~e zw8{0%2Gf?=)zTK$>rJfFYMq5NnX{kn@drIeLWMbMG^#=~5^y;?CfI@a$1Xn##Z+M_ zWq)j~|0Q|1zJCbW?j)t6;Puq;ff1foM<|5`da@m4FVN`o$dVA6uh+6NYGV0=U|w5I zPmlE?gI0qG>QTa_o#uN)8(5V&P(Az3`EACQbwLM_&}~EAl8eeN50>z(T|#dWqa#+i z@&QRAKK^(MuH+^Zjie906RTiIr1jYishNA{R-o)0OrPI4?I_K;HtW}gCJ2VW!<2IM zg340M*r$yy`PnhkF?}}$!quBTTJKq?3AaBDqH%)3nUZN#o5opGs*iEt$D7&sFn0f3 zUsy@)^HMQX$DK&hgViIZ4VCITGl2)U9#yuIz0n)+n0ZM?p=i2Xnd|-|^R+jD{F|GG z)ya+c7OXA24i>$vf`=?E2NQ!!KMY?P2%-V<kSoreN`kKu1!L$6=qc8QfjTo*Sa19b z-m0EpgW!u;g2IdH#a%wXF<n7FBqDfy3~SW{2}2Fqb5Qe#6(WmM1rBF>{GxelMr4sz z+KFt7geJ5=m7*6KJnAI2Vixl!dC(-wM4P6R5dc_yCc{K<;@bhW6Fh?*=358TAV!XQ zFF`&~KOVB!XvdTnXw1qLB&9l%)WU**W3Yi&?0FCLDubZQDcnZ)>rmSi)>5R!vSOIe z1fG3fVF*pC+5`i{t?TlWhO62jZpG;tTD;=*8^ip%y_7Zf8a@3X=rp5L!V(G}WQAbn zRnQm24PwDv3Tj|U#Y72|-a18vN>}*`RQXNAk(Ct!Ic^rj-a|E9EsKWS8lfZ2R6gDE zHA1Bt1?;dbkI{30Zj?)q8g2r&n>TSJgQWQ4{|GlD59^0Zxn;aU*fM#H(aDY?-CHzJ zq)3F!#LlDp)PHB-^=2HFs27|JZTI&Xsw@I8;u$QWc<7!@j>LkWGF~WRbntv2Q)vF- z_6xs3`&zsgxU;K|K;4Kymkc893~haW;FPQRt<=@*B5WZMsCMQlFOx+qcu*}U^xk+F zN5loydc{#pmI7BoHw=n!oI@~d)L9iq=SP4F7-oFS)h}(H#}A`=-tD53l*M<v^zXG> zbYHKZdp1fuEeX#{0-3_~g(c>UU&0Z)RGZ;yQ<AqkHCc)R{d+F$H^Omk=HY<`!G<cq z$s#^KWVm|gU!;1X^Ga&t!cFxq|9f8Ftoy{L+;Bj>s&5#(F{lJK1+SViDpl3eUaOu5 z$nLY(gX>j$bgB(#CwYp)a8rgP9R@NhI}a&y+Cb^TTCYNArC=eS%b(k{EM+v;g~gRp zIJO2?N7p@vINh6SX_*ZN{y5lwM@u~Wzs{h|bx?kodFb+cNzwZES>9YpqE<m$aBPSq zB3B>Hi26if^H^kk99>74B{RxZD%8&;aF!M@KLeEaAlphA@tk!qZDLb=mxiLFRjMdn zl3#MZrIiSG^hFQr8rp8dDb}Y^<6h>=6ciK1Fqx4ZB}=mk?4q4*YQI+rRQ<~u2jjam zv&*`7FOxC?S9%7^f1fXvD`Hz6pVs<IxA%=QSK*jb&W$9@oc%8gJ!wHp$8vJ#F4qP4 zSA2Azo@1veA|fx_1JaO&`5PTwO)9$i<foIeJzT>`q_*@zB}3|>MtO&_VFQ4M)!MqV z1NX0wyB#$f0}+gk--1flxq=byvJTJ?*|MuPzULQ#`sG!imA)r*Ok6KAlTt(h@`z3t zoQJ&S8A(Pbk-PA-U}Lvdg+{@)v-pg8GjE!k$t1^4&ssqiNks2m-m?u(oe=REyQ9;- zeq;|%Jlv5LlqE8^TcZ@iSx-mvk_pg8^^?0GcoD9*=0#FrJty<}&Y_nv2d$Ia*q41S z)i^oiM*&GFt_IK^cjiTMj}}jE^nVzX&;Y%s#KM*yoNy>Q)1rzW)Zinat7u>w1}EOC zRP^$JPkRZ$`tBmdMXBhy%#~YA^mkte)#IU<_BU^Nl>M0~q?m2OK5^+TF|bHK`ih_@ zl4ivWpwwAL3djS48^=1sx3$UXYIfR(_5*-1VaOctEE27Z_c${Ob4y^2d$s?C;B?JS z`b|l1R|HxSl(G|?KN1@oK-Yj|nY<SX>1PV|=wJcne)VZ-LPSl9mM$ODxG8&(pC{Vs zi2XF(t_eMo23L5togBkajjOFc1C;y_ahOn$Y|sgJuKY=mwqPdPD#cW>46K%(5tHHk zaahPo11YFQGUuQffwvsl#@JsN>R6MmGuH<sn(-nDv+<6FK-mL?8aEI^&>PmB{m~6y zdm|$ZtBzF9Q0tLe32Ofhh36pDbm4ZG7(rf`5e!w*5JEaGGlw0)xRKncSj1n!6qM6X z`mwMx?AacefqCYwB%3v^6)fUKHgQ64ryBGl;lD?{q6{fxn!e2=IG80U=1!dOy}Y4q zY}Q-VXJ(7?aT3U{w=?nlAt`mg2Xf_v{OFxNB);h577zS%MYEogc`~IOgDYo7UxE;f z)$29A8+R%G*_G0$Ml+ZgAA41WoCg0#YFfcImIzs9-#wnfH#tx$rHSpo%yib^Vv>#< z)u(lkw=q&KwVcu+J98LT&uOma>mOY7I&kzavAzvttEdU_1)dcU6+acAW95@7)P|Lf zaasXvc==o7rRi>B#3E-bTK-m4YU1A8tu!o#)pQ==A$_B3(?+T7VK~3vyOF8x9MC8_ z;C_Y<w~cQJ=~+<rK#C#x^!2-+##0p#beR>%a}dmqVbd(Dq^=Y6kqcu#lnjz2>>A{A z=yiCCs!TJiCYaJ;N?9#PsER+ioDuvvd!W8w({uDT+dEr1U$$!K;jHj6s==TiSSK1w zfVlgp(flXq+-FdNBA8kv^2rXCU)AfkB(N+d64{=LapO!D9D-M;WdQ!yJgQF&zo6f} z0p+bBY8f5>7(Nb;73>sS4NAX+Y9Dr}WU-(uvJctm>l)P1xEk5eg$<k8h^9~y1q~KG zW;A+zRR#E#Bbi7wX6{!|{SFzESgL!L-M}KNt(reA<MR{G-EbaY@5z9NiT5Yaa8udw zO(hm~-8<=6DQi(~yzjxloPI8zqm2lId-Pn%PkU<$)+>3X){)>(%_^;@4kTDx|50)| z)~MaU-RutZ772AQ7k{OWR-#MA8z~fqJ!)2PWlkctahhqV_~wL64!q)px&a|G2#gGb z7~h#UYgYTtz=0Ym6dLNBT{be4?uJcx8WS-_yl2<VoTs120u3L>D8!Xm_H{Ryz-1hC zHQ>l^Wlt;-+H_)OnlZ|f3!0E>*681^UwdEb{G~0&;jf=a`|q8QOASfT)&B!t$tjXf ze`;KJJMQ~|(3o3nCToVpz&#Uc?98`_D%_A8kU>Xar0TOLac1JgOiCav293qOSp<|P zU4EEVRzXNxmP^D8k+A^ThnH)Ab)IDI|GnjkD8ku7;+;wK+p#O0-7pAL1xaZyYpmrC z`$gS{4^Wo!ogz!B29(TxGrfnuu#-;T|HNudnSRG2Aa-he;1%aQ9tjoQK+<?AJm0SJ zC$bl!yijKyVhY<^+KG#mn;tsLLQ$3z!t+Yr8VT-A9{n)^<m&PuT5+LG29uN|p)}T> zFiCdaYKLr=ab&H<Q3i%1bG|e{4(w=Oq>j^Vu`>%Ay-5RI%Lhua)&sYxoVDjM;X!nX z&Mvl{^?$R%#S}Z?>51R8zLcaNT=Y24EH(}5R=qi)0U+$+s`T5uMDlVJ^%go^!loXY z@L=nT#znem+cM<QuIK9rEn;mnsyv6`4A84hWiU#`;mVh>D^zLTKRUm(u8FUzeer|1 zP!et;-aT;kUG^U3m)}1yU!E%`!#Fe9>aj+#6l%li;?5g2ok~hD-SjJwUaob{c6vy2 zFfQ{*e<m~?z$uv;Jc*;KNzc&Tx&WsjV2CSv_A+F=Yn@ffCraIfOMmSa9V+$u%T4?+ z53+O)Eex$!3|+ga+hkw>sPj<re0uQ~1Up_DqjwSgM_aWz|2Asx_EAG!6GsJX!hJ}Q z1W^oSm#kqAa7Y#+cg4GngJnx`hkee*pT-L<1eAm*1s}w^aev3#S`l+mYmCQ^qJ^%_ z3RB7tK)dlzH(q=BugZq2?1R9F85c5829@E|VgzUVu-pbpIS^=GELwVA09b`gu>sWY zRxp~A2;z;lY?{)6vkYe}+uy`gj2?k%ij|Y=3PVm`?+fE2Ow7d=JPd{T8r;(J+17P% zEv=!5yVCQSyhOu|O1c**L3g#7E6E_FQr40^%=0Ic1gK>=jC8_IR=z}I0E+;!xcs#s z<l-buj4?J3C|!o*ePj1M>-4k6y+!OJOT2_ebG7-(eZ=NvCLOY{Fa-MkB6>QUwThFi zB92q2blqB03oB|<s-%Yy5-J6ykomHpQuW5!e~dd9kf=#w^VHYlN$PNDF?Qy>T*jHv zZ~WLS1Er~!&8Wss=(c{Y5&4hunMx^kD?JVilL89@)2s3Ih_`jmtfJ0^HUUFIKwtNq zDT@dFPPdnOQ5l^dA#B;W&P`86ALQ%84%k2W9ZSk17ekFH)h36k3242V&xb6SsXbTH znJ-uuLsqpVMfI3pAKYfm@z78Sec6F)#Y|}lgczPfadaV?5zDqIED`-cn^f1~Sl1wb zcbd8-0~p>W&BX#w;)p)cbmPMr1-`dRs$*`8n{SVA9H7UL@Urth+cL0_Ap2OHFxCLr zHc5v!LWmx(e6YWu@e&RlQU4aZSkmmC`uG1VkGr>O><ypeT%PK{E688z<{@>xVA!-6 zBm@tkS>1M>xVJu-tA5Tcj1@ssUk-IBG5*a~`$1c}3-J!JXiNsM5^4$eWY90^vnDN< zc=0s@EjwK`r+ppz?;Zfhj=w4+wJnb%+o`QLtIRwly{d!rvJ+iGfBd|cIp99hAGJo4 zM2F)CUHT6X6<dhizC}0h&0()JRIR`W4koF5<9!?A7>>CvvL}dLw_)vzh%iqx@F~Nf z-j0=}_8|^avC!wuB~tjOe>JQuJAgNCR_U*n8YI4vtgDJ4j^c^>$0oi$zAG!xwwfn6 zFTL+f2ToGnz2_?ny|0rq{Uw2Xz$5s3PGqP_YQ?c*nc{DqbW_#J)PKlQ$Z-lNvn%SP zJC@qfSkFhR%Zs?-s7dW)a1G0X{&yg0^dp`e_Q?^x=k2N4tNhBlQOL*o?!&{`rl#g# zvqm1|;-=(JXneRPr%k!SP^D0JUbQCd%j}FWP`9uFxV(5?&gQI?D^jDH6&@f~C#tAI zQ=^`O@ZudUs>no`pSY`b=0DNnnp7W3)rdwyF*6qPwHV&=oRerZ|HHMwxXB*e#n(}) zyYdxKtpn{YJJmty1-AEKpjEpJL1}O$Ah2cQRHYSNL5!dVE2DM>X>9&G2x035Rorka zrPEqA6I(m%h5hNUbP2$R`Fx0P?D+2VeP#YNGA{se2a!?tP6A$tAKAnBL7NS6+xt4q zkmbm)y3(XSF72e{m}bg3w}N*JLuVw$YAEkTP;HTAcbSaO_(4ML`yriz_oiR?bq+Lq zB-`A#)edU>k$uhA`rCxuihHQo(iu=B*i7Hd%ha7Aa5vai7yipjkp6z*tBm`FU8kq~ zE^e?|y@F|U_ZI=eo&*~IQ1WcjjF^R0ka)(0xZ`JZKtDmOO-pUy36V`O`MUbUv48Gn zGjLZb6pB_I#sCS6M;phs0~EWb6I%r9EgNa(jTKH;{lP+C^7;Z1k((IoO`q5aYBo^S zof__`C_22o=X<+`GdOFRrnzcq1f2!e%!x7xhvNA8qz0dpvFP{Qz6k&}snFxfYYo&_ zE=v5$|5*jAmGYsWR@ZCrUs0L^&9C923a&;T=HEK2g1XxFxG{2de$jP`bMPsyVMj11 zsdz?ccR<9iMq^dz38Wn*IDl5+iB&Vjuk;lxi|SE8xVC<tUs2dghHmJ-^*!v;*)!cn z_>e|LdnAv>6SLEV6ogpjhpb)717{QXOh~BB8gRvu0k^?qVEa)b%5>g>KK-Vis9eoS zl>a|Ua%;9NGJ<<W{JN2$#F0Hg((|ZNwPVhl3SWj7)^-|%I=BKkAk>a2+k!U3+{Ij{ zn&D)tdv^7&(30<8SX0l`ebis!Y2m?MOaJgWonUrHSZ6Ag5-aY_DEIS0V^L0fKS3hL zAwy9;%%_b{34LwQKn(RuCqV?jAb*hGbsyjPrqv~z?JZ@lBNo<GB10a}Dc>XLv;PPH zuFs8ZHz@EA4^Jext(;~@N8}sBh@Ym9M!b4Fa>|s5xYPqy4esoy_(-ycE9j?&9^z!5 zAWMEXGcx=y{7AX2UeMXB7wvo1sv#x`PEKdss~o3iyeJ{^t++x+*DVA0nMSIWN_Duf zLW-zy*T=y%Lp4aaSp=`QnR<_eRXq>KEDx7PN~cg;0QerNnmg9_ILu6x{RHu=_9>*O zc{e)2ZDf6^pPZG?mGH&{Si$;)-}IYT+^W6JE^gT-NfrkerxwyeMH?xRyZQwliCyH_ zTR-dzKl6IK9t&LBj~~I6(~USFfXE;S7&c*Jmh<=FGo?9bf6^qtdzSxPp||2HWPGq= z*$V~u5hhr|t6}LO*g^~6C~LwS`kLmI5eZZD#OtNvauR4FP@46IV>zAP|Mt(d204Fz zVV3i<`j&v5bOsq-U0Ved6?yeG{~wVVy+x+R<P;6O$AcxRf;4QvC8bBQdxH}}#Ns!e z1@juTxFztUH8(F5F!6P^@+B)#qzA0{V)_&CM@_vEPGB{#08^v#T_sQ9X-Q1^zExaL z-Rey&O6K{pW&j++KKE{}=FH_E&h2)=q57lvM47hNybLl5KOIkYs2AbyGw}z5_h{}s zFXeK?p%|%%vg*yqlCy%zX8v*{p33X27Ig@Qk=mgmgs+R3f1lS{Uzy-VoHmA~M<+)R zKZy7#1K5_1&M&UHda`r8<>ukIJSR2&@E=jR<TxO5Q7duz4JDMf0jd^wPx(=rSJ(tl z!jUxfaT7dcco-x)LPPRa%bQD@ROpm_qS!3A8^x%<-^un-jF;&?OTDeyVc1Ljr)3q_ z_ZDP&)&ls;mA4P(oWprGq*CSZ@K|+Enwwt@*O0=5)NIj(K`Rt{X?&RY2ab3T0-~_} zA50Im%Jg{m@wb}`Egxt>-S3#n&P%<C2)4mz*ky3({Ikv|Tog<Qm|Sq_U&$2Gx*Ut< zV+hhDsO@`5MSdcTZPa8_c%E8+G0n4R5cE@}A!_{M?u62i=6em(AX_Gs!3>B{&``?a z8!2GJQ9$e^E{(?H#f-O*I)xWUG;i%<kDecaQHbh412Y}rp&<WEl;(H;#y(H1PWO45 z&y%`m1zgj#gl?6<0=6Nntc16ZDoJDCHaiI1;sbqS4JGc2+J0UVn3C%=fG38}CP_A% z`uIzN2`h<gMCgfJSQXM%_G~KhZFKXDtJfA!+eSo8>Lv3UPO0WhYW_FJ3~&4=>u51A zD`%}a2p^(P7wTeX^?Q?+&xp<T{#he;*7s~2GbPT)pPz_D0Oc<jN6-_Cq+!W}p?q}q zvxJ+eGRLv&KHN#-pD;6M5>Mwu;9xEoB=@y7xKWa@ZKa+4fkf22`{!%N`k0TpNMj-O z@9Am{fKh4cJ-nV~xd3;_5WTzd2la~RHszjL)%OOfV*UHJ#?>z>-gVI<J+72bYx&my zV>I@>L0?y7r~LbZlswsd@D!C2LUG=ys%J4292W^iXhzA8x;7H9RAOYu%lPp4ubmgN z!rBNLE0kL9Ya&@%J53ixVG)N{c)w@up4^(FZxJoBY5nfGm3CPAuqDmG)*~V)7!c%4 zHT_M<(g7@g3Rm5Y;x__@;`M2)ow4Dp+aQ4`a1npF5FmX?u+#npyNC2CR02Zsu5r+V z{Sz3m;e2cxD}}UDz9aNr@Xn)L>2v%PtkrI}%2wUPz4_qvTB~AF5jA(Ic^ZJdF`{hZ zbz~@r=ndO;p-U*elu9RlK6z{qSI#$$)XHr+e(SB?IxmY8u?=(`rB2)SPshz04e`K| z?COnpal727EY0xS#1t=#8KTCPs+|h6A1T$@Svuhw=qk}3F;k_|?RAjKbDo|-g!|6l zM&mSl#j3Rcto_?c<E;fLQ4rVS3KW=GUL;hbZ6aETxnv(j>iIPGSLFb!IFWc7ivByD zF|k0%;=)X5pr)In<(QQkp_Yq}6~p6z%@|cp#Pm-J%Q5XNLigVT7P+w@ix*o=4JF<! zh{cYh6TqFG1#>YH@1NuTRZ0f6EKCs|<ch>R9I*g6H)}B0N+^(RCQp)IJcwkhQGLZ? z#kH<7CyJq36v^P~3}`OpDHFeIpmQv)0_86#WG$^)*nTSCg%{fZ+oGj+REM&iwW&h! zL+Tf>)J*3SB4YVxBl$9$TkTx<=Bw>Rt&Vjs@Xnjc|LLecse8YxF-^bg1}Ix+zy#1- zuq`}nE=(B(gvr^-$lHqA4dzcb%mC~Y-75O`!y*wRew9JKB34}U(k=hg@Wh?`<TXco zuMQ{G$)%Pz#rK{A&!+ZKNHulTY$5WZKqU>HS7fPdB4!X)Drko@4luW3sLIPHKujqr zz1-S0C~clyrMmGAuM?ZiGn{L4FcDzGMiAr!vpiquB>S3K`M|*(tbm<*Xn59P$m+3F zcx^JZAw66u#q|af9TF=e2ok~&Gt)WtiCAwRneNHHp?+Z+T-s{MVWL)QBbUjKK249d z5@{H#X0Eq)MHn5{t0WW){!pd}<Y6m-6OXLztncOP1yR@sBWyLZw#L4PC8n*+rj3S4 z#2X&AnQJ3~U4S4KGfiBfQjI<{zD`r0R%uxGE?;QCm#b9JIkZ<L>DIrqQMZFtHrEMv zKmklvZ|q$b0b^QoJaJiP_0Fl+46m%1wia~%zSVpS_&lcOG$T_R>Rq{(bNm<>>b=Tb zeqhInD%0sn8t^K!r%B^mMZ_I!$&f+1xNtG|yfJ#x?Vyz-)|8FCD336TcW|Gv=6$pL zRo7On?=A%biYZt@7umAJ#f=Q8;X70$rJe26<LtY9J*OJz%^sP(+P|IQ=1LmvVpWjP zoz<U6XTGV2=F9gArDYBG6GiL^IEMTt-*Q$BY%tdF^PZ;k*k$eGpRA2gQl>u{i_E|t z4<S*3gfuTnwRfdWtux|Z{4gq6r|2;_2IZStYMxi(dcZT<=N*Mvm~fG}2V;m)wuS<V zyBwH(6^uu?pRZZ56{XiV!WzV<NQy=C`zl33NE&(e2|p|-(N7SS6{Je83kCMR8TJTl zg>M`oa?l>Rf_kgNlQ|L?pYjT_4*GNVU>m<*vemQCoX#aHie-zD!jUmYV64Mad&nNX z2BVWilob3@hcO2~>-X+7ciuT#j;$qJH4-_O=vt8S@Gai;%Z`U&uNZ>`m<$@(kHa0# zl;Q)axPHm-m>*Vw(EWwNU7GrQ@rlY&9_UgauwbyAZe>@)6=RNdVg;cHyzuBOgJAVX zkQ+#Rd2#aqIUvWnMe6B{Me3+CF6x#s7b0l86Bk0nnnfw%)8`761Fm830b8S4>s;)& zk+}LM2kc1a%nU|)mpT!er|cChl_66K6kT8$b@BcB@f-|iWDu^Zj-zQ0X#kr($ahlg zcHJr@n-E>ri;e7=_!Ij$6*%9v%97--XsS+EH8?vm%=!cEnDFvfqpCxMHUo4Hsp5^Z zF|{FY4F3Sv-3-A<@*8@S%6jiE*~H7Z#YRcXx5@P#!}Be4B0eJTG4A}5ISLO-$4uhU z)DITBB)&(fW(to|xZYOsZ)a!-FE;CiI2YBY#0V^uT`k0Q^YWu4D?2;e9yUl*ga|sj zk;1|R8?W<KxQ4QOyQLMavIj%Sf+o%P!UN&bS`1bj;AMOMP9XftL$~5<xz-YuknEXf z#<CX@<aw=DpIoLAdm`wvO%eoytv*v`a8%Ha+GM5vl9Xi0tytnO1mj$F(2J6y!yx*q z90k1!7#%G~`FH<2RYGPVYRr#zEe_*qxJawLbSQ-?Z~DC~gQW8(O_=4xoOrl^@QWK! z<&i9gPp)Q#n)1#f)w-PQx|BxW*ZpuOkyk#y-wIcCI0pjufJPx$S*!<laX71L7ld<1 z$Up~({IV?@I#H22b!M%5Y8T&_uS!ca-*+w^fiuGDD}vwwq>2C^OODM{J&{tlV7csk zE%k^uF`R~+jpGEjdpc#bx!UFid#|2e5=T#z#(G~3En_E+{cRpAYAyv$^y*4G_c!RD z#(!gBA`utTNfU3%#L4~fPFh|x5?1@4FoK+Uo^+xLjv!Uhf5-2tBu)Yu1RlTnpWPp> z1mo)zq|aie4)hdkB&V{xsY)Pj1;^_a9gcAQh(PXGU+`1;cy3S4*^)VZQ-iVM+ezeh z_rRyrlA<#ebl{K%nFn0?`>*?EfL`mPs#%e|4Xvq!ICkl$Bc1NyA--uGmN28ZU6{!^ z{p^Nkq%G8UI$r)jd1vPi_xO;YG_gwU&t3T&ru~0V4R4hy6iCSJpFeDtwUx=DNxs|@ zooqZi^XUrT(JhAL67F0EN+7$jsNipY#FYix!;k98ouWHj^^`h!y%KvnM3)}Xvp`## zp#CJJT+lwGgD^$NZ0b)*a3)oTwpNVA!p8U(FH&}W0Z4baNPqe&XLUr5Clq+xrQCz_ z1icPN23-Y|9mr5aFzonoJcOI6>fw9F&OFWmw6k$>f2b%MF&~EMde-sB@sLF~&OG*o z=VsP{Y9UG6%LJ;0Mia4v>FZ}6mH#(9!6lbj|KOvtZD{Hd$6GE@A(T55mH8MrMX;~N zMvn&HvJ>ekJkKCmzQnRnlgl7g{P0dahyuc;h-<=f7oe}<e!BY*CuqJC?1jT;iSphz z(#<NDg8o6jbej3+f$hjqLVOIcf#5q$7Rl;OpE>E><mg28b`6x*F)VpPKXS-~bXAaT zBH<_F$+&|wZus*C7va!)rgsCCVwE_fG5#ShDwva<VmV#~)sMt+uXC``Sq>DFc<rO% z>w`zgm;43)k8Vi-hz`4{SITib1DbLhS&TT}`XJx-QdS6>kS&HdKWh^F8<%b!*nbJ0 z3PbOv31pusPj2j|-G!{@T<>!*|3Tr5j|-w7_0}Be30w@wj655GOe`3c5HO6~s~gT4 zPl{EZmj#3DjWpC*MtP1)rx!G?yI=PUV8ud8pdN&KwFvi2)NrjUzpJBRO1n2Pr7r{6 z3%rh#2mWp5omsVaVS%&4=bfPr44~wWc0UM<uB_lt4OZfJ&2Kum+-Fy+isrt!6{!hX zU2s%*j;`<`n}9Ls?n<P5RZK>q#e(#JJaskZ$G9LMPN<Y*Yb0SNPtilrb<3<$ras#v zCkwQ$UB~jAoB$jvVUr8t!J+TjoSqnxh?j!QREkmtOfA!w`zN_FVU3u!abQUfv0$xi zt!Wn$m@;Srt1yrb^R;`MT0>jf_<v_}9_BkG(ZWGH%%nd$IiP4*hQ(Bg)|?ib@wWuv zt!@(qmbe)LdMY$@$w58s{pFkgH2I6M)@Wad7YDzR(nTa0+{&ItDKsaGKP~QB9ds}K z`NTvWIgDOBWSe5A;V){dBr_KZZMt)4`tzj<?CF$s6p2_)YpNe^9PaeHzJ%!lna|O( z;FitDp+2&o8h8(o_!}S9t)G)s!UA>n#BY+w)Zh$Ih_xT|p+p29O{FWPw5REJgMyN0 zb&YPAF$0aiB!Th`<JM4}a+HsFOMB2qR>{8rl7<|dHIMatEL-_+u&&?KyT9Icrq;Bl z{v0nsPf1$BZ9J-#%Go|l>DY@@)Z^5r^P&HD_Di0eorF8iz~FbebwCjdT}7^~oDFWa z7#L83cT(KFw!4s($x<WZ*)1zTYJ{sG8D;pS9aG--37Lx&?sGyIn%7ZUy4`tkWLuu? zP1c~@3?=j}Eh}(Wr7Yp#Y_Ynispea0HIf{;N$*WI?V^g5|K5-+>Z3QSwMyK-rgT)T ze3hd+L(zGRa~<Em-a2Z%7(t<Eu~-7nT6<C1AX^Wzil6iA33KUnZBfn~JO%4`r&B|( z^7{*6fK0Cqxn_PB6t1T|AIG%voZszwW0S<1k{xp#{u8`p0HhgheZG})^l+@EX`N~d z24Og*xEya|E+W`bdr+Xr;zaT<=?HMIhEAcK7JU2-mw+jz>C<4-#bB;VX$3p!td;%- z7XcU8)2%1zJZ(VV{*N{-O}rJqQEO!ytKP<3<<p+2s_!58(>3DpAK1n_j5|l($Z{Pt zFxdIf8OihI0+aEYO*a-wR*DH5lL}3L8>at)_qoMLR=EHPJ++?Xt@`jMK06m+Cqi0% zR=eW$djD?p32R&$|4uD%^U^A3ZF8TqUi_;vAhuK&T!qnj_j`_UJI6a#q4xOwT7~bg z3SRPBd6c|^)(2XZh>8c}!C?iVz1CH5X3}F6v_NCic$!S^%SA}?-1Bxwa^LMJQk9Cq zlEUw_zGclrnN6}JDo4#H;Q!+&YIb&o5^?8PVg693b2DsVwVLjwg>$es#Bl{-COedp zb9?@=lFJsM*6`ncbt~~Jyfh`1uN7z`6WCD(MgmP!B(Tn87@nyjFL0-p2&wL$i&Gjp zx(6Xe!tmR8Vknd#J1zY6=t6$Ww7E85*{a3Z)oRJ-kE8tWB)PwqLgO^ju;S!n-A556 zRx4P~o_X*B{<^?)VSINYHcY`9k|cZe%?cVsV#3Atd2}^tPd+>^#@R54RRJ;WP~}+( zfJ9N+9T_cQ5{x<-4}OxRfHO6Tv=3rJvh4E$ZckpRwK9yA@E61<-2?M6x976%&e?o+ z8Z67Kk<}=?<~{IjZ#KWjT17Z49MMF@UR@gB&a~91r+YSE6fGRJ0%i2Sy=_Wixg|b6 zRrhQ7G|yMTwpi0hs47^w+TkNnc2@F+RDswXkr*F8BaCKlK_mY&@44BmQtWX_Y3A#t z@6)(MWn$DYiKt2<r34pZa4pcRfL+{)T$A4iJq`=VL5Rt21<uT}6L1ebn|X6Se2xfv z5&Q$knv5`aNDYlO|H)3T{7-f|UZSc-6l70Pmp}LQdP#3p|38gmft_kOwove^K_!1y zNe5#6NOBr&8;}Tl{w8B|L~^Nq;YpPhF?jS0@tBBkOJkOK^9@Um|Fjigaq86hJgX-h z$AMG#;rz0DpoVI?v-;Af898cqglh-f?92bL;)AgsHTb-e9=}W4W#ysy5;g(j#s4fL zqwGBojBX{pNPVcr>oxgChYkTZ7>YaQ<F{Nmg$2qB=p!hpQnkJq$`(Ci1ojk2nb-Ey z)fbfGNJpOs;Demo9K%vn66}VM7GoEILyI3<>{BXwuoIWi?3q{+CYIXde#|1r#yf`C z+PdG8%<Ar&1?DuuDVY*gxYXbNUF*{IzSD|lVOt&zl;H2x(7N7YyQpO8D*CeJiAv~X z($Ou-;$;C|Ti~|(sMZ11KhV148#hsZe7mfDe`W_~ZFIcdeJFqbPr!Clux{16lf7!? z<P^)9PjkvO-NC8gzxB|p5JaP6Y18=%d;vu?@*r1Q@SC_$oZ@O6zn{bDzAL*p2i^Ke zIH)VPk_TI)Ebt{%QkYxUiIOi)rt~tB^{qtiV(fM53&l=t?(%w+<Pr5hq4My3y!KGy zpM=SE)fxtg@Pj?%wJ6U|SH$T8&{5si2eM~gNjY=`8(5vI+z9HIx^}aEC0pve?I(V$ zu-$V(Ym1MQSB5nGGOJ^-8krKY%xvzaNlLrG{OrWXQ;2@;q|eKR@<i{IZUk`@P9z0$ zlO~gkmEKlBhE{Y*uT^-q6cGv!Jl;85%vMf=ibvM-D>$ddC@DUuS(~ns<E|KAIV2AV zW|A8hLCdyYfc>`naY9|W$uYIYJnJDr;MTIFlWbZY;Pcr4m+Qi@jqJ)=hb~RYFUC(2 z?A0fYmy;X++wMiMdG`++Kzh!gYcIFKa5wp7N9gT{sU;Wq@#~*DEl%Nb+7>7)gh4DR zldb5yE5iE*HYaKvsXv|aI1<QNHP?t}jPUVl(9IZ>UP0T@#xkoU64kGT{HjV70yM%C zx?_c%@KAE6lc%32B~IF*d_MT2s&+r!n(K{v--g*98!dW9-hdDPe|+phP&Y=IH>%-7 zIq9nfpD2o6S`AcKdBz`TOi<#-)KF8oR+iarlsEzq9#G?T@K=sg!pRZagi70nS&W6o z*bDRo!fpp1CidA%joC7CL>}r=zJ8W!TfRns2XsiKAqe8i6|oZWU>~I-8wE;`0N$L? zjTP}=Yn<#a2?a3(gA(23gg`k-I@Hv1{J=XJn>--qxv#*eq<|9y`V|a04jWbkf8*t+ zYFFb{5{NMpIF?olIX3_iL`;e*3ZHPrfg`6XYY69$8u8B(k>5sIC|^b!c^Rk9^CQ{9 z=p>HO^XtV89_zvZjGz+(a%d=TgFD95T;-3A4A2V=NNAOWfA7u-|E?~x9V=?6<m_^< z>IRgVTP5-LOfb3yyqQ|=mw6d}yc9K+$n!Z?`uon|twSGybxNr=&Wiw;)3d!0_>9uQ z<K}a#NaNv6r?PX7c*bA`9_00*!vzNWko*i`kre`V)a~^YQiaU_O+2I|%$RB8b^6o7 zt?|hbKYvpIWKN850mBZW2DTeI-&q%S-!~!z9n)pMV=YV|)vcKDdNhmVZSSNvw@-^c zPSKrJOK$`Xy!q2(MAtwt{1Qa9hI#!`1~m6DYKsRPwiADaL;!O7_!uV32A3I9+7;ER zH<jpzX+eJjjY#1v!Ieho(uOY}y$#XR<fhy6(qRprp|6tK#vO{RnO=BEklL4>k=-;0 zG+>-G{sOHZO9|Bo+8bslXTq8YeqlEqP)9cD7BSVY??@R`o{TyrT8$6o-E4E$Cm)ei z$E6pR&Q+riJ}9)WMpH+ACr_eKWYclqux`y%>{_?FeCRGjF6?Sx>g3ZizC%W!d8esB z8Lix^aMD?+b~xuw(u&0LCDPYm71P&XG>Zk-mmmVjI{zme_q!^8Sc0v3)d;J_;<Y<i zUrVPB5F9jGkHvVHpm+jpfRx}4^}_D~F+-kBxw92QYMScpb46YZ2u}02A!#U#=?E$0 zhTKBPQo>35^Yag>Co`?zvGKhI2d=P?RdcM~@1&iFvWEcWF-RlD>m({uWTkTu3}R<H zBokVX3%%y}fs60KHI>d$AiaEit#~*TOD0=$$<*lC4dAeLSk*C_GQp@(LeiiUW((p? zGYNAFk@<a1**-at7Bt)R+_#g1Qf*+os2H1?4BBG~T)yQ$s%_3(x^G-?ih-)P+1h+5 zgqM4!=W7Z$usHRlF&D!!(}HxJKMTGHp5lv-aQ=Wgw+jfFMg8*^YyJ$TL`9_xbW1=r zG~`Bz%|X&{$xrPXrw|54I#78fPw5v^03z0j5|vLFu{}XJZWV^n9I_STYbVWV*?{3S z0IxIpr(?^ZahzvG4eZ4&Kg+&<l5DL2nRJPaU(PnSq50NcXo24a+nKZ3tb0s_a!u~~ zQ!d$%&MPdT_Xy|wTzg^Z8g%Neo_SRwW8w$G%~z!bZsZqbUqJ4x(r?uy5PA4_0hC1u zsmX4!aS(yq4ZWSOkx+GTPn;b_+_U;;x);)Ll{e#3Hxq)fjISe=D=e}|cQY5VbMAU* z32PTJI`E4+aqPpz4H&I0A8dmq6r%rJ#Lnqffj2ZBffpXND9#%Pg!xZh`#e?}uXEzd zAv3E@Bw=5aG&K8zfuY?fpk$#;E_*vh7wkDZ$Y1ghhWoVd0u>?Ho`yw)f1CSAK8XOz z-%oAz{gr&>wHF`V2XFTN_(n}<sr#S1wOZ)7#Jk0bC7b`uM3&&w*;1eM{OAJDKQ6rq z?9`u}`k5QJ$0kEoshr2FwtNBjF2@^0-UFP$SbUL44S8R-*I!1V7uhvXGAcY7>%p1> z_Z;)(L<VRvGkqNtU4N{(9~J&SiV<GC2a8P@)u;&$#4~pFYeq{0xz0f?3Qk{x8zH6q zbiGIL@2;ii?Yl(ihl(z;gB|uQI17L4&3|gjv8~(v-4dh^zK!^Ze_dpx-kWA?PUgW9 zt^akIf_@zRr>jt+&hqWYj~vZG%T~So?_2F9yhvHjZ$G}L+|tZc_0ooSpIqrp%FB-_ z7@=X)^szeYg1p;?m5h6vf5^6Ati#|cEw^q*tl+gQr|yx=aUJseF-n$DBKX!YCV<f@ zQT=+5M~}UsZI7f+NmNMgtutgi%V;_G+Ri1eF;c-MK&bn|+d}We5myyQD*bls{pm!V zvNL`VQA+s_cp#NLP54cd^I!w*$I=NrqgKH)#h=Z}M0{7xn8mF}(T|#ru8oJbHJeCW z`-sXvS9yJU4@Ugswxvvv<qY>mjevlV(uQK^+R;2^j2Q7z`}=BK2-t8AUmgLIx~W(< z(H{UFkgeNatl)p{+^yrm2WqLP68o1L1FV#JUA&oT^lfJ*SL6RKp)6HbZA^<6dua8f z|NF-*=4;OUniG(hQjK;_#^CEgVE03dC+cK<^POfg&1WM3=Aowj9*y@s{wgpr(Dqds z<q1nOlE~)f9TLrr##h8}&Ysw`J#=`_c*ILW__9~Fn9U&S7nPPwAkOL$CYclt>`w75 z3w4UnQ#X@Lf9+^xxJj7w(Ho6azJ2Jw;}6RnoCAYU9dXY<Tl$~D>Uo0oNX*YhE<n1N zPp1BAGXDQu!sl#D$a5;5p*l~wB;T{2zvqMTWv~~0Fo(aNA7R!CI0P50vY}SXZdvP3 zLOg^~Dbdc6pDIm@y~%nI?C`Y;7c4`MN#b~ofxx?MWKQ=27mTi~n*FHRlg+c0VzDEp zu@{}2uZRx&YY_v_aD$*3++1m(?U4?34Lm@!IvK7i=hiI`GoqZDD=3sA@+1e2-`|PC z9#=**M9eIgiZlRI%mMGE?1}c^2A`33MRRRFYA<qAKKGST+IXCY*6p74x+Ym`#L^AM z`$^eTw`61H%}epkivNCkvi;+g7n}8io3^zRjZl7)M~#PB!8KhS87J(IVl=96fODY~ zt3h-_I>q*jsHRK#FcnHSERB2KEt!xWci2B~9M4!^Z)n}TeC)EB5%j5fBpM3!FwkXB zN~*LHBR`(r&AvDpM)xkIJAm0jtmdU5bG3J?`pKos?oo~kuq=BsO;eY>F!@cgy(E(= z%eh_sE&sLRa=4w)<qeVj>Jk2zz=Nz8ww1pyI_`4)A}l<Np-tJZPmm{p2RT7e5M!Q! zF`RuN14*K*IGHr<<Y!$DEwA5O$RpkGx?+mx#(Z!zctT;^v(=ui=WmAO=;qlVDu>OV zPW@4jO<+iaUgN_d63-+?L^o)pf3-evUT32~-2{60cPU=&`ej@NqHvx$+egLD&ON5% z_BuZxkh=EI$bL984|*QMzSsCnXuQ$?UHGc}zyvq+V!XWiSo6U;>q6YwS|N%?dw<AT zM1Fnp>B0Gwk1TFte(HMPTCS`?dyU@sJ><>%@Vq_$70LQFg+28px#yN;f&+obFQe>M zQBBx{TrJ?R_x0p6(BpHrv=#yXITxbV(Y_q}AQWPuQ^~K502=Njs&lS)J@q;){fR~w zF^T+mm*@c)tl1GT7S)|ZFM&vGJ(fiwfMQI2!SA-|L8*_H@Zu$vo_k%>@${-o8w<oB zPtvg#$ti=+osz^FrjQYU6Q$ue$#(Bbb%JFOi1f3Fmmflzv!;n{G-cBV47BOD$&h)E zP7i4`FH9g+Stx|XuT%bvcHEkLLQ(r&KUYbn<W*mxoU`^uSZ?~+Id(Yq`u%pi9s-zA zx~Ei!YKRNII1vuMt}itF6tSNI@ltIM1_JTEx&te&4h0)3GM-8MAUJk**I<{rn08(} zm-o)2Lj+w~OrN^889jQdvYewKf@|7gm$v4Il%|_r!sVHMlk(vpUgsgw&s;<0*SbaE zYc_&ApE*BKdR;aV>H#4E2s>bw&aP#4@#mm<dcYQu+Tnp`jvmEa0r@U?I)mXhlJIqw zA3D^G-Hn5<>CCjVtJ13OulaYhZ?GPJ%_b_7I<QH^(pg4BE%4TS6?n(DSJ-%?38Skm zMgQh3!|~5lme}iaSD6j}`Ohw5BhH2I@)7S54YN*u&QLEY63GU!)H-(U_!j<bbZ!qK zq1Bsk#5T*`M3HU0T7>9#t>@K+LarMIoump5u7O#vXzFhMJA*eJtKe{!-^{P#dxs-6 zD0GQBf@_1S`kCSj5KlTh3LRf0^xRC}n1eAwbrdTTMn4N^%e~XIUMEdxPi?FPJty(a zhn6*)#9HFmdzAIPH{cG=a}Qs$8<oazTW0y_?DlNIBA`bR)qgJN$`xHDgNfICn}4$( z`+1zA4#$rM>?e=zS*@AczA`AFKMAq3GX;Y8=>r@iFAUo5ubK&F9cU@D&|$MLsIMSt zzm$FjdhoLap4xWL78vOBz+ftc1S(`BAd9yJ<z^o-<hMVgkPmh)c|ae(v-c7Br6p!l z2l>DTbA@SMLvw}3dcvJ0=Bu09l6x$g=9sjSv;0EEabtVBiJr300)gAo-k&d8&+eFV zjn9&<&tFnLFO1kp0{kvuISEfobIexC)xTqxOPliN#j29{<MANxKyFc=lR?!cG@LbN z;*Y^EY|NY7&mVS^hSZV%>d`-K7)iLKsi3l-UyDt-{P8gtZyC|Ne5`1iE7+W~FWL3E z{%xhlmT{yWQ$4idP1dlw#_0T}ht5<)lWQnKFFqqJ4^T!B()F><W{1cEQ|O+dRYHYe z+ycjtmVsjLZ$O^6<wFFgxqS)cBclMP@sFML3$Jmnd9Ykzi5!h*SBxA8Vrxi4$?Pp} z0W**kYeJb8ggxetpY%h9^=M|JP~i|Ps=$Up2QgwRAr%RtI&E)zkMvhjeIX_cjv&3o zj5<k}mOk;@bF^dGMS+nsNW2%l!cv@#J7V9hY651JWhpAzwreZS^1taMHpUp$tB(~F zTDp}wJSpQVmux7`{&$(zhdKHlnG|j>BjF9B(zZTx+^88|v=lCNELesg1B5uMx^(|F z7^bG}Eh734GFw<+P4g3eWk3;QWN4V?ZAr8Vyon+2$23Oi3s(fneRbY-{uQk{tl=M* zxN0xsk`l^cuxg1VZU-)<H3DmN@j}`HsV|0ECxR8O%d=3*iVHTL*`f;9!m~E$2uGc# zi@53#z1xSp`pT6rl#|yl)50^Zfs7tS^6nQ%JdhbQXMC}?)U<AtLmYiPv_VmpQe)W9 zCT<_5qu}GgxI4~B>S47u(i%CD=4erOaRkSt{e;W1yNU<Shd03`yZ$}p?N0$PoimyX z7{{uF4d^z(NH?P6wrJPfGwxkK2A&Ck<BCeT{Dl`1>O;%rto-%0bSF4==ocSk+oh5G z*Qj50Y1y3~%CWoHc_p8mh^>^pS#Nopg&qS^qNr{nSqZ>XLyiP)goN3xb<4^-;1JCN z0^UtAdV1n_<hBv0E0$0Q{7%~>h4r&w64~@FHdTIr-@HQczYRs?bNuk!4lvt-r+#J) z{}386b4+?*jACXMv+Q)6Z?}OL8aN-YHk*@$#5SJ#gm5Ijn`?Tjl`?VOc!f9hW&i_t zBeM%$e8ciFeS$3`hEACpzZ>!WuTO~M{fJU(v=^n${JmCInuHIFXBf$>9nIDSj?N}y z2FI3M{_7sN{h1{jJ564SU$j@|F8{vQDpf7GF7A#;OL$2ND!ZU17P$Tfe^j`dnNl!^ zhM}cJKjG9DvzsP_Q(n;!Wa*Q>DLw4WwVNg<y3cA&vg~Q7zKG2lC?GFw`TT8tDas{H z`Rh;Ga`	($kB|Ql+u~`F!IS`!DW`>eAK!U7eOqygpAs*5IjC`7gO+K=5Q;4Tby) z{jAdKe0@LplS$5WKN0$j{X~CUL4zN=`Z57CWo~8{T4PV+*cGp5YnkZOavQjVTghlO z+-BZloY1-Hy{DLT?B<5^S!r4Z>BU^K6&)u=#r4Am@}Zh20JjQTG5QKvc6Lzx7LI-v z3b~|IYR%)i!G*iTAP8el_LTRnPqdrZ?{Z9`m<gpk(W&G3IOh5fCA%o^r#WB6$Nq)H z7gGv_L27nc+uXB!L!B|H%b7V)$|aZ%hbp|JUK;_s3rqi}DziVI3)CHh_ILW~yL<NO zyKB*-zT`brL`ueM$}Fe2cdTCzrCsNeN#?N~vIiFLRao&Kn^ztvv$vPTn~4cs(sBMj z^=NNUuEuld-k}z)jc1B5<d^PXBgqT~3FNe*s4YVrKZr*4fqj@2ndEhgAPUw9%;(@k z=B631kUVX3uH-$Ao|?gl&#q~|O0!@ZiBq-$M~+hV)W5Ci5OI+j4j<RR0C&{TjDHr| z$7_%1I4ywnespg~6WSMjecD&NhwNRiIg36*1n-&JkkOg{6aomrRPhNgAx}h6g-tOl zU^Mur%oF7X@c2Gp-JA<}vSB=`?&S<0Fx*nSi59)F7&!kbXf?SnwmR6y*>1AQqnqZU zcT}VF=D$YY)^uvxKZ5t#2U$15t~YC6gjM7`61c?>F(Sn}=_~xxx(7`F%`)v|0iXd+ z%-|RauEIbMcZfE&`KLpKVQc-tc%l@?MppzSLR?lf+b7`B-qLkt+rl#MlUI;q4k)^; zZt8&QdXPpIt&XH>YSPEumW+kTDkyj{k<N;Jo?&ynG{9rZy2UM%Bc_5$8tb8JH@f;i zwO9!gJdh&EN2aF5OAZ6%lNv^(#-a`y{M><oignaQJR1AY9RzZRVl8+dtJ?-hUOVww zdVdAh!tcJ<!Usx&SXRq#_>h(~aL-;51L>?<$f`}OZ^FZs4c?MN`7aVM5S(V#p9w)? z86JnB)C=WN$4>}09amyj(6rDO2&JkmR#~m##w98i+ZpWfKs`;UjtdCPj*62uAt$gA zU{1aU8nB@<?R1Qg`7m;K(J^qM(s8=SkX2V9D9NG{jFgKyeVE+G^9UJBe;!#Y>%2iM zp{SX4{(nq;V~}J`w{_dLZF}0bZB*Mfrfu7{ZClf}ZFAZ)x1Z;|_j|t|RdpgxL`B9q znYq_qYwwk@HX+AmF8U)G>>lVZD0SZPh@RUed3*b5K4tB?F8YM)-F@}3ML*>Y{DEOV zNDJu7rDWRXm<B$HRe;Z~aC0$=vs2|BylpiV1V<A3^0gxAyB;hlyo`<{ED~c(Vzg8& zNiG;57=kJhR_TA<d>R3!Q3xPbT!+U-*z6Z2X422GG#zMeAu!ycJ_20v@BjHY8kTiH zq}CpQen$cL>Pv=y(KPNrk@}yffM;fFV736#A9QF`iux9wzhV3-=9Io|elcxl@#N7+ z1MxXqqp*7gqnYbDRBAyVxf`%B%Y|aI#Dng`W}=u*xtFwNjJcx9dSA$e0YB`vw1q@3 zr5;8>62PAIf~$ZEjRNKymnf}&%A}|-Z?I|}{O0Pz<OUiLzvRVv#5RIT&u$^x0jHWo zbCZAFG^6@?^Fd;xKFA9v+uE9vZq5Tg6@50t!IU=PVh%{OJ-vur;b2;JqMi57gq~Q` zhbaA~1S&rs|A7v1dc=NahQ7b_{S3tNJn$8aDvsaEOiCtF^QEQb*GZ+-m#07#6{PR4 zCR595mL8%PZz__L<#qVTdKj(zj`6SmnTubJ6vi*_R0_{(-e2Is{vQs9lCY{z9~Il$ zo}hxplGDvs1pB^GjTDz{(YxT|Aq8J6x(fOAv3eQcDF~$mSGrUn@u5UmrtD_<zWX$n zvpEJz-a@`yYGuETs;(`S$h1l?p4fB0Sfw^Ai)ZL|^Y#}F^wul+i`HoPi@M7mN5TJ2 zVIO0*^gYB*vSZgR`sVDx1Fl@{|01<$d~}l$px8+oSZkZ~)UtJ4Qkqo7_s?7=(S<!@ zu=7%xwA&i*)l;mAp{`|4KtO1sh_0ZDH<dg24_Enw=QQi&soO(ruC9)FEadsnFKH<V zo=#C^a`r2uAczGjt98F^y)7%UOr)}p*ot+;qEX7TtfwWb0lH5D{7w|y57++MWU{P6 z*OGeHG5V6d2^$WwQse3c)gFYWT@rwKbo_51|F(7eN#QvDCB-pCsmfI!(6C<0&<WT3 zBIO_g(#^ubn=p`&9{%0vG2$h_Enh$-`S%S-h6H&!(8~CVG~b^O9}%ZfT(dIOTvAKP zXO|UW_NT#{K07a5-DP+jg1RN)P*ifSw@1UwIi9|%cOS0)Hz}sSn!v~H)kCP?Ie7xJ z{sNFC`S{P&=4lmrTMSzrV}J7CMwWl)>PB)IT3a8gvZn6{#Z<h&3oszm7a_(a{2>q% z3GKVZ-0g1#X%G}gX8i1R^`R(2sOFd#EPzs=VW{<Ua%q>I#3=)B$~rEpBoY^>ur@;R z{Q-PJJ+g-b0{h}t%$IBr6;B{M2Mf8M4A%ZY1&1Kt<KlxRcRTpB!$j&g@+$fS(rhUv z@iDRq0izyDKYxhqgTM>X(!6Wn&tWx@Skj^ULhfmGig`8NjH6D-B#Mrk%@V19sK$jE zx$$Dg$fVD|_!&=Sse50mq)9uca7g}I9L3<#%PeojTVx{Xo!wO!L4M){f)63w4#rdO zXO-&C+zPF-oaYx4HnBOa9{nEQt7MYr;h$CVQ&PFI&@aF3>C0yemQ1v*{xh@u+P%O? zUBf6H6%Dq5?xa;6Ko1FM7pez|ANaatwRnU{ZFDV1acM!IkE)lz4}k28g_4d3VhWb~ z81)uXumjuHba@R#J%qnZfXMv|*^3$OATOwEtKNgzV;~OpzCT!b>}%1wf$MC@3M1bN zn^j{orneSdm|I1w^VJB9AmH*+ea?ODX80oZ{J*Ij53&aB>)UwPmz8L~0A=pYYp+CD zWz*x&CbgU-Wi)qcw#TVJKZ?H<CuoR!jtrR3XWyz0Xm-sNTj~yL%4M_V<IfMVq$Yx# zf8yQfAzIze#S7y7#q;VLu6EdbBWc$8QY7ms+PH)e!Jq15t$@8`LyFknKqwLKy`?~- z$%*urKY*bhRY(&jk^ta50?`Ow58lNmN#svkZz-gZ=nWQ?8!OVe*%n@eN+kpx5Pv^^ zcy1VO04MgIHqq+LJLA}Z&zft_GHvqPrpqUteh0dTShv+nyFO+p7-Nuo(2#4!VX*hE zqhX;J_!2g(L_=CTbN>xX&QOZnZe7|GcNCbOb{bNcKr9p$t8>^=^14HOAaQnWM^PtK zCQRc=L9=6sK8<!g3tiq9Uv`Eq`-maUsan!Ssqw83De*bP&EJu>Hi%Pyz6)13DBEMB zPn3WK7B*wSCPAar$A_y#8J#!&KQbH7=du6bQ$=&?{ZtA@6t{>XXYC#<V9KeHR5yYJ zZJvmUa*)hINeD=iqXY^)`vov)9A0pQxL>$7+&8Wm$Z4?JaJ;IabR^6VGY>`;TXuaY z@Zx)X)b{&d;!gMs8eaqsxWYQT#`M`YJb~iP?wjTtl}DTkBQ`7R2*s*v3|s4k#5MXq z5SLnt{pX4JcY!`-nn6rG0B3Y-fm`E_R}vsZ!)fDmSp!KiX9&Ij7F;)u#hL&jwYK~- z9qYa^wb*!SfSlO47n7vl=$FpKtJ*CjCgF2%08((bIkX8fb}CSm{KZDsfKeLC-cC#q zrL&NH{92a5_)$POq?J`wP}Pz{k3C!+c1W)!6<~*mFtKws;dF*kc4y_$u&ClVR2_A1 z!j=u^WmDXGP}&x^#_`wsH>AOptLfg(_1LiBzHXiWEp}b^#daI}4Y0<NZMELUB|n^E zy!})pfnw1H2a#j_5$GCgV~7jE9&D&q8eoDi$!utYO;fpYu~W7B5+cFv<=A~3?*^~} zb?Rj5IDzQaj{f@da@~7MY&bsn<$C{h?7A4AN>%j6dG64cWx8-~IgnU7z2s399d3)h z-_pM^<F)s3|G+)T;(PL9m$l}rmTcKVE4FvnN`Pmn*o?Gc-qlKQ$*Zfi=|HbY%?jLE zyb{g*-AM9P_~-vbm{3cb){mb63Ey9=-utUVn<p1S+@nT@C=oV`fyEN@jU&H=Zy%_% zOG$;3Zb-ulO$|R+FmZ>TPf9L7n-X&m!&ZRoEb8&dmjV**wm#4$K%wB`!5?kEI9c%7 zU^}`TZ7)$*H#Xlhq1EXCeguY<NC5lGRpi~3I?`U+P0L!_9F2%9=>A(FiP)}YRNg;{ zxBWVqk>LCrtABB6F~VP+`wl|>R*rxWWPhqR^m`!4Zycl?#|FiKpeG3MP2!tE26P30 zfoSTatrPFGnRZ3sr&Iiryk-4ht&ey3X@4pQMhw`M!owvJm>Qa|jdnp+D|lSO<^gT= zY7TIE>IZ^(Wz!z0Z8A=<?kQBUs@bR;c`}=RRCtemG_Ym>Yd{w;aagnr!Ihduo=nnv z^j(IQ>!l6{tqw<o=^*km#Y1|NBuXeF7jY0~BDP-kypiLG7v7=A@l3A~h`Fj8f)pdB zQ*B>WNjirig>8-mfCs1!u*{|x`YCiTC&OwRRMS<8jeuH`grb~ikwpT)23LAx$*q1y z{5t7)se+S-2}>^1t{ub<%%4g;Us^b($@O$A$oB+-#bq(xS=OHS#8_w9o~|kzws6_l z0~7x!OuSyg?=U(2mPv@DikFVewR3i1!UR4TFKP7rS*|cqB34s`c;FH|tPNR33Yc`b z<P;P=|0VArG~eEFG+Q{)m#j$iO=F`}3S!^(RntZMc`lqcRQtD`X$037O9z89#-)i_ z#@1Cm=5bYsO*Z^LpGWtjq><qJ;X2lbQ80M}pz14e{s6_UT;C|XceAH2(`d2G0W<$3 zdl$?M=l<nNrgFp(z$f4*2#;ZPYG0VmasqehSeaN4G^ha0e2B8HU_p0Y$Nd-h=0-dd zg8@=16rQHgQ=osP-lSWtFcW24q~$l2;N^+^pVuUQ16UHB%N|yGxCWx=@)jy(cv8|a z?D}f5a)Yox@?nfB5-l5r9A#VJ^YoPYvO;lCBGvF6u$<dhc8LP6zJ^H>du3+zbCKHf zQ)L$Yv?&NjP?{?Yq~W3K2LbHlnggz`rnGC)!z<}#j-ImxWN->RbkNS5|4m+gx<-`7 zW-7AX$88)P9@;>gpRu)aS!0{x-3G;}?1+n=S!XgP4kh1wp>66SNSa*LT4B>?w?wnt zTu?(Al)1KcT>c}2{)fPfLk+@Gcm!?iU^I~#&9Z@Jawz|`*Uxeaaf04`AM9n0!EcOO zW*SUlRtYnJFU!p$FsNTodw5GE6~cnd02Eu6>R`i?@Fp^5F)3G1ufW=@|Ky9@9DNCY zd^Aa5)j=hp`iF)xg>VGplhrIz*?`;OaFMgC@mTx&9^BZC)~M@7qj}1|Kw;zGz{x~Z zaNXJV;(P)7mU!>kOEx;m$x=3o<z+B$6b-D3U})t;?3sW}3iQeBd@dI|5E*X)Y$yfc zN)%jS#O#gkAaqe8{0r!V&Erx!BTxKyz3M=<K7MOnw6P$8X+KktixJ7B(rYyGu(RjV zH&BQs!-~FZaCWl_zYKTg1-zt|he<OjQ~km2>&{B=`ipIkP;h<uom^2yw#RZF2IW9V zpwTX=b!*;sLZ&?Ext~*dypr!?S>_nsT)O>5V}7~!3<bZz%GmQaZ(4I4?wy(!vCmzU zw!+{db<#wk#Occ&Fvi<i9R3bZ1%9P}iE5eHhs}%<P*6ESqGv9rgdI(YcE#BuQoct9 z*YGS#7`o;<ED?@M@s0(<g(m{nddxMvvV(LR_Wj>Ge;mJW<(%Lcbw(mKToSMi*LH1D zw*MtJAT%N_7Wf(dGutqNPFbN6HvcdaSvfoV7ODJv?JbCjSJc?JxF8%7EpV#6<-0G@ z0iG!s7J7d|7lW%`n4OoD#RX+aXBSnE@X>qO1W5n6ceW91lYxtgP7psaaeIn{c>SN> zKrui!iM_wck7ND5#H$n6?jZ6m<c3mF;s^88D+H+ty8}7*#*c5(veYRMy3dxUWlNBB z_1bi^WDa2y;l5|FHR@-s8L^JtW<?7;+b#8_T-kQ(c5nGtiu;jX+&6qu=Ar-fkl^@2 zuhP-Op5hq9#c5ulO~1t_AItc${rk-;n;Q`Kcu|?^oLYH2T=dPRz;R!n&~5*l#B^VN zZ$X~f#Sr8Bppy%AnUjp^TfeEAB1(3Rqo-C_#LcXKj*mzC!>le(*IX%#zoltaIQfW( ziN*EkYcYplbdpmoJB6qn@!YQ!O?tN@iDH_g_ci0UJKpkiRvsTzTXhUx+*sBpwhKJ! zeSF8??xKH{(>=k$sW-%Ctv)NnzRF!rtH_ndP$HDEzCO=*sR9cE2jUgTn%%*?=%wiD zt@M%&dADRnx{7aC*%KpIN&WFuL1smOZV(>bT<1uO(%}?p)UVfDM(;i>%zUH$P<*{W zS3;>7_NaP<wl9-q)F)#%Tkv=&Honi<xznMWc7^)5VfyjTI2%*79Pv?#0sKW=QP>Z@ zvvJ2=-@tF$75)Y)AA`w(f3NauxpZs{9gpf9#Pjb3oe7dY@1Q#|t4aTA1quuTxI>y; zl8~%Jb$yF3FrVsP=a5;5HFC|QQB6G_ha(vbj^{II8Oy`2@g))ev+p_oNEmQy+|@ns z-&;HY)I32fDO&t2Fht+Aum3fvxBs`Y>a%893p`qN1>AZ&0I}x!pTXe2RQ$;RwdtM7 z+lH^FqM)d~@CN@?9Z;-Oi%yq#Wy~idB51TPqJ$u^wW?lE)`yE9&w3b3;R-D#dc;d! zA5jPuvCdkD>^1+f(;7mkZoB4@Z<S6wK?FMKwGZZ$J}8um+P(t`t)kV*#a~xAI~SxF zC#>z&kyVIjB1&oP9_tU#N;h<V`R-#FY^@=>a7%%+?g-F6;Z$YGfAOj?sXn&OrNKrD z;O|-ynHki!%fHEO)OX#m-MopqsOu!X_1e1rfw7V6TOd8^Be$9?bP0vf91c#$nWt=0 ztS<~hwkVeol=Y4cKVPndFakAM&KlZ*us$k^wLnzTn}~HePByJ|^x>N)ie+8yzZ`lB zvIMM!7#(Q{gW(hdkw2D%_IJSxGi})U6mtkNqn$P?KeY^GP|_U3P>Wsy19&~3H3|uf zc1>^nfeu862Yru^UZH<cUkj|S{4ct@i|aoeFLB_8uO5bBFSKuvWF!uf?r$-+sEhe> zF3gG1maU_G61-N2<HWf6pqAuGL*xl+vbL4V%;JkNI7}S|UANX;fgr?enVQU|!btl$ zAg~KKN?9xNaJ`FQjmvx1Ic`@1gCY(l`S4w#UV_~C8g04?ujbg<AJ$K)adiQ$sUKuc zfA_HQ))ZZ4M(1pB@ReI`OJyw9D2sYiO;hu&7d=p_laUjOjZ}Q<G!V(Z6N&Frs5nF1 zGp*U1j<^@_R#rNh$Wzc0G{rtySCsEoAk)czn*-@1zjnzx;8);6UrR)8_He^rX&kO! z?f?j!cY}Z+{ujf$b1sscE?&Iuf_~eryvB&V|EObljyudP4P5PdT_jdekLB9X-hQka zIRj6E7WOT+89IP-@ehGu4#eseh6g~zW0K)nFJl)I3soM6!&9OuASe&FNb*RFT{8Jq zG0)W*rY~cP3O>?%g7=L*!%gM(%NhR+7Dlx5=M_rohq`$lhZ&^*wQ1^c^JQdbR@-L> zQ*rzo4rU5b#p3!fjCy$RZU}9%B=;M>t4PwfnY$@e6&+f(Q8)jW0*Y(6#>^6)<?4nj z;GD!9CFWB{EzE0<rQkAH9YuEms#c+I`uy@JMSp#PZWM-Nd<QKTri|f%BQ|^I9$hR> zc6+1h4Y@tG-It80K#YlStwwHq5a~Wk;jzLF-2*B6>wK$G1ev1BBU!zIfHLcXNj9qA z95hT+#1%ALk-G-m;`3DALfW&lIM~m7jZXGEYU6Al|9al}yGn{gujZ?h=-wyC*0>~- zLWRtKTuQq9317^5G`Y~`fX1-ycv47mlz1?Kh7mh-E<c@S$P1t<pp!<NjBt2*|7Zvm zK*OR))A7rktcJ<v2L<@yw`&Cq;$dpH6_vgC_;=Bf#H;H;yf6nz&I+ybg^#HaJ7G05 zso(xo<%&rUu0KB*Wi(>=u1cOT!3>G-QQjcxUP*}-x)7P+l9N%`0A=g@TGWo;R`NdG z)Hvp0zVHSzD|r~zJ>rtJnZ>M=auc{7rw&z*Yu85-zi~u+uX<<c9!rp^U5#83jy1K- zk*sqF;SPTgOQkquc|eIx$EQq!d{VCl4_JoMwfEQe4*TM(wh7dTkbnKM)ff9pMv>Wa zGo@%|`$r$hKOc2A^b33ahm*w>s;<Ou{>C_)Htn(oXurp*tH`{ci0X*VxBrovhQwG$ zn(pU2?{*Pp#aU?H1r#nie<qn5_W}<NdaUB+&cMzKQw~N>tivT$Q$pOLBdNf8vq8=; zcW!@4<-LHZM}gcskY^UQBWVy#ILJu3Qm1QP^P#YTV)J&&xkUM~kHS`IMJFCuiaHub zNJ7q>h~1`<yTmW9wMf}qgA|FwxK-LBU33UE1;?6Xb32fVOsuEW=w%JBFfcXd4pqe! z`l&<bFkJi}8(G8wJwNs)q_@zfxN@!s1Q!n^%5UxdmI4FqbmbF#1Nq)aS1)~3-@4MQ z_iD9$^QU;WXY9~=u91WQ>CSIbHGnm>Jpy8Xs`@?mNoW~Z6djoWzd9dv7-K?c$t$p; ziBia>0DZ}WKWw{k<@~#N#h!5gW&b$C$}`3^O>(-^o__3=*_d&K)H>M1lOpu~rgDX0 z*2H>M(@TN2|H#ta^Q5=@Pvw7DC1V?492S16>CZoFUdRMhBJ7tOt*ZPJT>yL;HIHho z_LSmRx>-@~ZhQD{pv;Vj6j-J!eV6d>TxZJA4qd{+BBq>oi6;%Di#|~wG$d5qqDlNM zpQ^mrnAr=-BS3(=kcs+%Dwn6PmCm8cPzT*XI}j1LXWT9^@cRTnmKO^+m%}DYu74Ia z|F_zcZ1yjqbkel`F9`XUdTC;DH2X`EQvWncSYyp8YLJ2CGPb~$D7{!<FF1@GNfDk` zUQKjb88WF=CSqOh0vf7Nl_I89EoWxSNp{CwRUp<srEEw?bc>NkBjEFs@!>+iH5yn6 z>Q_to2xRAl?$-*W;NRxg?|l^f?o>w#LF{*oM8vNJHX%_P^bQ4{dzGUND@dT8z1*HI zD@gZXRGs<z;T;tpNA4QfOx^EMR?4n+>AQd6`}?UgU)169Gg=MpNAw>bxS8pLCGN>? zs@E4IA!qiiallaoLSc|k(^iRSdn3x;rheF!7T3I#G&LU+V;2$2kCzMN;^->;dgOcl zbJ!|wdTIx^2qnKbyDJSE@7^&1Z+F&t+$er+ubJ=fZC<3a^<}j+BXDO7cKG^7Su_$B z>jcC)sxYP=R4}l2>)`nD$D%AqjeCBxW1a`}<!ruqU^DY*r*@Op#QeP1rw=HqU~g|~ zxX1TC!2$hn8K{Y7f?V?CFcF!otQDLjd-0%71tS6SvY`2>5+52*;Gb~QO|1lS=$)3R z`GYG~Ij}IESDcF^UMiExzxhP7;HmJI@JG=*0DApa?Gs25b1X0zM_9e0U?E@v%Hb98 z0YolLLX@f$=<XR)NVlI^tqVY!)(dK3o)_nMQ-lNHHr`8;7ThmgBgUJPIgXP|T7MGC z0fjA<8fSx6O^-3j8-wwj8Ho-G>_X5L8wpQ>%mj#f#<n6dYa4<I{t}FmH+^10{lQ)G z5FjjXvRea!(O?!lt%%=7*O~vqc;c4KLvxIyivLn8ISVJ`Z%?cm_s~pq*Yf3}dM3vl zNW!nIi}cWO8s+NE;&d?p*20S*h;vU1$2UOSCQ^#_$u&3!YU9#XrA@%E_>1rFwn}Tc zd>3fMzmnsBazF69NU*=seB+o2qQ&h%ffC0~$s&PD^OEEnXp*7>6reUxhCtc|sGyZ6 zRzOH0K9$CK8JI(puqsrJp88$R&inAe&;D6B%TC5m2rF^;0JF?`e2Rs(fflGzP%w~D zvK}c5P6>vMlL(MAd?mHm<NfV;Q8hGuWUJ+(lz0VsgI<mz<H0OZ6xP6WN(_-JK6hWW z=;-?PIB9EmM)8?_tdw~6E^$*}?@2D2Q_e>ktG9C1Do?S77n^<Vnz+e-bJ+is@*d4| zUiqDu`ydAeK%v8GWjzjp-Usp_UGgR<B8>;FQP2P#DbZG&2ceu0KU!J0h|d|9N}XDm zSy?zp%l|k*+9us06%~6~AyM^Pt%gt5C^XsLLyE`;6EU3Vn%u%Fser4-=vNhRg<~ZN zw2y!r0hbLmL}X?3@|%By&o|j#2d0X+2UbKm1gC14#mpClgi~~KF}(y!9=-#jir5E9 zUONR<Z2TL^14Ef_lK1=4A7$$V5|DTs*Muphn=jG46v8*BxGeOpzEmP%DUvYjs##p^ z|JSXIeN|9v7a*L(zRaW?UzW56k6ZNa-gE!j@O(+zw!3#(0N@I0g8ewaNegg*#%K`? z1n`}A+db-`6U>jA+!6^D?75$TfYKh22}QPg7$$=v-1&qJvE_RADL2cFQ8lN+mna`{ z!N_EcQPG)h#SEGT5ERX6bLcKT6f}EVWuG^DquoSp7<fu_eJ4}~2G^6X$3=HyCBhj4 zWc)>oe+HpXzq`BEkhxv805g$?@SmLFl&=_E1ChPC<dy9Emj3+?o^)prwjNYK2qI(E zm^Ao|bA=|K4G|0g95-LSrxw_c3r$|^U{ToYG?13gNeKfo!Z=J~7d)ijL}(sO94kc0 z<b9Ms&3}t=5(OexEEpb=*WErFW=hQV7@(5~+50laTD`%9u>>1C$`a#6k)qt69$*C! zZ0Rj;uzy%wqtAG6Jd7>sM-_dm(2+bF|D=`j$FiRxLoB@!YEc7u)<s;9<Z52?(7MD> zU`EtaTEd!Eui9<6ZI6hb?U+1l<s0moctp{S<0NeU{92A+iga#iXaeI0ftCp6NR!)@ z3s{IbU9L>MafH$wr0+eWv(0Oyctxi>x5@Th`}5VNijDNbn=#n}x3ggnwwMilOSa!U zsG7!v>92lcjoGc*MeT!6<=+2C(;IFOC+rIJXZ`e3?a{uv3USuN7PsUY8Kx=`oh&e& zgh$C(LK&xzA4p$G8p9GYORg6dN{$)+`6~DU&y+H3Grn<}$Qb9!MK)e7x)e$?4jt<< zs8(MU<Zgu@3CRg^&g5T5;(#t!PsJ=P#*_-H3_fLkuM9=>3~&@*(gz6E{7eC!4c~)# zB#J2@PSXIk#6uDIwS<a}qU=)DqQ2@nkj4Jv&w&M>pIxq+Y883_eRQhzwkjz&lkp1{ z+dv8xmVzGS-cCONxDzWb+c=_toAX>Vm9+N*m~lsS+{*Q;nPKYilu@*qL#77Q0CbPd zPVvlnOr!hRRDW`D<-vkhL{-NY2V<**7arzK#gqH7=C;<V<X3vvQBhqk%*aJEk7Cz< zu`=jm#PTia^7-v-pGLq2T{7QEUJ4`eibPLFOEoD0;u^Lu|Do7tx5l$i^~Ia^@~VUh z%8NURRVXNOa|)steMds+l~?o|T|AH;%y1Wp@KB{hsMJPhya`=SQFeds6!u+lbl;Ew z0^OaRsF0YTd-Nzs=w|LzG$zW&5?oE9f3mhbk)C8r{$!Q??Y~rG^w$0}dnyf75t(ZT zdt{KB;&gLcK$g{vqYr#K#nyt?ezOtQRmvq3f}=AF*brz$zUeTGWcA_u*N^CC!AzRO zhXoBBYjCW4TN;D{6(SuEFVZs|>)4kXx+&n<8-E~X1oXrC_m69}Qw^15_r4K|`Hjeg zel9$tq>@w-8x(SzvNe+cITu$k>nOR(7Ie~@vR_gwPLXo&|1Im5oqvd!J9=@ZW*kRT z@|Pm1HIBsSiM5VgF^*hu{`dR)dhxE3sXNdLgBIsfiuoi`9FFTF^`vHVJMRhm`ysAs zW{gIvlKo)`DpFI#2!{PRiRy%&{t!WR8H5=mMuOCVrAnyI*@G|g-ECWtU!*ETk}t_i z<5&kpVnwWQild~qGQw`8xX6^W!UV!6DG=G>*($u1EHPJ59wCc?B6?&TDu9&aet%#c z%=5;o_ydUx6V<MwdR;nOPX4sgsX15R_F9L&-WUTVw0PVlen?MqqVb1){&>gDGn$r; ztQuUqLBo*MxhjHn-*)SIdVzM-r$}%&!Zk2jDMu!d`bYpvmNELEElHnXxcRTQQYQdR zHX;-xitn0aJ+_hyc+o!ol7p*gZ@_L)$ynyfgrJ7#Ji7WJAzg?u2suKM6Y|ns$IGv1 z1~Vy+@<i<s8Kszc52_!8LIFSU-jHJ8G9c6EW6U*L8@nQH$m{f<bG5fzHqI1Fe$JnL z3Y-6<!tMVFu*REzFldA6i5w6ocOoW)8K?abEj&h;g^wb`mE;ZUj%ouFk1{pTgWs`j z`jaIK(g)oI6v@6WUq=+!xACgVb`c!GoOY74^2!~4I`pskN6hk9UyIx}SotgqJk@kC z99H30|6u@6*vO}Khqg>=A0W70u}I>@J`kXsjJ_5K>UoEHPS3E*7uKQys*R*{CAWtb zJBKQ=Pk(*b5AUhN){Yl?ue}|54BOyM@}BHpW0>L`J@yW9f}_c=yK7PNh$O@REE(y3 z|4c8dv?6{cKiFt^4*h_F?8)zlYqHYb0e{qe;@AULi=6R?01O)Vt|iqND-MHycV;w2 zM;$hG6G;!um9?a<03%Y`X4K<f=HJeaw{bbZ!=z@a(>!tBO|@aH8d600I^YWM6+Xs5 z+ZTzVjII>a6xbNT<j&L>fFbAsneE>^nT`9<@mR%)8s+Df9Z`dCjuJUk#f?gH?WSgq z0%W4$TIzriTzTwB(duSd`qJDfm;1Z&;wkFgz)?cupLe13EB;{&J!jXzLLXuubl-sL zXSS9r(?PIUYxcZ?W@U0B?vmN07CUY${Gggd3Fq}Xp4xzlAFXG6d-{TuYgDE<Gq@_} z_7Qg^NqQ9$Nfq1HDup|}bbDCKComF=mYpJ43uJKOr?TZb#cClKMb9$<&z>rB7x@+z zfV2T%trfN3c3$b@xXt>q0B~X>3*Xy9SQ;_I$*<<~VU+&$W_V;0vu$Sj1Y|p2*BEGN zgS_h0a8mSp5o{k~6k||TGbpSXXv%>BQ?O*P9oQ>UD|>XA<Q%$+Jo<5W=H&6zxW(cq zCAAerX#Qw7RDZkEG7*u6vbB0k9x%J<zYE|FHZ3MNA;(<B`6M|IX-ACE&6PDB!npQ6 zi;RphyCK#%prQy~JChfiJ1=(tzauzdu9UoO==rSmu~tFe?P{Sa7XgEzp12~$mj!UU zJqQc5zfwnsbSt<@PoK`Tn_|hJW5+ae{f1fxblPKgv2~*!Ji;-h$)7OrQ--?7hnt+Q zZA$P;#h?)jkO3iR25R=;YAv%4BQ)q|<f3Y3Kln~6^Z@S;Q2d&6RRlRGi^RkJ4W2;m z`B91-^k@zcPW7Q^f%SQ9(cN;vT?<U>OE!%>G(3PzYwWGLqInWYy}m;z0>d~^Tvn-G z1>cvHhmQxK%~K6YmpnmZN!C}vmOO1`2{g^;#3h&tJcO=J#z-3>Fb`7Pw-pjfNf5-& z{z-`T7v`r-)NC{nqce0CtCyWWt1UrZz-69mi0JW5QA@6SC&1Xq1F?1C)|%fnp$GXb z+UQKi+G*f&q5!vbltOT>-P{0n5RdG3k{jq#dnPkQw_op_|H`>(ay4d?x=t1%wg9oh zO42mq8EA@@^;<{zj_3!-O87jd|3%gt`V=UlfZP0XBaSjNVq&1c^GRJbhZdEGiH=I@ zh~%pj8Spq;xnSzSoi^Ff8%+{&h67#V7g`t`exYcU>$*~d)>T2>ybzP|IwHk_({HK? z-*hTUHMnXvIoI;#Aw*Nv=n)UFd0ipq6{;XHQVG*NmR40bX0L9+n5wZ-+$<4ic7<g# zLGjo>3>Z&Y>}^B12$V1PkH>e7YE?=)5Vc6r8g=BM`cFB(bh^&gN!_5Z{&ql;i<3Tp zCWt`bW_+%0?$R;De3A@W_^biB1LDlk;2{QwDR|LDWNzDXxXN1Bk&Ek|Rd7K2@mT<& zZv#ke9>5;5uuwx=3O!o%ntm@w?H;GB*%c(8YfH^Qm<Sm7b}L3V)9!VNNSBh*u(l}0 z(>;DEZ#QZuVH#2xNtbGs9i)XCmv4{gNc0#;CGG(U86cH1$jHatl9DH$MsW8kHN^Fe z#P_0$(d9q-?~V;pJ$j{Wi0JGHNTkY^Xwxo$DZVTdZ?~|*i0OAH0}*bvqkY6MZ^*6! z6T%s8JfQqp&LB(E+QAp{xF6+&O$F_psG{QDisA8@fxuEwl!dn{I%^Irv+o@|R(|dG zS`r*auy{;c>L%p+cs*CJcqT@%sjLPpR>_3j{{Hc|$fl`mb~Q@Yoss21?B>OzNvB=k z#M>tUYe}<;^o!%1dPl64c2$44;g?J1CP`nW8#ahahP2)dn<>TB%9>!pZ8DSv^5d_m zBzD=$@gr6}b%!ti!C+;lX!#)~H2`R~7Ss%*chKk;qSz|deO5`5)(Y7v0ngzwscAyi z&QIiKMPlSljP{|SB%(i|1q-9v_Z~KAYY8w(uEkCwdO#CRcO3q6HE{S-tO!UE;lr){ zcsS6#W!0*~z+O^t6i+-u`U43qa+KHNLLKqNx>oJKLV{T_oj$2$=4^F-lS=U>c_?kX z)qiRt5moK7JC9#padVC3J#n3Sw8Kz|u0lJ$gUAfu&T-lOapN%X>0PsjShuIsL4`9M z$;T@I2N>}N1{e<JX+jDnB~9#;7TLYQr{qFptR6>WYGdC<`$2iS_%gCY87DwV=MY(j zS_KjIVfU3LoUswwF?#7lbes)x?%1rx!<XE1KvUmp$43J8%CiN+bc+7ECdoye4vAz# z?3tn}@cfMK;Wlk(l|X)QB!rnfG3wLG!WqJLuceIetvtBUo!z5yU=``iGm*l+OmqH9 zaAt!$ndcf3Ee+nQV{+V&P2RKO^9#jFPC$t_hpts-3oHX0>YS(>+JhnR@6?XVvkwb~ z{u_+uN#VB?PoEBv;CjS;Rg6b&DCnoK9vGjmX@Zm&pB8e!9W0XH>W@3N2Lu)sa&@%E zZni#FEBI755F{&jl}RUH$}PNxlWaZ6Xym$tK~3L1c-Ol~%l06dA2=NLee`S$%5@_h z=yQP&-#|LBIwlBn2P`#=m;W$}vA;-^Wupl(|BgC<4c*;;(N-HABGRFrj)`NGBzkbr z?ET}0MBt^FQYbD4LG*<(U?Q4Qs1b}8I=CQU_u_e9sJ^jSFqA^ow3N9#<1lA}47t3r zYlIr7hW~2ZB)NET{t2+sW>4l)jA4C2XeuR0BHa=5+}%jc9j_}ChEs#Y@Dy=rZgw1z z8dX+%DMeI2A_unrTG=>9jT;hj?l2YTs9&8Vb!kZh(rLzyw}lwV8-6^2a>O<cNh92# zs#;?M7Eff|9~~mo$mcqeBiw<rUo=qZF(hOZ5WG2rG7r%Ukt{PHYLVFdx(yIoY0vSS z8=J4YRrIAHU(Ol|I}r{Oe*`?+{lhGg3etS=;IpgJd5P%8FbJfiNWQu<WFwhKQ`r>_ z$=x9-rPOm2v<@F!DW7IM2m8@v;+tR8Xk(>6O;J5Gik@R=$q(Wx5rp)j(ljS9i!^9p zGS(<gBd0@q)ZE$>vx{+hE1rNJewEo6Zg~ag<QDFMSj%crpgY1fCFJrP9&<>+OQ3LE zzbiAyEuI-v?4Vb~QZvvbdtYHgq-%sZW@r2r!zQk#QV=dwYUS9q<Mp*6zOCEx4bqDP zPQ?zv>zlSL2;aEX2cUP7P{6V$h#Agd!D>ck%N%(GU_=yOQ;>PCg5U(Wk`c2b!EhU0 z&Dm%49NZrCXNS<PD3F_VXl)9Q3{%H4x6F`RaW}21*muVfGjCYbJE*VHfaSL1<riXH zNQDIRp=h*=;FGRs;|Rq7(E&2qseC6t0Ye63%ilh$luK4yzC*cT<emP#xX7YQwoHX- z4F1=<>ad^S&2E8#uHca@(Qr&~thn`a^xA?hHMIVeeaqBzTlt!i0a0JcZiY%=tV~Xj zIfL9-SjJ!;nNZJFJ6pbFRAKKpWPii==M+G^$uTy+mv;eLeA}MS^)b%%)ABDTNECH3 z8sVdS5aBB|i>sDfE8d?^CHCxg)*YA1%M*N`p_RIW2=jN9D@FIOj-=h{-^X5Ug_o2& z2?hvwn2~3bnM&fYN@n(3u#`29imD@F>m7IIpyjmtg>vr~BGs}7Sv3MhD3w=k19t-v z47tQ&WgjFwumvOOUo+&+$6}2hns*MgeX+j|IGnSTGMghK>vz+t#_HKI)r<Zdhd<Fz zK|L2l1_4fv;!NhT0S`o{gT+aPJd=(`WrQ4rmi__UZ)hPNDk!-&AVd~TcgmYbarY(E zV2WNO-i}glxdR>wja@WR#B`_tLDEtg&HUDdjE#U1G}3TWQpf1!yrshriz|nop{lGN zBHTM}ZW=GG)WGerR;PbRfSU0TSCQ#2Bl`2!s@N=s@1}0_aO@)Y$ZBgve4Dq37fT1% zpUYjkgzLbdqGhz4_aL5(KYY4~8f7i0%Y}>Z+JTcm;6*#mp8il!TZD-Mm@OO?`fA?p zp}?UbJQ9{S&AnH+wK7}lh+i42xH2Daz~#8kw9YtOmeuZvf%k=jHY2G?fqWryeS>WP zD^x3r;O8<R8G!A%)*~6fgr~$$r>uk`W1XN)ngEQzRx3uMPrEoY)+0I6C03amrOJm{ z;Z-C1LRcqOX_Sz;?137z45*JzB8xNE)8#lCe@MZ<R4Xl;A<_yZr)gXC77s7AP$J!E zx&UA7k8<gYfZwD=+xgIru$w9ucExleo-9Y8cFKN@Dti8W(<~Uot{4I%)do+-)Wucq zhFwarxj|5DixI<Aq#Dc@pT=(c#M|f)7HdcqBT6L!HpP}llI0**W(g#Av@$ls&W@p) zGk44go<xVM3r>_k_7Y#_2`P_0<{@py02lDrYapXMA^Czm!<3qG*2}Lt640CvtDZaE z;n>3-yQtJ=Zi_b@D)0d77KT_3WlrDf3Rwf7$i7h<8B6;B<^}P>QNXTeXwgJC$FU~* zQ1{E-U}tMbuQjJHT7tLzy46|P%$RJUZOW}V7OZA?7bD_?BmL;}h_;BsL8#3sPPH_Z zi!>Z>(lHORX)+=T0aN8+Jwp;D6T(;fG^TtzheaO2cCdrIL7J)Sw|7cPfp^Hil7-Un zrqBu@vE+Tw_OwAn4EcgEUGbcJk~?jDRDTXij6nJ%Z>K5DxgH|Embc^)Cb(FyfAZ?Q z`taalMzm>5lF~#=N#EM>8@CThgrE|-eoEy@)Q+tp1uo$aTGulj5RvL#Va}bU21)&( z1RL(~R({h6C?>uynb}hkrl0oFS*nJV&yWIBZL;VKBeA%g2Zy*v&N~may%_AK=6~E; z2W6W#GeVUa)vi4^JkWv&%}qREQauh^X|lGeG34%N2ip`pAgs?53S4-8%hdYHl>eSJ z8Oy6+gG&d#0Ty(WkiX$MXVxU*Dgs5(;8@iY#UdrB4SRZn3b;5u54hCsoOM=J2L>ld zr!ovHO{5)1JZ6$b=THM7Bp>2Yim)lTGVFMBuesC|(=}`>nd$-yC(l{M?(|&ORZM%5 z$}1L{2S_hUb)5;-U46`u#;tK^)djJfM#dh1$-6fvV^<bWgNZsr?x6`>ZnQx$%(7@K zY*WsQo?qYG<Wo#k#E|_2cgOt^(*P#Ke25j`HXxo>B+WD3(0JY#7Zjv};p})O1^#)G zDyxC2VEI<)cmY-L9$_4&)R0=HJ`BdRcr^Izz1(yy!b%H)qblaoueQk!8w7DNiX-)K zsMdzFbQOh9rfM$GODQnpO79O9tfXwy(yH7qlHwMD*S{yP)X>Tb5QxWlJfP-)#;3p# z=fvc;w5p5Etr}BUbv0^{wui>>Ye08tp=>8!N7nS1rNG&09rW6TUA~{Og`Q?v;DlVS z{IFd(g75_8_5_F<Zs&)l%+BL&z8i63P?j#KeA*xp?s|D~tS-}dVmV3TY6suT2rJ`& zm4hUlA&!BE?iOLh<HV5aMKmJzGujCJFJb=N78s+v(<c!*biH=PZaXL?Zl}_{aUDod zN1hXoA}$?)<0{mdXamU0{jk^&02%so<Qw19*P*&%4MLxrtGrt?0N>CBDTD+n>=6n> zC;DBOOY{9G;&|-Vh@%=O?=_Q(-qQ@eIXn`X1r9wPYQK{gH<18lo!5PInuG>2tykTs z$zjxtc$K{}BZ0o$?L@G`x-2Noz)88j5MdZ{ueh(<5DXOXdrwMIwCTtqA{|B$MynKt zh967HOme;OnDM(3ursI}ifm1}G~Zg22SrAja_9nF$XI<fND~8=Xu#=RE}zbe5nNyM zI!2SZTpjKvkTQxVCfdRs{gRU2uR*+kIXK4qrYDp~x*=RN<{L{Pi6E%2%X(OkNd7^Q z8#X&?rn*XkwkSRCdg~wn-zBbpf{Og3`dpVjpP^_X4?;;M8YYMpXTj_KM+8UU0X}R& z0zjP;Q=ta9r-Fd2Tx1YbPi=`AQC$$!ZQ4QI1TI$&b>y2i0ufyd2C9sJzvloVjg6<r zpyvV$&*iDp5$C2<gB{EW#=;OVoYVzLLWcUIn>H|vTs8F#g?nW_!Gx+JV4BC^-nZp* z0%VX;;ZV0{!a$Hpe7`|WZqWh=9=O}+k!C3S1)y3UmPK?p94jFs|7@cbuDk_TaFy<H zFj8ah;Blniex!&h`jr!@a<1UVA-In40IAHN1X@Z4CY*?R^Kq_#<OvIv9G3Y>{C=<@ zL<}@0oS~UB>=f6Bej6MkM#<@p2i>{>PvCXsrpZ~#-b#gy3bbT10EW2`=1*LHad#4u zY9WR_PM&0Yn&8y-60(gBIF5YuBC`B%H$=KSVyJTC@}N}j032z@y;2X8m6u3UNhAzb zi@T~FT}%|u^}_^w5FxB-<Y=gF-~Pw;fI(T45e4&%KApZEvw}^o)AAuGjFpoU#<av- z1&IbdCXXTku>>+H@o2F))za(mqkJ2`L|UK#yyAfQ5aPIKT~5ZVcHsLGP>uIkLU+V( zKV?)ISQ@%>J>`weDs&QFFbPA_I&sUlLe{c`$?~_kClB_}*CdiQ)w`-Mej7`p+7mmR z1Qj}W5a+P)*(eFCbe-TF-N1w58C~FVwv__i^e=r#lpgL>-K30Ah4FP1m`4te$ZE;q zVOqFg#N+ZB)9O`;e~2{SopNI9m?`?cBkD7;LG@q>ZKe__3@PI2KW@0x`PRWrt-a~5 zK=nEr;H@ZBEU+U#V3|~xLa6o_nAq;CcKrM|Xf4R|nf2iKxWUoG1j3M~NuSsosD_AF z-8B*O&OUp3EQdRiPlhh&>3A3-KN{<gTuzu7sE1>4uMetr+AV3QG*g|!hmsP01TFyO z21*Usw~$Dr*b9htf#D^*s{0t}xmGrA%~J?0YZd;;iNdktlJX60n=fS~pUrXnC3Y;T zD@_)wyob3NY1v&Bzow+*Xr$PvxT*%iF$LAvW9s9`l$SE15v)dVSw$CiWDL3jkpova zTU7Tj+v7P4kX~ZgX{sqtg&xdF{9+eRcr&cd4{sdysl8Vkfogpt&d|kc>pihlPGzes z9sAM8o(Szdt2QbGOqZe#J9@`L%49FZ#nC~8URz?H3%ke@6s2!+u}2vzccRyxPM+BH zFu{5rvW++(!k>oO^B0LdocfO|Wl3&&m>0r8F@Z!IIZuY@1XU<EY3|?aWMWG1LjvW% zB$&AF=@CIBkZ8s$O1MP-WBRW5LudP58Bk3UBD~I?DTXF&MAV4`5Z;2uLaGq_W|_%1 z4rdlg_eMH-V5o3wy5B6C0xKhFwnpF)h}bk2L|>$szRBY$nF$qfhyDQyP=8%A1;jPO zE^z%_IDx`_z$7SO`|tk<IK)t{z~>h|q&(z8#gJ|VyT`bGh0RDf)bBLohRpWZ>X#o_ zaN0T(`?omy%A6$#bhCb+wvMH2)RBdKreyKuQUgzZgU>!bt?j(;!4&e|8LcLU(^7(@ z0Xt_D9OlYu#~DVCp&BqZ2lNx#@+wq%KcfEI8>;6kg?J<%3Ji+5;ZhM39f=@FJJ*s| zM<Fln+P=juLh4xXp;NMmC!v@0P-4kNna?~$ruI1kFYWH8^vNk6WCDr?t)EZ}LL1=S z2QQH_h#nd|9TT8<m9EmF$GbAE1-AhXeahZDppo;pT}fCOm*QD?*f1Q3D&Tu76~x;Q zjgW<P)PqTM6XW4uW2y+4pl58{kd7A-L}(o0X*OiBV8HF3iYj{*v)^LFkYd>E*a)34 z-eO9Sk%jxaud~J!AdAf9+A7n0Hy<kX%JITmT-kEk&}1c9X`2prDbWYN%Vgj>N>6Gv zo(zv-+HGuuCn_y2zDpOg+%=htqtw;?mD7Bye;uG4M)cea@HbgVHW>@?cx@&d@mftN z23SrY7}_n}g7nw8Fp}YI)$IZo=ZlTua<C06G)QV5o4EAK)W<~_pi^6&cKOVE)AK`J z@kRH!UKxJgk8i&&y7w~u8FzYR3iOx{yASjH;{BulEUrk!8zVr_h3oge!u$sY3cHu( z&-lPAQ}EiG?<cxwR_^F-)r`jsU)F&5H49&;F1%_=^F1b-J5<&Tj)^FYaYLrGjoP{G z7yY3P%#AkRn*umQ2kU;iTY<1<L8uVPeQfpH>5hYDTGr@*s~CqR7r^dn+{|qHNB$z* z86sV~hazDp2kV*OSr{odHHS!cM%P2E4a4D(ayD-UWl!Emtv9%A%JX!4ot-&c=?lD& zu6jn&CiFcw(>&&)Ucocaa~#1=6&SJqDs>ERp^O%SqOVKTr*tTB(N-)ci9h$9o9nOH z9>X{E$5faD3gJ<O)P_UqXkwH+CGsgGU{90`&fn(xm<%2s3OG_<*!=!Ww#z}E(L0vP z(I78yyl96#{uUVx4_hyBSxoPUo9M1Qx2?45J-fCV3%@@H-9`XALvAC{vqNrTt0$IS zUh}vqWCeV-8k}x@diOe~ZYf(KpEg~At{?4s1&>2+;7}uvZqJ3RcAh=^!#=Q9l{Iq+ z*xYu8m-H3MBS%oAS^_9>&0X-Ysnl22Am<T25g1pH-ANme;n+0u0YszT4WMM;U}T^N zs`eBr#}rnlH18R~7!GRIHweV84_hDLz!hvPfhLXB=qcY`<mmz?8|XHsX&GrTY{8Ir zx^FYJHsGu>>w)Xy7)rMp66<Bl=1B4Xh<fcsRhO=929!k8&8$O=i4&1^rj0PPM$ep7 zQtFCf!D5-uW+B0#PzGSyqr;#N{?&k`>Asj~GuVlH>xE#3@^O3^2=u7HhXFSp+|klw zk6dn2>JZNgoq3o*6|;_O=(){sNey*+p+I8LH1ylZ$4m@Yue!TYMl*rFix1dPYIZ?D z;d*GSP~_mjGhZ_N2ED+y$CHzc5p*0`XSG^j!q2fvAyv^c{NLu$;#SLRz|mX!-xeyS zgvT}|fT3CMh26%*%v;wkhJV^$|FQk(V|LQIPxMO=)$pO8R|CopZ|{;eK(n`^G8a@u zes}=Cc{CA4iPbTjbIz_3f))NGC*jL*6uWjw>qo4QBgeg`fSmPOx(!A6jm>||Z4<>n zM;WG2k;H;12#HnA(A^6C*?ks0AN#=WFW1l>oR4RtQ5(id1EzyLdK?Un!D!K;lwmeF zTPAze<y{*?z1m_0EfE8Yw(J#1tu)9F4kae9ExhcG@w<cd20H;ZIAb4FYlBl`gA$tk zrrg3@Bb$f#8LqC3DGiK*)`n-l=WhAE5cnL<y6S}>!bw4vxNXW?JUOV3^rEoy^mM1b zR@RnE2y==}BtgySTJEtM0;WnTeOXt~OI!Cx0a_SKfPIh<#G=j9CR%%xmX%065n2`D z5PvzVb#r?U$5jWbaX`=#UPTB1<}f6`V1qAU1OspBti~ug%?g>yg$LI?@5{QJV{b{R zO41@248Z{r<zUCL_lFnIO92>XuZ3mK#3TjKdZzOvm?|kK;m~78`BT&kTrhg7E;$hL z9JBy`R6LcCel_(K19`S;&3iEr@-J`@G5@z7uI}D7v1<fF0aaSAwh*knBDE1#IV6`s zk9}1AC$oosA|OE`?@>}I3S<1N)O*2WKg!FY`e?}BMa|B2ff<i#cpY+V|69#b&?Tgf zNV6f<Z>NiL&i#~okDHh*j%CIb8;?=Y?1{>J+0=W*0q9zklzUBY3P#``7emn!sl!j) ze(TtXXbO)RM>+R<WX+rHECl}`hh>nV{JUhasV4?dj+_k+lkmJLW<lS?LihlM;@e<d z6g$VSalvb@wN+6FH=}Wf;Lpy1!)ZL_YPAWMEtq_8qhEqs7+{FMxTZ1585>y81`vc= z@WDHa!!L`kq@r-BC0ko)?l6cfAOoZYXJEE6<<Z}aO1PkmowE<dq7gX*Lnk-m>su@Y zUcY+@a9|9fTLcsJYMqiov__GimNqFUcn27I+4wy(l1!L^7dnVa*TRPl6rzzY{pWiK z$?jGnEKIH#e3Xj2g`e=K_(BKHIg8v5NzV%oUe^=1HR0|IX{y@vNE`gMW@gGpl!Goi zen4t0Y`aVbP-k&OJDH=wmbKB%tw@GSN;2c$=1ef49Sma+E9%5kt3b1K!%bBTH(~{? zVO*7Z{HBh#_OF#zmg8$brx}r_v7}sZMK)S8MhEtb;9wZn;K&d@kK|ACJ*G#A-5r`P zu^`VHHyz-C;!U-jB}z*{Uz0~$W~q7=Gmivuqiv(>4OF3uPeOZQH$fpvW9m>wqR1)a zMq5fxseI$3fOK^`+5<P8W&@rn^<QG@<V5j7bmWZJSzToO=Z&$Ek%W<d?d6e-NMr^{ zFH~zI6qaF+BaD>i2^!#~BS@R~M6E;6Hed^veVXxF1}l*UVUlkOIqNA@bbP&5vff}p zF~*93O>y+qkLN}<#G~KQu9KEpxDp#Cv$6=sm7**ly4}r<+@%al)zYPU*<3lCgA6tL zM@2uEAegdJKJFG@(j4M!!}+Q}vVzfT(H#aYry>~}X6PpL>;<C99^@}ndOXXA0T4_V zqQ`$~)enWUZiaPmf4T_2=tNP7@aMCEbsE5k(tg`!(r1^St@6G<FTEkc7RYE5JFOCd zcVDPbwe?4?{tsF20G&zjybH&+oi|Q4cw=vDn;RP&+qP}n+}O5lY;1dPet&%De&^1a z(=$`kGhN+hR8v*Yqvge@j;=+yXu|KgXv%U!Eck1^2$k4*Km9^dk)zPHI)r#}qS=I4 z5zG{e180Q^bxaJ!LME9bKsU{Rzsl3wDwW^)sCJQP+=L;`s{;m<<{9MSbe#jfl3a6u z3tWwz?rUW{WaZlp0>SR=m#{2?Z|FrOE*bmhaKHdCG~<@(N=7og3wL;AxMyLYF}!S1 zpYc<%6$6U2(0a)CtkcIhQZ>Yd>fSF0|A%S2Bu-f{$#4Tpa~5tey~Eq)^zhd4XS4a_ z6k@)Bg1C_mB|N!qgj<k;ulkNB-ig#UfsZ1d4S5vhcdYRXIh1Vk0^9Me#~e1$z2`0M zS2Mw$Qvui>D3caUj8+JV8G|7L$Of38TiG9)%TM}5B}#E7>1e}Z;8@G)Lx(85tYPNC z<^pOh_r}iwmB4eAO1o1@8&ummtMZ(HZCNnvgxj+Q;*W*9?P9@2Xsfp-=g7YoikPZJ z3?<iM_NbP*b;kYD^0_<*G5$D_%nrSveUCy#N$B7T@U|^rujHRNN$kY%K>vP{1}q@V zHZ-LLl=5C@f^cMcao0^iLk_uZZ$d(t5i4v#6<HN#KD9L}15|8r&&c(jr<182GswTX z7&(do)60<4Y3ex2Th1#qUO_FfT%P-SmBK+lZ9b87abr)8bbORcI+&(d>x|6yCVy&1 zV${(i#-Y&>9M&@vYnsgXPikTC2+}IBYrqoOx-_*Zh$Zl@;DG!|0e~L*^Ykcu>9|`% zk8w?b1ri}Q&0WiI6^h#Iub+Z>!6AIA*B_7AZcIae6ucYrP48_zmR5`HVx73MV@f2d z^rFXPhUND@z2cfsU<KYJoKczRgp=k$P_?PeZ4jIfs_NGwKC%H!Rib#d?#6n^hZ~aQ zJZLYC0U;tn>yP2YzFr(vM<ZWkTHCUAL5$EW6_0TNT6QAKukAcK!F5VINbCBOZ-@Th zWz~Ug@n=@jD999Z=NpZHSah&6uU&nW)a4Eld=~Q+>6hIY*^I1kaY)qVdIOx=B323; zch0NGS6I(Upa&TZ$Ah$^cwd$qks^r%R}Ik+W?R*6)h`0_nzUEYPmVUayv3*ZCxY(L zAIIBSpMgUl7!;?`Y+s$Hqo-N<9VA@bSN!tX_A`>!WyXd0(>lN^pMcvTHh|PFlOh7R z3x_q#@IOYi6wEii*tA5ys$GR8t0*yWnK$dGPU7|=CKKsXZHvLu>DGg)&5w3mn%sXF zk(j|pr%L+9Ah@LUVFnF0bL{L&%g0}!nZ9a+KD{%G;A5M0H*pQvx%*D6zwGcm^?4tM z5Wy+sZ)0~~+ehtCeQ?HG+o?gwYW<v{m17Lc4?zYS*td5W4sl7BJVIW?HR$o3C3pKp z_8^^?NaO^SLNlJul=U|4Fj?p!B@<|Rz#ou9omh>2x!_H(qPj<Zo0ww}gX?_14n}K6 z4aHRKEqQ7p2<rBk5)7l<2<RlpOBV!Q-!DbYjx4|xp)Js?O3OpjD0{*@aM23H>QN|G z*9=`?rSuZHuLBoVp)v@!k!4;1nW5$9Ybx8nxWL>HRQV6bHV$6e?uki5k8%8@oECJU zz3hyW9TmS~V9hD<K?=<tkuO%Iesm9?RU>OGiGC)UC(0Jy#IdtZ$Y?T83E$W)uVj2X zid8<#?}?4il9eua({>{h5MYhVHzfpcOYW8vWKh7UxwcsrL-<xDdToPXhsA@N0I9QL z&k4c<8iq=kMljHQPRrl9?XAZZ`PnG%Oifrz!~!wmoiw?Y%71vL$dkO~Iu9u?HL~H& zk(Xzj7$Qvbv47QM3YC*C;MBcLcia+-%8iB_24~Ohf4Ws}eTR-;U?(?to=!(jZ!!!i z#sMHnik{!fZxYbl@>SQ@R4N5bkTkNaXwla6g<4}kc3s6_4;deoSy(D3pt7898J|QI zXQHFie@#*d?&`<$y8K1UF;Qm)FZ;JxA+0DeDv^*Ur2(6^_VTKsKkBfm32|R9VfAnW zZw=3BvPm|WFh)-ZKZ;C(7``ijdtJilIfu^??Zi=Ckg-J!%^NkJD$B~jr6l5$Q4MDF zV;ww!;#Lp~J^^_UmyHJfHU*wb2Ct2OXj3a4RZlO1pw(p{R;R5InT&yMNKlrk93f(v zSlxb9DovG9gb6u_GMG<)BaZ0pa@c6Hf8flyj5S`x5os|YKcj9e4sMG?{?8-te}e9n zUsu?ghD}?^C<U&?mF!X91k8PN5T?Nr&T}PMg`l!F9*@c0rtAZ}$vynP_|Rl{XgTAN zL~U$blIu^=KZu+TTPM5&-oG;i?cWlN%Vu75gU)?lt$z5v?)<|$*ZaEHGkTfu6;%57 z@6Afa_m!U=44`5-n~Mtamkn$;*H^ly*;h07-vDCJ@ST}HDio@N-)^JXJ#m70%vd3) z5q&&;e;V8{0P5VfCH3GWcZJPoMYTOjU|B;XT0`M}u9QHzdjOdb|1GDL3(@vt@jZ6q zmP{f_fRM2$tzmvpLX%mej8vP?23E2TB+geTA_&vJtSiSBvHrl7wK%Iq+M-98NEi=F zo@z0ZcOAp_QU5-2`-SlGjZpZb6PIgJfYn^6J<Y;tl1X9zP+)$&g0Wi6E?EsLN8X$G z2fvTFB9=dpk=6kol_i4cz}u-{oRcH0oM;i+dGyWkx?`sgSaqu2hgL|99KsD&(j0Hb zN2VIS>)2&HoL799$g095B-B6q5-@DLiP!2}8{+6j7(Eg@y%#)u$+6Cuj6rl*BzhBJ z0}+9xBW`gy67Ii%6JVRyHg^#OH_8mDPYI|;lSu=D#s~;vs*Hu@K*{OlV<|f>u<2F- zn}o|wMYdsZz4Y(IieM_~!kILF954zOB8~O@_6{99T(48gMfm!yk)qxV*XE(IstR=- z;zFAr#h6*PP>D2^ZsJEw{+i#E^&(wVHU%cv|L15-EG2F<tprbie2uqt)Mk6rSX}-5 zlm!$1sU^j`Xmq6E5LVJ?UpR=?D)W@|p}E$~XdE(~OI(~a6HMGANeIsYjPhTcshWq^ z>lCSDp0{)k->$~wEEB#Mb~)OV=Ca>~Aai3;->MUb1mq#N#HOfj^9Rzklbld-o0Tdd z$8jDbhstSuWspU<BC?KM`lzq((xU;G+~**ix>`{c9#^o=H+=wDZkmtqp=5wbo|TS^ zkf;RAYut>Qvx;K>dw=KUQ5wzRxOi^R7-I4S91=zCmhYK+OHOSt8t*8Ec3@Y(sLTAD z-!42olC{!q5vgB}4|_rZAMKpd4QaqY$TgM?b0i28J#MyQs`Ls|gfc0seai`6u($%y z8y$$rMG3}5#OFK2NHoeh@#A8rDgt`#Pne<7Dj~m9)M=3)_xDGg$JNJ65*XpAWl$Zv zPo1EHCgCqNXw0LRpc{AeVD|%;FMnc=XA(Td>Jvr%XOX(xE8uhpn-efopC|9jg%NZR z!jPXcccGB5vBYtWE+W}}O+=H{o4&eGHX&-+2$;DlJNhb^6sKlN<AsSiU5i<vGpN4F z^^S3?no3RN2P9q1NqE<kzo7dRXTpkKTAU>}z>ZNdnxKsBa9$f!*F4}!U=I<>7DR<l z`J%O)zYeRtiIf0^3!*bJa=Z^rAgkG;&%D(5@y*vtm(-+P#Z~Hlu@9%BqFRmR9o+RD zz*0Xw?B-3M`on#oIRuF&G;e;^2{IupD(ZP!wbDUpiv=3ehwG=5b;aShmfq@MfaV3< zE+;BF$^7P+%D6-iKF!HWmN*)@?E^qNq2Nb(-ASoxV|AXb_@h~?8%|N~l;SSqOT`@M z7%^Q+bw?u=$`mqTp=Q*oyDQa;=rwdE61DR1r6j7Z`2xvr`hM4;(8PPkIFp|F<XF<> zQ9t&n=b@4JP)0b?D_H%J(V_{$3`sP)EM~bj!H6oNj*B?^w~FThhro9Gqjl})6FR6K z8+>Vs%T|WJ_=;Z6QaKRGzD*Kr{xj7JI8FAo+V*uc%~a*@a|M>uKBKZGlkNVGRz2~@ z+s~Os#e9aJh#HlsQ60)dhty_<4f%OD*3q75ABN^NdY>B)25Dj5;Fzm%kaFD5A&`m& z?#XPrN|`j8qNF4%cGJa!Q~Qocx8lXJz_(B#+r=kJN2D>r`SWHTuAJs9rSnW<53P`h zHN(L_<;`OcHOmWTW4D_hR<Lf=LzZKb;S{;3z4kY9In^UG(CIl}PPcNTA+lV{iuCR5 z$opf>aY!sG9o2_kZIdh}na^RKqsFXCF*8u==|UfXABMMND9VL&JsUKms?|E}_h!`G z&x^MP8vdI;Q+2w5;@#`UpZ8}4Z}(`8=0i-=jI7cP`e;J#d+<3;+*QOeq?~PW!NP{P z)tu0?$ox)PZjS?`<NHx=XCC~0^*X)O14fLh&Wn2eIp7OG6UQ>px=D+rStdp;Af{ID z0_;+R&8TVrI{p^%0i_w}QVt%?1f7m}N$M|}l}R#0k1_<~#CSx!)zjZ&2L|Z@pHLd0 zmrP3YQe*O@2d|-=jhPsrZ$C#KzuA7MApge6X$e%$o~z+U3XkGpHN;eX)~Q!_lsRg} z&iO?JK@PE9pIU)8uYy1hDEkkUrpzoiK7ORP^M1!}*hKXzxqQ&=o-@@i25b__yEPH< z@5o$__N2s3`q5~HF!@qBcz}#Zi#MB;GW0so5=m=Z|D}weGSu<Jp^RmsaQ=EX)lxDK ziMk1PSDLhF<6U0A>s7xWIzAIQweE`t4~T7#*^IS=$9#ykDkXwDdnN_zpAkbBs(^8- zMM~iLDCcB@9lFJr$uIFojL}QX%AiQf<=#Q0OcCi5myS-(EqDie=HDk8<#5CnwY@`h zJuBGnufuO|E2~^m2^$q&xSy|r5S9trn<Z-AD}B~aI#(~`@le#@qM?Z1$k$K-E;xQO zx=Ct`kP}g9l^<^d19~-}_kK{-#Pxjg#yT0Xipn61Ss(uhG;Z}Bo-rl5clk^vRldLI zj%6iwewSz_PnkB9zdu@8ja|0&gO1^OxI*sPKL^!^S0hRP=l=4{UvWVj((^&()!yF& zBKFv_xAQ{!u~E|ptprJKR<BB<rbk^EX{?Q`Bu;XV4KCz1G}#2e8EesooqgLZ5oYjn z|3E^QAZ>NCnRI<@mix}0C`<;s+w<6c5IyebYGEAd-0o#PLExfGjFSR%3o+ZaG0TEb z2T`H&P_q%MX$e{@fihxTRfhG7tVp5-uIFjVS7eR*F`VcD?q%Kc^tr{T;RK;t6hO@H zJ9_2*ICxU*!u#hQu1`uamW$VGn3o$<=nF!+ATz$GT;y%hV24Rt{V^u4E}cm=D5Xp< zcI!64)At)Tf(0h4=IQc+Ui}J?pf!(dhcQAi)8`VUl2xR+>CoTKuQ~hKv1c;o_YZ+A z=5CzM9V8nspr7%1@Dtj^dQck&M_vy0ZS~gYXR(|_M3Yod;t#h~ui>&?13`4qePbmd zS!7F~B=m7Tc(^Oo9VH1E>@q%#4=#EsK<+yee_96`PB+0)_yPog|FPBTiME1yzI!O( zmLbw?QVFq(y3E)I3k`bCEd=X|6!To*x*`mjk1V6%Qui9=i1Ic$E<=gGQg7#tY?T_u zmgC9KtBs;a#{xaW&U~Y%V2U7ZHx2M%3PI?lhMad4@rfWcwddCg+QAcc%+6l`JJOVj zBP$BN@Br07UC6DkZFw^s(HEkZ%!<ehM@d<fG*$_*54Cs`t^`{e;QdpI`ndVN=PGVN zU}y|Cu05&}b(ZYsCmFp@OnFIDjjI4nsifGx@0BDQ#k4e7q?mU<OwBipFt!SYGLX3w zJ|3LiIgss@VyqD5kuLGFO)f0UW+X$rIcg@&)M%22A)uIKU@A$9>;spSDvvu72k94% z?Z3QcZ36V*kBKM82ECUrt~Wl!U6jQwAlSgfk|;3h(;FbozDEX`mb@A*6V#6I>{Z0b zDxMT>>jaFlmisRs4q(?=D!d{|1Lp5Fre|Q`X?^fo$X~k^FvtGlNyq`NXrWAQh3+b5 z=Vh4-GIDha0OY%*L&fbEOZYiJQ&s@}jS2jP^QI3(QPN?2a79E%6ntlOQdGY_*%-bl zVT6WK9y_@v+MDV^-}w0>tBp{<QFfBzn5ALl2IgYPygqic<)@RBiQ0*>B%KCx0s(X< z`UN4SSx9vZJiiVXkjZQAJ@}J~5d%t$9i`WK@Da|8biyAp;Qlabo6|aGW8;f9JgP0h z?fG1S-Sdh&fiAGNaTU+`-pY-^U<U;Gf9Nh&F8PnF<}PVZ|6OnR;{I!jz2x8heUMQV zb|zNHKQn1$_*LLkqEK9L6Q1U`)Wq$61{k`9ypL6>UnM}j;;<p+WoH&LRGp1b9^&hY zfjO(eJZ`)8J~U}6d*kSKIyH?Yrw~DkVlZ=94~Q0EP{R3{9S<WCR}Nl4PJ7yFkgM$9 zD0O$DIE|{BDY4-oOUlP*W?{CkDA;%@Ii<L2frpxq)StqfrZ4e;C)?{-X%Ch(JW9mr z7>-$*$JV?F{a4f>yi^S11VJJotNck_hA}9OMSvrp$Cq9a7Xz+ITL40SkJRlixwR$1 z%@Bq7500Q24jchVi7Dh40Y+_N1v{K>0WjkNmDlYwUCO+?$4Z8a{Ace6$Th8mTj)b* z$eR#<nA)^(t}2D#1VDPHy;MV$stXZMlnW%EdTWKI3Y?NC34(Qk+F{+3G&-)Ej994+ ztIc=F#URg03Hmg>DSdm@Z>Z5aLt()@&IwWx((&#d*?tACrR5CJ)Y9H8trnn%&)K3% z{P9AFzs^_iv!D@n+0K$+Qwu+4wSlu{F#B|pJp@K-cdE7XzxRo4<m&--V=_N|^pM4q z<$(C=U@{y*Oj7UM8r(4D2=npYnKRubN}v~CPyW(={mSQ5Ou2wn(w~KgdP!$bHHAc! z$tWZwqE0A~tJv^Z9NY0hDG^mErjXtTas(`_1)TUEaYDK?DFG;#VT9)lUSk~Fv^~v> z#!Qt?Z4->)%$sOc9!0~S2+O5BV)juYb#Wp%a#e2R=zJ^SNz`m^KpX7>(I*=OD$)`j zY4YDi`48-_ia&&^7}4V2EGEND0KcUFbfZhJn=MELGQA52!!3#hE(az)cb|Q7-e*co z`MYIAbTao0i#Q|;xCpNMb@&JSNl}k*%d;D4+DnitZv`}27Qm`3=Fbjpr+8|)O7?;1 zQ`SvZkp+MzX16$x56-XdOu&za7Rkk>36g3nlN*kJnQGV}fv13AzbvAnn7>dyL>B4$ zx4^Y~lS0@0g*+AhMqx@Qmi?8DnpC72=k3iZf#@*Bv2Ic8Qt*=$Fi->w5Z13Ist<$V z-M1(4;FPG3$vl}I5uFUIjx_wV{OK)6U8D{YAY1e|zBSKt2oB145|xa8wF)ONAO`+u z<=E;MGsz7AQo6=I+WrHg_;fU0UEvXdm962CsL*@qRkaR{1;>;~A{Dc0Q(U2A)V#20 z5uq*l{qfaS2{o_&%y&vP<YdX47`A)<1vO6wv+7XOT43h_ZzJLsf7bsT$@Mu@?>1WB zWXVw4^;vIB<B}M+QpGi|HYj@Mg{u0)&%*-Zmnns_k=+;rN-H%HQ`nk&VU=SVP=-zW zPq~<~rJ_<6$WP@lF%vwCPmqI&KCF7|uIp-5njq}U2xb1j{5KgB3dP~-rL{4w`d$F| zW`DV<2JS+c*u~U3tCF$OF~riGrU%D}kB|;wNMV4a5N72_Y+N_I2E9VQmo-gVh?avA zbNW%)crc8Ka=)mw8Y16w`iC0P`)DsIk*Fqo5W5L0Kq$}ie2RQT|L>31^>CR#c)O+D zvGOfU`dBPh(&<6J?;t>y;%=5yj7cfcSH_wN{Xt=j%1MnsC6Y(!Xm!2F7LW2SWGO1k zY)kTBMU&w1P+_uAp}>)0kKUu4AR16p5_M{OXi2!(7nBg#;@VSDA1zV)(+=Z56f7mc zsKhYxroK_TKTYAv)ut({#Kip8Ivm_kA91SIdQtBx<7&u%;tCBB2>kdnYjIYdE-Xwg z<Ylr9V<cMlRH*9sK3XM68~%s5uo@l3!FEZ7&q>6^u7nasVi1trj~2*ga{dR$5$@zC zC%Z=uWr^w_f#2hk%VlU8_{uH`b-1K=(Gm44;n0J8uqy<+JGmFhasX0LRMRZNtmaLx zc}NrB2rn@vRB_IDKtuOm+kM~^z6wh0@8NM#g?gM~9bOBmhJNj;V$z7whkq>L%^fJi zxp)_3?W^7{W*`*XX&ingYwh8!SY&&g%MIu$Kb?IDYSK}Rk<<sFIDcVU9@_oAgP_V) zb74*Pe|(dR?J`T<D#kL;PtO@#=+)xOn|a%k+7M&o%mp_6o-k6YAZ0G~;uwq^{X<H< ztFnwyDrwj!Fwvc*Cyg{$k1<drMkBqF84#aDPa4pc3;wfSARPYt7oojkw;)|s)s>*? z6VqykOfK~t$h36v&gq%~rAue>&v_b;rjQBx0K8ZVq%L@P5lt?&g93eR<4I+Qk=z=~ z#>_l?k&~thdp|X}1zF%vz?iE3qJcGcGIlnxn50RH-nkLh{-0x;^8+9MvhvZSpop)l zr*o*c)O9^k)=fI4L_X-z*+D3~=Rb+|FnhnzvvAtW{{pzMM1gVBEkC1lc`_%_UaTJO zm^Bx|%z(!mVwy~<nsFJ5!Csbd6noyt$${Cm_<fZp;z53OoOs(e?W@tk`(Ar_TLMu~ z*Gd#`LwD#PBxopO&};K4ZbD^YW0GV(rYV64<n$3jO>vr@p)Kmf^*n$gkt4-C0`wwx zushr^-`x63wN|SV2#Y$g6vJ@z1ZqsOm_D0=r$&Ede#GB}H*jE>YqOcdYSJV6MTBe9 zOJUreZ(EJAz_)E}Kqr6erG6ZvB99vr9jt$T=w+PB!l<Bl5E>4Ccv0-S#0nlgZnPa+ zI2jpkd5HyX!(aEE#>a<OkcWEMKBJ{C<(xno(B#F!)i=`VDU2}Cc|2n#0s{$Uk|2q3 z{Th;89g}Jn7X?np+$rCy!(M5lM?pdoB6-YNzmc?u8AjlkR$!H@4a4x)=Bv+FqNnEm z(^6md=(!OIXg5@mzOhab5Ar3811Orz8sQj(om*H^BXMG4H8}*i;L2bSmp0F_Pg4#b zF_$Y_d2JZKT%ka4C%2g-J{2q>+|?M2x~xUSG&7V)M3mG4ir<?3`IF3?@$IKtB7mUC zBtvQ{mGdk|E6z9|IQ}#YJLCn9W<t<9^aaY^f=T|boy!IJgd)&Is?3>O1wnCzHNkns zLLqkf-b`raxdni!42loDDbY9?II+x4I3s~)bMVse<ImhFD;1br)c95W3{DGWt)_Bm zVqqY7N~9JSglM;+{QbR%)KF5!fZ#u`!iLz_KOAo%f=514$mq^q#f?^h(EJcf^ERVW zO*&VfdIsi@=V7!+KpU#5qRdnhM!&HHsr%rKx(Hx~+yimL%7}WGXvDw^D|n}Tg_!#g zeQ8Sx%Fhe=hbc`6hNb@OUF0!tKunV&w4iB?wEcmnOH=n>M%3okweQeMAl{&5-bRlh zl#LJ0Vx9tKQzNGx1`<d(jpG^!eFO#q1k9?qCk8U@ii{YahauGGeV+)XVM5eAZibe7 z;kdR3))Z38w$>q|>o_ES-nHoUyLT)t-l-WX8&(6JJL=pFdyNCcW`q!Sl_a$l=D@M) zZhZ6}x<wv!Jj{YJNC$k^BPTKuZ$um|&o)HRXp)a;ys<t7ZH1On1vD%vicK$gd{R0= z7)-jpg&yr1`AzNvABH;w@{@Xl?#pM{^{l>B_vMBP98#*xLer=~P)4M1%(vYN<Z83X zV%N+gAR8#*0D&a`B-lFfEf?)5QQ$wapx6W<f&w#)9^BN)FK#v{+k`qSg5I*cF!T;z zP<S3Jo`2U31+}GQvE`J6b>zqpHQJ&MgJ+yzBC%c1d~WT(O{S|V!Vvitwe+r^+b6_9 zGYlq>ckLJ@5w}K}Zc_IYMtuTq)mnXu;4gmu+Ia6#sahGn8Edg9KI}?Jfi1y+>2~nf z(M?;d4ew^bZk!&PTk%10)Bz_tmLicJX;$neJo*HXSvrk80P2hURHH*Ph?60S6D7-Q zrZixnAX^wPsytbUN~Auax9XqgGcM2v5UjkNs^H{T3O6&KOH-rbsM;TW(a-hhSq+?T zTFP9y`^E7TZykTT*bIr_Z-?G|HIXy(2l`bn@e=rbX^^kwI1q+o;jU;}Gqu06)npKN zfD&<D*}3-o!_?aeL|gkc>h@L}0)|9<El7i>`O1`4;1^FIN7vSeY>Ytg@D~9IEuQhk zqa5Y!)lP|oi`dB7#&6L?R5WPXpuixRfU=cWXmxZ(y^HuM&s2Ab@da7lR?=fJ!4a#7 z0Ta>zJ8?63w@H>ba)^`Vj3d2b0o5n8`SXPFxE|7EqC75^;7VL9Oh+3<mo*}f`PC<2 zVi}YTl9)XIPg*6;1GeTD>?j$w&<}sVX*Pi=51$;S&{-6BPsMYDg>*0>!J+z91jvR_ z5VQC|?lPaLhrk?ee!fjh<*YZo_ciuHQz{yy64<FA*hY$Yoq1wqFr_&hE687xc0Ria zU{p*~9!1um-ib5PEFez@3@}P&_ms*LJv`4W{Z9N%>FD8++TzB`8pS!^7h7Yw)Ku^Q z4L*kk7LOD;58R~3n-uvMqz_mx={jqwf5DlJhfPJeE<56l!%c-*GZw7oYf{bk)8p=7 z`)MUZvZ0;<{#pjc`<6_B2M^@)DR}ZHcO5md1z)0$r~(`)!v1|RVk!-qKOEZZ5NGmz z<TNs}L&dHt#xx#}42^+5uFRT#XCo_-JbUE>W^p}qp$qv7@x({iK4abs8APIFaVWq7 zbQFO9`^7WnoeV!%1TGb!mp<B_n^p*I1Vs;HK~WIL0fSa5^Yt6Ik^itzA7Nl|umG1X z9R>ovoDOBeO4a@k``~?_gY2!E+alhO7$Gc8N?$`(dA?9Pz9SaR%@1%+B1Q<f%&bya z8)_V(e8<sWSbTAu+$;sr6Mevc0Y=#!@d{OXg%P|T8724{*iygD!><)wVtU%!#or2^ zjiU5qZYz!yhfq14pjU|k8Npz%0<n$DHnXa6C!;#FS~~b|?^Q{Mv2I(&Dtu8C$jd3y zg>j1EC4Utj(3T_HX*JTNVx~HlX-Fl+krQ@V7H3e0HG~WO`!qQYJx(xDuFh_l4KP;W z`C!oE9J4yisR|1xv-j{%n<3J-Tn31;T{)be2nBS+F8$KS8S#<(vVlR$Uq-$im#1y6 z#!w(@!*SF?-{6r`>T=Q5@;hJ8nRxrQk$_bLHYe+>1Y}->_Hnk@BB?5+$)i8Mq0LeI z8^k8R))D@BI`_nIioN_Rzuc_H5Avs8Z^UEme{0$nT#loZecH)*MunK<YLz^G(TMS% zGcR&?H1jnJ^LnKByE+XL`e$Sw)Ito1zWIV5x%sB?BO=7I3@ZFg6mx5{v30dGu%n2i zpP<560?7a=9}ia!UtAvB(mA(qmj@V|Hr8!uLePLtLEuM{^B|WqxLYk<vZ%8kOFF=5 z?fWu_=Xx>iGL?B*)<l%v70@V<*+*F<C>Xj&tLf8uzkZ|1sy;(F2%fOou@9Nk=UX8% zbFAto_@*LYbt4+~zKD$j+dK)@G7tA_cIIT3wfWV^nTKhtaU;Sa`;IHs4rmgr=)1(G z(KncSlYR|HWU9BD7hR+&NY<>^bdsm;2I7P!ghW}w*3gC1AQ&!HmrAH+etEW(ucw>a zu=Y`*ks<(eWA`{Rqw_9CBP!>pdSG)~Sov>AJq<f%P`1=Rfo?3l6o5gocDsfkwq`-U zK?8vt;aOKYsoq}RU#lB`ln$>4^sZ~ih;skL3Ra11!$$q01rfnbU*Jo!8bt4}`P(hA zcG1dL`JITinp)gwPP$p+Unk|?EQuGcSp32M=0>|V4ek{3XL3038|G;sz!(H7woQPS zZlCfAE^i#KkeeIFS6~%htp_8pt0^wvo68pNSfPzyOaxgOUTC!PUKqPDcVXoLtV@p$ z63)cN*wM+s#NhXT)@%(e;n)C-fd4E3000{&7YFPA%31zL&dd9sT|iMcCvjyb11A%} ze+&L2GhqDQ004yr0Zag(k->M#$M>Jg|6KybY;BzWuiB>cjP#5Ej_=bd{io9R1T>{* z`WA8iuju=@@A-f9;`;yE2LM&v?Mwhbc|%KOr{Djjjf{znnUgtyiSd6A1<C=K{%enz zg@dCLfc1aYt({C906;P8Z@ooKjBJff{?ow!^xrLuMVWCo;wa)T7zh+6sLua8H2-z_ z{}~%5_WxCkiTVH1?0*&c&uINuw}4_6|8JxJ>Fa;jWCF0Tb1*Xg|I|q8gw4Vj>f+Q@ z^`72ZaNODAd7i$SyqNwDYp;?a<Cti-;>0G<i~)fgzp9|7Huwo^X>8Q_2rIeh_7|FT zH-M=IHX}{qkYd>UE^3hf)?%Ns^COJQ*zBJw8%vn`_5Huje?qRR>gvxb>fYCdAP^uY zeT@2k4M!2G;MZHN$F?B9ZdE~M@7T0!)PAh(<Ur})(|`o&)#7XT4k72l>{8Vbg8SKO z_E?;^v;5O-$LrbC1Y!Ri2?5q>y}wV7a)7M*K(#dg?nS)Ib@m4FDu&e)Xa8mY=*+K< zh`e{M>*Z~@aKu*Wmv4Xy!h_-Uxr|v^gTJfyw<of8?0&p=HWfO8_-hrSw~i2Gg3d3G z7-V_|lvfMXi9d8V^5oqQ8qDJrWOr5Z;WH3n7QV;<1}t{e5qR(p!^jBhWJlJC(W*RO znvI@~EosKxLB%ZyyHecvvY4P+UBMP1KRXl92*={0cF=1kU~roX22=Io**M8pm-IU1 z2s5S8B;)vK(GZ38Y(a^xb<q$}RL7tjN0`m)IBO^vY5x})nj&0n+8iqraYd;~y`tFL zrEBHF12|t$UbJ#?>R7Viv!FR|#BIyEe-Fm<+6?)4Z1;B|uxKIR4c_i=Zbs;XP99aQ zW?VsR>B&L))7;oXhCgeQ^dXbiXNoCcruls=^7fIq3FUM>=EPfMd(Ni#x^CW8BYAEt zw&Gf$p$NJAOYo6Uip(9l`>WX$po6K>;j6|H_e?D(U*f#Hsdxk~o8D+%FZO1PHIOru ziG8gxNvGRCV_`3zi6CT#PUSc6X2F{{mb6WqsytF0&U6ROQhv5`t8&((Zcpzl!yPg+ z&qO0Sv0}HDAY1G;bL-U1eS3bpfe1!Lr`0K9*D_g8D!G3vUSFi3)H&1-l$-mb^6uop zmaQwR=ht(93`~=0O>ID!S=d2NFx4XIvil2su<?9n?u7C!h;DVra2ZJMS?GYrdZ2F{ z7iBg)Z)*$^mqY%5_4B#ERg%O@9oBBJ>nXRIYe?<87>n(t^D@4P&;B@xTe-CF*ix?W z_2!$(#g=>cTpL4p7PQjqDc?+C?u+Lu3zK3#Ardu1W^bYjq;D<2#ls>GbUfnFNEBS5 zHsZsPqX=A69Cl*l{rR1aO1s{4v+wr&Alf&?GGy%8q9#Kr0uVW-)0D!ZTsL0ChsgOG z#MP+XV3XQC^Y>0_IV?kU*RMfOaGaXXJmtxl8#6uOcPRMf`-UQqg`*2Fn@F<`u(-7m z9nvA*j(U>PuK0X%xN6YBva{7wWs5=YtLtqT>WNYF4YA#RB2&T3xwuAeu*n-pi(XnG zdMw)5&3i8#)p%Gd^3V04?T53Li8|{37^E*DKm0$f;P8}-sOpR^62qpnf<2AFvbX%T zdHt9-130Pd)`4h_ei8+_zCH&%Vrwf~tBGcAou$SFgXWH`=`3OTL#>V5t^hXT*kF7& zK|9UMCH(4K4*1R+cdO?3rr|=a)ebeA?Ko|RK%-9`oqH(NQpp_`!QusWRp(PpJugXv z1F#?h!bVzp_y{Pu-)9!Ibk<KzPO2p-T3y^ECyyiavpGV!Rn3EVD>a=pl~(p6h{^QQ z7-rOH_L2rxz)o*KHRchXQgV&-LMszmy$@_lv$=oCQ-qOwCJ^mbRtEvt)g$?+K?<Ih zLc(b4khK<Up-J27m#$uDDiuCopCC57w1+;gIHxZVStvG-2RTeoObjvMC^rwO_b(+O zn;>5sNeIKC$+m`iR>@rM0CE|F(1$!5T#$V&X$qf>(5GAx9%6JY!WQ@2q)w0%BkLGI z+-h2q^=vIRM-e81XHFy4uVPT2qjKhD@yG6XQ9I+LgdeG7)wjO*DFnXFEf=r5VSTfc zBYfA-PEDJ?{~a{BW$=@%^zc&O^NQUrbT0;sI4+w|$)a`e>;t|P@`KJqZ6E-tx``HW zHWdJdchtx5$ve;~r2ZZi<Bm<oR>iSd(IM&^+1D`Y-iRJ7MJrvVH|g(X#=Vy1i~8n= zio+yw9x)u0Z%)2*bxAETtV=e3Q@%4w-Lz_G9G!hNH|E+@l)rkQd%AsL%6vs=bND4C z5pWpUn5{+CpZjx`cQYyj-fdefeJvxIt!N~}z|j}uP)5v%FP7?yj)JKTd(Q&}RPiqI zh%_vuU|=qux>8hLe9jI-jAEYVjWokwv&6{ZLZIvfC-}TM52$R7%Ic_y>Je<<9}nTC z<<|v<VJeepU(6G%Nj}`4uk!qoew)N9sKC6$6@Qjg@sE$*=*r!6Uqus-m@XPwHVK+Z zl+2q^wPt=3JPI|yVTV(fw&52B$uUOB+dx*ex6}s4;JHXwm|L@;g@9L|f{GuT{yq4b zDQx;j9SLtT?OPuPQ*E!%HsGBisu%HPvCs7l^b3<W3`1rQz3*svco-j6Xh4t=2AWBC zM#CkKe87R2uC#YHN;-wmx=$~H<n9nO94mBc$OY1u$J7Eks)(~37eTbekh=Y5k81Xc z4dc;`W&^!w@jxk7nvlU+H12I*kN9hv8hEo8qCpZXDd|Zfe;P1gQv1iCVZUM?Q-%sg z(mw8~6_bB}=b{yM>Q$@gG9Y;cI+6d=__SY%HH5j|-1J}0Z*1Ae36E4!W^Qii<z}be z_e>8{hV+Kr6xynsOorS!=KJ<<<Lw*|FN1r&`t^u{Y-~2TT${Fbhj)n~jk|P{;~u|m z^VH%DIIRO)Mqrft0bCsG2J^Bj2zj0JqcjRaGKa`S6yxZ#FC2j4!mnPRgxLD6Y9mf- zR<>zCrff_;vx;DtIAmcao_`JE$4gC^%W2Qc;!`!;xPzalJ7Dg?$MVz~1wonpaKTx7 zH83G)u&!O}7B*=>)eH*Gs^K+jpH4X4DE#J_L45bpg2*?js5TWI^z0zkZ_9<hH&m1y zmBeiF>y_&75W9A--oa3sWsw{K74yhoxP!Y4Hv=K9(UG)xjQW}@MA{8V34SE)o>Y*O zKXf&+hptQK1;L}&A?g#vE(7p_=_ySOPd(%7?8vQUfY6miE_CC-1i-cC?MZSlx-2$f zj2h5cF93DvwQ$>gFoQIFYtnIzsE-qzYc3msinoy7`vn3A0;8D{x$61bZWYR3w&AiR zdQEOsZrx40=U=*xNF2R#jl^8NS4os6iI`?oGJ2_0wn$iSi=1F(CCm+p(Fb^-Nff>b zV^TlCOon4nHN@Tex+5>>utpyqgZ}JqtI+!$AD&c&Qq6<Cz4TL)Xg%AWinU~8z48yk zr@?=xnJF?=x{2CtP`?$jQ>UW6+u+pA?=V>UHo*(1wF7C*vc^Y9iyG)yslzj(-2PAs zLs>EJ(@cq(NB}QSlFNC2tbih^e@T89(*&f1e@DOH*t`7=MQenA5hK>7pfe_+1P584 z<B(iY9s&2dXPmmqGA?zTkCK>ZvFbyjlhk3x8lgkQ0hCXX;G|?{h!K6iV?=abs+ZL9 z@+3fD&R`z|;M5du8xF^%hs0`gn%qqkqi8UmFuH$G4d0mm6UvvT4B&1P{ekgj%tgeb zhc-w>3a%u`$R;LGAJlilnV+r!H5_)Z0LLgN!b7o5OdR$Npz1E{Stgv0pHXS2JGYGP zG%{c*&xnT{Y0s$x6TyNh%nFF<*UXSbE>MgO^@-KAoyA9}5Gw1zy$GEJgWe^2p)}s3 z_K=JmC$beu@%&B3&Kf2E{u=?5rx?l&=faSGRq+^J^%Ozb8^I6f3&g)w2@?ah54cRu zB=ta%yJPsM-RT3+^i~2<wk7r7awDTQI+GEom%`Bb)v6FW=tW0~v54ojU+LzrX{4B) z-`aLpdPL7Lc}8S?*D@2l@GezO+a58I?n&v>W)`!9NA%Hm6YTAZGyq-N@}}>!zqhyj z2U4Q7j8K#*O4l!#7~U1zuD4d*-~q-?)~5mz#%V2@7sE17%9%*QnaG^{T!K0;{zS6~ zR-G>oB%Ib(85Ks|$1{xA=uo#mj%HsNgdHIlDpO}6K9$D~9P3KC8i#ho%BjAjNuo9z zI?6s%Mt@iiHV2VEr1=VOQxOb$?FadCGrW;1VNvn|Mr)K<Y9rLIL+G1Y!|VVw)*X1V z8m1Ks?1O%{J1+~DoFpxAdrP8rq_pKfkL`I!XU>luP`}7(x>^hMWuSsz5gBn04-7mv zS+}s7W(I}P+<y9@Ce}Y?RFo;FWe_v<>ylk?AmNBdc}AB;XWjbU_qJF$!QER`*ramL z2to}m!oVMtV-Qq;_i@3tQwj{6|9s_2R-(CN3*Fof!qjT(;LGebmI9#Nx-ghFg&w;V z-+<)v+ux9O1av{)?EdSXofg2hyDwuehr$X28-$M!B~WWn5>cjpL#jhy86RW=pILK< zx+SR4JqKw54;hNOpmrnynGA-;fs|e8OBVn~>4L}cqV@}BX>J<sLu*Oj=tFd*3uWwr z${Nl`MS61hBsd$Hk~o0kgy0Mo-c)QtVZ7;KrK-E3a%)Q1nJS$6Kq9(Lg_&R&T5u|f zs<-&CPX9nuDjH9M9_h*nhpolO#&K|ouyBpoVj$#EKOIo<hDrH73wrA-gEL)YO7wYw z_yGQLUz|)!n+goIRX@G&AB#<+*o2ye!zKXrpo7AV<ZiIb9^QA_qWiQKRtJACL844l zPSEP+|7ZGT{AKF_dbsDb6?60FVIiI@zVLKq;W~6D_KWf}G$gig*xk?pWBb*Ml*C`r z>FgC<Gwko;2T4d`)+`s?GXb1=xeOm8oEa}LJs})>4@w`J`{AE?h^~o-AlbNX0W)*k z@v~702b{Pa52~AZ#<O8Lb_=l<%>_yvV}wubX*2I;Ch^2KK27j~?yNO%BW@~@2QKi2 zOrcm}tF-8DxAgDr-7c;}KOR}gpWDS31&dDJm|KG^rGh_BlsF688r_13m8fhL4ZPTF zBZeL(Y?F0Ul?Jl`Ruv@OcxTQQd>wcpGX6^%NO=vEuVjXlB)<axk}#%8t*w_CAJ@Ul zIrR5i>#PpU6P4&dR>7nFX8e^FQo+0z5_4?x`ZNz8YeP)b+Hp6cW5rxCV%0T``TLiv z?<&63mTUZcquEvTC$g>n@pb$2UC!TU-gSqUZ@gAME*2tlv^F$bHz#HJl_UW}4+Y;- z`k&{nyox$I)FJ=)!2ZQ8J=pB-D@M>TA1i>=74)KF?C$<U#sGnUbc+D9Mucp2r1oQk zD`JI_wqw}@4P`>#(=hh`i4uRll~!Mu2^%@KTax@@L->WxOLUFH2^$CX)(5i+uL0Y` z0Pg_`-wV<#qD!h2_0kjphe5?C_5nl%GlYfi2Ya1)@_vG9JihqB9^0)Z3G{azcvn-w z3rE@vmj^q7gqrm`T51!7AcOb|)e!CgmUu?_pyQ0bU$ed$74z+q!`~g39EWfUf{oi~ zy=xpOYMDNyR7rnuSkVm5bfWsUm45#iY{fK0aC|JwTq68Y$ZkE{xHTiJOV;HUl@5%< zzCr9nOuT8Szw+#)>r$_|jj5Q96kS>gJ3YZ6xpqb}RHBAfJWcOB)>l`rX7=`IE^uGp zcpbt+2JJ}|KNJU5ScK{m<GWU#lFwN@B{e`d;i43Ii412Zxie86KB$pqyW&2u?;zW| z$3d<pf?h0e?ySI4Yl}O=4e$LL6)+}m;BesXr-|Q&nIEMQ!jZDF%1Ke?t}t=@qZLxv zv^Y;>yPb`&e#DWMAn5L+5$#z*E9hJ@XVetED^CM9tB+yb;3ER<EDFW%pfBc+kx{Io z&yE*O8B4RjtR%`|qWzoj6_w+{dBEq{&vUSA#e03K4LNAWNc%vfh%0b{WW1}cB9dSB zEh}8H0%Ss?iSkz!rm}y=u-SX=Zz0AtPG?IaqvOR|H?0zN(@BZ<vTQ1Y5SJ?khr-5s zdrQ%5g%pg>q=`P&XLg!nYYzCjtgg7VSV;@NHQmG}^aAuqI5BuATK$sX^bGyU7`h`( zh-MTJG(J@7x;1;?-KAP3=89h*b~Btn98CF8%h*DZL)Zhl2G~|n2ra+6|L9-q$UXrW z+WLkR=>w-eCKsh%FCbeA7hlNVB|JJPf&rY4XYddN!2xqwXvI&__>V9wk~(tOh&nxe zzgZvxyeFhuYvw0bqy_pb4X7_36XOs}*xF@2;7d+vNH5Yq){CBDQf}UuJv2ZSWRQDH z1M{oQRGm+ap;YTs<5r@a_cC6+eImza*48(3k2ivt_VcSi`5s%?Smh1({@&0W3afRD z;>gZ@y|a!f`}vk=i+Q7XI=SZW6{apIGq>(+xY#sP>`w62p(~zkstuwS{?7wGV^8=l ztJh{Hs879N%4c|up<cZf>F0<WqX`Q^<9z*gNT&9}L@H2zccsjolFaC%6Az)tS<)>_ z)*zQP7k8qjpCz6${6C21FIWD-d{S{+@D{d2QbE83p@ZR(Q>1sG@>;w@>aI?&3of4m zgqz1i-HVMvjgZxtG(!iT;j#8HdF^|3jl=r}#3Edq+8p91Zz=dB)I{8bohZFTBhhVK zuI#Q{n5S7>-s@l*_rt;(_HAyx<KqV_8k+QYY3}wHOrhajP+JW%Sh=j0C%a6c(kL1Y zP31E4)1yQ}kLP*GI13{iP8WA)zkO%`RRTuQRK`*{$pg3O5EzN#kh&itbR3z4b(`W4 zVn5IkF==FTnS)PiSyBiO>j^@j-l=8tExGHj2|&TV*o0fn7c(L$y-<GN2ozAOzzaPi zy;P`opLMM#cY74q>Ykn?n=WG#CX}8iR*9nX)q#L0>(#wJJU*_ujqh#fh?w<F>P#CP z##SV4aoqHTSgl<B^09Ipy-kEB`gh<<=yTE>o_!-JcUNVh?Z%SvHoT->+cw>O?uyd= zE>8XNa2jJ0PqQBF_QOmr$B=6>RVWYFdFh5HCI3%4`8B1+RAiE<!U8;UlAp4?RyI`d zZg}$9s@R#M{mR36kCj-3t+H;(OZXiDAVkUw6J{(3QIcKOOiW|1x4l5l+|saY2=kow zKqI#1oL0m(WtiHVU*707Au1Ov!rx%D@XH@$5$qEb0|xiT3PCQSO0I(T09C13ET0?U z76nz5D^YHq@8N*acc10-$61Nxq%nfy_pR0ws)&71S1Iv`yMr3nSH@)~$0xje-mk*F zliTLs<=>G_IDIh@WdcFZk|hW@u>J-@bc`GlEq{kFr~yfq>F|m13I76p;)(WT>DM@N zf}?>u@`XT0#6dT6{_%6Xc%1u_(j>D1`a&@WD+Z>P4bQNE-X@Cy`Zo)qf2(6VKTaBM ze=P1QC3JSP0=l&&98(AiW0_cm;Y5tjah+psA}p79*68bqF;IR?l3V`uJqwJV>}O$x zLaEZ$0lKhBfiyZ(>7pb`fwA)v4q_RWO3>r5QWZ|@@L&;`5*;5!P#GA08zrOyqpadk z24`qp6dcsvtbV$mRZ?vO1wwI+kWSZlcB#AeLfAK}6VshIOUUt#>)kc~=DLyM=5`XS zR%ZRvr4_r1T<PmHTe?Z3+Qm+j;pYWBpXZ}0XX$&nZ>ID^-OTno<x-zs==CH4^^^6> zt-Oz@@xmCtx@n`sUy^+TJ}?w4M%mteKlUo-YO~|Xp{%FcY*l;0gElP%&DX}~RXdX> z(*EldgRZ|^RV}T8U|CCewR6Ac^sKripKTtDK12~Vr>%`zxNM#4vke1|)9R@9_Ha## zVMME33DgW3u^M|7Uz(4%G+^<AKkl*6Q!MkL`ezHqpn&F!c=5}tJFP9Z#$+2OTyUkJ z#BHk0UdF#Oeh>axp<LyaEch)@@jHv&RX(dd>;!-7z-iO13&s(bxZ*AU*U%3K0@fQo zB7X;BomNU;6)NZtLH_+}MP<4Qc%w_9yK1Nk+!fNdRm+pq<RuNSJ}M#WX&M5REk7g{ zGL&l!MV5ZV+cY+y__{+}m4@sRz$vutI_28S)5!Sn+_kg*D}>`p<MFd2|L-+E)pp$R z<n)ww`+4Ptk#rlPqVi0OPm|Tu*r)u?>jTBiK(7I3OM|uH(RA=<PY2`uD)`KYa()98 zAB*_S=D9!Yhm<s@>@1jRIxBpkrhXwRH$<4*Bq&#vBMu^EFP<!2Y`i<h8=EG|WdI-o zf%ZlWHArUmfz%7a>KLk3oW(wFC7c)NP&GH&4RdTTin7N&mA9aY(;B)q0x-q+l64$F zONr>}1$(FWq^F{f3SNPrcAFI6(`hc|3`~cad*}Z4FZtZ%i$oz58<tpxB~XTNSBug< zB3{WTfBLD=W5FwCPH0%-2gKB|S*yY`hQf~$aDBWCBG0m^^XZ27vbd@e1)PVZXHIX2 zcXJ6|9>YwI+`NG0pu>vv&Vg5ofT=)pP7A^%j`2w8iyuQW(T6l?_*&au7)M2GN;bMv ztRlGhZE2A2`%%4daT@$}?u3$Ee3s2O0gVbGLqr`U8{M_QD)7<k#eQJwq&E%6Gqj1F z2y{zsu7M)i?!a50YCU4z*McxaT&CJjq~lTpkKL}%aA1Iwjean<?wm_W-6+qnY$k(~ zy;MrhY}PCvmXZL?@I79dcqeY#+pGODd_=bC++VHQ*WXy`b7EJLahar>oGKLO5BRB< z(S5u$-Yx$l^!oFY*0<8@VY90;Bqw+%mZ<wjrEXo_9n6xfyb<UAZ5CJGXNHdBvk}vK zQ6{9#RP-Oe4G?DemcEHyr@TN<LGWc%Nu9w*MxU2!w?`nj+$Pwr9xKaWmP{xh)P!MB zU*rq_icdaq$~4lH-k6?;jtoahBXYzULv*)%{h@@rqxe=?Y3U<qg;;CGJ%Wh<<*-3j zG9O<0TtmJV{=9V0YiZeC67o^r{w%nR!-M3#(AMXVqq!7{Xd(4%1LEh%&uA$|k#2jp zJJ&Q@TL|S@B|4q$jUY@tcxJ}leoBD^>BnB8qDGgf{<Zb+KQzB}hev9hY9Fc|Y94O0 z!dU``>ye%;2bM}Ir?lz7`}}7GOaUFzol|uSO$%O;CT3@nPh(l+nGPkL<tU?&-=sKD zrV1>T5XlP6Ygw64rTJ)?kH&#(C^>H*kil&wZPAek(lXM;6k`<S0x%(QK<pnc>sW2r z?pW^cTEH=#gq|Wh$&EV~79&RT-%xtY)(YGul#PVKcX7Yf4?*|*Cj-~0Sd7Be@${sl zowKg$^vLCI{ijA-kh@_~)$U1nBZ_+%rc~U@Acaqk;($23cmR6$8DSR$n{2VhEpyh= zsAyC6l0r!05Ok26a4gEKy7Spa1$g#D5(ERopKt>iN%h^<qSxB_hFfOdJL4MSbJN&D zGf>KC%O?_{vpK9@T%X4e#u<`Uj&NFUvR-|DO$pN_BlFg8o_6fi{cE#2+F#5m+#MsK z(@{qGj^>6i(wM5eg(+uvecKjtrITsAt^N;V>Gssoxwxgx%t=u8U6#BN#*o^irsng1 z1?JHv`oa)%gXp>!Iq)5O^{u6biiZiB(h{Q;xw60wLuLdme-_w`JBk(I+mf@x(Vbeh zvxPX9JkDoeXkU>{r(@tin9xmIiUn>c88@aFA`wIQzvr>ZXt0(%GnCSXmE~wAvjx|& zVT-6@p~Q+BI-_IsnH{B<{qlKAK-80!eSdj(b^3huRKCwA_V@`(g7?V4(`50!vca>T zJknjid+5sXJA41G!o$XhqS^7fn1>-J<$S&SD#DY&=fs;(9nN&OXXo?r>;C{rK(@bI zmbb2J3jXlwI~U#ki-ISVsoI`Qm@vM;>;nmSsIHMV(l`P><g&)8H9hVQ!CHva`4L5- zdN@<49?pa-f6}Z(QVr37rx0~NhR%77LQrDdPS1;2h;@S)@d5S3H^+CxnRr5Q8yNwV zj+4kCO!|omSggkl5jWo$v6o24r1QN$Ve+~des;5NhYsoR3*0;&DHTbrRBR<<Bn~K^ z4B`*e=0Peikb9v}WAbx9B7Q%}l#U-SV+pnD@b+n&6{J{NQL9Bmn&l=|&>G>&Mi@#4 z-EkU+b6YqbSEk2;@tHtxXtdfD39{~CV!Duw(AdXdt=R3-ujHY|Lp~j9gpI%&357xm z99BjY%%UY_zcQwrP&lPO`NWl9^v(>O8mt;Er17Egvb7o`Qv~N3FNQF*_4PqVRjwcs zVlA4b7X4BWeyQ6o7F@NpeQrJyUh0q?%~_l2!fBP%>;{jXGljjOxE?x~eLwrzjHY<o zEPL|O%De?}UnCN7(m0hhdoS=)BL>1W9<V8H0oHMv;B`kp8Vm5>=qO~;*SN3tUG2Xi zF_6C3FKl(c5ILIojqf-9Uq^UPTuLWm#b_};HIYp(k6#uaNDrqCuL0=wC;eUiUwMAx z<DN{wpG4ku{W|h){GG%fBfMV=2NI$=<P+5f81e~f*b?$N)G!DrO^ty>DcloA;jrLn zOgNnmBnqMoyplKT)w}~<&O48WuS_cyKpJZ4{b{rZ6PFX1<TOE^fFz_!6{Ln1*kCbJ zeauW7&0cDKd9*n_3a@=$UA4wdW128A?4F=!iCC6CEqBkrYHN&X{@6#0@xIz(S*b9I zmr5?y1DX7hq|5J)CSpk!Av}J3ihD4riTW^^sWEpfD9&57Rg;1ZYA8IFZ3rq;fvSW6 zq?8AwZlnsq5Q%z%<rE}~oU4a8RU?_g1+~>#1BAG<CRCE8hmZMV-T8?Zb4#LjU%Wd9 z|LgG2M}G6;mZ9l|F9kN-KlheJxn6XA<(lD8Q#3jybTzYy)VdCAd*Xz7j$Z%G;br%C z*(j}8RGE)%!1}EXbZFuGtlzPs+l;%cNdJJ&3Lbzq2!LMqq`9FNgZr?3QPl`MUUM;0 z*@z4tIxw9>1qrk}cpg(iOb*cv_kVfh5!m|a^^abpdgpJiY>qBi_jP#tTd)mIUfDSF zlgfRszP<1E?>vOzFFJXO`3>gPJ`jSJYL|$dAiFu2%*tjLZ?y=L*%dUod8<*7+@_G; z$6F18<kNFHz*|Kf5G0SujT|<{?lL=FX5<JkcDKok96r6@r)TUC=#YbD>_DgY`Kso0 z9afQbW|xax;dk2IZj;05^mx2p2oY&?F?O3R5b*n1mU%(IN_;RF3UM4y?llXN#bPuX z45BFTW|K*;*8!K??ehV>WVPDuj*@w&>$|3%#Da<3qsXMW^gDgJoxbma5)!bIWI+&J zqAYui%q8_JNoTM$bn4UcsfjaZsLK{qJ-6qd>A#83t7rbF)}35BMU3xhHz_Rt@<*&T zE?c;evtgyqmSf0~bA<7sGGUv_*i@T(Rr#40XH5cVTd{Bjym-~Zl}{g;Jy|(jzIa9D zyDP7R3+Gmz@xvc|2`=r0KdKbT(OY@yOVt|kz@-$X(io<eP?#!ektEwF&gB>kOjZn3 zHUObcj{!>Iv%JPL{IH`s5nlu<Shf|g79~3*AQal#<U&54M(GE>y!(;Ln}550Pq*sn zx`A8U*tPEN%C*0&{G<Y}jL!Na-1zD*_uu}+Lv##HSjK$@%eX<fLAyzC8;UNs|AM?* z!zT|pS%OYyQs@-s2~P>UrmSF>iz{5q-5bTLtyjyB82-)tp!He9v*y>i*Ihq$zw3I} zeNy=c`wy4H0Ryau^Eo_Dk1OC7bS{J25XgJxcy_sVD}vjDkjv{adUzA#K^*U<t|%Le zb+8SZPN&(8CBjTbnVe>nIPY!`+~e8jL7t;b4hzm-dmbWV5NpR+A?H8nv8}Xiw(YR7 zwoxc(HjS!Yilz)JOh5J2d0vG7z?zx~YMOl|+KhIf-RNcXCVC(J8HvafI!`Mrcc1F4 z>NMqU8kj;%8YEbtG%;A^yf5t5!IyP!>acn|*q1s*WH2h3W2%Rwn)<xm^ED4qpUs^+ zC2spMGjVi~_?0rG!8wpo3jkErULo98*Np`vsI6^8@>zNIBwPiRN3UGFCl>X*`S9cK zW#>Qfk7;oArlqsJkgI$ag%`pHp1SdgYX*<J_SVSy_1`*FIXy*cX~r}H>mJ+&%%c(@ zkC1jCLR~;nO5S%iK>60#tKnChnR$`#G$VJ&m0lNN@H|Ij(K%ol+>ACyu7}s7t3p>P zTN<v5-VS#v4>UgmpNYN@`(E>8#K9}Kz&j(i#2<=04j)5LMD{hm-26`V&&`v~CK>BG zUL+?lA8P4HcVyQ^F3ac}MdbHGN62SU8$dMS17gT$R>RJa&##6x)D(?G8W6N21S8KP z1qqGG$EkVUMTRR#La)#-FvLuZ1mJttpC5%^*DS3GOmL9JY{rWw$||vbE-R2io7GbQ z>OK|qsC!kU9+KLi);7>~qK#?Gi<I|?bRI;?`@{yPgYrHHZFErH=XkWR>In3JnsKYn z_@wd>mQGSB!uK*&zL%--y&B~i$FL$7%R`wIp)ww?G>#>6N?o8_^kP0LVc~o%Md6_( z>n5DBxg{75MVrH!9Bc{VQ@Wuk2f~r8(n@6Sn#@gN12Ol*2elPPsRJC2PM$tsFD6Jh zpdEvqhg44=l8RZ0$lS0R14cp!Q>qH90ek-+OjCeZgM_W;Y%v12p>kiPkW)-S$sb!# zpzPD(C+zc2?;PL#?Ps97|MtyiFR=M_KX`S|%^hpdHUwd1%NLnw>8WeBkH#w3-?q$% z?txF<v}2ErWItR`na%tL%iL4JJkZD7uiYd&d+(1u*v5cnX$9KSxMd*%jeMH_(j5w0 zYVTRGx&4~hz>3}MZtiB+E$-cg+o#<;Yj@XeJ@>lqbwAiM${yhkx(>Qu&%fR^w&KK! zlPgZI@Oc$SPAb^jLMyoMi1XV@KHy~9)cHQ(nIT&xi`ir}=yf`q&8`!Nqfm~Lh+x@{ zv4mJTG36npW{?e~J<)y9m!nK{6h3-rS!!5?7|MRDnMhxG4+gZCRYt8nLreUKiu-Er zk@;{wp<(ki+%&(5=mYb6?a)37Ma{NZggZoBvEtW=1^&SqaK<Rpq8UB&^%)QB^$cS= z_Y(R!;DL_m2A%kpo)<h9!;71mEZyH{vLFkB_*?|tOjZj?S-3g7JG&>#WZlFEmZkBC z*+MbR3@?I<$hVm=c;m`X4oY@f`7QN?E+TP|dJ|R)7DYn|m>?hRbb0f;6R;;SkQhsx zNU#Yr>6xf2I|-uW$OLB+*C;EpE3_4R@tkuj$fJIPF~7pR`@Y$5HjUez-J&?5#W~=7 z)5$nTC;w8_jhsf}ysO4cI!Wx7?ZFZ(wPcxI2K6!!FcLAL;i>cl^0dTnW$IpOQh9-V zBXimE6-VK9K!y6}c46H5lmI*yO~;2OXk|P#bV^DMeo8M=L&QUq8kA09gf0(}V8g28 zX5vHQ#wlSP1EYGmj0@5qcgFhd!8g_SRaEXvVeK1ZQHr!g-;d&!A%gHVs(+61*Q12D zbuI0f6)E@wE;r<2(bkq+OTLBSr^k9?>1bnYNpz7P`lkl{psUcWfD2(s0T*y3Kj>}l z_Jc2_7AbJ1d$u1gjxY7YB})SxKHS|m6)b3(ufVSPg*FY%P%wL&#&-JQMVX8JU}5rN z1<Z2I@KeKUs;;fqPBpE)_ScxYk^Vu<iAX>tbw6qPw1i=_Aju@U`1EsfO=qv`;?=AI z%$@jfxGG@B`KslX{5xmUCBeh^AAJf|RsC8J3FObe!q<gG%a8B9x&H?#GsAO?C3WqT zV~@|A(-cy({(+xeP~Lp`BcHu;YnQ=V5LV_>Md+Bnc4oeJ!RlGL%HK2Dj<qixd?uHF z=y&j<<Tv`hdQ9VZoy)7|__+hahwZVV-Kq#I!|6-|Um9F<_tMrjw>x^FZcV5q6uuPg z+_LS_r56ru+q3+_vp43JMYEA<JLcw{PL{`fa~77Zk6^lF0Rd#JSp)obBTwVxD2Fgo zwqcY=dV$f)5~w6MNky}fMwWJCT~W~>ML{(mi0bjRu!qkb%sO(P^z)L3eKZ!6wNvHY zZj@+ToJ51-Sem^+sw^8c8mo?Cnt(&16{O>-njGlUX34`2_#NkA=d18_-Lb&CI$r)y zJ)EnX<y`8x72c`aWqH>pgtXQIOC7{}Lhv=m>t3XV;5@ND8kr@SoWk(X!$OUPC&;;% z?Pmwr5q3Yzv!56VLDP(TjL3M-H%cPyiMKP=wLh`2Yk%*>%bzm_=RFr<=Y47UvX@A1 z0bubFnj9mEKQoqn4|thYzydqd`myw}@4QQlnSGyIlG|WFj+$dA>W}H8e9UUGD<A;9 z3Uuml%`M=X%_J$%$KaF0;8K8x!>4mL<LV!ctslbp4~Y*}vtEO);kW6xnYYQ;Ij?bF z;}^?)<*MaM=a;NSA3hue5e!vCAfXWi0r(o2x3#$%2t>;@fJWfNS2k{WbH|(8)^C4f zVPWHidv03wmCNQZ`ySo7@A|XDkKgg#ul(cM>7_?+_;Ka8dw+Q5&VC|u8!;pN8-}$U zI+v4IpfSf;Dv-2PRdUhd43p%xG`R`~B1w_jp+stj5~&?ZbVkTyPAa=NsrS@bC)Un- zqKlJI?HK9q1OfVRfW9a|UlB;sX!#^HS0$^V=d^N~z9m_=ZzV~Vo}N6B1b!6B68EdF zMI+AGmR8gEnGb-3j|O~1$(@KL(kghH5XFK_2FpjEd?LM<8vk#$t@ASBh>5Fi=lVS6 z{?d$-ax}^_T4<dPpr!hLOGKo2BhsKBk><aNPU@s}(k3U3Z+1Ehc@UsI1GFJPZwBax zQa^4zLMTbn;z*BVvXDP7*naE_(Wx2Y*{xa22(<aMj)K-$5DO&SIa}y0^cMySBL%J* zqY$kR<E#Azet+Ra0qrlqe%vrtU;?5u8MKTt7R}m_Oa>$K8^mPLJU<*r2E(I_Sxbjo z;*Ha@!IqhR5N^%UFOG!67K>T$az=y^5$+eEMH~?Kh;NFlNI0r5nF~Z3L&@G`e{vwn z4kt&F`;!bvN=cNYfem(NKG~mtVh7PCQ?*F^iE3GwXGv<>xlrhuB}aC97@m!K7?&Sn zg^lyp6VixL7-QEUR&Ry~#=!rcidHkmVbSvZ#?S3@IoS2hySp|iPP3uq!pc;emeaG- zyRY41Fq4!r`|K7=sGd<a{)4V1oi|jrE)9A7kw`pd>4Dd7AH1m&C_4kbNMz1hxaje@ zUShC=VE*LqI19HCL?F^G)CGy-A;V-s_gNkce%tbG`LN{$*&qfX7Jj!g*E_CrevP@^ z`ESg9-e;K?8J&?avnVi^>0`KzC|M&uFbcUt$XD|m99ACU5`G4bqW2G3Q~M<-jWW{@ z?KbT(A=4<6(K5tCfR%XIDm}Z;3PaYC6<NJn490YwiW^$oAvbbUj^dsdUAv~LaFmCr zNA1(0!S3-hgIG+N7(7!x^I>VcZrhh$r)!}&e7sSJdSix|Gs^pP&A{luC(*+-L%qw? zT=%F~6^N?-0BvDvbVrzY)#5+cutI2>9H|=JF#~7+JrtVu;Wu~wdi$2~2X6Vv*3i1j z=@%>ej@*72mR|bC?nc>X_Zqm3mE4<$cU9hcf3)(Kk-;bJho1cB(X&5=i(Z`TwE41C z+xptcQ|OnN#<j5d8jXwRsBJx`5nhQPjp~3@J%tvZSBx7;dt9}UkVA6MLU>~<xo>F^ z#JELe<rD|Ea&Qv|xo8GYZKL41HVD@QVKAzAVZV34i@dS{l#XF(u3XOG5*K9>dO=7I z=EBF{IxfAH`eIH`t7?ghY@;(Mr#aNvB2-`Fk-Ip!k-MHlT(nV`8H8(tR|iorDjOgf z;a@Z_AsQA-uGK4=DgBAdq=?6Jt+hCiW7YB)iR3DiLppY>T#}B_tTH?(!nKpSCQlQR z<+NrfHYE&2x4qA}Jod13PlVG8`h-5&pBu;x=XgtQ6e`+IOj~|x`l<O?<XH4q;kP62 zHhsu`82&Kwag#wVHI<vLY~J3q8}3HCnPJDUci1=V-_^W3Z6Y2$JqBpruW$Ns!|P$u z&p7R}-x=^EeN7MQ9@Iaqd?Wmgh(S)75>4})dU7js*CnrOy3PD#cwg=#=0m?RDYgW` zODG6KFar@Bg{cGJrSvHDYSu<~(DRZn=nZ+H<W=y@k(-{EoaAPMEJwm718a%Xf(ybQ zfpn&^1pqRs-miE(ZjvErcV@`MqBmp+Wg>Y0ndnuFT{8?25Bh**#KKrcVVmZOdp+q8 z)}5O6#39Kj8IChbJR3*xqfh~@P<gJa?noehGfLJbNL<#W3d?=P4Ce9&CLyj81>n>f zlKDZSBu|~^P{T5U9#hze$zV5`40ZQfU)8@>9{PfZ4OeRxpR}Sg<*~|LwRtC!Aw{zC zLddH6A)gfeK#{P}FC;iWtYRjW!>J$bGvQO|Q|o65w%iAYs1t3O=Gg=HpgqhU!$YPK z$B1{tH{yS=;r{TRW+N6>Qbckm8i{7egfo#lnjVfk+{BgphzMg%D4wD&;VD8*Uqtxu zRYSzQG<sHFOyi~|dgzKqDJYlB3ONyTg0DymPcc#rv<p`YBQ6fxicM}?^<8rHT^6kP zXfpl@x!9!0<eAf&#e%z8ii~8!zhfdVIIYPh{5vMx2On-LJ^nQX@r!@3nv+PJhLl=S zr=hE+c@Sn1w&v;v{z&|MhannKW7l3Xdx;WSdG}9Wx@OTP)!{O!s{hf|vzD%^{I<FI z;p^MFb5=<<GW#k&{>J6=o2Mj_={ak@wf(`M-V5j4dDq3oS(lD<6qgP@;If$An1(%B zxg7Ol{vm)HHMgdNK+rKP7h{kl@G$^ooy+B=k-8d<)IHEaieW9>UXU1t8DNGPc9<Dq z&|U^IJ2`$oguSRAA><Ktqp;<9_2<i}gJqC<z0OpFjq9o6-4mT;j|W&2G0IT2st$ah z{>of9TKO$(s9Y|<o_{=o$Jtf63^8<^w`dVfw^xVybr6q~7h(`d92-N7T+%@@TnN#A zo~sU)6j~4umQ%O^k_$4-jc{l$rmj1)U?0|r@HoE*TYw4{K>+AbiB+_g6LXP7Mp`~^ zsMV1UmTh>L1$=bjan8l#{4KZf@0A{YE#(VWR-S<mfFA=FSg6JOP@n6VlhL{QJtsU2 znP8T+h%z`VYeoaxVR3{U!w$wV3L7;;$g<LcEFSkGkJT)-6J?6Vausi@X*sq+8x_lm zH?W?+N`U9e^@BP=Fhpg0OGj7Rh3j`$o@r{>-D@-H?7EIz%j~OGu79q|w^mPn#BAjj z02Kzc8IQ40?h9TP+`;YOcLeTazZMV*sGu%p7As5Djs7j%R{u_PyZ3hgx0xq(d&4Kg z780K&S!J8U=@RWGl9vGenpIKl3d1U@*XL&hH_PGHJ<ltOYC8)50o;sD!-4?(0H6<4 z6>CD(SgtvThN<Cm6#iAyYhkE``@<-V>FPfYOK7hORq_%|r)bh%2}z!Yqwrq%F*RJC zDtAlcQkldN(bSStpqf@$8m9>*&|0LC17bSGVbTUZkNeV0LvRQUDL28J&`k<Y1Hg#% zK-D9|Ms~BjHaNfy1UQnO3<XtStE$m?b6t<<!m2)Gwq8`(&<Ax7-@5dci?7<cZF4&8 zjc2;LuX*m#JFfU1WVr=TAC5oz)zOWIhvV%FTm30X%|Ex}`d@Z53&=v&g2aqJj@QBf zUO9qw$dk|8eA76R@tndHj|e%YQFkHGO#P<4rk}vq(L3-Rbkbz-0%(B1q?s7RvFs@P zhUR6Eona8mm^f`tf%^dRxZpp4c(O;~gNOGTpuuD0j-rnMgZ@)90#;&);=7mS*zcha zfw30yM7%GjDD0gfeu0!Uo+?pa(oXaCV|CU3YVK<O7VZ|FtxX@vTg6n(raE94s>Dkj z{~7vKrE>t@Te)K}yC@gn7R3JfefCvfy5CTZ)w&pe_mkWPj3aSq)N<Fy-{pSQ@NS&l zz;5NXi`#V98m}{LwOy;+A>M4$>%`qjG*#r{ZZ+=am|&Cz0(TUy0dA-rjFYtZQ8+`> zWulv-C>jMp67Xf_Fb}`;pvwhJZUSvyXn6t1lB~##JPN<8$snmEhm(wkb%?#mlSwuS zN$jDDyJ`BD^+@kYd?ANvyqd%`QH8WpEvQgBQ%#|-`R^%jZ1hD$t1%{Fr9K=Bnbg%F zU?IU@BCaWh;A-fz;!}ezdVVNjU5YKch>xqio~>R@U_u!BV9noKH#Kd#>EypB|82+a zH`ZPM+IOzK`?s%s^ZO_#U%0iq@3y|_E7M=`N6|GfvhU09y>Q@;r*41dvkxj;Z@e5G zx#^-+zq@YlqrbRz3EB0B0XJY;a64WH4~T&r+@>AvgHV?X<r?FgbK4q*4Z}tvW!)4V zj@_Pn%Kf<aJJIKj2fZ)EUW~u0f7S3SlT!eC$eWN?7k8Rm-l!>R?t*v1n@zWxp91Eo zpaXV+E;ui-60V3}lDiyS4lhINW0%J_<gSM|#J4ovkRx{QVPROj$$FE#+rHcR0DGT! zkM%zJVdoRE@5aBI8)XlR9~=H){Mh_){NvW7VA91qKoL%9<z|Y&=#8`VB)Mp`AI~+D z!e$Cg*Fg)=;T0uE7S|=XPLn`ED<Gn+?k$`wu!ZpV@C^n-M<cXo`m9THjkp+>Cw~<F zsYYnX%AcV`21`@b*d)5zuIgI1rZT|>tCJNSQI!j?1|cD^8aCM*SA(>i#f!`;t6`7? z{H2`Dt3k%vTwU{Xu}0NJC)!Kh97(+u&U4}bZi}|n;zx;1(pKA(plW#B_094dPd)bI zP0#EvF8K9xuWVYf6}DWbZCST&xX{wJu=i_MY`Q5n2R(Dk-X*uZd|+t7qZ_|^(YnFi zKi#_Os^!nUy=i;TW!G-$$#2M1{&V)@{Wm_mZRy<N<u(5r;SWxX4UE=j@EHQPN%u;> zl(;R@7W+==0o#L)*L|-C-jYPORkjC$jNpJfy<ZKYgvf_{08@aF&!mQ3swb2%n@z}* za5{nL@9dFb6$P`hCUf%W<ZlntjIMd%T095l0ZJGSgbANv!m5jUv|QBY=%R|G3lo=W zbUkh4sa1%7v|-hne{pBk;y9?fSx<l-#|nQvX2Tn_IHagO7PR=6K(7Oz0@lzH=(Bm2 zoENi!mHwgfVD5_^C54q8SUOVUco~3I0z_iVEr~dZ_Xs6m7GChmv#(UH`OS_cAHmkj z&rUDDD%!4I#cbN4G(~T(eE%1f-+%wD)qXe|x}XQn3{-itu`-)E!`uT}QA16awCi-K zMh09Lhw*^S+o>*TA7wr`Y^7BzsUliMq}ocWt+@Ie;Awhg>J$90lsQhcLzwy8G8@#T zK)`NA+gf2Oka@s|w~^nl*zGw0^0}J%6_nm9A1jrl_sS<uj8$pdeyMBGvX_8w^6$Vi zc^Y^pPd}%ZYFiyWqa!wUbL!qCvgOmxwQV<ZxAI7*<7CkzdUYwgH>Qio5pOIt1-8ir z-yC^^ZiD_Z&pPiK--f2^#I5?Ro@>2V`>t!+rQhXw06d_(-+N!`MQ|ejdp@kwiK$dm zW1}95SV6OS>_Ho7Y7GKeZw<;ZQSo@a*+#t`cW6qbB0ABIXAVEo=+&`$u?d$R%tFMl zO~%?6;Nvul1dleJ2^RyFJQf)}#B1f-t%vXHPwP>?en9_cJ)_@V()H+8>KNU2aTJ=h zK<aIa0xilO1u45%Ho;6&sR=cCa`~so_DEE89O^z*9y~R1rc82UCTenS_r$4G)#!9C zqEW1>T<83!cz&_=Mck+V{VYggr`TDQ{Zhog-q(j2ZIuz&Y?KbfFjP8-j3Pk(ygf~b z%FqspoDz#0;WLirX7&B!RzYk?!NzF9t@BjwXxn%3)CKKXwV2Qc=R~GgUa+Vh$(3X7 ziN*u*S(R4!&tyW@8BEbA>sHOBvsd2w)tOC=Ij3b>-yZaQC>=IRMhd^PF+ZQr?86k} zuLndH)^{#K-Y=jl#bMqVwdjO?bwEWb&1S<mt_DUY-=f)Uxbl<3#9SL_A*n7TnXOzp zR&z2OBTy_;YH?L#a~_0=o!@0z$|1jP5xbCE$S)F>`Ih;G_1qS27!0cieXlAfl#}51 zoUR?tflJ(r{VT)$?tcFk_mKZ~`7Yatb;SJyd<^XiKM!AlKN5cA`B*&V|ATS{x_LBT zUMk-ax<eTbpAHLF1%7YxBv9}Xnml;`1OQR&vyuw?)nOF@RZ<nTS0&#yqV84qt7Ga3 z^`v@QHL2?Y?^~edN6x5Dz@PWd0eg`Y+7!8nKhmJS5i-Ia<8C7|W+a-8*AE832-puu zCa#V&An^26-kZIs*9-S}p?4IEO!8@xBr5UM;2Dme(J<o(`g)aT5!+gMXmDb%d}?rL zoTda~!FZ5b4Nl25GN4}=SQEG^z<eVBNt_49Z<3cnGgHb~O(**ICCd&1$xXE4(^x^~ zB#F4?#|}tjPoA;oq+*SgVO4ez)-tU?Jz5b%WmQL^?5e%3otYnf=jKN~g7DzZ|I^Ym zHE1=2!_(GYaPc>Ht-h!|4=*|NL&(4X4m9uXj%8wwEurB2)!%&Vvl;2F6uvNNLf6r_ z#8<S<D$Ti4>8@C+mMD1CRo1m_V#tTo26xCOs|}uz52|5Z$Y)i<vK1jHx;=E)JP*y# z^04GlPlIki92QTC%p^&r>lOP&W~DeLo)DRu2ZFK}k!*KDLn}rnD_Vg3w75zcz>9uT zWwLb18M4-B)qmjG$5TxCP^vsgVm&D{qOpTjD>FGq&yP{4@)QT1DPlRsXp63!crm*u z<~Hd=P1!7()v_?=G3k|5HX4n#DBGA#>s62JruDP;(0Y}J^k9K*8+Rv2A+f%HP&1vW zW}3O&Oy|p&f|8ksqB&XHQA0Kp4~$Gu&Med5$ZDV@Pm<Ae%m<oYj6{Pj04zxh*+xj_ zMUW|tk4a;t<L9=_7<-R|*h;UFe_!xz903;k3?Q5A1d@CNze!Bi_RXN4jN0xQWEk&M z{eWiLj<0`BBYDl{=B9eW8TC@oNZz_raM~TpgN_Gd%uHscajxf9=2j#35DPQSJJb<= zMA#$l(LE|XYTe(glXyu$D;rm)kY6+(42pL*z=J_ylo7R1IJhVHau5Zrk*Eu%dL<}j z8<Vn?7X`iKgT7JtrRR4y!{$-+=>gc78ikT(N+h9du}XJaEHFZnDWC7}&(pG_qgs|q z)iTmTOU>z5^CM<RQZQGV2h3yU6K3A*X*$aAj8Ky_t4WjH<MoKXPF#FgK2?nq?(Ccx z>MTuIi{(t+TM>=how2AR7Ih~4Aa0NN;lISw(5*Yp1;vn5=WroM9DG%^g6`)+_s-)T zj+_HN;g3#RIPqRGaiQnHfn|pVFI(1;54v*mL!nq&^MArDn0R8ip(zqc%v_C@&+Xjx z{cC16PYD*(D{Quw^>1G|7o+IpKPvN?qnHoGLGhd<lblMe;;s5dQs{XCUV6990~97{ zqd;g==*D$QeLo?x6A|^z(Zop;O^##|DVa6%-uebZ2DitZj+)#0u(nA(b%1*602Bbp zjx3F5#;H?|itCxGGj41QgQs*X^MyRgBcOHi>>&bpt$Je6qsfkX{nQRTItr&Y3ZXWo z>L=vMp#G%!XuhO^dnI^t96*m555_=7Lk2*g6U$E2dj`*CKA)(iKBoxaenmf(aIc~? zPg)`Eva+`|!KqE9sa;Jgnl86q-gK3?)w;Fm7V&Z6WAPt4Q+DdITwi`up4FzpjL0OC zvQ0?_J-0R3NW6L&#MPd75X?k!D#5U6sST2mAc2g@<2JXpg!Cgsuo>3x!-CBp6=Wk> zkUmA}C2V+Dg@k=oSr(`Ecl@BMzJ*nFCn3T_h|7#**t^bU*fVB{q(oNTHJO575~KN; zF_w)MgjNM+O!$=3wJEU0kXHW1T|?tDFymmNImfweIw;qO*HiOz&i2o@o8hWXH9|CN zrY{7&u{pbYZog#ks{>EZZ%eegid_}O(;l}uq;Sw3g?XL%iiK;ZU3`hQESrfi#i6&i zuG)0VTjLM!a9ElvA77daMx)SaXj#jw?#sH(J1S3a4tFfOXx)*Y4_@S!Np9rtC;z}b zjd@NSzM^&AE<<@Ygh=Zt>_$)yAQ(r@wkfvjY!9IK(IgUV4Gpps^6AwEHRSWD4NS<V zQ^WR<&x=`*Y=sCl$PIQ`ZfF>V-_k7cXQ5uFgUII<WgUY?(d(K~UTC!{QdZI=Mj|dE z5=bP~o!z8D)g|ds>g1HLx|D=U5)$hLO4{t?6%JL+Ho&n4*g(au28le`Pz{5^k2ZJ` ztG-p&l*(13Y+X~r4Y>XxP136d8jp|ftl7`xA}qSe+6hu83G*JBu}n+oWRIMLB~X-m zz<hZnST1h{m&@Dahv8H3MR-X5Df}mt|BN7s3hx7hDL6>u`_SZ*&j;laBE->YD#^t9 zjxkj87fJO%t?<#}u&0PQH>tiu?D4WwE+WZ6!vSqXkTMhzNu8&~-w)Y~Nc-IQUbm|N zYHnv#Sl8cbyf*x=QTCJ;`rrU_fj`=tf$wCoP8T_QlP}hjtt2u_BhIN8Obtxs7Mv9r z^Y&0<G@4h2KHJ64Jo{4pirF`-$)>C4-7x=`y9>0T-8Cv0m_p+pB)wHnX`bFq0~_4l z+9o7_uWv$fvbuGjn}#d6V|4Rh(hyVgg?KYmSrVb3(s!uMG_s+Y&`r|XTn}Yvu0sv3 ze4<&X5J2D83>DTgU5{y~T#aJ@34Fvs9#c?Df$oS^XajL;pot}M4P@Kf)Hn@o_+nH8 zN!V$Xj-~4RWqmPqXIY-^aZ;dX{EP8CekpmSHLj<`R8LvFo|0F+n>M*=liOI`uw+U* zRB5wHn^oGZHrFE<n#tB*<OZ&srF)z-H&1E*--a^yw9FK(v7nzqA{i`EOKf<`$P{+} zl(8u%rZ6cU_D<=aGC-QNDNqsJ$)L3wzR;Wu#^*QalR;^ISWO0F)$oNvJUu-hEX;&T zybaK=#Imo|D(OA$h;Bp=_v@hrYm|HRZ|Ye+-OQ~SsF8H2xwpB$d7zmcZXRhy`<o$X zmYT<!Pc*a5{q5Dzh4TMCbiwZRb0Q!0#r&Md4+YUH1kT@0caSEQ4AR&I>;JHk?&qmQ z=k1r<{)O$nx;Ni6{h|S%&8*LAm1z#GRnLTGW?L?w?<mf$bX*X&yDcHFBV&d#ch|(~ zZL^kKqCH*t-crR)V;H21;LQ6j&E$J3e!3T~O+RG`b3rvXo#0Cn0QLn;YZb5`vsNHM z1t?H!Sc=x8Ytik>gUVCN5yc2G?Yv9Nnb)>0MwbK;mhTz0!P)M!UeKTq`6M;0gzEd5 z{HNb4A%7S#B6t>V!u0i+mT~@11|^+NPw`DpTlEwj^^dBn%AZ@5Br3(9IYUF`@gkfm z7pvJna43b<au@S2F~^P=&(~&`h+XV`>eZipSPw8p>ozS_BxCE%YaaQ^26(Md8Hr9& zu4Xn8^H3BvYFp1fyD;Rir>~)OWE!5A{Z#MyFRf!aT#l9pmIpV&jc8+FV^GYfCACL= zfV<!KB=>|*Krj$=l6h#*k$Cj5;0}Wjk}RS+ipDgX4yJ(SGM8iveqArvM_dCap?O6e zofaLP03DqcU4zRRN(HH<gggO)QgCH(Z;%ZhMG4@<G==UhUW>}YZ+~7{Tc*)AsWW8~ zLLS6=qM<-uaKK>6W4KG5k~*u?NPWUUGZgSqzx5%NaVIdeLg{r90Ro8@Y7194==v8z zP3Si-gl+6MEir>Fw0_acm_}zNULgeg+ba|K`9e(M7F2$)DALjX*_k?VW{qasrc2;7 z>VKGAQ<)8SbGv~7B;m0dg*GH?R9&)#$Zkt#4ieI;tM93*t8WvfBh1KF-A$^x6P2Dc zn?hJSq>tdrC-r>E|EOnBzJrBRk~j(YB$ZZ;2`a57l1|^xB#;?DPImK<-acO64l`BX zMk@6h78Vab;)T2irpS7f+6$)C0Zh2GRBvh|^`!a9z}^(E;EUlDBjJ`4DaI=%;>z@R zFfr3ZJ_!r2UR|TdrzDMnlbEth5&%X4|FUI|4N_6Cv$47<lQyToq*E@J*J!NmF-5lo zAzzBmkx*!an7|;(IytQ{ibAE}(aFDJ-I7Sg2O3lVu96*^Xi{A*+mWn#)s@<PCN(}J zjhE?0npIx0pFi$9NOy-CAL=7!+S;A~a;laHUfrhBA2eG6QNJbRhe5NC!~(<rx4dtU zkFq%Te`e<0dp5hdkZg80*;_UjHeoj*2nj3MT)BiwxbLDi2^YDDCCa60LE^Cp1}#>p zidH~-5ELz&jbNg-rXICkYQa;rYOUZATdLAhtDagQ`}@wjOMrgt`TTyL&+otU&OY<Z zJkK-tXXc%G=Y40-t#mNJW2y5`$2(o{7%Xq>Kb`Va76^v}qr>5zmv>(tt@79%SzR96 zn7s36J#ACjqXXe#_nE6cJ>KZ^RhxD5{qujwZ@)Ki{;Y0Pq9(4nnw;olwzLjqGZz+; zh2$)9QlX0#IeG5SQwtW-0gUosF+Fz{35(vDS4ggI*26esdJC7UYPqTO`jOXzLAhwb zUH8wQv0&=5ZrJ?73s!(!Iym&17K_<xEHb#-J=#&09IHLc?r=D5g<ADn5x@GaNPj3= zqvF6A6(`D5WOqr5Y%fZY^Hp+URO$nhd#yP&W{U}t*IK4oTCG!yW_V(j`PR9)(V`WW zrPgId>#RM>=FDEpX6xqc+lsch9<V%M-I4V`(V%6}`ZN1r(QB4htuGh8TJ$^1JJwGv zC#)xmzP9|``gP&gMHL21n}fTGn5-l!EGlvuGK~&Ho-^N(r{_9{9)qT1)b&Lct9+Tn z>CLj{W_4#V`9VXboZV*eqFm04T!njxQWZDA4n{KcR*M*wm#5bo^v(hHkBGs7FnmvD zBx`_&`ezieq5=L{B-1U=KsuEvGWWW#?p1HE1@^O7p0wHJCi2Pi-?@BorCjyM*J9+{ zck`9<A?;?(h8F_=@)7N3>xLJ?x-fp~X7c$rJ+6aZdelc?^7sPv{NAxFb(Vp;iErZK z(X)TM!aH%{@Z7ltV<)lS``A0RSI$2Br<t{7-}%F5?4`G7l)8dCzu#gD-K|`4`o1kQ zHGaP`##7P3%)I#Q`|=*)CDMM5dqjx65;<Y9u}sqEXBL>tEajHbN=TP|DZ4Znv#nyw zY}W*Du-(h<558u5&vt@+ij}U78S}NF){wZwc1dWmP2`12Z6zV0wP`~6`658&@J*z; z{94<%f^nhx>KWC`=tf#+yP;rBsF!ZDZ3^w6dqa=XBca{Z@#<gYzhWDzen0<R+mY&% z`JdQ6DL7XB75yXszeD~hY-)aMZ~=?u&kL@|zrNrl+l!&MZEuG@wtXDRw8*uR*Dcpd z-Z3smna9iJr@=m~r}8{LtfxvnzI>a_D-S=k6=1P#vq^oJ6bj~sZ26&}Ey#k%d%nG( zAfFrbdZJLMv`inm9Nn}aIL7OCU+_d$>)9EpX4OnxbD6Ccw*|{A%g!qG@g45?@?{S! zisZp1XTz9j1&8I|8dxv*08>83A9=YzJ>F0{Y^g?<_pU3i#YO4|O%X@XnwwG2QkJ#W zX3MIzS+i>i>zaHlUJmBh=7(}?Q=iSHRx9$}=lO<jfaPO;#x8hPcmbP<*0WzY{Owq( z$lG9UX4`CZkk8n0)+3iLaz%4?C^X-fclJwV?U{8WE0%Q&$_h*Ur+=rY-ZQ-;jNa2* zrEWQrAL=$hw`xUEYJ@vvU=zwqa^?1uT$1rp?lgoPiYc3$^m3vtUq!69o;`wVNZoac z6d1H-U54Ih&>4*(ZLKal)0R`4k$&5%UvH=Z?vdCj-z+)-$0dew!D)t=5;g2KXiKyK zeTAtkqb#S)UhWuOR$6t5w$@$~nyhWswV9?k=4hk3s6J+lW<>4L(44B}+Qqt+re*eJ zj;qJ6Q`TwM>DC#qH+?VTd-m%c8=Tj>*9JE!x9fYITY_6cw^iMx+hMvp=Wg2$`+bh> zWj_jT5AD-GW_Zl>n0=q)QRicYdxQPDe*II%0sE7oUxvQce{DKb__cf5vfz@?vZ~t* z$^^&CqE)W%R4Pk!OZ3YOqRlYfHKnXAsKgxegEK>-UDvK(U=mnN8HLH@%nOcomb<ES zwI;*4gZL<WVx1#oa4M#(RLwfF^*R$X>1#`~xzxORIUc<zuaWl*9FYox)2YYQ)#-F# z9MzMSIVi`T>&Ph!mOIL_GZ0H@QHi6pwrYZ-b|Bf^?=Ts0pI#No4e53643o+0fV9Kz zbQT$mM)lEw!wG^jSg6-~WxomqtF$_u<TyiB@K)tymzI^))@Bno8I5|q&M<MOc25=B zED;%3m3rMuqDV<)s3ug^Q*~#Rm{HYH)mhc8dX80{s?t}T(Erge%j7s{H$BbWWXEW2 zl7GQ)BtvBEt(!Q&SM+=2;2e;zp%hq;+pK55P(Moyoc+Vd2z|j9&ho7kVA}-_ecB;6 z*aH80%)WSwEi)`P*>u*hdPZsbqAS&hR!rRqiR6}*<(cb?q~#95a@n#?^{J;?G5Kc( zqrB<{oR8sGlHQX`?b5NF($dr}+=WDzdWXn2PPbuPV^MBk_|~%F*M?s&9=@hBBe!KD z`^q+MLIpE@SmqvOH|G=-<dk!3@r0U6rZ8Snm{)SCX1c$m#`pcx&xl259#pQnDZj+; z_lLZ`o6hR^=Ibu6F3B-xV@c=_m5<$c*2O<vAIdM&XZlln1zE}C;-uJ4ZvKmN{n|kG zY^^$5$pW|m{M=?TmjCs^?2C4ht87Mt0~+Y{Ae<>z{b(UKN1HWTJ(wd~%T)?zTDR(O z%XnT7od2ahD+~Vqb*p-2T*Qz)+ZeT7ULXq82Rq}uQkA+!jms^_wfhWSqbJLqU2Jm~ zxb1a@T4P<dydl-ur|G8}nvE^CmV&AF<@z7%cNiYD-|yJveU$d;_ZS{7co^OI=lX+& zL&ih4rwX37KjRqkzGeH$_?7Lny>gd<dDU}GJ8M)Ns7m3Y@)VvtIfYA0Q`qNA;jAnb zM<NBz8jE*5$v4WnHS66sYc^$V^BU^(HO3lSt>Y!_kmnt{Zj13Y+vWl>K6{Fd=h)=d zwZom>x*ka0ijHhAaN7zBLIz_l`m@7sS2wT?dYx9IDC&N9YTMdgV6qLc!bo<9ky(w! z#$Cok#y5=`;|7C6PT;K(ZE&}KQ2%Sa&~GplthLL(J#mu(`L$%%7}6gcN}Q-3C(%<G z<H#^%;D!PA+##!%^>|aY0V#<OS#oMT@~;dEtbxF_*L|VRYV2oie<;8N$@Yc)q{P?R zPE!53QwdJW|1;Q}-q{}Mf#1*pQ(f%ZbF(D%$P3Js(8&%O-RN2Rf#iv&fDOeac@*ri zL{6<yJ{B|P)au>1Z2;2W|FYC+kKec*p6ZP9p<ZbgJK!uWANAJT`FfMLh6QSJea_)$ z%7+K@%3N91V!OY@?F$WSxp_iirom#u1S_kk^~|4z=8~Y*pr=%i5hVYOErS$(8E_G1 z|AI1UBPH2v*2S)8H?h0;OZ<Izi96)3b3g9!CX@1i(r7oE#X7;?kml!rUpvD8*B3(? z@3Z^akJ(O`-D&d@%**WM5vfrlu3XAfu{-UzAcy&+LmoY3#vHL&zSo0G3rLkoe#YP= z|40mFs$EG{6abcIg4igF#;DZlv`i7erPhO70TGpA1#P5qDxeB7kUVrqBWDd{r_pd{ zQz2?ek-N9JO8&?4{x7{WKLAW87r=?}*g=&15O5rDVt88e4Az&v;T6eaLbk^fX&|)P zJ+upLdWu!i^E5=$=n-n5cG^yp=`bBYtKYz0L(TiBnf8&NxzJjzl+QG@gWjbp=sNnC zj-kcd=mVCGcv`3%`cs?y6YMtHk{pDTk(%hI^fX(^W}{`BXbP`j0i?VwIRworOTM0b z2fhdCV^*B(qbYFx0s2ykKJi1e{0e$Cc?LNxriHYZt!ICN9(K}JrAFyZUQH9}Aic%f z;QAKbpn1n|kX9npPgp)1N*+!Ak$$c)T7sDVjkX}i2^!*KM3ZJWw7Y~ZrOT)b-0#u5 zEQeKzNU}89nA`#CUizE|_)9{Ed<1AJb<iF35NhLXI!<3Q6C1}KWcy*>V1L%UgS@rT zTC8Szkn=|n_Hi0yRji8V^L*4+KD2r+c-v?X((k7utc}Ik5c`GLqX`YyCv%gdl7B=W z9!*i?a2NdoX?(##kU<K<E7mAQ${J1e*_%<q#q?u3LT?}+AE4g9q`$M#Fdy=p_{QY? z<i6y`B>SX`CeTb;K&xmSeU~0at3FRJ&|leUZh+)r<webnnp4TUP`4%2h`i5$^lZev z6|Iz@0hqT@$}HwaDJHPXFzzg6+t@u=B)`kv<y!9H*YZz9T)ZaUS1!?Ll68nF4?Wri zEtyZt&;mE1w(mlD_tA^=3LC{rSS3pIHo||!C-P>PpYX%{1F=bLQ_g5^9X>Yv>G0`f zFGiVW=v@@G`50>9FDwsvC}%6!YW5LwekXrYWD2YBiE*Mq%n>oMMQj%@i(f0(Df^Z8 zG*dNQn*F-2;qMH;k!(wTpXAu5MZQWgrqs}Q=+9NqrmK;^ZkX$6J>5*b^aJS2U9_9_ zqukHYEA$rqo<2hh5c43<%aQsu(7H|R2QWL><LnpgMfM8&kbNZ$_rjF%OL#qR;;nos z-vqOrAK`EF6T&GLiH)KMW~VqL-c?9Zl%%E_W{PI3X0P@&U72o*ZlV5HXHK3SeKvOX zgJBxB4_`if&+spX|CpSYyg@zlzY-(odfJR!?tl*Nfq4v?c!*x2U(s*XynfC&(?H{G zxKE8{6==13HknO@`4*cAGZ$t)TL9C=7P4h9@+|*<W4EyHvpd+2RC6E7wue2+4#7Od zo`!jg9c6!DpRmt4l$Z->vmcAJAg@J9oA_itgU^Cl%2&a3^XvFJw9sDO&j<P2B1iZ| zrRWmZiXGyo;(76=_*&sgg%VW4$~<MMa*J|Uc|&<eIjwPNS~SZvJ2lU19oic0T<r?& zecA)s6WTMl?Y8R{>elPt)Ft(Pj8?xyxes16<OQ{d*=kL$ay>taeqj^cn$2u3YD~-L zh?U|8;y*Q4u~Wj$-ebLDxwty{6Vb~5E>^L5{5j?oE=`@dif+e!cR&A-f5HE#jAC>6 zpIDjlL-rJ3C7QTa9an#&j8bmVoWO1Bw^YYBup$1UxJBHO{2A40cCw?Io%{`QE5~>a z9YwF+%<n~@U-RXBD@BzW&1qVWdVf@NJ?eWB-@-<VH<g|AvGDP~u~Y0Gj2f@AX-YBw zHm_y-G2)(OMRbx~OWo{86k*S>-?IT?Y@gW6rt=K6R-BvJcuXK(7asPeFj7qBk(KaK ztev0YbH%gTBVrt;c}M6!=|(142)c9b8m8}{2W;o17}HuXn*4@U<Ia39#``aZpOquF z<{iye=-ES}f@a~aekFg6>d-enhKbUxxOYDd9o#}8ejlw*_OQhm_rHbllhXiOK|yB1 zXqk_^ZN&W|k9#o|$^%vZj`9ChjQVZt&-7j9M$a0eGDUK3S6VPKbz=P83bPo?r03C& zcWDo5enT@@K9S-c-U%&#pT3QI&PPb!PGRJ40X?KtAlGh;%-15skB6sF1m;$Hjd9w5 zJWoPDZ&#*Z<h>`k0wr9Id(?E?6JMd_$$O~@Z8s}<OL8l9Bp*s%K}%_Nav#Rub;$%> zLYp-)K2H-+YB0vW!d}4r=yz-@#_cKe9>ya-v(YCoKSlm0X`Z28<+m6+>yx)9-@+WK z4D+KM7-yzo^ty)rjCz|QhG^{YWxOxhD!OrpI!ZH>dy_6^q-DvK7<r$iJvt4>o*pXF z?1AoXRj%S8<hPviSP;}JG`ol*jdSNj>L*<qo>*5qLB6^$wz?`598*~l7+qdgTH-JE zc|C4dQK8deFR<n3<>s6}yG5P;3(lyb#n;;Dj+b=Cl@i~SDV5Ui>jI(c0zzlp4N~jH zGI4jOD(1deG6Kn~{z)>Dl8lT<GOIgG;mQhki_aZ@z1im;U<+nO;l88U7jwr?s_t*8 z?mJbt8E%gU0o*ONWzFt5>vXroTh}e?ZRu=AbbTgcldoxsv9f~tj3ziuaK-a|-F+;7 z5>p*Kzoo8^lirL>#qGZ4mUw}$S>_-v{4HIJ<Lxt}EzJ&(CstVzXHAQI3*+Q#j9UV# z2sNq6#I;Rvotlh$xhxHBb@x>a_1->UrG=e=jK#jiT~|coqAMoT$O<6c=6L>%$8G1m zh&;O~y7@w`L-e-Tmb<05w|BEUzI$f$Las;Ju^8frbAM}RZz~eM9aY#i+l{pNrdTx2 zHX%W`EQc&ps&q?yEt1l?!W}pG8hy)pS9GFb?7eZCb%Q5iw?_t($H?B|?wu3$dE)gB zU#zRy*_TVbvu@}wh`0+b=2TYnS+i2r+LvibJ2K1{I+l#^RF^6)U2U^Q>XpeH`KCf0 z;_gLm<Rt1tnI=eU$pq?MGy!6`Vho|;i_sp-<A$crUTYobRvA97@mqcF-Y*I5?>qU~ z#gwizMeDbINzy5`WJJ-xKj)4I0`bwKrGn_1(4xrWB-KBzvSQr;kNdi<ZXi@2wWDsk zVs$}OzQ-e*ck4ie7Qz$nnHf#_-L%k=phz$fi}Oy&89K)qHCJ+a&T&S9clw~mPpYfE zQE`3A$j@TU%V}9w7iW3@1A9q|-!|LVHgiGL-O}5cuG_Xb7x_~B2_w9;i=`N-@VMef zO-=Pd^=2)IN&<cwf2*%$dFK>#HRLg#(-ajB9!t5nL#R<ff3FycS$d)wGA_lhRrP=I zfKCquRw>Ng8n<>%N!c-@$Mb&~Xdrn?hEmb_VCgc&>jD@1Ctl>gD0>;b0vT3Hc-x!> zy}ibZc&!+$dV5=a?$+MU-mZaU&qANu>gyd8Q4#I!Zs|PN`~%6Sw>sjjx5rS3WvmXW z&8e}^$F|Jui?A)T7eoiGSQKxW6HRc&n>rg~eZ}BK2i+K-RVtTMNt7P9^x&R{UXkE> zRo*ceA?i`Nib_!Ziv}2}H2pamqeTNeMYEoxanO_$EuzxoB}a;;InfJsK<!wuN_A2o zCog>4<;q<x9hUHy`hwJA{o#)apO^UXlY@_*UUT+#E9o=gH9($GI-Jfk+(PrMbb8G{ zj#?>AKY!I|YnfA9OIhL$n#yY_SHa{6_)_H-^20q_V)(=0Plvx*q2ynHqX0Kx8sL<& zntJ$SWCr#E&jHO3JO;QzGmm`0mjYS<8SqaNg|ry_eSkJj^c~O_0%ZI%0qZrtRKsK- z%~r^a1<XLYUu%Au{1|kz@)4B-Ch^+jBFI{Gh2+)UK@RY;(&f4y71oTZK(m<cBgzaD zS#4lXAj$#GokCd55!sIs6+Q_#MdVtEBJ~h?;P)boe>+j>3q<8ZM57T`MJ@pJG0TXm z&;a9h6HUk<nz)82jPMNz*LVd{6L?MgP}~}#)(WD@pig}n3rpnTTUh<JLuMw@k<~Qs z2vH1iTmjydbBMnEI8i6!T7>*Bd6Q^4{43vATdjsZ$BgMgtgGZMbCBktg#V>{N*dyW zG)H{aFGjoS8%Bxa)G7W%yTr$I6rhk5jh%Ts)ZN?14Oz2PN?DuRlAT%1m`TQ(P<CU< zQiGwf%*aUD31yASl7zW$`@SzFr4U&nOGM~agh&#}GJZ4TR?~C$JkRfC`<`>IbA8Xb zuIqfh-#^UjeNfhRB?Mo8o58?KH(+?8&s3P7fjidZ;$OuN{fr?8d|Xy~TCQ(iEUvW$ z3K_gdn!jL;diJdP-7R+MQ`<8gb6H$EOMorrrdHNTDQ831d~zjtFi)Th2)f8?4Z3;h zC4R|2nh9~T#eb_+Vz42D_c7x(dWqv$9AsFQooE3Q-ORbdaoXfCpABpOy|ArR>%oC? zfwNC8u&wTI0yjyAXco2DkFX81AQ+6>8AqAf7@c*ahZR@FGX3|BGpevzF%PpC8%M;4 z2GvxzqAD1x)Kcp0+}qCdO4LR-pf?wreBNHyXcMqLvFCA1Lm!znL}Ht)6v(>i+w?7G zsga}-vS^YySKkufN514xB7ip#SmZ0>J*nWPwSNt-&IWut!ad|;KIZfN!<lk^3q{$% z#PyJ(&@w%Y`OvlSw!GSx*T|b!=f?8;xboMz^6yi>zj(@2kneS^C!Wc%;z?aeu%`UV zPfD<g`)bko!LPi8L0O$NZkaVMp~~&g=Bb?%d~`KEWUNo}QH-D5sZu<z>;bHT)ayR- zi0-8=s8umo+aDdQfmx|tMunNkh<Z8;*ksQiNNUKQNcf}R^!h@CG-pAPAf#1vT_nUz zVctQVi^NZgJQ+1f`iQ&8Z<m#)o9iYvAqAHagz4v}V$~!I3+xm#ZJcBDIkZS`2KPI^ zQ{fuBCK6UI@a9^=9}X9kFIZs49viTwutocUwY=F@jsWb&tiIO=SJvX^({smi>v8Sg zE!k0S*Q-xew^ip=-yKj!EQ}a>)YYEy&-1B!1yn=5P0z)f#p_ua;LX|(f@{*06of~| z%Jt+D8-;2Q^>Wqol1MjH)Gx;nn2z@f2<Jx=I@QL*3_e=}PQ6936o>XOBKN+RBb`;R z3DGZ?$P?~mpD`rXh}|P%vNxrl0}`*MvAZ6dTRk#w*2jE@C6gZo4RF>K{@W!%OV(YP zBn&hk0W(yytRn5Dx)=9cK$E4>(?W`(&0nB5OZ)!x;!9BkSE+Pbi`z~Z3okwGc=6)X z=SyoU9miLtiUNu@i#jY8Oq|!8*IW|2hqDGY=C)R_EB=Wj#TlMt=5JC8nVpL=#08UD z12xdr{n_hzvcmnA1``Oo*c5+(DE0iat@e*wSb3RO=a!PMMQt)f_XGS7zm-x?*z2>; zn8%9QcnvVDK;D!8WnfzJ0t<<Wq!B6k{xjof(zM>XLXxE2tsce6y$3g!w8w_|x%xrx zeHeTIdnSW~Hrvp%i62As<C{^reJ<Hkmy5g)&l53wSn8SQKy?gb1OUr6^8jFm<-$z) zo%poo0Dh?ZVUnOH11XXjwA>m%-qZDN^P<Uquw|T=NyZ!9_7{CNg2gA8j-PklFyBB8 zJ~F8Jh;BgKLUwNsDlPZqr4rJe6(Yh?FrftJQS>M;7<MPq9n^CHl%jlRzs;jJF>kZX zZDc)G3tZ3TjvTsa8~@!c{yRZr`JC!BtLn6y>hx(<$UWY`p)MkNRNTWS-qpx#;a22x zY|wLT#v&yCa;jjLyQJ3KK>o<d!i5C*#-`rsstC@W@}2^IaD{gLn+WN&Xe-EmJcEE? zxFl1$LHK3HEwTNNMpxuE?iSQ3t)l;0t1rSF+_-9h&fX^AHZ9>vtCWA&wCC7li%31X zHo^OMTF$1?K6237(PL%y9ctPOt`uD;)DH#;e`<@0j?i5R)o$g|T?r~QHfvNixhOQy zz<a0;FqV{W#a=D{cM8AlO}2O+m{UcLc~{Q7XxB3*!`wWxdspvRG8!2&<Mqtjjk_VC zfLDp1(SW25TU2MjK<Tj$`fCH4e7B6^p6a1s{oG%Q5Vx%dy3(E(IRrkv40U*knpOL> zzW5nh`Sz<smAKCXlpo(z8tY<kMM2b7sWJ=pl7#JbS(ZE72_J_Zx!5Z5@s#_r4;i<H zh{S%7VL_<O%xOuq{n7dQkD<`$oKndEnalA!wn{6do;e=@6O2RLYMbms-G_QQ+<ReC zODD=558bxyZ{DygAp=-^#A91Tbu7~tk6X&O^Rf&Nkhe+;CjqHR`zD8j+hGFI$!i0} zUBcT+M^7EEg`56qlVy{imX}tLmM>UTG;se?#PK`@%gmCxIp_+%UzUB@(A*)_q1z}+ z)k-bH`^1{LE}J4<8<R1MQu1DZ*BdWVGqM(2h}wdLw6XTd*B%4#JTMB?3n!g^5pBd) z-MX@na%$tqh6d*&#KhI4XZNHIn%{yW-!DgLILOt{a$j%xs64hmrlNMOSJc&J=0-3~ za0=v*p0flSks{bNmZ1DiEj-+om95fo8&{j7+z$(*on&Jt`r#@9!NnlHvg8|1m|}U9 zHR|<|_d{RM^4(z%ZT_<IP8e%1^GKDLoqskx9C^foeQaM=xwdI>=0nrihsEa(rIki& z7&OXv2ig<#XMEvk4voMJ%ZTLMNg163kwtc*kxQa77Vl)HPnvSBCq#F7VVw9|<XoMS zIZefg`hTJx9~C_mB;BVx&J&8-zLM0qePZ;35mr(s2qvr+Q!oqivCoy0V3ODE^i*|X z+*T`-h}-6QTKxFqR%^`r<s8Fvod?ui0zN*#)?Vq+tZN`FZOpP7Nu=*REPuP+O>Z-B z;@0$<!?~)4GPjq?<%jirQp)bf^VM`kt;$Es;S9Y`J~$g-*IhwA1&7zzCBE_y!@)c@ zueHNF+V47@jjs-5YA~ogdJ|Xkyn2Mv))q18cTF2?9~sFC^m9#=3@xb7OpBf2N#`~y z2KS7p>MaQIs@A3n8Oer^CQI*&aaj|1=N#gMhMU86`TX*nupN?-xa5W0D3P{)_j7l# z%*f6mT|i}x96{uJiG8DATzHKeQ~6MFO+htmraI%b$CAbOhhjoJbuD7)-QX38h??0V zX@LsVVU*Bx(Nn?Z(WebhS)bmmadQN^sC`hKpAsHNkH?LBjN6Sf*_Sxk?pHm?zW9aQ zD*H+^ZCJoozT5S#Tj<j5ahRKBXG}tqtGdknrV(|s>L;gTK%>|@sg|bPO{_=u-r?%) z4Q?WHy%lSkVBi>I+`u;(F>Kuul{txx@$juW1J2FZOW+;c`t#x-FH1;6ss7)u_nOvE zRUYJ4HXjY)U2%-#;yd$j>L}W>zbNDzLnZ?m&>QJ$Qlj$+o>Nmnp$Io61b#p8UI16Z zU2L*FeEa#Febw=Oq`?A9asIRFTW;$yD;dLcHQ|NoVGJ6&Ii<-@Ug-yXy2Z)ay3HzC zCS3g+O3f~*heDZwz`-!&4^WNLZ$LFv+|U0Gs-dUj*xl~NkAn7vFZywa+|~l72D>se ztKYN_nBkdQ1&F&&aYRalCi4Vsc=^O0oOv=W>FRU2{_HD4n_`>q$eArM#aZz*{yg7y zFFQOkYIf>G_uz}W82!<AdfWXuHboEiWPTblP-yeDE!-L(Q0wgcn7lYWyIlVDi!Hz7 z+@f#7iEfwWuNw<3CH)?D6H}tg&UfcLL1D2HV?+3@S9UD}0-Ih!9UHiY$BE^^(dWLQ zV+TJ7%jaMdK8{E{9}7r!ols(}I@+csvHW?W;WOK8+t<9#K@|7N5hbkjL$O7UI{W)_ z5ll$++xdDC!mp)ODw+0LlY(=|hq_LfEQ5lz$sn1OoEDvDIv^Ogz)*r9DDe0iXR)Ah zUCmcqg;8M0c#x^WY0RAb*W_`zhsbBg<0M!;-Dj(q`(46&+Mzo4UPALMdEWC0`AyG0 znJ21e%5ON=5ay$_jD*~f172UJ2L!iN*1pY04J~RK6g6VUA|GUI*nfN76B$%HuNRYX zW&PWF+#NO7Aiw3$d{(j~i~0On5@uU<ebHATp2)n;^Le5YZw(5M$2B7+#{;^2A57Wm z*ea&Hfam(%oq9(u>5P`z+F~|*GXwZd!O$U}lwis~$^}kWE<3~}D#1w`OLX+Yd3qDP zc9=?(E<=n51;=E3`mC9*g{(Qw14}f-UN$9oVDQFf2IfD^bTA$`cVD0s%>qa<0n%)w zcMRMyE<_*{^21tFlW+xS1C)os5kPq;2o8jRKp+a?iE?E}Gt%?MxZ@l(@GkCHAc)qZ zL3G4Y5K{=`4*H3@QfgZsLV-0=kxN<_PhBj|<$^cO4Xw+}8|z^KMC{C`HgtCz)V)&J zO;`w!26!^1Ls3zgRa6WV6?XMAFgyM(Ie63HROVinu+%YHuO^0wrO~hcpCD0LRazG? zUb}HPgZZZohWtab`XvECFf~Oxr}reDg0sTmU4V*~IJ^d)h}-R;Ex;1z<b8oi2`3eh z1p>k-k=*@)LZJ#EkTQa@OS((yC<L}^L$&{JeyH{^I8*@%S3)RJ)#jH51@;AjD1hP0 zNH`Vd1&6~GkVps^f%vt5=cfN-EpRYM0Rcr)y&@1uiW3kR2LE*)3<g&K!yr)Fjp>}z z_0xHw-UX=yRX`{y!zrGAbJJf|uoE#Q`U!Lm>L^{4Zsu;(Aj-;=^+P~VFp%z&662r! z%0Dl6XYJgT&K>mU-sxuUn$yBhSqklZP)d`4#^b1vGa#6n3;GHM)0glrv+M_V?YGP_ z`bPn@7YadxLYfb1DKH3!F&+Aircj;jqS6pQB<0@_w_nOs8|Und^}^yEv6NZ`0Tn@Z zbf*qN-QLcz<Dd)yX?vjG6a@UkU>C0jgizsZ|K;;7q2fj}kn`H*jHiw&<_TRU*@bmZ z^=^Z2R2cxbwOl#9+7s&3G_x`W0OA|lYRitkv#8@=*x$#6&4NRho%lbLmunpH{3>23 zP^^qA>+m=x9MR?Z_yv334cmPNOkXlukxLxctGJj%m>Jq-ZJdn6gn|^dss<IoC{Im& zlJ@A;lPPV1b89)0d#vA@xm11X_v;vb{LVj)?NULJi#V^`3ll*n1F>kG0zQEcXG5JP zs?(3$`%d{Jeo`TN>C;%;$fIx3j(pSivgb+K!y@0p%V4ehlnd{vwS2=5{5w6gu>6xA z2n4j7pWUSWpTb5FH7HF~sEW3u(mSVzk}ayrQo9W>-d;GWd{a{b0?P0F*%{x>I1o&& zhMn9{MbFra;7FvX{Eto=x{oG@JM#QP4R@M9HBA8m+i`7*B@!-qQL2X`&D6@FEQkIW z&<cdUV!-qjv!moj|AGoae!e0Z{s0M5p@NWq!tfiwAN^rL-+bcQ4BDr?NRGarc<lPf zE%zZq5K=tUSpSK5W0FX?sE}4gta!@wVngM-L{Y2Zwj0jLQ|E8$-ZW$G@fC+;w-U|K zy5nVyI;NdE4Q%RPMlys^z$VX@50O63GbgT8zmAO_xw}lNZq<DLO5pRIl!~Uw$-u=| zdXT<n9ppZl!-Au7DgDY8rzh^>6&s&9Jmx#&WSf1j?2G9^qv8N3cVyaT>r*Cc)<F3Z zZlq@A{)e+f*}CcLlMp{^lhTCJ{H_WWBgX6Vr3fqN*ZjiX@E|Y#*;qsEl6l*sZw4+s zQ~d1HORUQ(Xe8s4P$IPe)48Vjk<=$L3kl<@M&jl{9E&sD+k9{I6d#`<j>h9XZQ`#5 z0S-(NoPSf4bnhV$1bDY7A^(04u`<Ov(`s#3x<JUC()&#+s3l-V`NShdYH2o>KpP5l z5eWIGVh99EB6sAGYN>~JCQuSft3moCI-Wm&er-E5Dl{}L6mSLthMZ_<X=-Y3YT^;i zTwshLGQN%ty{zFh3{O~F27-JKi63~x*%id)uz?PHE)9g7=el!(Q&ej@tlgLeB+YUx z?1>ZmaMyR^QT`|5P%+1BcHI+w6vqr)D||2i9T=iF#>@MP7iG@~$}<qgAt7<nNQdJ; Dlcg*@ -- GitLab From cf02d08bc93f6ee4c47c19d5af255592175b6400 Mon Sep 17 00:00:00 2001 From: prathi3 <prathi3@illinois.edu> Date: Sun, 19 Jan 2025 14:27:27 -0600 Subject: [PATCH 084/109] Delete test2.jn --- juno_samples/test2.jn | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 juno_samples/test2.jn diff --git a/juno_samples/test2.jn b/juno_samples/test2.jn deleted file mode 100644 index a1fc6e65..00000000 --- a/juno_samples/test2.jn +++ /dev/null @@ -1,25 +0,0 @@ -#[entry] -fn main<m, n : usize>() -> i32[m, n, 64, 64] { - let res : i32[m, n, 64, 64]; - for bi = 0 to m { - for bj = 0 to n { - let tile : i32[64, 64]; - for ti = 0 to 64 { - for tj = 0 to 64 { - tile[ti, tj] = (ti as i32) + (tj as i32) + (bi as i32) + (bj as i32); - } - } - for si = 1 to 63 { - for sj = 1 to 63 { - tile[si, sj] = tile[si-1, sj-1] + tile[si+1, sj+1]; - } - } - for ri = 0 to 64 { - for rj = 0 to 64 { - res[bi, bj, ri, rj] = tile[ri, rj]; - } - } - } - } - return res; -} -- GitLab From 5a4973affa99c31f55797554e3ad599c5f2a14b2 Mon Sep 17 00:00:00 2001 From: prathi3 <prathi3@illinois.edu> Date: Sun, 19 Jan 2025 21:51:01 -0600 Subject: [PATCH 085/109] Delete antideps.mod --- juno_samples/antideps/antideps.mod | Bin 1583 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 juno_samples/antideps/antideps.mod diff --git a/juno_samples/antideps/antideps.mod b/juno_samples/antideps/antideps.mod deleted file mode 100644 index b4abaef48222ace6f66264d6746dd2d7924b9039..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1583 zcmb7E{c;;M5Z_(h>2#-)e3s;|IEihN*s;?z{z#gH4$wF8K$|~0Q%Hs~3_KnT!vN)f zUV*oO)nP*8!c4gt?d`7iOFyl)XuN*;`Q59V^`~Ea_438d-D?6szzhl~Fi#{RXsD7D z1Yw%N<}<HuZ}0A1kd08XM=sK6kCl)>n8=NF^Z9K|@y)%%qV|Y6a#3f{x~xP70+WAo zyyLap&eN`hdOJ_+>zgmXU2i8tt^!q^D~r{2bx-#zkwe1Sr26|v(A)xIJr@8ed)pcd zXOIvrVeXA$mRwpZ_^6E}FF*^OaQIwj1=;(ce=&^*aIF`FlH=}jT;FeUlwi?k1R^9D z60nwqUKSJv1+G=^YX`nn)@<9O>(pGc0b1!o=bB`X$S^@GyUZcU0x8Y&ZdKs!Ry7Yr zzm3;oy%cCH7kJGDx+_k#Z}$lNrC@eaD<&3)T(Aroh74?<g^MC+3>y6I2W&R7vxAJz z8==Bbfjfzwb~XhQ2%Kf9WqljGJ!1CIpN!+anGvuJ`uT#zrA{<ys~ft~O$OI#uY)$a zr3-mZ*k_WQ$_uhxCMP|)qV$Zp&X_l@9x)jlGnwol<qU^n?3ylLJua}I>xT&ZAL$*m zmg^N^gTb1{+dSrUIPlcx-=2$j4Q$c}i@R^3h5H$)4O+It!jTOP3=MF@=w{sr{Lt6F zp4kvPV9$>B(M4njHtoi1?`X+r#H&9d(Pn}jIi>?44_QPq+*f6VbeQMkUV+p1KV;`$ zU(xzJfvV{_%ricy#p9HKg1{ia8WyfLz2ML0T;$ww`yu*;j)I-fz%8S~+M}GVH(lXB znX&HF#iO2jKjmMj+kNI%%%9kH2khWcMhCsHgJB2!`Goade^LMJlPA3WIHRY-Fv2hb zJ7Hjk+$bY%nFGD6kGX-2p2{ows9=JI$;YL<jN)bFp3-WMNo$|Ucn}{9Qo@h>(jmn* zZuavJ4`as^;@SHH_#c+tJSm~%!BH}DmPZ9IB$3KYF&8z0S7HHEUXoYFOCgkqCBmsm z4Nogk2&;Ki@Vp_JibMoK#u%$m{D`7dh4=xL7T=?&#ExkQX)4n+&Ulq5qe>1a)N}Qj tnyELa{`l^<GxaN~-!qk|7djzIbsMGT@se{2gak?;c!Tf@!p{gl;h$>7iFE(~ -- GitLab From 6af2e9ece087d83d3f262a8506f87426ad3588cc Mon Sep 17 00:00:00 2001 From: prathi3 <prathi3@illinois.edu> Date: Sun, 19 Jan 2025 21:57:39 -0600 Subject: [PATCH 086/109] Delete gpu_matmul.jn --- juno_samples/matmul/src/gpu_matmul.jn | 45 --------------------------- 1 file changed, 45 deletions(-) delete mode 100644 juno_samples/matmul/src/gpu_matmul.jn diff --git a/juno_samples/matmul/src/gpu_matmul.jn b/juno_samples/matmul/src/gpu_matmul.jn deleted file mode 100644 index e719ba9b..00000000 --- a/juno_samples/matmul/src/gpu_matmul.jn +++ /dev/null @@ -1,45 +0,0 @@ -#[entry] -fn tiled_64_matmul_with_n_1024<m : usize, l : usize>(a : i32[1024, m], b : i32[m, l]) -> i32 { - let res = 0; - - for bi = 0 to 16 { - for bk = 0 to l / 64 { - // TODO: make these all the same size, clone analysis should undo GVN's - // combining of these three arrays. - let atile : i32[66, 64]; - let btile : i32[65, 64]; - let ctile : i32[64, 64]; - - for tile_idx = 0 to m / 64 { - for ti = 0 to 64 { - for tk = 0 to 64 { - atile[ti, tk] = a[bi * 64 + ti, tile_idx * 64 + tk]; - btile[ti, tk] = b[tile_idx * 64 + ti, bk * 64 + tk]; - // TODO: remove setting ctile to zero explicitly, clone analysis - // should see a lack of a phi for ctile in the block loops and - // induce a copy of an initial value of ctile (all zeros) on each - // iteration of the block loops. - ctile[ti, tk] = 0; - } - } - for ti = 0 to 64 { - for tk = 0 to 64 { - let c_acc = ctile[ti, tk]; - for inner_idx = 0 to 64 { - c_acc += atile[ti, inner_idx] * btile[inner_idx, tk]; - } - ctile[ti, tk] = c_acc; - } - } - } - - for ti = 0 to 64 { - for tk = 0 to 64 { - res += ctile[ti, tk]; - } - } - } - } - - return res; -} -- GitLab From 32ad1e8299a1b903b2855c72aa9809e520280f74 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 20 Jan 2025 12:16:55 -0600 Subject: [PATCH 087/109] untested --- hercules_cg/src/gpu.rs | 149 ++++++++++++++++------------------------- 1 file changed, 57 insertions(+), 92 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index bb28db8a..be797b2a 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -80,17 +80,17 @@ pub fn gpu_codegen<W: Write>( .collect(); - let fork_join_map = &fork_join_map(function, control_subgraph); - let join_fork_map: &HashMap<NodeID, NodeID> = &fork_join_map - .into_iter() + let fork_join_map = fork_join_map(function, control_subgraph); + let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map + .iter() .map(|(fork, join)| (*join, *fork)) .collect(); // Fork Reduce map should have all reduces contained in some key - let fork_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); + let mut fork_reduce_map: HashMap<NodeID, Vec<NodeID>> = HashMap::new(); // Reduct Reduce map should have all non-parallel and non-associative reduces // contained in some key. Unlike Fork, Reduct is not involved in any assertions. // It's placed here for convenience but can be moved. - let reduct_reduce_map: &mut HashMap<NodeID, Vec<NodeID>> = &mut HashMap::new(); + let mut reduct_reduce_map: HashMap<NodeID, Vec<NodeID>> = HashMap::new(); for reduce_node in &reduce_nodes { if let Node::Reduce { control, @@ -124,11 +124,13 @@ pub fn gpu_codegen<W: Write>( } } for idx in 0..function.nodes.len() { - if function.nodes[idx].is_fork() - && fork_reduce_map - .get(&NodeID::new(idx)).is_none_or(|reduces| reduces.is_empty()) - { - panic!("Fork node {} has no reduce nodes", idx); + if function.nodes[idx].is_fork() { + assert!(fork_reduce_map + .get(&NodeID::new(idx)) + .is_none_or(|reduces| reduces.is_empty()), + "Fork node {} has no reduce nodes", + idx + ); } } @@ -155,7 +157,7 @@ pub fn gpu_codegen<W: Write>( (NodeID::new(pos), *data) }; - let return_type_id = &typing[data_node_id.idx()]; + let return_type_id = typing[data_node_id.idx()]; let return_type = &types[return_type_id.idx()]; let return_param_idx = if !return_type.is_primitive() { let objects = &collection_objects.objects(data_node_id); @@ -186,7 +188,7 @@ pub fn gpu_codegen<W: Write>( // Map from control to pairs of data to update phi // For each phi, we go to its region and get region's controls - let control_data_phi_map: &mut HashMap<NodeID, Vec<(NodeID, NodeID)>> = &mut HashMap::new(); + let mut control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>> = HashMap::new(); for (idx, node) in function.nodes.iter().enumerate() { if let Node::Phi { control, data } = node { let Node::Region { preds } = &function.nodes[control.idx()] else { @@ -237,12 +239,12 @@ struct GPUContext<'a> { bbs: &'a BasicBlocks, kernel_params: &'a GPUKernelParams, def_use_map: &'a ImmutableDefUseMap, - fork_join_map: &'a HashMap<NodeID, NodeID>, - join_fork_map: &'a HashMap<NodeID, NodeID>, - fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, - reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>, - control_data_phi_map: &'a HashMap<NodeID, Vec<(NodeID, NodeID)>>, - return_type_id: &'a TypeID, + fork_join_map: HashMap<NodeID, NodeID>, + join_fork_map: HashMap<NodeID, NodeID>, + fork_reduce_map: HashMap<NodeID, Vec<NodeID>>, + reduct_reduce_map: HashMap<NodeID, Vec<NodeID>>, + control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>>, + return_type_id: TypeID, return_param_idx: Option<usize>, } @@ -318,7 +320,7 @@ impl GPUContext<'_> { (1, 1) } else { // Create structures and determine block and thread parallelization strategy - let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); + let (fork_tree, fork_control_map) = self.make_fork_structures(&self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); @@ -422,7 +424,7 @@ namespace cg = cooperative_groups; write!( w, "{} __restrict__ ret", - self.get_type(*self.return_type_id, true) + self.get_type(self.return_type_id, true) )?; } @@ -536,7 +538,7 @@ namespace cg = cooperative_groups; // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); - let ret_type = self.get_type(*self.return_type_id, false); + let ret_type = self.get_type(self.return_type_id, false); write!(w, " extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // The first set of parameters are dynamic constants. @@ -566,7 +568,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w, ") {{\n")?; // Pull primitive return as pointer parameter for kernel if ret_primitive { - let ret_type_pnt = self.get_type(*self.return_type_id, true); + let ret_type_pnt = self.get_type(self.return_type_id, true); write!(w, "\t{} ret;\n", ret_type_pnt)?; write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; if !first_param { @@ -1267,16 +1269,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // If we read collection, distribute elements among threads with cg // sync after. If we read primitive, copy read on all threads. Node::Read { collect, indices } => { - let is_char = self.is_char(self.typing[collect.idx()]); - let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); + let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_type_id = self.typing[id.idx()]; if self.types[data_type_id.idx()].is_primitive() { - if is_char { - let type_name = self.get_type(data_type_id, true); - write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; - } else { - write!(w, "{}{} = *({});\n", tabs, define_variable, collect_with_indices)?; - } + let type_name = self.get_type(data_type_id, true); + write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; } else { if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { panic!("GPU can't guarantee correctness for multi-block collection reads"); @@ -1287,13 +1284,12 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; }; // Divide up "elements", which are collection size divided // by element size, among threads. - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects), Some(true)); - let num_elements = format!("({})", data_size); - write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, define_variable, collect_with_indices)?; write!(w, "{}}}\n", tabs)?; - write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; - write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, define_variable, cg_tile, num_elements, cg_tile, cg_tile, collect_with_indices, cg_tile, num_elements, cg_tile, cg_tile)?; + write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; + write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, define_variable, cg_tile, data_size, cg_tile, cg_tile, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile)?; write!(w, "{}}}\n", tabs)?; write!(w, "{}{}.sync();\n", tabs, cg_tile)?; } @@ -1305,8 +1301,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; data, indices, } => { - let is_char = self.is_char(self.typing[collect.idx()]); - let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); + let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { @@ -1318,21 +1313,16 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; }; if self.types[data_type_id.idx()].is_primitive() { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; - if is_char { - let type_name = self.get_type(data_type_id, true); - write!(w, "{}\t*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; - } else { - write!(w, "{}\t*({}) = {};\n", tabs, collect_with_indices, data_variable)?; - } + let type_name = self.get_type(data_type_id, true); + write!(w, "{}\t*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?; write!(w, "{}}}\n", tabs)?; } else { - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects), Some(true)); - let num_elements = format!("({})", data_size); - write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, collect_with_indices, data_variable)?; write!(w, "{}}}\n", tabs)?; - write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; - write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, num_elements, cg_tile, cg_tile, data_variable, cg_tile, num_elements, cg_tile, cg_tile)?; + write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; + write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?; write!(w, "{}}}\n", tabs)?; } write!(w, "{}{}.sync();\n", tabs, cg_tile)?; @@ -1508,18 +1498,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; /* * This function emits collection name + pointer math for the provided indices. - * One nuance is whether the collection is represented as char pointer or - * the original primitive pointer. For Field, it's always char, for Variant, - * it doesn't matter here, and for Array, it depends- so we may need to tack - * on the element size to the index math. + * All collection types use char pointers. */ - fn codegen_collect(&self, collect: NodeID, indices: &[Index], is_char: bool, has_extra_dim: bool) -> String { + fn codegen_collect(&self, collect: NodeID, indices: &[Index], has_extra_dim: bool) -> String { let mut index_ptr = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { match index { Index::Field(field) => { - self.get_size(type_id, Some(*field), None, None); + self.get_size(type_id, Some(*field), None); } // Variants of summations have zero offset Index::Variant(_) => {} @@ -1550,10 +1537,8 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; cumulative_offset, ")".repeat(array_indices.len() - if has_extra_dim { 1 } else { 0 }) )); - if is_char { - let element_size = self.get_size(*element_type, None, None, None); - index_ptr.push_str(&format!(" * {}", element_size)); - } + let element_size = self.get_size(*element_type, None, None); + index_ptr.push_str(&format!(" * {}", element_size)); } } } @@ -1600,7 +1585,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Constant::Product(type_id, constant_fields) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects, None); + let size = self.get_size(*type_id, None, extra_dim_collects); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; @@ -1612,7 +1597,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call let field_type = self.get_type(type_fields[i], true); - let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects, None); + let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects); let field_constant = &self.constants[constant_fields[i].idx()]; if field_constant.is_scalar() { self.codegen_constant( @@ -1632,7 +1617,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Constant::Summation(type_id, variant, field) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects, None); + let size = self.get_size(*type_id, None, extra_dim_collects); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; @@ -1660,18 +1645,14 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; }; } Constant::Array(type_id) => { - let Type::Array(element_type, _) = &self.types[type_id.idx()] else { - panic!("Expected array type") - }; if !allow_allocate { panic!("Nested array constant should not be re-allocated"); } let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects, None); - let element_type = self.get_type(*element_type, true); + let size = self.get_size(*type_id, None, extra_dim_collects); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; - write!(w, "{}{} = reinterpret_cast<{}>(dynamic_shared + dynamic_shared_offset);\n", tabs, name, element_type)?; + write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size); } } @@ -1684,15 +1665,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; * and offset to 2nd field. This is useful for constant initialization and read/write * index math. */ - fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>, exclude_element_size: Option<bool>) -> String { + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { let array_size = multiply_dcs(if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { &extents[1..] } else { extents }); - if exclude_element_size.unwrap_or(false) { - array_size - } else { - format!("{} * {}", self.get_alignment(*element_type), array_size) - } + format!("{} * {}", self.get_alignment(*element_type), array_size) } Type::Product(fields) => { let num_fields = &num_fields.unwrap_or(fields.len()); @@ -1700,7 +1677,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; .iter() .enumerate() .filter(|(i, _)| i < num_fields) - .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects, None), self.get_alignment(*id))) + .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects), self.get_alignment(*id))) .fold(String::from("0"), |acc, (size, align)| { if acc == "0" { size @@ -1715,7 +1692,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; format!( "{} - {}", with_field, - self.get_size(fields[*num_fields], None, extra_dim_collects, None) + self.get_size(fields[*num_fields], None, extra_dim_collects) ) } else { with_field @@ -1725,7 +1702,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // The argmax variant by size is not guaranteed to be same as // argmax variant by alignment, eg product of 3 4-byte primitives // vs 1 8-byte primitive, so we need to calculate both. - let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects, None)).fold( + let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects)).fold( String::from("0"), |acc, x| { if acc == "0" { @@ -1880,16 +1857,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; func_name.to_string() } - // Check if a type should be represented as char*. Must be a product, - // summation, or array of product/summation types. - fn is_char(&self, type_id: TypeID) -> bool { - match &self.types[type_id.idx()] { - Type::Product(_) | Type::Summation(_) => true, - Type::Array(element_type, _) => self.is_char(*element_type), - _ => false, - } - } - fn get_cg_tile(&self, fork: NodeID, cg_type: CGType) -> String { format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" }) } @@ -1938,12 +1905,10 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } fn get_type(&self, id: TypeID, make_pointer: bool) -> String { - match &self.types[id.idx()] { - // Product and summation collections are char* for 1 byte-addressability - // since we can have variable type fields - Type::Product(_) | Type::Summation(_) => "char*".to_string(), - Type::Array(element_type, _) => self.get_type(*element_type, true), - _ => convert_type(&self.types[id.idx()], make_pointer), + if self.types[id.idx()].is_primitive() { + convert_type(&self.types[id.idx()], make_pointer) + } else { + "char*".to_string() } } -- GitLab From ae1863d9976aad58abc1023e667424bf762f8aa4 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 20 Jan 2025 12:19:10 -0600 Subject: [PATCH 088/109] still untested --- hercules_cg/src/gpu.rs | 4 ++-- hercules_opt/src/pass.rs | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index be797b2a..21d284b3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -22,6 +22,7 @@ pub fn gpu_codegen<W: Write>( control_subgraph: &Subgraph, bbs: &BasicBlocks, collection_objects: &FunctionCollectionObjects, + fork_join_map: &HashMap<NodeID, NodeID>, w: &mut W, ) -> Result<(), Error> { /* @@ -80,7 +81,6 @@ pub fn gpu_codegen<W: Write>( .collect(); - let fork_join_map = fork_join_map(function, control_subgraph); let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map .iter() .map(|(fork, join)| (*join, *fork)) @@ -239,7 +239,7 @@ struct GPUContext<'a> { bbs: &'a BasicBlocks, kernel_params: &'a GPUKernelParams, def_use_map: &'a ImmutableDefUseMap, - fork_join_map: HashMap<NodeID, NodeID>, + fork_join_map: &'a HashMap<NodeID, NodeID>, join_fork_map: HashMap<NodeID, NodeID>, fork_reduce_map: HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: HashMap<NodeID, Vec<NodeID>>, diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index bb70bf08..9b4c09aa 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -984,11 +984,13 @@ impl PassManager { self.make_control_subgraphs(); self.make_collection_objects(); self.make_callgraph(); + self.make_fork_join_maps(); let typing = self.typing.as_ref().unwrap(); let control_subgraphs = self.control_subgraphs.as_ref().unwrap(); let bbs = self.bbs.as_ref().unwrap(); let collection_objects = self.collection_objects.as_ref().unwrap(); let callgraph = self.callgraph.as_ref().unwrap(); + let fork_join_maps = self.fork_join_maps.as_ref().unwrap(); let devices = device_placement(&self.module.functions, &callgraph); @@ -1029,6 +1031,7 @@ impl PassManager { &control_subgraphs[idx], &bbs[idx], &collection_objects[&FunctionID::new(idx)], + &fork_join_maps[idx], &mut cuda_ir, ) .unwrap(), -- GitLab From 0b1d162f4affa4ddf102b99412db99f83e11862e Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Mon, 20 Jan 2025 17:09:47 -0600 Subject: [PATCH 089/109] untested --- hercules_cg/src/gpu.rs | 88 ++++++++---------------------------------- 1 file changed, 17 insertions(+), 71 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 21d284b3..11a91c9a 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -134,51 +134,6 @@ pub fn gpu_codegen<W: Write>( } } - // Obtain the Return node and if it's a collection, use the collection objects - // analysis to determine the origin. Also save the return node id for later - // conversion of primitive Return into Parameter. - let (_, data_node_id) = { - let pos = function - .nodes - .iter() - .position(|node| { - matches!( - node, - Node::Return { - control: _, - data: _ - } - ) - }) - .expect("Function must have a return node"); - let Node::Return { control: _, data } = &function.nodes[pos] else { - panic!("Return node must be a return node"); - }; - (NodeID::new(pos), *data) - }; - - let return_type_id = typing[data_node_id.idx()]; - let return_type = &types[return_type_id.idx()]; - let return_param_idx = if !return_type.is_primitive() { - let objects = &collection_objects.objects(data_node_id); - let origin = collection_objects.origin(objects[0]); - if !objects - .iter() - .all(|obj| collection_objects.origin(*obj) == origin) - { - panic!( - "Returned data node {} has multiple collection objects with different origins", - data_node_id.idx() - ); - } - let CollectionObjectOrigin::Parameter(param_idx) = origin else { - panic!("Returns collection object that did not originate from a parameter"); - }; - Some(param_idx) - } else { - None - }; - // Temporary hardcoded values let kernel_params = &GPUKernelParams { max_num_blocks: 1024, @@ -217,8 +172,6 @@ pub fn gpu_codegen<W: Write>( fork_reduce_map, reduct_reduce_map, control_data_phi_map, - return_type_id, - return_param_idx, }; ctx.codegen_function(w) } @@ -244,8 +197,6 @@ struct GPUContext<'a> { fork_reduce_map: HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: HashMap<NodeID, Vec<NodeID>>, control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>>, - return_type_id: TypeID, - return_param_idx: Option<usize>, } /* @@ -416,17 +367,13 @@ namespace cg = cooperative_groups; }; write!(w, "{} p{}", param_type, idx)?; } - // Pull primitive return to a pointer parameter - if self.types[self.return_type_id.idx()].is_primitive() { - if !first_param { - write!(w, ", ")?; - } - write!( - w, - "{} __restrict__ ret", - self.get_type(self.return_type_id, true) - )?; + if !first_param { + write!(w, ", ")?; } + write!( + w, + "char* __restrict__ ret", + )?; // Type is char since it's simplest to use single bytes for indexing // and it's required for heterogeneous Product and Summation types. @@ -536,9 +483,8 @@ namespace cg = cooperative_groups; fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. + let ret_type = self.get_type(self.function.return_type, false); let mut pass_args = String::new(); - let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); - let ret_type = self.get_type(self.return_type_id, false); write!(w, " extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // The first set of parameters are dynamic constants. @@ -566,25 +512,25 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "p{}", idx)?; } write!(w, ") {{\n")?; - // Pull primitive return as pointer parameter for kernel - if ret_primitive { - let ret_type_pnt = self.get_type(self.return_type_id, true); - write!(w, "\t{} ret;\n", ret_type_pnt)?; + // Add return parameter, with allocation if primitive + let ret_type_pnt = self.get_type(self.function.return_type, true); + write!(w, "\t{} ret;\n", ret_type_pnt)?; + if !first_param { + write!(pass_args, ", ")?; + } + write!(pass_args, "ret")?; + if self.types[self.function.return_type.idx()].is_primitive() { write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; - if !first_param { - write!(pass_args, ", ")?; - } - write!(pass_args, "ret")?; } write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tcudaDeviceSynchronize();\n")?; write!(w, "\tfflush(stdout);\n")?; - if ret_primitive { + if self.types[self.function.return_type.idx()].is_primitive() { write!(w, "\t{} host_ret;\n", ret_type)?; write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; write!(w, "\treturn host_ret;\n")?; } else { - write!(w, "\treturn p{};\n", self.return_param_idx.unwrap())?; + write!(w, "\treturn ret;\n")?; } write!(w, "}}\n")?; Ok(()) -- GitLab From 58639fc70b216345e0c12c229c2454474c42b306 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Tue, 21 Jan 2025 05:50:54 +0000 Subject: [PATCH 090/109] address mr comms --- hercules_cg/src/gpu.rs | 43 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 11a91c9a..fb9526bf 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -372,7 +372,7 @@ namespace cg = cooperative_groups; } write!( w, - "char* __restrict__ ret", + "void* __restrict__ ret", )?; // Type is char since it's simplest to use single bytes for indexing @@ -512,26 +512,20 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "p{}", idx)?; } write!(w, ") {{\n")?; - // Add return parameter, with allocation if primitive + // Allocate return parameter and lift to kernel argument let ret_type_pnt = self.get_type(self.function.return_type, true); write!(w, "\t{} ret;\n", ret_type_pnt)?; if !first_param { write!(pass_args, ", ")?; } write!(pass_args, "ret")?; - if self.types[self.function.return_type.idx()].is_primitive() { - write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; - } + write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tcudaDeviceSynchronize();\n")?; - write!(w, "\tfflush(stdout);\n")?; - if self.types[self.function.return_type.idx()].is_primitive() { - write!(w, "\t{} host_ret;\n", ret_type)?; - write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; - write!(w, "\treturn host_ret;\n")?; - } else { - write!(w, "\treturn ret;\n")?; - } + // Copy return from device to host, whether it's primitive value or collection pointer + write!(w, "\t{} host_ret;\n", ret_type)?; + write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; + write!(w, "\treturn host_ret;\n")?; write!(w, "}}\n")?; Ok(()) } @@ -1426,12 +1420,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Node::Return { control: _, data } => { // Since we lift originally primitive returns into a parameter, // we write to that parameter upon return. - if self.types[self.typing[data.idx()].idx()].is_primitive() { - let return_val = self.get_value(*data, false, false); - write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?; - write!(w_term, "\t\t*ret = {};\n", return_val)?; - write!(w_term, "\t}}\n")?; - } + let return_val = self.get_value(*data, false, false); + let return_type_ptr = self.get_type(self.function.return_type, true); + write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?; + write!(w_term, "\t\t*(reinterpret_cast<{}>(ret)) = {};\n", return_type_ptr, return_val)?; + write!(w_term, "\t}}\n")?; write!(w_term, "\treturn;\n")?; 1 } @@ -1542,10 +1535,10 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; }; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call - let field_type = self.get_type(type_fields[i], true); let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects); let field_constant = &self.constants[constant_fields[i].idx()]; if field_constant.is_scalar() { + let field_type = self.get_type(type_fields[i], true); self.codegen_constant( format!("*reinterpret_cast<{}>({}+{})", field_type, name, offset), constant_fields[i], @@ -1573,10 +1566,9 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let Type::Summation(variants) = &self.types[type_id.idx()] else { panic!("Summation constant should have summation type") }; - let variant_type = - self.get_type(self.typing[variants[*variant as usize].idx()], true); let variant_constant = &self.constants[field.idx()]; if variant_constant.is_scalar() { + let variant_type = self.get_type(self.typing[variants[*variant as usize].idx()], true); self.codegen_constant( format!("*reinterpret_cast<{}>({})", variant_type, name), *field, @@ -1851,10 +1843,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } fn get_type(&self, id: TypeID, make_pointer: bool) -> String { - if self.types[id.idx()].is_primitive() { - convert_type(&self.types[id.idx()], make_pointer) + let ty = &self.types[id.idx()]; + if ty.is_primitive() { + convert_type(ty, make_pointer) } else { - "char*".to_string() + format!("char*{}", if make_pointer { "*" } else { "" }) } } -- GitLab From cd747c23d0972f6004fd4f321ecd502fd9906929 Mon Sep 17 00:00:00 2001 From: Praneet Rathi <prrathi10@gmail.com> Date: Wed, 22 Jan 2025 09:13:44 -0600 Subject: [PATCH 091/109] origins --- hercules_cg/src/fork_tree.rs | 0 hercules_cg/src/gpu.rs | 98 +++++++++++++++++++----------------- 2 files changed, 52 insertions(+), 46 deletions(-) create mode 100644 hercules_cg/src/fork_tree.rs diff --git a/hercules_cg/src/fork_tree.rs b/hercules_cg/src/fork_tree.rs new file mode 100644 index 00000000..e69de29b diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index fb9526bf..df95f63f 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -28,9 +28,8 @@ pub fn gpu_codegen<W: Write>( /* * We assert the following: * - Fork node must have >= 1 Reduce nodes - * - If the returned data type is a collection, it must have - * originated from a single known parameter. Can relax to allow - * one of multiple parameters. + * - (Later in code) If the returned data type is a collection, it must have + * originated from (potentially multiple) parameter(s). * * We don't assert but assume the following: * - max_num_blocks in KernelParams is within constraint of 1D grid size. This @@ -165,6 +164,7 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, + collection_objects, kernel_params, def_use_map, fork_join_map, @@ -190,6 +190,7 @@ struct GPUContext<'a> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, + collection_objects: &'a FunctionCollectionObjects, kernel_params: &'a GPUKernelParams, def_use_map: &'a ImmutableDefUseMap, fork_join_map: &'a HashMap<NodeID, NodeID>, @@ -247,7 +248,13 @@ impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // Emit all code up to the "goto" to Start's block let mut top = String::new(); - self.codegen_kernel_begin(&mut top)?; + let return_parameter = if self.collection_objects.returned_objects().len() == 1 { + Some(self.collection_objects.origin(*self.collection_objects.returned_objects() + .first().unwrap()).try_parameter().unwrap()) + } else { + None + }; + self.codegen_kernel_begin(return_parameter.is_none(), &mut top)?; let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; @@ -271,7 +278,7 @@ impl GPUContext<'_> { (1, 1) } else { // Create structures and determine block and thread parallelization strategy - let (fork_tree, fork_control_map) = self.make_fork_structures(&self.fork_join_map); + let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); @@ -308,14 +315,14 @@ impl GPUContext<'_> { // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; + self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, return_parameter, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) } // Emit kernel headers, signature, arguments, and dynamic shared memory declaration - fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { + fn codegen_kernel_begin(&self, has_ret_var: bool, w: &mut String) -> Result<(), Error> { write!(w, " #include <assert.h> #include <stdio.h> @@ -367,13 +374,15 @@ namespace cg = cooperative_groups; }; write!(w, "{} p{}", param_type, idx)?; } - if !first_param { - write!(w, ", ")?; + if has_ret_var { + if !first_param { + write!(w, ", ")?; + } + write!( + w, + "void* __restrict__ ret", + )?; } - write!( - w, - "void* __restrict__ ret", - )?; // Type is char since it's simplest to use single bytes for indexing // and it's required for heterogeneous Product and Summation types. @@ -480,7 +489,7 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, return_parameter: Option<usize>, w: &mut String) -> Result<(), Error> { // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let ret_type = self.get_type(self.function.return_type, false); @@ -512,20 +521,27 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "p{}", idx)?; } write!(w, ") {{\n")?; - // Allocate return parameter and lift to kernel argument - let ret_type_pnt = self.get_type(self.function.return_type, true); - write!(w, "\t{} ret;\n", ret_type_pnt)?; - if !first_param { - write!(pass_args, ", ")?; + let has_ret_var = return_parameter.is_none(); + if has_ret_var { + // Allocate return parameter and lift to kernel argument + let ret_type_pnt = self.get_type(self.function.return_type, true); + write!(w, "\t{} ret;\n", ret_type_pnt)?; + if !first_param { + write!(pass_args, ", ")?; + } + write!(pass_args, "ret")?; + write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; } - write!(pass_args, "ret")?; - write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tcudaDeviceSynchronize();\n")?; - // Copy return from device to host, whether it's primitive value or collection pointer - write!(w, "\t{} host_ret;\n", ret_type)?; - write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; - write!(w, "\treturn host_ret;\n")?; + if has_ret_var { + // Copy return from device to host, whether it's primitive value or collection pointer + write!(w, "\t{} host_ret;\n", ret_type)?; + write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; + write!(w, "\treturn host_ret;\n")?; + } else { + write!(w, "\treturn p{};\n", return_parameter.unwrap())?; + } write!(w, "}}\n")?; Ok(()) } @@ -585,8 +601,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; panic!("Expected fork node"); }; let fork_size = self.multiply_fork_factors(factors); + let reduces = &self.fork_reduce_map[root_fork]; + assert!(reduces.iter().all(|reduce| { + self.collection_objects.objects(*reduce).iter().all(|object| { + self.collection_objects.origin(*object).try_parameter().is_some() + }) + }), "All collection reduces in block fork must originate from parameters"); if let Some(fork_size) = fork_size && fork_size <= max_num_blocks + && fork_size.is_power_of_two() && self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { (root_forks, fork_size) @@ -1206,8 +1229,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; )?; } } - // If we read collection, distribute elements among threads with cg - // sync after. If we read primitive, copy read on all threads. + // Read of primitive requires load after pointer math. Node::Read { collect, indices } => { let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_type_id = self.typing[id.idx()]; @@ -1215,27 +1237,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let type_name = self.get_type(data_type_id, true); write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?; } else { - if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { - panic!("GPU can't guarantee correctness for multi-block collection reads"); - } - let cg_tile = match state { - KernelState::OutBlock | KernelState::InBlock => "block".to_string(), - KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), - }; - // Divide up "elements", which are collection size divided - // by element size, among threads. - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); - write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; - write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, define_variable, collect_with_indices)?; - write!(w, "{}}}\n", tabs)?; - write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; - write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, define_variable, cg_tile, data_size, cg_tile, cg_tile, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile)?; - write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + write!(w, "{}{} = {};\n", tabs, define_variable, collect_with_indices)?; } } - // Write is same as read, but when writing a primitive, we need to gate with - // a thread rank check. + // Write of primitive needs a thread rank gate for safety. Write of + // collection is memcpy that we distribute among threads. Node::Write { collect, data, -- GitLab From 70c06a3b461800b509649fd0ed94ac8b11de9847 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Wed, 22 Jan 2025 16:18:30 +0000 Subject: [PATCH 092/109] cleanup --- hercules_cg/src/fork_tree.rs | 37 ++++++++++ hercules_cg/src/gpu.rs | 126 +++++++++++++---------------------- hercules_cg/src/lib.rs | 4 ++ hercules_opt/src/pass.rs | 52 ++++++++++++++- 4 files changed, 137 insertions(+), 82 deletions(-) diff --git a/hercules_cg/src/fork_tree.rs b/hercules_cg/src/fork_tree.rs index e69de29b..da7f640a 100644 --- a/hercules_cg/src/fork_tree.rs +++ b/hercules_cg/src/fork_tree.rs @@ -0,0 +1,37 @@ +use std::collections::{HashMap, HashSet}; + +use crate::*; + +/* Construct a map from each fork node F to all forks satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we don't count self-domination + * Note that the fork_tree also includes the non-fork start node, as unique root node. + */ +pub fn fork_tree(function: &Function, fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> { + let mut fork_tree = HashMap::new(); + for (control, forks) in fork_join_nesting { + if function.nodes[control.idx()].is_fork() { + fork_tree.entry(*control).or_insert_with(HashSet::new); + let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0)); + fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(*control); + } + } + fork_tree +} + +/* + * Construct a map from fork node to all control nodes (including itself) satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we do count self-domination + * Here too we include the non-fork start node, as key for all controls outside any fork. + */ +pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> { + let mut fork_control_map = HashMap::new(); + for (control, forks) in fork_join_nesting { + let fork = forks.first().copied().unwrap_or(NodeID::new(0)); + fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(*control); + } + fork_control_map +} diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index df95f63f..d960de89 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -22,7 +22,10 @@ pub fn gpu_codegen<W: Write>( control_subgraph: &Subgraph, bbs: &BasicBlocks, collection_objects: &FunctionCollectionObjects, + def_use_map: &ImmutableDefUseMap, fork_join_map: &HashMap<NodeID, NodeID>, + fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, + fork_tree: &HashMap<NodeID, HashSet<NodeID>>, w: &mut W, ) -> Result<(), Error> { /* @@ -133,13 +136,6 @@ pub fn gpu_codegen<W: Write>( } } - // Temporary hardcoded values - let kernel_params = &GPUKernelParams { - max_num_blocks: 1024, - max_num_threads: 1024, - threads_per_warp: 32, - }; - // Map from control to pairs of data to update phi // For each phi, we go to its region and get region's controls let mut control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>> = HashMap::new(); @@ -154,7 +150,18 @@ pub fn gpu_codegen<W: Write>( } } - let def_use_map = &def_use(function); + let return_parameter = if collection_objects.returned_objects().len() == 1 { + Some(collection_objects.origin(*collection_objects.returned_objects() + .first().unwrap()).try_parameter().unwrap()) + } else { + None + }; + + let kernel_params = &GPUKernelParams { + max_num_blocks: 1024, + max_num_threads: 1024, + threads_per_warp: 32, + }; let ctx = GPUContext { function, @@ -165,13 +172,16 @@ pub fn gpu_codegen<W: Write>( control_subgraph, bbs, collection_objects, - kernel_params, def_use_map, fork_join_map, + fork_control_map, + fork_tree, join_fork_map, fork_reduce_map, reduct_reduce_map, control_data_phi_map, + return_parameter, + kernel_params, }; ctx.codegen_function(w) } @@ -191,13 +201,16 @@ struct GPUContext<'a> { control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, collection_objects: &'a FunctionCollectionObjects, - kernel_params: &'a GPUKernelParams, def_use_map: &'a ImmutableDefUseMap, fork_join_map: &'a HashMap<NodeID, NodeID>, + fork_control_map: &'a HashMap<NodeID, HashSet<NodeID>>, + fork_tree: &'a HashMap<NodeID, HashSet<NodeID>>, join_fork_map: HashMap<NodeID, NodeID>, fork_reduce_map: HashMap<NodeID, Vec<NodeID>>, reduct_reduce_map: HashMap<NodeID, Vec<NodeID>>, control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>>, + return_parameter: Option<usize>, + kernel_params: &'a GPUKernelParams, } /* @@ -248,13 +261,7 @@ impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { // Emit all code up to the "goto" to Start's block let mut top = String::new(); - let return_parameter = if self.collection_objects.returned_objects().len() == 1 { - Some(self.collection_objects.origin(*self.collection_objects.returned_objects() - .first().unwrap()).try_parameter().unwrap()) - } else { - None - }; - self.codegen_kernel_begin(return_parameter.is_none(), &mut top)?; + self.codegen_kernel_begin(self.return_parameter.is_none(), &mut top)?; let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; @@ -278,11 +285,10 @@ impl GPUContext<'_> { (1, 1) } else { // Create structures and determine block and thread parallelization strategy - let (fork_tree, fork_control_map) = self.make_fork_structures(self.fork_join_map); let (root_forks, num_blocks) = - self.get_root_forks_and_num_blocks(&fork_tree, self.kernel_params.max_num_blocks); - let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &fork_tree, num_blocks); - let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&fork_tree, thread_root_root_fork); + self.get_root_forks_and_num_blocks(&self.fork_tree, self.kernel_params.max_num_blocks); + let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &self.fork_tree, num_blocks); + let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&self.fork_tree, thread_root_root_fork); // TODO: Uncomment and adjust once we know logic of extra dim // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); let extra_dim_collects = HashSet::new(); @@ -295,8 +301,6 @@ impl GPUContext<'_> { None }, &thread_root_forks, - &fork_tree, - &fork_control_map, &fork_thread_quota_map, &extra_dim_collects, &mut dynamic_shared_offset, @@ -315,7 +319,7 @@ impl GPUContext<'_> { // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, return_parameter, &mut host_launch)?; + self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) @@ -489,7 +493,7 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, return_parameter: Option<usize>, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let ret_type = self.get_type(self.function.return_type, false); @@ -521,7 +525,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "p{}", idx)?; } write!(w, ") {{\n")?; - let has_ret_var = return_parameter.is_none(); + let has_ret_var = self.return_parameter.is_none(); if has_ret_var { // Allocate return parameter and lift to kernel argument let ret_type_pnt = self.get_type(self.function.return_type, true); @@ -540,46 +544,12 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; write!(w, "\treturn host_ret;\n")?; } else { - write!(w, "\treturn p{};\n", return_parameter.unwrap())?; + write!(w, "\treturn p{};\n", self.return_parameter.unwrap())?; } write!(w, "}}\n")?; Ok(()) } - /* Create fork_tree, a map from each fork node F to all forks satisfying: - * a) domination by F - * b) no domination by F's join - * c) no domination by any other fork that's also dominated by F, where we don't count self-domination - * Note that the fork_tree also includes the start node, to include all controls - * outside any fork. - * - * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying: - * a) domination by F - * b) no domination by F's join - * c) no domination by any other fork that's also dominated by F, where we do count self-domination - */ - fn make_fork_structures(&self, fork_join_map: &HashMap<NodeID, NodeID>) -> (HashMap<NodeID, HashSet<NodeID>>, HashMap<NodeID, HashSet<NodeID>>) { - let dom = dominator(self.control_subgraph, NodeID::new(0)); - let fork_nesting = compute_fork_join_nesting(self.function, &dom, fork_join_map); - fork_nesting.into_iter().fold( - (HashMap::new(), HashMap::new()), - |(mut fork_tree, mut fork_control_map), (control, forks)| { - if self.function.nodes[control.idx()].is_fork() { - // If control node is fork make sure it's in the fork_tree even - // if has no nested forks. - fork_tree.entry(control).or_insert_with(HashSet::new); - // Then get it's nesting fork- index = 1 to not count itself! - let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0)); - fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(control); - } - // Here the desired fork is always the first fork - let fork = forks.first().copied().unwrap_or(NodeID::new(0)); - fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(control); - (fork_tree, fork_control_map) - }, - ) - } - /* * If tree has a single root fork of known size s <= max_num_blocks * with parallel-fork schedule, then set num_blocks to s, else set num_blocks @@ -808,8 +778,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; &self, block_fork: Option<NodeID>, thread_root_forks: &HashSet<NodeID>, - fork_tree: &HashMap<NodeID, HashSet<NodeID>>, - fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, extra_dim_collects: &HashSet<TypeID>, dynamic_shared_offset: &mut String, @@ -820,7 +788,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // First emit data and control gen for each control node outside any fork. // Recall that this was tracked through a fake fork node with NodeID 0. let mut state = KernelState::OutBlock; - for control in fork_control_map.get(&NodeID::new(0)).unwrap() { + for control in self.fork_control_map.get(&NodeID::new(0)).unwrap() { let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -834,7 +802,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // Then generate data and control for the single block fork if it exists if block_fork.is_some() { state = KernelState::InBlock; - for control in fork_control_map.get(&block_fork.unwrap()).unwrap() { + for control in self.fork_control_map.get(&block_fork.unwrap()).unwrap() { let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -852,8 +820,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; self.codegen_data_control_traverse( root_fork, state, - fork_tree, - fork_control_map, fork_thread_quota_map, 1, num_threads, @@ -875,8 +841,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; &self, curr_fork: NodeID, state: KernelState, - fork_tree: &HashMap<NodeID, HashSet<NodeID>>, - fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, parent_quota: usize, num_threads: usize, @@ -902,7 +866,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } else { HashSet::new() }; - for control in fork_control_map.get(&curr_fork).unwrap() { + for control in self.fork_control_map.get(&curr_fork).unwrap() { let goto = gotos.get_mut(control).unwrap(); let init = &mut goto.init; let post_init = &mut goto.post_init; @@ -925,12 +889,10 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; )?; } } - for child in fork_tree.get(&curr_fork).unwrap() { + for child in self.fork_tree.get(&curr_fork).unwrap() { self.codegen_data_control_traverse( *child, state, - fork_tree, - fork_control_map, fork_thread_quota_map, use_thread_quota, num_threads, @@ -1424,14 +1386,16 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; tabs } Node::Return { control: _, data } => { - // Since we lift originally primitive returns into a parameter, - // we write to that parameter upon return. - let return_val = self.get_value(*data, false, false); - let return_type_ptr = self.get_type(self.function.return_type, true); - write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?; - write!(w_term, "\t\t*(reinterpret_cast<{}>(ret)) = {};\n", return_type_ptr, return_val)?; - write!(w_term, "\t}}\n")?; - write!(w_term, "\treturn;\n")?; + if self.return_parameter.is_none() { + // Since we lift return into a kernel argument, we write to that + // argument upon return. + let return_val = self.get_value(*data, false, false); + let return_type_ptr = self.get_type(self.function.return_type, true); + write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?; + write!(w_term, "\t\t*(reinterpret_cast<{}>(ret)) = {};\n", return_type_ptr, return_val)?; + write!(w_term, "\t}}\n")?; + write!(w_term, "\treturn;\n")?; + } 1 } _ => { diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index e41f0205..fbab6dbc 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -5,11 +5,15 @@ pub mod gpu; pub mod device; pub mod rt; +pub mod fork_tree; + pub use crate::cpu::*; pub use crate::gpu::*; pub use crate::device::*; pub use crate::rt::*; +pub use crate::fork_tree::*; + use hercules_ir::*; /* diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 9b4c09aa..295b8bcb 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -68,6 +68,8 @@ pub struct PassManager { pub postdoms: Option<Vec<DomTree>>, pub fork_join_maps: Option<Vec<HashMap<NodeID, NodeID>>>, pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>, + pub fork_control_maps: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, + pub fork_trees: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub loops: Option<Vec<LoopTree>>, pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, @@ -89,6 +91,8 @@ impl PassManager { postdoms: None, fork_join_maps: None, fork_join_nests: None, + fork_control_maps: None, + fork_trees: None, loops: None, reduce_cycles: None, data_nodes_in_fork_joins: None, @@ -204,6 +208,31 @@ impl PassManager { } } + pub fn make_fork_control_maps(&mut self) { + if self.fork_control_maps.is_none() { + self.make_fork_join_nests(); + self.fork_control_maps = Some( + self.fork_join_nests.as_ref().unwrap().iter().map(fork_control_map).collect(), + ); + } + } + + pub fn make_fork_trees(&mut self) { + if self.fork_trees.is_none() { + self.make_fork_join_nests(); + self.fork_trees = Some( + zip( + self.module.functions.iter(), + self.fork_join_nests.as_ref().unwrap().iter(), + ) + .map(|(function, fork_join_nesting)| { + fork_tree(function, fork_join_nesting) + }) + .collect(), + ); + } + } + pub fn make_loops(&mut self) { if self.loops.is_none() { self.make_control_subgraphs(); @@ -985,14 +1014,20 @@ impl PassManager { self.make_collection_objects(); self.make_callgraph(); self.make_fork_join_maps(); + self.make_fork_control_maps(); + self.make_fork_trees(); + self.make_def_uses(); let typing = self.typing.as_ref().unwrap(); let control_subgraphs = self.control_subgraphs.as_ref().unwrap(); let bbs = self.bbs.as_ref().unwrap(); let collection_objects = self.collection_objects.as_ref().unwrap(); let callgraph = self.callgraph.as_ref().unwrap(); + let def_uses = self.def_uses.as_ref().unwrap(); let fork_join_maps = self.fork_join_maps.as_ref().unwrap(); + let fork_control_maps = self.fork_control_maps.as_ref().unwrap(); + let fork_trees = self.fork_trees.as_ref().unwrap(); - let devices = device_placement(&self.module.functions, &callgraph); + let devices = device_placement(&self.module.functions, callgraph); let mut rust_rt = String::new(); let mut llvm_ir = String::new(); @@ -1031,7 +1066,10 @@ impl PassManager { &control_subgraphs[idx], &bbs[idx], &collection_objects[&FunctionID::new(idx)], + &def_uses[idx], &fork_join_maps[idx], + &fork_control_maps[idx], + &fork_trees[idx], &mut cuda_ir, ) .unwrap(), @@ -1082,6 +1120,12 @@ impl PassManager { file.write_all(cuda_ir.as_bytes()) .expect("PANIC: Unable to write output CUDA IR file contents."); + let cuda_text_path = format!("{}.cu", module_name); + let mut cuda_text_file = File::create(&cuda_text_path) + .expect("PANIC: Unable to open CUDA IR text file."); + cuda_text_file.write_all(cuda_ir.as_bytes()) + .expect("PANIC: Unable to write CUDA IR text file contents."); + let mut nvcc_process = Command::new("nvcc") .arg("-c") .arg("-O3") @@ -1109,6 +1153,12 @@ impl PassManager { file.write_all(rust_rt.as_bytes()) .expect("PANIC: Unable to write output Rust runtime file contents."); + let rt_text_path = format!("{}.hrt", module_name); + let mut rt_text_file = File::create(&rt_text_path) + .expect("PANIC: Unable to open Rust runtime text file."); + rt_text_file.write_all(rust_rt.as_bytes()) + .expect("PANIC: Unable to write Rust runtime text file contents."); + } Pass::Serialize(output_file) => { let module_contents: Vec<u8> = postcard::to_allocvec(&self.module).unwrap(); -- GitLab From 923d7ce91049c2ace287961abe78a1bbc56c62ee Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sat, 25 Jan 2025 06:07:13 +0000 Subject: [PATCH 093/109] untested unknown blocks --- .gitignore | 1 + hercules_cg/src/fork_tree.rs | 32 +++++----- hercules_cg/src/gpu.rs | 61 ++++++++++--------- hercules_samples/dot/build.rs | 3 +- hercules_samples/matmul/build.rs | 3 +- juno_frontend/Cargo.toml | 2 +- juno_samples/schedule_test/Cargo.toml | 3 + juno_samples/schedule_test/build.rs | 2 +- .../schedule_test/src/{sched.sch => cpu.sch} | 0 juno_samples/schedule_test/src/gpu.sch | 47 ++++++++++++++ juno_scheduler/Cargo.toml | 3 + 11 files changed, 107 insertions(+), 50 deletions(-) rename juno_samples/schedule_test/src/{sched.sch => cpu.sch} (100%) create mode 100644 juno_samples/schedule_test/src/gpu.sch diff --git a/.gitignore b/.gitignore index 749cea40..87af5349 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ *.swp .vscode *_env +*.txt diff --git a/hercules_cg/src/fork_tree.rs b/hercules_cg/src/fork_tree.rs index da7f640a..64a93160 100644 --- a/hercules_cg/src/fork_tree.rs +++ b/hercules_cg/src/fork_tree.rs @@ -2,6 +2,22 @@ use std::collections::{HashMap, HashSet}; use crate::*; +/* + * Construct a map from fork node to all control nodes (including itself) satisfying: + * a) domination by F + * b) no domination by F's join + * c) no domination by any other fork that's also dominated by F, where we do count self-domination + * Here too we include the non-fork start node, as key for all controls outside any fork. + */ +pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> { + let mut fork_control_map = HashMap::new(); + for (control, forks) in fork_join_nesting { + let fork = forks.first().copied().unwrap_or(NodeID::new(0)); + fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(*control); + } + fork_control_map +} + /* Construct a map from each fork node F to all forks satisfying: * a) domination by F * b) no domination by F's join @@ -19,19 +35,3 @@ pub fn fork_tree(function: &Function, fork_join_nesting: &HashMap<NodeID, Vec<No } fork_tree } - -/* - * Construct a map from fork node to all control nodes (including itself) satisfying: - * a) domination by F - * b) no domination by F's join - * c) no domination by any other fork that's also dominated by F, where we do count self-domination - * Here too we include the non-fork start node, as key for all controls outside any fork. - */ -pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> { - let mut fork_control_map = HashMap::new(); - for (control, forks) in fork_join_nesting { - let fork = forks.first().copied().unwrap_or(NodeID::new(0)); - fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(*control); - } - fork_control_map -} diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 1a9b6869..a6711a33 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,6 +3,8 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::{Error, Write}; +use std::fs::{OpenOptions, File}; +use std::io::Write as _; use self::hercules_ir::*; @@ -107,6 +109,7 @@ pub fn gpu_codegen<W: Write>( .entry(fork_node) .or_default() .push(*reduce_node); + println!("reduce_node: {:?}, fork_node: {:?}, join: {:?}", reduce_node, fork_node, control); } Node::Region { preds: _ } => { // TODO: map region node to fork node @@ -129,7 +132,7 @@ pub fn gpu_codegen<W: Write>( if function.nodes[idx].is_fork() { assert!(fork_reduce_map .get(&NodeID::new(idx)) - .is_none_or(|reduces| reduces.is_empty()), + .is_some_and(|reduces| !reduces.is_empty()), "Fork node {} has no reduce nodes", idx ); @@ -158,11 +161,12 @@ pub fn gpu_codegen<W: Write>( }; let kernel_params = &GPUKernelParams { - max_num_blocks: 1024, max_num_threads: 1024, threads_per_warp: 32, }; + std::fs::write("out.txt", "debug\n\n").unwrap(); + let ctx = GPUContext { function, types, @@ -187,7 +191,6 @@ pub fn gpu_codegen<W: Write>( } struct GPUKernelParams { - max_num_blocks: usize, max_num_threads: usize, threads_per_warp: usize, } @@ -259,6 +262,8 @@ enum CGType { impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { + let mut file = OpenOptions::new().append(true).open("out.txt").unwrap(); + // Emit all code up to the "goto" to Start's block let mut top = String::new(); self.codegen_kernel_begin(self.return_parameter.is_none(), &mut top)?; @@ -281,13 +286,17 @@ impl GPUContext<'_> { // If there are no forks, fast forward to single-block, single-thread codegen let (num_blocks, num_threads) = if self.fork_join_map.is_empty() { + writeln!(file, "shortcut to 1b1t").unwrap(); self.codegen_data_control_no_forks(&HashSet::new(), &mut dynamic_shared_offset, &mut gotos)?; - (1, 1) + ("1".to_string(), "1".to_string()) } else { + writeln!(file, "no shortcut! fork tree: {:?}", self.fork_tree).unwrap(); // Create structures and determine block and thread parallelization strategy - let (root_forks, num_blocks) = - self.get_root_forks_and_num_blocks(&self.fork_tree, self.kernel_params.max_num_blocks); - let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &self.fork_tree, num_blocks); + let (root_forks, num_blocks, is_block_parallel) = + self.get_root_forks_and_num_blocks(&self.fork_tree); + writeln!(file, "is_block_parallel: {}", is_block_parallel).unwrap(); + let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &self.fork_tree, is_block_parallel); + writeln!(file, "thread_root_root_fork: {:?}", thread_root_root_fork).unwrap(); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&self.fork_tree, thread_root_root_fork); // TODO: Uncomment and adjust once we know logic of extra dim // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); @@ -295,7 +304,7 @@ impl GPUContext<'_> { // Core function for the CUDA code of all data and control nodes. self.codegen_data_control( - if num_blocks > 1 { + if is_block_parallel { Some(thread_root_root_fork) } else { None @@ -304,11 +313,11 @@ impl GPUContext<'_> { &fork_thread_quota_map, &extra_dim_collects, &mut dynamic_shared_offset, - num_blocks, + is_block_parallel, num_threads, &mut gotos, )?; - (num_blocks, num_threads) + (num_blocks, num_threads.to_string()) }; // Emit all GPU kernel code from previous steps @@ -493,7 +502,7 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, num_blocks: String, num_threads: String, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let ret_type = self.get_type(self.function.return_type, false); @@ -559,32 +568,28 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; fn get_root_forks_and_num_blocks( &self, fork_tree: &HashMap<NodeID, HashSet<NodeID>>, - max_num_blocks: usize, - ) -> (HashSet<NodeID>, usize) { + ) -> (HashSet<NodeID>, String, bool) { let root_forks: HashSet<NodeID> = fork_tree.get(&NodeID::new(0)).unwrap().clone(); if root_forks.len() != 1 { - return (root_forks, 1); + return (root_forks, "1".to_string(), false); } let root_fork = root_forks.iter().next().unwrap(); let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { panic!("Expected fork node"); }; - let fork_size = self.multiply_fork_factors(factors); let reduces = &self.fork_reduce_map[root_fork]; assert!(reduces.iter().all(|reduce| { self.collection_objects.objects(*reduce).iter().all(|object| { self.collection_objects.origin(*object).try_parameter().is_some() }) }), "All collection reduces in block fork must originate from parameters"); - if let Some(fork_size) = fork_size - && fork_size <= max_num_blocks - && fork_size.is_power_of_two() - && self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) + if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { - (root_forks, fork_size) + let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); + (root_forks, fork_size, true) } else { - (root_forks, 1) + (root_forks, "1".to_string(), false) } } @@ -597,9 +602,9 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; &self, root_forks: &HashSet<NodeID>, fork_tree: &HashMap<NodeID, HashSet<NodeID>>, - num_blocks: usize, + is_block_parallel: bool, ) -> (NodeID, HashSet<NodeID>) { - if num_blocks > 1 { + if is_block_parallel { let root_fork = root_forks.iter().next().unwrap(); (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect()) } else { @@ -768,7 +773,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, KernelState::OutBlock, Some(1), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + self.codegen_data_node(*data, KernelState::OutBlock, Some(false), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } Ok(()) }) @@ -784,7 +789,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, extra_dim_collects: &HashSet<TypeID>, dynamic_shared_offset: &mut String, - num_blocks: usize, + is_block_parallel: bool, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -799,7 +804,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_blocks), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(is_block_parallel), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists @@ -911,7 +916,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; &self, id: NodeID, state: KernelState, - num_blocks: Option<usize>, + is_block_parallel: Option<bool>, use_thread_quota: Option<usize>, parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, @@ -1215,7 +1220,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; - if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { + if KernelState::OutBlock == state && is_block_parallel.unwrap() { panic!("GPU can't guarantee correctness for multi-block collection writes"); } let cg_tile = match state { diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs index 4cfd2a87..8657fdc1 100644 --- a/hercules_samples/dot/build.rs +++ b/hercules_samples/dot/build.rs @@ -4,8 +4,7 @@ fn main() { JunoCompiler::new() .ir_in_src("dot.hir") .unwrap() - //.schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .schedule_in_src("cpu.sch") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs index f895af86..735458c0 100644 --- a/hercules_samples/matmul/build.rs +++ b/hercules_samples/matmul/build.rs @@ -4,8 +4,7 @@ fn main() { JunoCompiler::new() .ir_in_src("matmul.hir") .unwrap() - //.schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .schedule_in_src("cpu.sch") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_frontend/Cargo.toml b/juno_frontend/Cargo.toml index b6d9a71d..648daf5f 100644 --- a/juno_frontend/Cargo.toml +++ b/juno_frontend/Cargo.toml @@ -5,7 +5,7 @@ authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" [features] -cuda = ["hercules_opt/cuda"] +cuda = ["hercules_opt/cuda", "juno_scheduler/cuda"] default = [] [[bin]] diff --git a/juno_samples/schedule_test/Cargo.toml b/juno_samples/schedule_test/Cargo.toml index be5d949b..c783217a 100644 --- a/juno_samples/schedule_test/Cargo.toml +++ b/juno_samples/schedule_test/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_schedule_test" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/schedule_test/build.rs b/juno_samples/schedule_test/build.rs index 4a428247..749a660c 100644 --- a/juno_samples/schedule_test/build.rs +++ b/juno_samples/schedule_test/build.rs @@ -4,7 +4,7 @@ fn main() { JunoCompiler::new() .file_in_src("code.jn") .unwrap() - .schedule_in_src("sched.sch") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/schedule_test/src/sched.sch b/juno_samples/schedule_test/src/cpu.sch similarity index 100% rename from juno_samples/schedule_test/src/sched.sch rename to juno_samples/schedule_test/src/cpu.sch diff --git a/juno_samples/schedule_test/src/gpu.sch b/juno_samples/schedule_test/src/gpu.sch new file mode 100644 index 00000000..edca678e --- /dev/null +++ b/juno_samples/schedule_test/src/gpu.sch @@ -0,0 +1,47 @@ +macro juno-setup!(X) { + //gvn(X); + phi-elim(X); + dce(X); + lift-dc-math(X); +} +macro codegen-prep!(X) { + infer-schedules(X); + dce(X); + gcm(X); + dce(X); + phi-elim(X); + float-collections(X); + gcm(X); +} + + +juno-setup!(*); + +let first = outline(test@outer); +let second = outline(test@row); + +// We can use the functions produced by outlining in our schedules +gvn(first, second, test); + +ip-sroa(*); +sroa(*); + +// We can evaluate expressions using labels and save them for later use +let inner = first@inner; + +// A fixpoint can run a (series) of passes until no more changes are made +// (though some passes seem to make edits even if there are no real changes, +// so this is fragile). +// We could just let it run until it converges but can also tell it to panic +// if it hasn't converged after a number of iterations (like here) tell it to +// just stop after a certain number of iterations (stop after #) or to print +// the iteration number (print iter) +fixpoint panic after 2 { + phi-elim(*); +} + +host(*); +gpu(first, second); + +codegen-prep!(*); +//xdot[true](*); diff --git a/juno_scheduler/Cargo.toml b/juno_scheduler/Cargo.toml index 1c837d4a..3d81ea96 100644 --- a/juno_scheduler/Cargo.toml +++ b/juno_scheduler/Cargo.toml @@ -4,6 +4,9 @@ version = "0.0.1" authors = ["Aaron Councilman <aaronjc4@illinois.edu>"] edition = "2021" +[features] +cuda = [] + [build-dependencies] cfgrammar = "0.13" lrlex = "0.13" -- GitLab From 90af973c972a82995ad3c51961e5777c079485d5 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sat, 25 Jan 2025 17:04:17 +0000 Subject: [PATCH 094/109] boxdistract --- hercules_cg/src/gpu.rs | 6 ++- juno_scheduler/src/pm.rs | 112 +++++++++++++++++++++++++++++++++++---- 2 files changed, 105 insertions(+), 13 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a6711a33..074281f8 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -534,6 +534,8 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "p{}", idx)?; } write!(w, ") {{\n")?; + // For case of dynamic block count + self.codegen_dynamic_constants(w)?; let has_ret_var = self.return_parameter.is_none(); if has_ret_var { // Allocate return parameter and lift to kernel argument @@ -584,7 +586,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; self.collection_objects.origin(*object).try_parameter().is_some() }) }), "All collection reduces in block fork must originate from parameters"); - if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) + if true || self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); (root_forks, fork_size, true) @@ -973,7 +975,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; reduct: _, } => { let init_val = self.get_value(*init, false, false); - if parallel_factor.is_none() { + if parallel_factor.is_none() && KernelState::InThread == state { let Some(nesting_fork) = nesting_fork else { panic!("Expected reduce to be nested in a fork node"); }; diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 452c1995..d01fef73 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -183,6 +183,8 @@ struct PassManager { pub postdoms: Option<Vec<DomTree>>, pub fork_join_maps: Option<Vec<HashMap<NodeID, NodeID>>>, pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>, + pub fork_control_maps: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, + pub fork_trees: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub loops: Option<Vec<LoopTree>>, pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, @@ -214,6 +216,8 @@ impl PassManager { postdoms: None, fork_join_maps: None, fork_join_nests: None, + fork_control_maps: None, + fork_trees: None, loops: None, reduce_cycles: None, data_nodes_in_fork_joins: None, @@ -332,6 +336,29 @@ impl PassManager { } } + pub fn make_fork_control_maps(&mut self) { + if self.fork_control_maps.is_none() { + self.make_fork_join_nests(); + self.fork_control_maps = Some( + self.fork_join_nests.as_ref().unwrap().iter().map(fork_control_map).collect(), + ); + } + } + + pub fn make_fork_trees(&mut self) { + if self.fork_trees.is_none() { + self.make_fork_join_nests(); + self.fork_trees = Some( + zip( + self.functions.iter(), + self.fork_join_nests.as_ref().unwrap().iter(), + ) + .map(|(function, fork_join_nesting)| fork_tree(function, fork_join_nesting)) + .collect(), + ); + } + } + pub fn make_loops(&mut self) { if self.loops.is_none() { self.make_control_subgraphs(); @@ -464,6 +491,10 @@ impl PassManager { self.make_control_subgraphs(); self.make_collection_objects(); self.make_callgraph(); + self.make_def_uses(); + self.make_fork_join_maps(); + self.make_fork_control_maps(); + self.make_fork_trees(); let PassManager { functions, @@ -476,6 +507,10 @@ impl PassManager { bbs: Some(bbs), collection_objects: Some(collection_objects), callgraph: Some(callgraph), + def_uses: Some(def_uses), + fork_join_maps: Some(fork_join_maps), + fork_control_maps: Some(fork_control_maps), + fork_trees: Some(fork_trees), .. } = self else { @@ -497,6 +532,7 @@ impl PassManager { let mut rust_rt = String::new(); let mut llvm_ir = String::new(); + let mut cuda_ir = String::new(); for idx in 0..module.functions.len() { match devices[idx] { Device::LLVM => cpu_codegen( @@ -513,6 +549,25 @@ impl PassManager { pass: "cpu codegen".to_string(), error: format!("{}", e), })?, + Device::CUDA => gpu_codegen( + &module.functions[idx], + &module.types, + &module.constants, + &module.dynamic_constants, + &typing[idx], + &control_subgraphs[idx], + &bbs[idx], + &collection_objects[&FunctionID::new(idx)], + &def_uses[idx], + &fork_join_maps[idx], + &fork_control_maps[idx], + &fork_trees[idx], + &mut cuda_ir, + ) + .map_err(|e| SchedulerError::PassError { + pass: "cuda codegen".to_string(), + error: format!("{}", e), + })?, Device::AsyncRust => rt_codegen( FunctionID::new(idx), &module, @@ -528,41 +583,76 @@ impl PassManager { pass: "rust codegen".to_string(), error: format!("{}", e), })?, - _ => todo!(), } } println!("{}", llvm_ir); + println!("{}", cuda_ir); println!("{}", rust_rt); + let output_archive = format!("{}/lib{}.a", output_dir, module_name); + println!("{}", output_archive); + // Write the LLVM IR into a temporary file. let tmp_dir = TempDir::new().unwrap(); - let mut tmp_path = tmp_dir.path().to_path_buf(); - tmp_path.push(format!("{}.ll", module_name)); - println!("{}", tmp_path.display()); - let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output LLVM IR file."); + let mut llvm_path = tmp_dir.path().to_path_buf(); + llvm_path.push(format!("{}.ll", module_name)); + println!("{}", llvm_path.display()); + let mut file = File::create(&llvm_path) + .expect("PANIC: Unable to open output LLVM IR file."); file.write_all(llvm_ir.as_bytes()) .expect("PANIC: Unable to write output LLVM IR file contents."); // Compile LLVM IR into an ELF object file. - let output_archive = format!("{}/lib{}.a", output_dir, module_name); + let llvm_object = format!("{}/{}_cpu.o", tmp_dir.path().to_str().unwrap(), module_name); let mut clang_process = Command::new("clang") - .arg(&tmp_path) - .arg("--emit-static-lib") + .arg(&llvm_path) + .arg("-c") .arg("-O3") .arg("-march=native") .arg("-o") - .arg(&output_archive) + .arg(&llvm_object) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .spawn() .expect("Error running clang. Is it installed?"); assert!(clang_process.wait().unwrap().success()); + let mut ar_args = vec!["crus", &output_archive, &llvm_object]; + + let cuda_object = format!("{}/{}_cuda.o", tmp_dir.path().to_str().unwrap(), module_name); + if cfg!(feature = "cuda") { + // Write the CUDA IR into a temporary file. + let mut cuda_path = tmp_dir.path().to_path_buf(); + cuda_path.push(format!("{}.cu", module_name)); + let mut file = File::create(&cuda_path) + .expect("PANIC: Unable to open output CUDA IR file."); + file.write_all(cuda_ir.as_bytes()) + .expect("PANIC: Unable to write output CUDA IR file contents."); + + let mut nvcc_process = Command::new("nvcc") + .arg("-c") + .arg("-O3") + .arg("-o") + .arg(&cuda_object) + .arg(&cuda_path) + .spawn() + .expect("Error running nvcc. Is it installed?"); + assert!(nvcc_process.wait().unwrap().success()); + + ar_args.push(&cuda_object); + } + + let mut ar_process = Command::new("ar") + .args(&ar_args) + .spawn() + .expect("Error running ar. Is it installed?"); + assert!(ar_process.wait().unwrap().success()); + // Write the Rust runtime into a file. let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name); println!("{}", output_rt); - let mut file = - File::create(&output_rt).expect("PANIC: Unable to open output Rust runtime file."); + let mut file = File::create(&output_rt) + .expect("PANIC: Unable to open output Rust runtime file."); file.write_all(rust_rt.as_bytes()) .expect("PANIC: Unable to write output Rust runtime file contents."); -- GitLab From 58b7928b8e7cd5d197b06cddc81674dce3bf8468 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sat, 25 Jan 2025 18:22:09 +0000 Subject: [PATCH 095/109] things work --- hercules_cg/src/gpu.rs | 150 +++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 72 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 074281f8..c3cb6634 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -79,6 +79,10 @@ pub fn gpu_codegen<W: Write>( * - Add float8, float16, bfloat16 dtypes if they come */ + // Temporary for matmul (both true) and dot (thread true) test while we don't have schedule annotations + let block_parallel_override = false; + let thread_parallel_override = false; + let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) .filter(|idx| function.nodes[*idx].is_reduce()) .map(NodeID::new) @@ -109,7 +113,6 @@ pub fn gpu_codegen<W: Write>( .entry(fork_node) .or_default() .push(*reduce_node); - println!("reduce_node: {:?}, fork_node: {:?}, join: {:?}", reduce_node, fork_node, control); } Node::Region { preds: _ } => { // TODO: map region node to fork node @@ -118,14 +121,10 @@ pub fn gpu_codegen<W: Write>( panic!("Reduce's control must be a join or region node"); } } - if !function.schedules[reduce_node.idx()].contains(&Schedule::ParallelReduce) - && !function.schedules[reduce_node.idx()].contains(&Schedule::TightAssociative) - { - reduct_reduce_map - .entry(*reduct) - .or_default() - .push(*reduce_node); - } + reduct_reduce_map + .entry(*reduct) + .or_default() + .push(*reduce_node); } } for idx in 0..function.nodes.len() { @@ -186,6 +185,8 @@ pub fn gpu_codegen<W: Write>( control_data_phi_map, return_parameter, kernel_params, + block_parallel_override, + thread_parallel_override, }; ctx.codegen_function(w) } @@ -214,6 +215,8 @@ struct GPUContext<'a> { control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>>, return_parameter: Option<usize>, kernel_params: &'a GPUKernelParams, + block_parallel_override: bool, + thread_parallel_override: bool, } /* @@ -298,6 +301,7 @@ impl GPUContext<'_> { let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &self.fork_tree, is_block_parallel); writeln!(file, "thread_root_root_fork: {:?}", thread_root_root_fork).unwrap(); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&self.fork_tree, thread_root_root_fork); + writeln!(file, "fork_thread_quota_map: {:?}", fork_thread_quota_map).unwrap(); // TODO: Uncomment and adjust once we know logic of extra dim // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); let extra_dim_collects = HashSet::new(); @@ -586,7 +590,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; self.collection_objects.origin(*object).try_parameter().is_some() }) }), "All collection reduces in block fork must originate from parameters"); - if true || self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) + if self.block_parallel_override || self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); (root_forks, fork_size, true) @@ -693,7 +697,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; && fork_size.is_power_of_two() && reduces.iter().all(|&reduce| { self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) - || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) + || self.thread_parallel_override || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) { // If there's an associative Reduce, parallelize the larger factor @@ -706,7 +710,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // restriction doesn't help for parallel Writes, so nested parallelization // is possible. if reduces.iter().any(|&reduce| { - self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) + self.thread_parallel_override || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) || fork_size > self.kernel_params.max_num_threads / subtree_quota { if fork_size >= subtree_quota { (HashMap::new(), fork_size, true) @@ -1069,8 +1073,8 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } }, Node::Binary { op, left, right } => { - let left_val = self.get_value(*left, false, false); - let right_val = self.get_value(*right, false, false); + let mut left_val = self.get_value(*left, false, false); + let mut right_val = self.get_value(*right, false, false); let id_type = self.typing[id.idx()]; if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And | BinaryOperator::Xor) && is_special_reduct { @@ -1079,14 +1083,14 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // supported. We need to use CGType::Use not CGType::UsePerId // because for parallelized reduction we only have one thread // per ThreadID and the reduction is over Use, not UsePerId. - let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[left.idx()] { - right_val + let (reduce_val, non_reduce_val) = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[left.idx()] { + (left_val, right_val) } else { - left_val + (right_val, left_val) }; // Special reduct is only enabled for thread parallelization // so don't need to worry about grid and block cases - let cg_tile = self.get_cg_tile(id, CGType::Use); + let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use); #[allow(unreachable_patterns)] let cg_op = match op { BinaryOperator::Add => "plus", @@ -1096,57 +1100,60 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; _ => unreachable!(), }; let id_type_name = self.get_type(id_type, false); - write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, cg_tile, non_reduce_arg, cg_op, id_type_name)?; - } else { - match (op, &self.types[id_type.idx()]) { - (BinaryOperator::Or, Type::Boolean) => write!( - w, - "{}{} = {} || {};\n", - tabs, define_variable, left_val, right_val, - )?, - (BinaryOperator::And, Type::Boolean) => write!( - w, - "{}{} = {} && {};\n", - tabs, define_variable, left_val, right_val, - )?, - (BinaryOperator::Rem, Type::Float32) => write!( - w, - "{}{} = fmodf({}, {});\n", - tabs, define_variable, left_val, right_val, - )?, - (BinaryOperator::Rem, Type::Float64) => write!( - w, - "{}{} = fmod({}, {});\n", - tabs, define_variable, left_val, right_val, - )?, - (op, _) => write!( - w, - "{}{} = {} {} {};\n", - tabs, - define_variable, - left_val, - match op { - BinaryOperator::Add => "+", - BinaryOperator::Sub => "-", - BinaryOperator::Mul => "*", - BinaryOperator::Div => "/", - BinaryOperator::Rem => "%", - BinaryOperator::LT => "<", - BinaryOperator::LTE => "<=", - BinaryOperator::GT => ">", - BinaryOperator::GTE => ">=", - BinaryOperator::EQ => "==", - BinaryOperator::NE => "!=", - BinaryOperator::Or => "|", - BinaryOperator::And => "&", - BinaryOperator::Xor => "^", - BinaryOperator::LSh => "<<", - BinaryOperator::RSh => ">>", - }, - right_val, - )?, - }; + write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, cg_tile, non_reduce_val, cg_op, id_type_name)?; + // Setup binop between reduce's init and reduced reduct. Since it's associative, + // we can change binop ordering + left_val = define_variable.clone(); + right_val = reduce_val; } + match (op, &self.types[id_type.idx()]) { + (BinaryOperator::Or, Type::Boolean) => write!( + w, + "{}{} = {} || {};\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::And, Type::Boolean) => write!( + w, + "{}{} = {} && {};\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::Rem, Type::Float32) => write!( + w, + "{}{} = fmodf({}, {});\n", + tabs, define_variable, left_val, right_val, + )?, + (BinaryOperator::Rem, Type::Float64) => write!( + w, + "{}{} = fmod({}, {});\n", + tabs, define_variable, left_val, right_val, + )?, + (op, _) => write!( + w, + "{}{} = {} {} {};\n", + tabs, + define_variable, + left_val, + match op { + BinaryOperator::Add => "+", + BinaryOperator::Sub => "-", + BinaryOperator::Mul => "*", + BinaryOperator::Div => "/", + BinaryOperator::Rem => "%", + BinaryOperator::LT => "<", + BinaryOperator::LTE => "<=", + BinaryOperator::GT => ">", + BinaryOperator::GTE => ">=", + BinaryOperator::EQ => "==", + BinaryOperator::NE => "!=", + BinaryOperator::Or => "|", + BinaryOperator::And => "&", + BinaryOperator::Xor => "^", + BinaryOperator::LSh => "<<", + BinaryOperator::RSh => ">>", + }, + right_val, + )?, + }; } Node::Ternary { op, @@ -1175,7 +1182,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } else { self.get_value(args[0], false, false) }; - let cg_tile = self.get_cg_tile(id, CGType::Use); + let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use); #[allow(unreachable_patterns)] let cg_op = match intrinsic { Intrinsic::Max => "max", @@ -1248,8 +1255,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } _ => { - println!("Unsupported data node type: {:?}", self.function.nodes[id.idx()]); - panic!("Unsupported data node type") + panic!("Unsupported data node type: {:?}", self.function.nodes[id.idx()]) } } // Since reducts are responsible for updating Reduce nodes, we check and @@ -1404,8 +1410,8 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?; write!(w_term, "\t\t*(reinterpret_cast<{}>(ret)) = {};\n", return_type_ptr, return_val)?; write!(w_term, "\t}}\n")?; - write!(w_term, "\treturn;\n")?; } + write!(w_term, "\treturn;\n")?; 1 } _ => { -- GitLab From 5364c8f5384335e86c04d92f51884fe8b182a8af Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 26 Jan 2025 21:31:14 -0600 Subject: [PATCH 096/109] no xdot --- hercules_samples/matmul/src/gpu.sch | 1 - 1 file changed, 1 deletion(-) diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index 9067a190..d17453e3 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -12,4 +12,3 @@ dce(*); float-collections(*); gcm(*); -xdot[true](*); -- GitLab From 694358d3f50e133ee296c8f72688216051e46315 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Tue, 28 Jan 2025 22:13:38 +0000 Subject: [PATCH 097/109] mmhir --- hercules_samples/matmul/src/matmul.hir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hercules_samples/matmul/src/matmul.hir b/hercules_samples/matmul/src/matmul.hir index b0c31da4..f9d37afc 100644 --- a/hercules_samples/matmul/src/matmul.hir +++ b/hercules_samples/matmul/src/matmul.hir @@ -1,9 +1,9 @@ -fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2) +fn matmul<3>(a: array(i32, #0, 8), b: array(i32, 8, #2)) -> array(i32, #0, #2) c = constant(array(i32, #0, #2), []) i_j_ctrl = fork(start, #0, #2) i_idx = thread_id(i_j_ctrl, 0) j_idx = thread_id(i_j_ctrl, 1) - k_ctrl = fork(i_j_ctrl, #1) + k_ctrl = fork(i_j_ctrl, 8) k_idx = thread_id(k_ctrl, 0) k_join_ctrl = join(k_ctrl) i_j_join_ctrl = join(k_join_ctrl) -- GitLab From a4ded058cca94c3140b25c5ef5cc63f0c36d4cba Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Wed, 29 Jan 2025 09:29:02 -0600 Subject: [PATCH 098/109] matmul dot chngs --- hercules_cg/src/lib.rs | 3 +++ hercules_rt/src/lib.rs | 26 +++++++++++++++++++++++ hercules_samples/dot/src/cpu.sch | 7 +++++++ hercules_samples/dot/src/gpu.sch | 6 ++++++ hercules_samples/dot/src/main.rs | 32 +++++++++++++++++++++-------- hercules_samples/matmul/src/cpu.sch | 2 ++ hercules_samples/matmul/src/gpu.sch | 2 ++ hercules_samples/matmul/src/main.rs | 27 +++++++++++++++++++----- 8 files changed, 92 insertions(+), 13 deletions(-) diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index 6910df9e..dab4dbac 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -1,6 +1,7 @@ #![feature(if_let_guard, let_chains)] pub mod cpu; +pub mod gpu; pub mod rt; pub mod fork_tree; @@ -9,6 +10,8 @@ pub use crate::cpu::*; pub use crate::gpu::*; pub use crate::rt::*; +pub use crate::fork_tree::*; + use std::collections::BTreeMap; use hercules_ir::*; diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index db2dee77..a23ab3e9 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -147,6 +147,19 @@ impl<'a> HerculesCPURefMut<'a> { #[cfg(feature = "cuda")] impl<'a> HerculesCUDARef<'a> { + pub fn to_cpu_ref<T>(self, dst: &mut [T]) -> HerculesCPURef<'a> { + unsafe { + let size = self.size; + let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap(); + __copy_cuda_to_cpu(ptr.as_ptr(), self.ptr.as_ptr(), size); + HerculesCPURef { + ptr, + size, + _phantom: PhantomData, + } + } + } + pub unsafe fn __ptr(&self) -> *mut u8 { self.ptr.as_ptr() } @@ -174,6 +187,19 @@ impl<'a> HerculesCUDARefMut<'a> { } } + pub fn to_cpu_ref<T>(self, dst: &mut [T]) -> HerculesCPURef<'a> { + unsafe { + let size = self.size; + let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap(); + __copy_cuda_to_cpu(ptr.as_ptr(), self.ptr.as_ptr(), size); + HerculesCPURef { + ptr, + size, + _phantom: PhantomData, + } + } + } + pub unsafe fn __ptr(&self) -> *mut u8 { self.ptr.as_ptr() } diff --git a/hercules_samples/dot/src/cpu.sch b/hercules_samples/dot/src/cpu.sch index 58a7266d..4c684da2 100644 --- a/hercules_samples/dot/src/cpu.sch +++ b/hercules_samples/dot/src/cpu.sch @@ -6,7 +6,14 @@ auto-outline(*); ip-sroa(*); sroa(*); +fork-split(*); unforkify(*); dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); gcm(*); diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch index 956eb996..a1a51088 100644 --- a/hercules_samples/dot/src/gpu.sch +++ b/hercules_samples/dot/src/gpu.sch @@ -9,5 +9,11 @@ host(dot); ip-sroa(*); sroa(*); dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); gcm(*); diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs index 335e8909..4e651fa8 100644 --- a/hercules_samples/dot/src/main.rs +++ b/hercules_samples/dot/src/main.rs @@ -1,19 +1,35 @@ #![feature(concat_idents)] use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("dot"); fn main() { async_std::task::block_on(async { - let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; - let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURef::from_slice(&b); - let mut r = runner!(dot); - let c = r.run(8, a, b).await; - println!("{}", c); - assert_eq!(c, 70.0); + let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; + let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURef::from_slice(&b); + let mut r = runner!(dot); + let c = r.run(8, a, b).await; + println!("{}", c); + assert_eq!(c, 70.0); + } + #[cfg(feature = "cuda")] + { + let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); + let a = a_box.get_ref(); + let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let b = b_box.get_ref(); + let mut r = runner!(dot); + let c = r.run(8, a, b).await; + println!("{}", c); + assert_eq!(c, 70.0); + } }); } diff --git a/hercules_samples/matmul/src/cpu.sch b/hercules_samples/matmul/src/cpu.sch index f7891b9b..4c684da2 100644 --- a/hercules_samples/matmul/src/cpu.sch +++ b/hercules_samples/matmul/src/cpu.sch @@ -14,4 +14,6 @@ gvn(*); phi-elim(*); dce(*); +infer-schedules(*); + gcm(*); diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index 2bdcc83c..c9d6b336 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -14,4 +14,6 @@ gvn(*); phi-elim(*); dce(*); +infer-schedules(*); + gcm(*); diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs index 8757a0fd..762644f1 100644 --- a/hercules_samples/matmul/src/main.rs +++ b/hercules_samples/matmul/src/main.rs @@ -3,6 +3,8 @@ use rand::random; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("matmul"); @@ -21,11 +23,26 @@ fn main() { } } } - let a = HerculesCPURef::from_slice(&mut a); - let b = HerculesCPURef::from_slice(&mut b); - let mut r = runner!(matmul); - let c = r.run(I as u64, J as u64, K as u64, a, b).await; - assert_eq!(c.as_slice::<i32>(), &*correct_c); + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&mut a); + let b = HerculesCPURef::from_slice(&mut b); + let mut r = runner!(matmul); + let c = r.run(I as u64, J as u64, K as u64, a, b).await; + assert_eq!(c.as_slice::<i32>(), &*correct_c); + } + #[cfg(feature = "cuda")] + { + let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); + let a = a_box.get_ref(); + let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let b = b_box.get_ref(); + let mut r = runner!(matmul); + let c = r.run(I as u64, J as u64, K as u64, a, b).await; + let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice(); + c.to_cpu_ref(&mut c_cpu); + assert_eq!(c_cpu.as_ref(), correct_c.as_ref()); + } }); } -- GitLab From aa1cf4084f932e46213c85e0f663fe0275cc521f Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Wed, 29 Jan 2025 09:37:40 -0600 Subject: [PATCH 099/109] tmp change matmul test --- hercules_samples/matmul/src/main.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs index 762644f1..9421c773 100644 --- a/hercules_samples/matmul/src/main.rs +++ b/hercules_samples/matmul/src/main.rs @@ -10,11 +10,13 @@ juno_build::juno!("matmul"); fn main() { async_std::task::block_on(async { - const I: usize = 256; - const J: usize = 64; - const K: usize = 128; - let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); - let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); + const I: usize = 4; + const J: usize = 2; + const K: usize = 8; + // let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); + // let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); + let mut a: Box<[i32]> = (0..I * J).map(|i| (i as i32) % 100).collect(); + let mut b: Box<[i32]> = (0..J * K).map(|i| (i as i32) % 100).collect(); let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); for i in 0..I { for k in 0..K { @@ -41,7 +43,7 @@ fn main() { let c = r.run(I as u64, J as u64, K as u64, a, b).await; let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice(); c.to_cpu_ref(&mut c_cpu); - assert_eq!(c_cpu.as_ref(), correct_c.as_ref()); + assert_eq!(c_cpu.as_ref(), &*correct_c); } }); } -- GitLab From d391d16354863a51c1d4ee8e3de4bce44b54402e Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Wed, 29 Jan 2025 16:25:22 +0000 Subject: [PATCH 100/109] mm dot works --- hercules_cg/src/gpu.rs | 29 ++++++----------------------- hercules_samples/matmul/src/main.rs | 12 +++++------- 2 files changed, 11 insertions(+), 30 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index c3cb6634..ce52a20e 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -79,10 +79,6 @@ pub fn gpu_codegen<W: Write>( * - Add float8, float16, bfloat16 dtypes if they come */ - // Temporary for matmul (both true) and dot (thread true) test while we don't have schedule annotations - let block_parallel_override = false; - let thread_parallel_override = false; - let reduce_nodes: Vec<NodeID> = (0..function.nodes.len()) .filter(|idx| function.nodes[*idx].is_reduce()) .map(NodeID::new) @@ -164,8 +160,6 @@ pub fn gpu_codegen<W: Write>( threads_per_warp: 32, }; - std::fs::write("out.txt", "debug\n\n").unwrap(); - let ctx = GPUContext { function, types, @@ -185,8 +179,6 @@ pub fn gpu_codegen<W: Write>( control_data_phi_map, return_parameter, kernel_params, - block_parallel_override, - thread_parallel_override, }; ctx.codegen_function(w) } @@ -215,8 +207,6 @@ struct GPUContext<'a> { control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>>, return_parameter: Option<usize>, kernel_params: &'a GPUKernelParams, - block_parallel_override: bool, - thread_parallel_override: bool, } /* @@ -265,8 +255,6 @@ enum CGType { impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { - let mut file = OpenOptions::new().append(true).open("out.txt").unwrap(); - // Emit all code up to the "goto" to Start's block let mut top = String::new(); self.codegen_kernel_begin(self.return_parameter.is_none(), &mut top)?; @@ -289,19 +277,14 @@ impl GPUContext<'_> { // If there are no forks, fast forward to single-block, single-thread codegen let (num_blocks, num_threads) = if self.fork_join_map.is_empty() { - writeln!(file, "shortcut to 1b1t").unwrap(); self.codegen_data_control_no_forks(&HashSet::new(), &mut dynamic_shared_offset, &mut gotos)?; ("1".to_string(), "1".to_string()) } else { - writeln!(file, "no shortcut! fork tree: {:?}", self.fork_tree).unwrap(); // Create structures and determine block and thread parallelization strategy let (root_forks, num_blocks, is_block_parallel) = - self.get_root_forks_and_num_blocks(&self.fork_tree); - writeln!(file, "is_block_parallel: {}", is_block_parallel).unwrap(); - let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, &self.fork_tree, is_block_parallel); - writeln!(file, "thread_root_root_fork: {:?}", thread_root_root_fork).unwrap(); - let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(&self.fork_tree, thread_root_root_fork); - writeln!(file, "fork_thread_quota_map: {:?}", fork_thread_quota_map).unwrap(); + self.get_root_forks_and_num_blocks(self.fork_tree); + let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, self.fork_tree, is_block_parallel); + let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(self.fork_tree, thread_root_root_fork); // TODO: Uncomment and adjust once we know logic of extra dim // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); let extra_dim_collects = HashSet::new(); @@ -590,7 +573,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; self.collection_objects.origin(*object).try_parameter().is_some() }) }), "All collection reduces in block fork must originate from parameters"); - if self.block_parallel_override || self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) + if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); (root_forks, fork_size, true) @@ -697,7 +680,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; && fork_size.is_power_of_two() && reduces.iter().all(|&reduce| { self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce) - || self.thread_parallel_override || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) + || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) { // If there's an associative Reduce, parallelize the larger factor @@ -710,7 +693,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // restriction doesn't help for parallel Writes, so nested parallelization // is possible. if reduces.iter().any(|&reduce| { - self.thread_parallel_override || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) + self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative) }) || fork_size > self.kernel_params.max_num_threads / subtree_quota { if fork_size >= subtree_quota { (HashMap::new(), fork_size, true) diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs index 9421c773..7b6cfe79 100644 --- a/hercules_samples/matmul/src/main.rs +++ b/hercules_samples/matmul/src/main.rs @@ -10,13 +10,11 @@ juno_build::juno!("matmul"); fn main() { async_std::task::block_on(async { - const I: usize = 4; - const J: usize = 2; - const K: usize = 8; - // let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); - // let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); - let mut a: Box<[i32]> = (0..I * J).map(|i| (i as i32) % 100).collect(); - let mut b: Box<[i32]> = (0..J * K).map(|i| (i as i32) % 100).collect(); + const I: usize = 256; + const J: usize = 8; // hardcoded constant in matmul.hir + const K: usize = 128; + let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); + let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); for i in 0..I { for k in 0..K { -- GitLab From d4a8a9488da6f6d49e8a098f26e38a2a28e38e93 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Wed, 29 Jan 2025 21:58:34 -0600 Subject: [PATCH 101/109] not fixed yet but switching machines --- hercules_samples/call/build.rs | 2 + hercules_samples/call/src/cpu.sch | 19 +++++++++ hercules_samples/call/src/gpu.sch | 18 ++++++++ hercules_samples/ccp/build.rs | 2 + hercules_samples/ccp/src/cpu.sch | 19 +++++++++ hercules_samples/ccp/src/gpu.sch | 18 ++++++++ hercules_samples/dot/src/gpu.sch | 5 +-- hercules_samples/fac/build.rs | 2 + hercules_samples/fac/src/cpu.sch | 19 +++++++++ hercules_samples/fac/src/gpu.sch | 18 ++++++++ hercules_samples/matmul/src/gpu.sch | 5 +-- hercules_samples/matmul/src/main.rs | 10 ++--- juno_samples/antideps/build.rs | 9 ++++ juno_samples/antideps/src/gpu.sch | 18 ++++++++ juno_samples/casts_and_intrinsics/build.rs | 9 ++++ juno_samples/casts_and_intrinsics/src/gpu.sch | 18 ++++++++ juno_samples/cava/build.rs | 9 ++++ juno_samples/cava/src/gpu.sch | 18 ++++++++ juno_samples/concat/build.rs | 2 + juno_samples/concat/src/concat.jn | 10 +---- juno_samples/concat/src/cpu.sch | 17 ++++++++ juno_samples/concat/src/gpu.sch | 17 ++++++++ juno_samples/concat/src/main.rs | 26 ++++++++++-- juno_samples/cpu.sch | 19 +++++++++ juno_samples/gpu.sch | 18 ++++++++ juno_samples/implicit_clone/build.rs | 9 ++++ juno_samples/implicit_clone/src/gpu.sch | 18 ++++++++ juno_samples/matmul/build.rs | 9 ++++ juno_samples/matmul/src/gpu.sch | 18 ++++++++ juno_samples/matmul/src/main.rs | 41 ++++++++++++++----- juno_samples/nested_ccp/build.rs | 9 ++++ juno_samples/nested_ccp/src/gpu.sch | 18 ++++++++ juno_samples/nested_ccp/src/main.rs | 35 +++++++++++----- juno_samples/schedule_test/src/main.rs | 28 ++++++++++--- juno_samples/simple3/build.rs | 9 ++++ juno_samples/simple3/src/gpu.sch | 18 ++++++++ juno_samples/simple3/src/main.rs | 24 ++++++++--- 37 files changed, 505 insertions(+), 58 deletions(-) create mode 100644 hercules_samples/call/src/cpu.sch create mode 100644 hercules_samples/call/src/gpu.sch create mode 100644 hercules_samples/ccp/src/cpu.sch create mode 100644 hercules_samples/ccp/src/gpu.sch create mode 100644 hercules_samples/fac/src/cpu.sch create mode 100644 hercules_samples/fac/src/gpu.sch create mode 100644 juno_samples/antideps/src/gpu.sch create mode 100644 juno_samples/casts_and_intrinsics/src/gpu.sch create mode 100644 juno_samples/cava/src/gpu.sch create mode 100644 juno_samples/concat/src/cpu.sch create mode 100644 juno_samples/concat/src/gpu.sch create mode 100644 juno_samples/cpu.sch create mode 100644 juno_samples/gpu.sch create mode 100644 juno_samples/implicit_clone/src/gpu.sch create mode 100644 juno_samples/matmul/src/gpu.sch create mode 100644 juno_samples/nested_ccp/src/gpu.sch create mode 100644 juno_samples/simple3/src/gpu.sch diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs index af48fe64..7f5816ce 100644 --- a/hercules_samples/call/build.rs +++ b/hercules_samples/call/build.rs @@ -4,6 +4,8 @@ fn main() { JunoCompiler::new() .ir_in_src("call.hir") .unwrap() + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) + .unwrap() .build() .unwrap(); } diff --git a/hercules_samples/call/src/cpu.sch b/hercules_samples/call/src/cpu.sch new file mode 100644 index 00000000..4c684da2 --- /dev/null +++ b/hercules_samples/call/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +fork-split(*); +unforkify(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch new file mode 100644 index 00000000..1e654e22 --- /dev/null +++ b/hercules_samples/call/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.add); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs index f04d48c7..c98d0551 100644 --- a/hercules_samples/ccp/build.rs +++ b/hercules_samples/ccp/build.rs @@ -4,6 +4,8 @@ fn main() { JunoCompiler::new() .ir_in_src("ccp.hir") .unwrap() + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) + .unwrap() .build() .unwrap(); } diff --git a/hercules_samples/ccp/src/cpu.sch b/hercules_samples/ccp/src/cpu.sch new file mode 100644 index 00000000..4c684da2 --- /dev/null +++ b/hercules_samples/ccp/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +fork-split(*); +unforkify(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch new file mode 100644 index 00000000..d8f6a2d0 --- /dev/null +++ b/hercules_samples/ccp/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.tricky); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch index a1a51088..4adbf530 100644 --- a/hercules_samples/dot/src/gpu.sch +++ b/hercules_samples/dot/src/gpu.sch @@ -2,9 +2,8 @@ gvn(*); phi-elim(*); dce(*); -auto-outline(*); -gpu(*); -host(dot); +let out = auto-outline(*); +gpu(out.dot); ip-sroa(*); sroa(*); diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs index 4d8226f1..1986a746 100644 --- a/hercules_samples/fac/build.rs +++ b/hercules_samples/fac/build.rs @@ -4,6 +4,8 @@ fn main() { JunoCompiler::new() .ir_in_src("fac.hir") .unwrap() + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) + .unwrap() .build() .unwrap(); } diff --git a/hercules_samples/fac/src/cpu.sch b/hercules_samples/fac/src/cpu.sch new file mode 100644 index 00000000..4c684da2 --- /dev/null +++ b/hercules_samples/fac/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +fork-split(*); +unforkify(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch new file mode 100644 index 00000000..1885854c --- /dev/null +++ b/hercules_samples/fac/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.fac); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index c9d6b336..9a714789 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -2,9 +2,8 @@ gvn(*); phi-elim(*); dce(*); -auto-outline(*); -gpu(*); -host(matmul); +let out = auto-outline(*); +gpu(out.matmul); ip-sroa(*); sroa(*); diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs index 7b6cfe79..abd25ec9 100644 --- a/hercules_samples/matmul/src/main.rs +++ b/hercules_samples/matmul/src/main.rs @@ -33,15 +33,13 @@ fn main() { } #[cfg(feature = "cuda")] { - let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); - let a = a_box.get_ref(); - let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); - let b = b_box.get_ref(); + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); let mut r = runner!(matmul); - let c = r.run(I as u64, J as u64, K as u64, a, b).await; + let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await; let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice(); c.to_cpu_ref(&mut c_cpu); - assert_eq!(c_cpu.as_ref(), &*correct_c); + assert_eq!(&*c_cpu, &*correct_c); } }); } diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs index 7ed716a4..92b30c43 100644 --- a/juno_samples/antideps/build.rs +++ b/juno_samples/antideps/build.rs @@ -1,6 +1,15 @@ use juno_build::JunoCompiler; fn main() { + #[cfg(feature = "cuda")] + JunoCompiler::new() + .file_in_src("antideps.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + #[cfg(not(feature = "cuda"))] JunoCompiler::new() .file_in_src("antideps.jn") .unwrap() diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch new file mode 100644 index 00000000..d3f4a6c2 --- /dev/null +++ b/juno_samples/antideps/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_antideps2, out.very_complex_antideps, out.read_chains, out.array_of_structs); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs index 16d5c7a4..e43a2ac8 100644 --- a/juno_samples/casts_and_intrinsics/build.rs +++ b/juno_samples/casts_and_intrinsics/build.rs @@ -1,6 +1,15 @@ use juno_build::JunoCompiler; fn main() { + #[cfg(feature = "cuda")] + JunoCompiler::new() + .file_in_src("casts_and_intrinsics.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + #[cfg(not(feature = "cuda"))] JunoCompiler::new() .file_in_src("casts_and_intrinsics.jn") .unwrap() diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch new file mode 100644 index 00000000..b2fb3449 --- /dev/null +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.casts_and_intrinsics); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/cava/build.rs b/juno_samples/cava/build.rs index 929d3eba..03d54160 100644 --- a/juno_samples/cava/build.rs +++ b/juno_samples/cava/build.rs @@ -2,6 +2,15 @@ extern crate juno_build; use juno_build::JunoCompiler; fn main() { + #[cfg(feature = "cuda")] + JunoCompiler::new() + .file_in_src("cava.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + #[cfg(not(feature = "cuda"))] JunoCompiler::new() .file_in_src("cava.jn") .unwrap() diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch new file mode 100644 index 00000000..07f71c99 --- /dev/null +++ b/juno_samples/cava/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +gpu(scale, demosaic, medianMatrix, transform, gamut, tone_map, descale); + +ip-sroa(*); +sroa(*); + +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/concat/build.rs b/juno_samples/concat/build.rs index f7784b99..c91df94e 100644 --- a/juno_samples/concat/build.rs +++ b/juno_samples/concat/build.rs @@ -4,6 +4,8 @@ fn main() { JunoCompiler::new() .file_in_src("concat.jn") .unwrap() + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) + .unwrap() .build() .unwrap(); } diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn index 2471671e..b9806c93 100644 --- a/juno_samples/concat/src/concat.jn +++ b/juno_samples/concat/src/concat.jn @@ -18,15 +18,7 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t { } #[entry] -fn concat_entry(a : i32) -> i32 { - let arr1 : i32[3]; - let arr2 : i32[6]; - arr1[0] = a; - arr1[1] = a; - arr2[0] = a; - arr2[1] = a; - arr2[4] = a; - arr2[5] = a; +fn concat_entry(arr1 : i32[3], arr2 : i32[6]) -> i32 { let arr3 = concat::<i32, 3, 6>(arr1, arr2); return sum::<i32, 9>(arr3); } diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch new file mode 100644 index 00000000..680adaeb --- /dev/null +++ b/juno_samples/concat/src/cpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +cpu(concat, sum); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch new file mode 100644 index 00000000..8ee4ef0e --- /dev/null +++ b/juno_samples/concat/src/gpu.sch @@ -0,0 +1,17 @@ +gvn(*); +phi-elim(*); +dce(*); + +gpu(concat, sum); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs index db3f37fd..d0929fbf 100644 --- a/juno_samples/concat/src/main.rs +++ b/juno_samples/concat/src/main.rs @@ -1,15 +1,35 @@ #![feature(concat_idents)] use hercules_rt::runner; +use hercules_rt::HerculesCPURef; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("concat"); fn main() { async_std::task::block_on(async { let mut r = runner!(concat_entry); - let output = r.run(7).await; - println!("{}", output); - assert_eq!(output, 42); + #[cfg(not(feature = "cuda"))] + { + let mut a_data = [7, 7, 0]; + let a = HerculesCPURef::from_slice(&mut a_data); + let mut b_data = [7, 7, 0, 0, 7, 7]; + let b = HerculesCPURef::from_slice(&mut b_data); + let output = r.run(a, b).await; + println!("{}", output); + assert_eq!(output, 42); + } + #[cfg(feature = "cuda")] + { + let mut a_data = [7, 7, 0]; + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data)); + let mut b_data = [7, 7, 0, 0, 7, 7]; + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data)); + let output = r.run(a.get_ref(), b.get_ref()).await; + println!("{}", output); + assert_eq!(output, 42); + } }); } diff --git a/juno_samples/cpu.sch b/juno_samples/cpu.sch new file mode 100644 index 00000000..4c684da2 --- /dev/null +++ b/juno_samples/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +fork-split(*); +unforkify(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/gpu.sch b/juno_samples/gpu.sch new file mode 100644 index 00000000..9a714789 --- /dev/null +++ b/juno_samples/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.matmul); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs index 75c1afc4..dc134e59 100644 --- a/juno_samples/implicit_clone/build.rs +++ b/juno_samples/implicit_clone/build.rs @@ -1,6 +1,15 @@ use juno_build::JunoCompiler; fn main() { + #[cfg(feature = "cuda")] + JunoCompiler::new() + .file_in_src("implicit_clone.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + #[cfg(not(feature = "cuda"))] JunoCompiler::new() .file_in_src("implicit_clone.jn") .unwrap() diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch new file mode 100644 index 00000000..443fc778 --- /dev/null +++ b/juno_samples/implicit_clone/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit_clone, out.tricky_loop_implicit_clone, out.tricky2_loop_implicit_clone, out.tricky3_loop_implicit_clone, out.no_implicit_clone, out.mirage_implicit_clone); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index 926fbc33..ff3e3d8c 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,6 +1,15 @@ use juno_build::JunoCompiler; fn main() { + #[cfg(feature = "cuda")] + JunoCompiler::new() + .file_in_src("matmul.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + #[cfg(not(feature = "cuda"))] JunoCompiler::new() .file_in_src("matmul.jn") .unwrap() diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch new file mode 100644 index 00000000..e85dafdf --- /dev/null +++ b/juno_samples/matmul/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.matmul, out.tiled_64_matmul); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs index fa5d1f04..50fe1760 100644 --- a/juno_samples/matmul/src/main.rs +++ b/juno_samples/matmul/src/main.rs @@ -3,6 +3,8 @@ use rand::random; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("matmul"); @@ -11,8 +13,8 @@ fn main() { const I: usize = 256; const J: usize = 64; const K: usize = 128; - let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); - let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); + let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); + let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); for i in 0..I { for k in 0..K { @@ -21,14 +23,32 @@ fn main() { } } } - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURef::from_slice(&b); - let mut r = runner!(matmul); - let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await; - assert_eq!(c.as_slice::<i32>(), &*correct_c); - let mut r = runner!(tiled_64_matmul); - let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await; - assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c); + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURef::from_slice(&b); + let mut r = runner!(matmul); + let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await; + assert_eq!(c.as_slice::<i32>(), &*correct_c); + let mut r = runner!(tiled_64_matmul); + let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await; + assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c); + } + #[cfg(feature = "cuda")] + { + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let mut r = runner!(matmul); + let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await; + let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice(); + c.to_cpu_ref(&mut c_cpu); + assert_eq!(&*c_cpu, &*correct_c); + let mut r = runner!(tiled_64_matmul); + let tiled_c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await; + let mut tiled_c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice(); + tiled_c.to_cpu_ref(&mut tiled_c_cpu); + assert_eq!(&*tiled_c_cpu, &*correct_c); + } }); } @@ -36,4 +56,3 @@ fn main() { fn matmul_test() { main(); } - diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index c5c7ca6a..2352ddef 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,6 +1,15 @@ use juno_build::JunoCompiler; fn main() { + #[cfg(feature = "cuda")] + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + #[cfg(not(feature = "cuda"))] JunoCompiler::new() .file_in_src("nested_ccp.jn") .unwrap() diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch new file mode 100644 index 00000000..021a05e3 --- /dev/null +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.ccp_example, out.median_array, out.no_underflow); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 423b66fb..412d56a4 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -1,6 +1,8 @@ #![feature(concat_idents)] use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("nested_ccp"); @@ -8,19 +10,30 @@ fn main() { async_std::task::block_on(async { let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURefMut::from_slice(&mut b); - let mut r = runner!(ccp_example); - let output_example = r.run(a).await; - let mut r = runner!(median_array); - let output_median = r.run(9, b).await; + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURefMut::from_slice(&mut b); + let mut r = runner!(ccp_example); + let output_example = r.run(a).await; + let mut r = runner!(median_array); + let output_median = r.run(9, b).await; + assert_eq!(output_example, 1.0); + assert_eq!(output_median, 18); + } + #[cfg(feature = "cuda")] + { + let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let mut r = runner!(ccp_example); + let output_example = r.run(a.get_ref_mut()).await; + let mut r = runner!(median_array); + let output_median = r.run(9, b.get_ref_mut()).await; + assert_eq!(output_example, 1.0); + assert_eq!(output_median, 18); + } let mut r = runner!(no_underflow); let out_no_underflow = r.run().await; - println!("{}", output_example); - println!("{}", output_median); - println!("{}", out_no_underflow); - assert_eq!(output_example, 1.0); - assert_eq!(output_median, 18); assert_eq!(out_no_underflow, 7); }); } diff --git a/juno_samples/schedule_test/src/main.rs b/juno_samples/schedule_test/src/main.rs index 2e63babf..1505d4e5 100644 --- a/juno_samples/schedule_test/src/main.rs +++ b/juno_samples/schedule_test/src/main.rs @@ -3,6 +3,8 @@ use rand::random; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("code"); @@ -26,12 +28,26 @@ fn main() { } } - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURef::from_slice(&b); - let c = HerculesCPURef::from_slice(&c); - let mut r = runner!(test); - let res = r.run(N as u64, M as u64, K as u64, a, b, c).await; - assert_eq!(res.as_slice::<i32>(), &*correct_res); + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURef::from_slice(&b); + let c = HerculesCPURef::from_slice(&c); + let mut r = runner!(test); + let res = r.run(N as u64, M as u64, K as u64, a, b, c).await; + assert_eq!(res.as_slice::<i32>(), &*correct_res); + } + #[cfg(feature = "cuda")] + { + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b)); + let c = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&c)); + let mut r = runner!(test); + let res = r.run(N as u64, M as u64, K as u64, a.get_ref(), b.get_ref(), c.get_ref()).await; + let mut res_cpu: Box<[i32]> = vec![0; correct_res.len()].into_boxed_slice(); + res.to_cpu_ref(&mut res_cpu); + assert_eq!(&*res_cpu, &*correct_res); + } }); } diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index 94760025..a0874af7 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,6 +1,15 @@ use juno_build::JunoCompiler; fn main() { + #[cfg(feature = "cuda")] + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + #[cfg(not(feature = "cuda"))] JunoCompiler::new() .file_in_src("simple3.jn") .unwrap() diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch new file mode 100644 index 00000000..e97627d4 --- /dev/null +++ b/juno_samples/simple3/src/gpu.sch @@ -0,0 +1,18 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +gpu(out.simple3); + +ip-sroa(*); +sroa(*); +dce(*); +float-collections(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs index 4f9fe6a7..8eb78f7c 100644 --- a/juno_samples/simple3/src/main.rs +++ b/juno_samples/simple3/src/main.rs @@ -1,6 +1,8 @@ #![feature(concat_idents)] use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("simple3"); @@ -8,12 +10,22 @@ fn main() { async_std::task::block_on(async { let a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]); let b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]); - let a = HerculesCPURef::from_slice(&a); - let b = HerculesCPURef::from_slice(&b); - let mut r = runner!(simple3); - let c = r.run(8, a, b).await; - println!("{}", c); - assert_eq!(c, 120); + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&a); + let b = HerculesCPURef::from_slice(&b); + let mut r = runner!(simple3); + let c = r.run(8, a, b).await; + assert_eq!(c, 120); + } + #[cfg(feature = "cuda")] + { + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b)); + let mut r = runner!(simple3); + let c = r.run(8, a.get_ref(), b.get_ref()).await; + assert_eq!(c, 120); + } }); } -- GitLab From 948fe3b976233754ff4eecb6aa7b7b1fcdaf33b5 Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Thu, 30 Jan 2025 19:46:06 -0600 Subject: [PATCH 102/109] before get exposed by forkify --- hercules_cg/src/gpu.rs | 9 +- hercules_cg/src/rt.rs | 2 +- hercules_samples/dot/src/main.rs | 6 +- juno_samples/antideps/build.rs | 9 +- juno_samples/antideps/src/cpu.sch | 20 +++++ juno_samples/antideps/src/gpu.sch | 5 +- juno_samples/casts_and_intrinsics/build.rs | 9 +- juno_samples/casts_and_intrinsics/src/cpu.sch | 20 +++++ juno_samples/casts_and_intrinsics/src/gpu.sch | 5 +- juno_samples/cava/src/cava.jn | 20 ++++- juno_samples/cava/src/gpu.sch | 10 ++- juno_samples/cava/src/main.rs | 86 +++++++++++++------ juno_samples/concat/src/concat.jn | 6 +- juno_samples/concat/src/cpu.sch | 8 +- juno_samples/concat/src/gpu.sch | 9 +- juno_samples/concat/src/main.rs | 10 +-- juno_samples/implicit_clone/build.rs | 9 +- juno_samples/implicit_clone/src/cpu.sch | 19 ++++ juno_samples/implicit_clone/src/gpu.sch | 4 +- juno_samples/matmul/build.rs | 9 +- juno_samples/matmul/src/cpu.sch | 21 +++++ juno_samples/matmul/src/gpu.sch | 5 +- juno_samples/nested_ccp/build.rs | 9 +- juno_samples/nested_ccp/src/cpu.sch | 19 ++++ juno_samples/nested_ccp/src/gpu.sch | 4 +- juno_samples/nested_ccp/src/main.rs | 6 +- juno_samples/simple3/build.rs | 9 +- juno_samples/simple3/src/cpu.sch | 19 ++++ juno_samples/simple3/src/gpu.sch | 4 +- 29 files changed, 264 insertions(+), 107 deletions(-) create mode 100644 juno_samples/antideps/src/cpu.sch create mode 100644 juno_samples/casts_and_intrinsics/src/cpu.sch create mode 100644 juno_samples/implicit_clone/src/cpu.sch create mode 100644 juno_samples/matmul/src/cpu.sch create mode 100644 juno_samples/nested_ccp/src/cpu.sch create mode 100644 juno_samples/simple3/src/cpu.sch diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index ce52a20e..a266deea 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1280,14 +1280,17 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let mut succs = self.control_subgraph.succs(id); let succ1 = succs.next().unwrap(); let succ2 = succs.next().unwrap(); + let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some(); + let succ1_block_name = self.get_block_name(succ1, false); + let succ2_block_name = self.get_block_name(succ2, false); write!( w_term, "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, false))?; + write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ1_block_name.clone() } else { succ2_block_name.clone() })?; write!(w_term, "\t}} else {{\n")?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, false))?; + write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ2_block_name } else { succ1_block_name })?; write!(w_term, "\t}}\n")?; 1 } @@ -1590,7 +1593,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; size } else { format!( - "({} + {} - 1) / {}) * {} + {}", + "({} + {} - 1) / {} * {} + {}", acc, align, align, align, size ) } diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 3b35f73e..4237cc84 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -387,7 +387,7 @@ impl<'a> RTContext<'a> { { write!(block, "backing_{}.byte_add(", device.name())?; self.codegen_dynamic_constant(offset, block)?; - write!(block, ")")? + write!(block, "), ")? } for dc in dynamic_constants { self.codegen_dynamic_constant(*dc, block)?; diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs index 4e651fa8..8862c11a 100644 --- a/hercules_samples/dot/src/main.rs +++ b/hercules_samples/dot/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("dot"); fn main() { async_std::task::block_on(async { - let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; - let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; #[cfg(not(feature = "cuda"))] { + let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; let a = HerculesCPURef::from_slice(&a); + let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; let b = HerculesCPURef::from_slice(&b); let mut r = runner!(dot); let c = r.run(8, a, b).await; @@ -21,8 +21,10 @@ fn main() { } #[cfg(feature = "cuda")] { + let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); let a = a_box.get_ref(); + let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); let b = b_box.get_ref(); let mut r = runner!(dot); diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs index 92b30c43..8e261270 100644 --- a/juno_samples/antideps/build.rs +++ b/juno_samples/antideps/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("antideps.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("antideps.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch new file mode 100644 index 00000000..9c2c44a8 --- /dev/null +++ b/juno_samples/antideps/src/cpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch index d3f4a6c2..25dba2e7 100644 --- a/juno_samples/antideps/src/gpu.sch +++ b/juno_samples/antideps/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_a ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs index e43a2ac8..5d25fbba 100644 --- a/juno_samples/casts_and_intrinsics/build.rs +++ b/juno_samples/casts_and_intrinsics/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("casts_and_intrinsics.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("casts_and_intrinsics.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch new file mode 100644 index 00000000..9c2c44a8 --- /dev/null +++ b/juno_samples/casts_and_intrinsics/src/cpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch index b2fb3449..f051ed8c 100644 --- a/juno_samples/casts_and_intrinsics/src/gpu.sch +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.casts_and_intrinsics); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn index f3096ec3..ab4fbe59 100644 --- a/juno_samples/cava/src/cava.jn +++ b/juno_samples/cava/src/cava.jn @@ -116,7 +116,25 @@ fn denoise<row : usize, col : usize>(input : f32[CHAN, row, col]) -> f32[CHAN, r filter[i, j] = input[chan, r + i - 1, c + j - 1]; } } - res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter); + + let tmp : f32[9]; + for r = 0 to 3 { + for c = 0 to 3 { + tmp[r * 3 + c] = filter[r, c]; + } + } + + for i = 0 to 9 - 1 { + for j = 0 to 9 - i - 1 { + if tmp[j] > tmp[j+1] { + let t : f32 = tmp[j]; + tmp[j] = tmp[j+1]; + tmp[j+1] = t; + } + } + } + + res[chan, r, c] = tmp[9 / 2]; } else { res[chan, r, c] = input[chan, r, c]; } diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index 07f71c99..bb91af72 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -2,13 +2,13 @@ gvn(*); phi-elim(*); dce(*); -gpu(scale, demosaic, medianMatrix, transform, gamut, tone_map, descale); +inline(*); +let out = auto-outline(*); +gpu(out.cava); ip-sroa(*); sroa(*); - dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +16,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*) + diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index e05808f9..482bbf8d 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -9,11 +9,15 @@ use self::cava_rust::CHAN; use self::image_proc::*; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; use image::ImageError; use clap::Parser; +use std::mem; + juno_build::juno!("cava"); fn run_cava( @@ -27,39 +31,67 @@ fn run_cava( coefs: &[f32], tonemap: &[f32], ) -> Box<[u8]> { - assert_eq!(image.len(), CHAN * rows * cols); - let image = HerculesCPURef::from_slice(image); + assert_eq!(image.len(), CHAN * rows * cols); assert_eq!(tstw.len(), CHAN * CHAN); - let tstw = HerculesCPURef::from_slice(tstw); - assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN); - let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); - assert_eq!(weights.len(), num_ctrl_pts * CHAN); - let weights = HerculesCPURef::from_slice(weights); - assert_eq!(coefs.len(), 4 * CHAN); - let coefs = HerculesCPURef::from_slice(coefs); - assert_eq!(tonemap.len(), 256 * CHAN); - let tonemap = HerculesCPURef::from_slice(tonemap); - - let mut r = runner!(cava); - async_std::task::block_on(async { - r.run( - rows as u64, - cols as u64, - num_ctrl_pts as u64, - image, - tstw, - ctrl_pts, - weights, - coefs, - tonemap, - ) - .await - }).as_slice::<u8>().to_vec().into_boxed_slice() + + #[cfg(not(feature = "cuda"))] + { + let image = HerculesCPURef::from_slice(image); + let tstw = HerculesCPURef::from_slice(tstw); + let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); + let weights = HerculesCPURef::from_slice(weights); + let coefs = HerculesCPURef::from_slice(coefs); + let tonemap = HerculesCPURef::from_slice(tonemap); + let mut r = runner!(cava); + async_std::task::block_on(async { + r.run( + rows as u64, + cols as u64, + num_ctrl_pts as u64, + image, + tstw, + ctrl_pts, + weights, + coefs, + tonemap, + ) + .await + }).as_slice::<u8>().to_vec().into_boxed_slice() + } + + #[cfg(feature = "cuda")] + { + let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image)); + let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw)); + let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts)); + let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights)); + let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs)); + let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap)); + let mut r = runner!(cava); + let res = async_std::task::block_on(async { + r.run( + rows as u64, + cols as u64, + num_ctrl_pts as u64, + image.get_ref(), + tstw.get_ref(), + ctrl_pts.get_ref(), + weights.get_ref(), + coefs.get_ref(), + tonemap.get_ref(), + ) + .await + }); + let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() }; + let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice(); + res.to_cpu_ref(&mut res_cpu); + res_cpu + } } enum Error { diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn index b9806c93..d901e7e1 100644 --- a/juno_samples/concat/src/concat.jn +++ b/juno_samples/concat/src/concat.jn @@ -18,7 +18,7 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t { } #[entry] -fn concat_entry(arr1 : i32[3], arr2 : i32[6]) -> i32 { - let arr3 = concat::<i32, 3, 6>(arr1, arr2); - return sum::<i32, 9>(arr3); +fn concat_entry<a : usize, b: usize>(arr1 : i32[a], arr2 : i32[b]) -> i32 { + let arr3 = concat::<i32, a, b>(arr1, arr2); + return sum::<i32, a + b>(arr3); } diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch index 680adaeb..7b87070a 100644 --- a/juno_samples/concat/src/cpu.sch +++ b/juno_samples/concat/src/cpu.sch @@ -2,12 +2,12 @@ gvn(*); phi-elim(*); dce(*); -cpu(concat, sum); +inline(*); +auto-outline(*); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -15,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch index 8ee4ef0e..71bed4b4 100644 --- a/juno_samples/concat/src/gpu.sch +++ b/juno_samples/concat/src/gpu.sch @@ -2,12 +2,13 @@ gvn(*); phi-elim(*); dce(*); -gpu(concat, sum); +inline(*); +let out = auto-outline(*); +gpu(out.concat_entry); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -15,3 +16,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs index d0929fbf..78932421 100644 --- a/juno_samples/concat/src/main.rs +++ b/juno_samples/concat/src/main.rs @@ -10,14 +10,13 @@ juno_build::juno!("concat"); fn main() { async_std::task::block_on(async { let mut r = runner!(concat_entry); + let mut a_data = [7, 7, 0]; + let mut b_data = [7, 7, 0, 0, 7, 7]; #[cfg(not(feature = "cuda"))] { - let mut a_data = [7, 7, 0]; let a = HerculesCPURef::from_slice(&mut a_data); - let mut b_data = [7, 7, 0, 0, 7, 7]; let b = HerculesCPURef::from_slice(&mut b_data); - let output = r.run(a, b).await; - println!("{}", output); + let output = r.run(3, 6, a, b).await; assert_eq!(output, 42); } #[cfg(feature = "cuda")] @@ -26,8 +25,7 @@ fn main() { let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data)); let mut b_data = [7, 7, 0, 0, 7, 7]; let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data)); - let output = r.run(a.get_ref(), b.get_ref()).await; - println!("{}", output); + let output = r.run(3, 6, a.get_ref(), b.get_ref()).await; assert_eq!(output, 42); } }); diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs index dc134e59..a464568d 100644 --- a/juno_samples/implicit_clone/build.rs +++ b/juno_samples/implicit_clone/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("implicit_clone.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("implicit_clone.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch new file mode 100644 index 00000000..ebf9d8fe --- /dev/null +++ b/juno_samples/implicit_clone/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch index 443fc778..0f7c8021 100644 --- a/juno_samples/implicit_clone/src/gpu.sch +++ b/juno_samples/implicit_clone/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index ff3e3d8c..c7f18a99 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("matmul.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("matmul.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch new file mode 100644 index 00000000..412e8cbb --- /dev/null +++ b/juno_samples/matmul/src/cpu.sch @@ -0,0 +1,21 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +cpu(out.matmul, out.tiled_64_matmul); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index e85dafdf..dd2dc14c 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.matmul, out.tiled_64_matmul); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index 2352ddef..ec111bc1 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("nested_ccp.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("nested_ccp.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/nested_ccp/src/cpu.sch new file mode 100644 index 00000000..ebf9d8fe --- /dev/null +++ b/juno_samples/nested_ccp/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 021a05e3..69e18343 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.ccp_example, out.median_array, out.no_underflow); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 412d56a4..99ef150d 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp"); fn main() { async_std::task::block_on(async { - let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); + let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); #[cfg(not(feature = "cuda"))] { - let a = HerculesCPURef::from_slice(&a); + let a = HerculesCPURefMut::from_slice(&mut a); let b = HerculesCPURefMut::from_slice(&mut b); let mut r = runner!(ccp_example); let output_example = r.run(a).await; @@ -23,7 +23,7 @@ fn main() { } #[cfg(feature = "cuda")] { - let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); let mut r = runner!(ccp_example); let output_example = r.run(a.get_ref_mut()).await; diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index a0874af7..bfd37cb5 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("simple3.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("simple3.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/simple3/src/cpu.sch b/juno_samples/simple3/src/cpu.sch new file mode 100644 index 00000000..d933f69c --- /dev/null +++ b/juno_samples/simple3/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +dce(*); +float-collections(*); +gcm(*); diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index e97627d4..d27e5831 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple3); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +float-collections(*); +gcm(*); -- GitLab From 2171e023b7f1922176a7911fc3d8ab93bd4c40dd Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Thu, 30 Jan 2025 23:35:32 -0600 Subject: [PATCH 103/109] things braek --- Cargo.lock | 12 ++-- hercules_cg/src/gpu.rs | 61 ++++++++++--------- hercules_samples/call/src/gpu.sch | 3 +- hercules_samples/ccp/src/gpu.sch | 3 +- hercules_samples/dot/src/gpu.sch | 3 +- hercules_samples/fac/src/gpu.sch | 3 +- hercules_samples/matmul/src/gpu.sch | 3 +- juno_samples/antideps/src/cpu.sch | 2 - juno_samples/antideps/src/gpu.sch | 1 - juno_samples/casts_and_intrinsics/src/cpu.sch | 2 - juno_samples/casts_and_intrinsics/src/gpu.sch | 2 - juno_samples/cava/src/gpu.sch | 3 +- juno_samples/concat/src/cpu.sch | 2 - juno_samples/concat/src/gpu.sch | 2 - juno_samples/implicit_clone/src/cpu.sch | 1 - juno_samples/matmul/src/gpu.sch | 1 - juno_samples/nested_ccp/src/gpu.sch | 1 - juno_samples/simple3/src/gpu.sch | 1 - 18 files changed, 50 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 623fc35c..303b1b78 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1206,9 +1206,9 @@ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libfuzzer-sys" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa" +checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75" dependencies = [ "arbitrary", "cc", @@ -2163,9 +2163,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee" dependencies = [ "indexmap", "serde", @@ -2433,9 +2433,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "7e49d2d35d3fad69b39b94139037ecfb4f359f08958b9c11e7315ce770462419" dependencies = [ "memchr", ] diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a266deea..55f8f83c 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -149,8 +149,8 @@ pub fn gpu_codegen<W: Write>( } let return_parameter = if collection_objects.returned_objects().len() == 1 { - Some(collection_objects.origin(*collection_objects.returned_objects() - .first().unwrap()).try_parameter().unwrap()) + collection_objects.origin(*collection_objects.returned_objects() + .first().unwrap()).try_parameter() } else { None }; @@ -568,11 +568,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; panic!("Expected fork node"); }; let reduces = &self.fork_reduce_map[root_fork]; - assert!(reduces.iter().all(|reduce| { - self.collection_objects.objects(*reduce).iter().all(|object| { - self.collection_objects.origin(*object).try_parameter().is_some() - }) - }), "All collection reduces in block fork must originate from parameters"); if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); @@ -977,34 +972,44 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // Parameters emitted at top Node::Parameter { index: _ } => {} // If the constant is primitive, it's stored in register so we repeat - // for all threads. Otherwise, it's stored in shared memory so we only - // want to "allocate" and initialize it once. + // for all threads. Otherwise, it can be inside or outside block fork. + // If inside, it's stored in shared memory so we only want to "allocate" + // and initialize it once. In either case, we then parallelize memset to 0. Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); - if !is_primitive { - let cg_tile = { - let KernelState::OutBlock = state else { - panic!("Expected constant to be in start basic block - outside any fork"); - }; - "block".to_string() - }; + let cg_tile = match state { + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), + }; + if !is_primitive && state == KernelState::OutBlock && is_block_parallel.is_some() && is_block_parallel.unwrap() { + panic!("GPU can't memset collection for multi-block grid"); + } + if !is_primitive && state != KernelState::OutBlock { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; } - self.codegen_constant( - define_variable, - *cons_id, - true, - Some(extra_dim_collects), - dynamic_shared_offset, - w, - *num_tabs, - )?; - if !is_primitive { + if is_primitive || state != KernelState::OutBlock { + self.codegen_constant( + define_variable.clone(), + *cons_id, + true, + Some(extra_dim_collects), + dynamic_shared_offset, + w, + *num_tabs, + )?; + } + if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; *num_tabs -= 1; } + if !is_primitive { + let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; + write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + } } // Dynamic constants emitted at top Node::DynamicConstant { id: _ } => {} @@ -1212,7 +1217,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; - if KernelState::OutBlock == state && is_block_parallel.unwrap() { + if KernelState::OutBlock == state && is_block_parallel.is_some() && is_block_parallel.unwrap() { panic!("GPU can't guarantee correctness for multi-block collection writes"); } let cg_tile = match state { diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch index 1e654e22..6c10c2ce 100644 --- a/hercules_samples/call/src/gpu.sch +++ b/hercules_samples/call/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.add); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch index d8f6a2d0..2852b7a4 100644 --- a/hercules_samples/ccp/src/gpu.sch +++ b/hercules_samples/ccp/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.tricky); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch index 4adbf530..4ec3aaef 100644 --- a/hercules_samples/dot/src/gpu.sch +++ b/hercules_samples/dot/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.dot); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch index 1885854c..6eea1273 100644 --- a/hercules_samples/fac/src/gpu.sch +++ b/hercules_samples/fac/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.fac); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index 9a714789..ca6cdbb9 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.matmul); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch index 9c2c44a8..7e6be7ee 100644 --- a/juno_samples/antideps/src/cpu.sch +++ b/juno_samples/antideps/src/cpu.sch @@ -14,7 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch index 25dba2e7..e166515d 100644 --- a/juno_samples/antideps/src/gpu.sch +++ b/juno_samples/antideps/src/gpu.sch @@ -18,4 +18,3 @@ gcm(*); float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch index 9c2c44a8..7e6be7ee 100644 --- a/juno_samples/casts_and_intrinsics/src/cpu.sch +++ b/juno_samples/casts_and_intrinsics/src/cpu.sch @@ -14,7 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch index f051ed8c..64d063be 100644 --- a/juno_samples/casts_and_intrinsics/src/gpu.sch +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -15,7 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index bb91af72..ace9082c 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -13,10 +13,9 @@ gvn(*); phi-elim(*); dce(*); +// forkify(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*) - diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch index 7b87070a..8ec730d7 100644 --- a/juno_samples/concat/src/cpu.sch +++ b/juno_samples/concat/src/cpu.sch @@ -15,7 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch index 71bed4b4..7bfc6dbe 100644 --- a/juno_samples/concat/src/gpu.sch +++ b/juno_samples/concat/src/gpu.sch @@ -16,7 +16,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch index ebf9d8fe..7e6be7ee 100644 --- a/juno_samples/implicit_clone/src/cpu.sch +++ b/juno_samples/implicit_clone/src/cpu.sch @@ -14,6 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index dd2dc14c..3d3f919c 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -18,4 +18,3 @@ gcm(*); float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 69e18343..4f36ddd8 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -15,6 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index d27e5831..93e85c48 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -16,5 +16,4 @@ infer-schedules(*); gcm(*); dce(*); -float-collections(*); gcm(*); -- GitLab From 7e19759527a10f601e08f862991e40e8e25392c6 Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Fri, 31 Jan 2025 11:44:30 -0600 Subject: [PATCH 104/109] before dc merge --- hercules_cg/src/gpu.rs | 14 ++++++++--- hercules_samples/call/build.rs | 25 +++++++++++++------ hercules_samples/call/src/cpu.sch | 19 -------------- hercules_samples/call/src/gpu.sch | 2 -- hercules_samples/ccp/build.rs | 25 +++++++++++++------ hercules_samples/ccp/src/cpu.sch | 19 -------------- hercules_samples/ccp/src/gpu.sch | 2 -- hercules_samples/dot/src/cpu.sch | 7 ------ hercules_samples/dot/src/gpu.sch | 2 -- hercules_samples/fac/build.rs | 25 +++++++++++++------ hercules_samples/fac/src/cpu.sch | 19 -------------- hercules_samples/fac/src/gpu.sch | 2 -- hercules_samples/matmul/src/gpu.sch | 1 + juno_samples/antideps/build.rs | 25 +++++++++++++------ juno_samples/antideps/src/cpu.sch | 18 ------------- juno_samples/casts_and_intrinsics/build.rs | 25 +++++++++++++------ juno_samples/casts_and_intrinsics/src/cpu.sch | 18 ------------- juno_samples/casts_and_intrinsics/src/gpu.sch | 2 -- juno_samples/concat/build.rs | 25 +++++++++++++------ juno_samples/concat/src/cpu.sch | 19 -------------- juno_samples/concat/src/gpu.sch | 1 + juno_samples/implicit_clone/build.rs | 25 +++++++++++++------ juno_samples/implicit_clone/src/cpu.sch | 18 ------------- juno_samples/matmul/build.rs | 25 +++++++++++++------ juno_samples/matmul/src/cpu.sch | 5 ++-- juno_samples/matmul/src/gpu.sch | 1 + juno_samples/matmul/src/matmul.jn | 2 +- juno_samples/nested_ccp/src/cpu.sch | 1 - juno_samples/simple3/src/cpu.sch | 1 - 29 files changed, 160 insertions(+), 213 deletions(-) delete mode 100644 hercules_samples/call/src/cpu.sch delete mode 100644 hercules_samples/ccp/src/cpu.sch delete mode 100644 hercules_samples/fac/src/cpu.sch delete mode 100644 juno_samples/antideps/src/cpu.sch delete mode 100644 juno_samples/casts_and_intrinsics/src/cpu.sch delete mode 100644 juno_samples/concat/src/cpu.sch delete mode 100644 juno_samples/implicit_clone/src/cpu.sch diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 55f8f83c..341e143e 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -321,7 +321,9 @@ impl GPUContext<'_> { Ok(()) } - // Emit kernel headers, signature, arguments, and dynamic shared memory declaration + /* + * Emit kernel headers, signature, arguments, and dynamic shared memory declaration + */ fn codegen_kernel_begin(&self, has_ret_var: bool, w: &mut String) -> Result<(), Error> { write!(w, " #include <assert.h> @@ -397,7 +399,9 @@ namespace cg = cooperative_groups; Ok(()) } - // Emit calculation of all dynamic constants + /* + * Emit calculation of all dynamic constants + */ fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { for dc in dynamic_constants_bottom_up(self.dynamic_constants) { let dc_val = format!("unsigned long long dc{}", dc.idx()); @@ -436,8 +440,10 @@ namespace cg = cooperative_groups; Ok(()) } - // To abide by c++ reassignment restrictions, we declare all data values - // upfront. + /* + * To abide by c++ reassignment restrictions, we declare all data values + * upfront. + */ fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> { for id in (0..self.function.nodes.len()).map(NodeID::new) { if !self.function.nodes[id.idx()].is_control() && diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs index 7f5816ce..e7b6dee9 100644 --- a/hercules_samples/call/build.rs +++ b/hercules_samples/call/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .ir_in_src("call.hir") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .ir_in_src("call.hir") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .ir_in_src("call.hir") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/hercules_samples/call/src/cpu.sch b/hercules_samples/call/src/cpu.sch deleted file mode 100644 index 4c684da2..00000000 --- a/hercules_samples/call/src/cpu.sch +++ /dev/null @@ -1,19 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -fork-split(*); -unforkify(*); -dce(*); -float-collections(*); -gvn(*); -phi-elim(*); -dce(*); - -infer-schedules(*); - -gcm(*); diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch index 6c10c2ce..cc4ef88f 100644 --- a/hercules_samples/call/src/gpu.sch +++ b/hercules_samples/call/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs index c98d0551..d74547ad 100644 --- a/hercules_samples/ccp/build.rs +++ b/hercules_samples/ccp/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .ir_in_src("ccp.hir") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .ir_in_src("ccp.hir") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .ir_in_src("ccp.hir") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/hercules_samples/ccp/src/cpu.sch b/hercules_samples/ccp/src/cpu.sch deleted file mode 100644 index 4c684da2..00000000 --- a/hercules_samples/ccp/src/cpu.sch +++ /dev/null @@ -1,19 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -fork-split(*); -unforkify(*); -dce(*); -float-collections(*); -gvn(*); -phi-elim(*); -dce(*); - -infer-schedules(*); - -gcm(*); diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch index 2852b7a4..d49af8f5 100644 --- a/hercules_samples/ccp/src/gpu.sch +++ b/hercules_samples/ccp/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/hercules_samples/dot/src/cpu.sch b/hercules_samples/dot/src/cpu.sch index 4c684da2..58a7266d 100644 --- a/hercules_samples/dot/src/cpu.sch +++ b/hercules_samples/dot/src/cpu.sch @@ -6,14 +6,7 @@ auto-outline(*); ip-sroa(*); sroa(*); -fork-split(*); unforkify(*); dce(*); -float-collections(*); -gvn(*); -phi-elim(*); -dce(*); - -infer-schedules(*); gcm(*); diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch index 4ec3aaef..c65827fd 100644 --- a/hercules_samples/dot/src/gpu.sch +++ b/hercules_samples/dot/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs index 1986a746..aa1f73a9 100644 --- a/hercules_samples/fac/build.rs +++ b/hercules_samples/fac/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .ir_in_src("fac.hir") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .ir_in_src("fac.hir") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .ir_in_src("fac.hir") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/hercules_samples/fac/src/cpu.sch b/hercules_samples/fac/src/cpu.sch deleted file mode 100644 index 4c684da2..00000000 --- a/hercules_samples/fac/src/cpu.sch +++ /dev/null @@ -1,19 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -fork-split(*); -unforkify(*); -dce(*); -float-collections(*); -gvn(*); -phi-elim(*); -dce(*); - -infer-schedules(*); - -gcm(*); diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch index 6eea1273..ac1f6026 100644 --- a/hercules_samples/fac/src/gpu.sch +++ b/hercules_samples/fac/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index ca6cdbb9..a4eb3240 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -15,5 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs index 8e261270..d74d1a49 100644 --- a/juno_samples/antideps/build.rs +++ b/juno_samples/antideps/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("antideps.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("antideps.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("antideps.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch deleted file mode 100644 index 7e6be7ee..00000000 --- a/juno_samples/antideps/src/cpu.sch +++ /dev/null @@ -1,18 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); - -infer-schedules(*); - -gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs index 5d25fbba..342c4a05 100644 --- a/juno_samples/casts_and_intrinsics/build.rs +++ b/juno_samples/casts_and_intrinsics/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("casts_and_intrinsics.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("casts_and_intrinsics.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("casts_and_intrinsics.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch deleted file mode 100644 index 7e6be7ee..00000000 --- a/juno_samples/casts_and_intrinsics/src/cpu.sch +++ /dev/null @@ -1,18 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); - -infer-schedules(*); - -gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch index 64d063be..c997cca9 100644 --- a/juno_samples/casts_and_intrinsics/src/gpu.sch +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/concat/build.rs b/juno_samples/concat/build.rs index c91df94e..c9ef720d 100644 --- a/juno_samples/concat/build.rs +++ b/juno_samples/concat/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("concat.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("concat.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("concat.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch deleted file mode 100644 index 8ec730d7..00000000 --- a/juno_samples/concat/src/cpu.sch +++ /dev/null @@ -1,19 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -inline(*); -auto-outline(*); - -ip-sroa(*); -sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); - -infer-schedules(*); - -gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch index 7bfc6dbe..084f020c 100644 --- a/juno_samples/concat/src/gpu.sch +++ b/juno_samples/concat/src/gpu.sch @@ -16,5 +16,6 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs index a464568d..8e465874 100644 --- a/juno_samples/implicit_clone/build.rs +++ b/juno_samples/implicit_clone/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("implicit_clone.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("implicit_clone.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("implicit_clone.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch deleted file mode 100644 index 7e6be7ee..00000000 --- a/juno_samples/implicit_clone/src/cpu.sch +++ /dev/null @@ -1,18 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); - -infer-schedules(*); - -gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index c7f18a99..0be838c6 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("matmul.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("matmul.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("matmul.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch index 412e8cbb..b256d73b 100644 --- a/juno_samples/matmul/src/cpu.sch +++ b/juno_samples/matmul/src/cpu.sch @@ -2,8 +2,7 @@ gvn(*); phi-elim(*); dce(*); -let out = auto-outline(*); -cpu(out.matmul, out.tiled_64_matmul); +auto-outline(*); ip-sroa(*); sroa(*); @@ -12,10 +11,10 @@ gvn(*); phi-elim(*); dce(*); +forkify(*); infer-schedules(*); gcm(*); float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index 3d3f919c..205dee9c 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -12,6 +12,7 @@ gvn(*); phi-elim(*); dce(*); +forkify(*); infer-schedules(*); gcm(*); diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn index ca9be73a..fb6de5bd 100644 --- a/juno_samples/matmul/src/matmul.jn +++ b/juno_samples/matmul/src/matmul.jn @@ -20,7 +20,7 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l] let atile : i32[64, 64]; let btile : i32[64, 64]; let ctile : i32[64, 64]; - + for bi = 0 to n / 64 { for bk = 0 to l / 64 { for ti = 0 to 64 { diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/nested_ccp/src/cpu.sch index ebf9d8fe..7e6be7ee 100644 --- a/juno_samples/nested_ccp/src/cpu.sch +++ b/juno_samples/nested_ccp/src/cpu.sch @@ -14,6 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/simple3/src/cpu.sch b/juno_samples/simple3/src/cpu.sch index d933f69c..7e6be7ee 100644 --- a/juno_samples/simple3/src/cpu.sch +++ b/juno_samples/simple3/src/cpu.sch @@ -15,5 +15,4 @@ infer-schedules(*); gcm(*); dce(*); -float-collections(*); gcm(*); -- GitLab From 2fed8e6b88b726494ed568bc2b6c85c7360991a5 Mon Sep 17 00:00:00 2001 From: Russel Arbore <prathi3@illinois.edu> Date: Fri, 31 Jan 2025 16:03:35 -0600 Subject: [PATCH 105/109] gpu non-tile works --- hercules_cg/src/gpu.rs | 84 +++++++++++++------ hercules_samples/matmul/src/gpu.sch | 1 + juno_samples/cava/src/gpu.sch | 2 - juno_samples/concat/src/main.rs | 37 +++++++- juno_samples/implicit_clone/src/main.rs | 8 -- juno_samples/matmul/src/cpu.sch | 20 ----- juno_samples/matmul/src/gpu.sch | 1 - juno_samples/nested_ccp/build.rs | 25 ++++-- juno_samples/nested_ccp/src/gpu.sch | 2 - juno_samples/nested_ccp/src/main.rs | 8 +- juno_samples/patterns/Cargo.toml | 3 + juno_samples/patterns/build.rs | 23 +++-- .../src/cpu.sch => patterns/src/gpu.sch} | 5 +- juno_samples/simple3/build.rs | 25 ++++-- juno_samples/simple3/src/gpu.sch | 2 - 15 files changed, 155 insertions(+), 91 deletions(-) delete mode 100644 juno_samples/matmul/src/cpu.sch rename juno_samples/{nested_ccp/src/cpu.sch => patterns/src/gpu.sch} (73%) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 341e143e..6c62ed76 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -23,6 +23,7 @@ pub fn gpu_codegen<W: Write>( typing: &Vec<TypeID>, control_subgraph: &Subgraph, bbs: &BasicBlocks, + backing_allocation: &FunctionBackingAllocation, collection_objects: &FunctionCollectionObjects, def_use_map: &ImmutableDefUseMap, fork_join_map: &HashMap<NodeID, NodeID>, @@ -168,6 +169,7 @@ pub fn gpu_codegen<W: Write>( typing, control_subgraph, bbs, + backing_allocation, collection_objects, def_use_map, fork_join_map, @@ -196,6 +198,7 @@ struct GPUContext<'a> { typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, bbs: &'a BasicBlocks, + backing_allocation: &'a FunctionBackingAllocation, collection_objects: &'a FunctionCollectionObjects, def_use_map: &'a ImmutableDefUseMap, fork_join_map: &'a HashMap<NodeID, NodeID>, @@ -352,8 +355,14 @@ namespace cg = cooperative_groups; "__global__ void __launch_bounds__({}) {}_gpu(", self.kernel_params.max_num_threads, self.function.name )?; - // The first set of parameters are dynamic constants. let mut first_param = true; + // The first parameter is a pointer to GPU backing memory, if it's + // needed. + if self.backing_allocation.contains_key(&Device::CUDA) { + first_param = false; + write!(w, "char* backing")?; + } + // The second set of parameters are dynamic constants. for idx in 0..self.function.num_dynamic_constants { if first_param { first_param = false; @@ -362,7 +371,7 @@ namespace cg = cooperative_groups; } write!(w, "unsigned long long dc_p{}", idx)?; } - // The second set of parameters are normal arguments. + // The third set of parameters are normal arguments. for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; @@ -403,38 +412,46 @@ namespace cg = cooperative_groups; * Emit calculation of all dynamic constants */ fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> { - for dc in dynamic_constants_bottom_up(self.dynamic_constants) { + for dc in dynamic_constants_bottom_up(&self.dynamic_constants) { let dc_val = format!("unsigned long long dc{}", dc.idx()); - match self.dynamic_constants[dc.idx()] { + match &self.dynamic_constants[dc.idx()] { DynamicConstant::Constant(val) => write!(w, "\t{} = {}ull;\n", dc_val, val)?, DynamicConstant::Parameter(idx) => { - if idx < self.function.num_dynamic_constants as usize { + if *idx < self.function.num_dynamic_constants as usize { write!(w, "\t{} = dc_p{};\n", dc_val, idx)? } else { write!(w, "\t{} = 0;\n", dc_val)? } } - DynamicConstant::Add(left, right) => { - write!(w, "\t{} = dc{} + dc{};\n", dc_val, left.idx(), right.idx())? + DynamicConstant::Add(args) => { + let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" + "); + write!(w, "\t{} = {};\n", dc_val, rhs)? + } + DynamicConstant::Mul(args) => { + let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" * "); + write!(w, "\t{} = {};\n", dc_val, rhs)? + } + DynamicConstant::Min(args) => { + let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("min(dc{}, ", arg.idx())).collect(); + let rhs_last = format!("dc{}", args.last().unwrap().idx()); + let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect(); + write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)? + } + DynamicConstant::Max(args) => { + let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("max(dc{}, ", arg.idx())).collect(); + let rhs_last = format!("dc{}", args.last().unwrap().idx()); + let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect(); + write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)? } DynamicConstant::Sub(left, right) => { write!(w, "\t{} = dc{} - dc{};\n", dc_val, left.idx(), right.idx())? } - DynamicConstant::Mul(left, right) => { - write!(w, "\t{} = dc{} * dc{};\n", dc_val, left.idx(), right.idx())? - } DynamicConstant::Div(left, right) => { write!(w, "\t{} = dc{} / dc{};\n", dc_val, left.idx(), right.idx())? } DynamicConstant::Rem(left, right) => { write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())? } - DynamicConstant::Min(left, right) => { - write!(w, "\t{} = min(dc{}, dc{});\n", dc_val, left.idx(), right.idx())? - } - DynamicConstant::Max(left, right) => { - write!(w, "\t{} = max(dc{}, dc{});\n", dc_val, left.idx(), right.idx())? - } } } Ok(()) @@ -502,8 +519,15 @@ namespace cg = cooperative_groups; let mut pass_args = String::new(); write!(w, " extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; - // The first set of parameters are dynamic constants. let mut first_param = true; + // The first parameter is a pointer to GPU backing memory, if it's + // needed. + if self.backing_allocation.contains_key(&Device::CUDA) { + first_param = false; + write!(w, "char* backing")?; + write!(pass_args, "backing")?; + } + // The second set of parameters are dynamic constants. for idx in 0..self.function.num_dynamic_constants { if first_param { first_param = false; @@ -514,7 +538,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w, "unsigned long long dc_p{}", idx)?; write!(pass_args, "dc_p{}", idx)?; } - // The second set of parameters are normal arguments. + // The third set of parameters are normal arguments. for (idx, ty) in self.function.param_types.iter().enumerate() { if first_param { first_param = false; @@ -540,8 +564,13 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "ret")?; write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; } + write!(w, "\tcudaError_t err;\n"); write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; + write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n"); write!(w, "\tcudaDeviceSynchronize();\n")?; + write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n"); if has_ret_var { // Copy return from device to host, whether it's primitive value or collection pointer write!(w, "\t{} host_ret;\n", ret_type)?; @@ -979,17 +1008,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Node::Parameter { index: _ } => {} // If the constant is primitive, it's stored in register so we repeat // for all threads. Otherwise, it can be inside or outside block fork. - // If inside, it's stored in shared memory so we only want to "allocate" - // and initialize it once. In either case, we then parallelize memset to 0. + // If inside, it's stored in shared memory so we "allocate" it once + // and parallelize memset to 0. If outside, we initialize as offset + // to backing, but if multi-block grid, don't memset to avoid grid-level sync. Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); let cg_tile = match state { KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; - if !is_primitive && state == KernelState::OutBlock && is_block_parallel.is_some() && is_block_parallel.unwrap() { - panic!("GPU can't memset collection for multi-block grid"); - } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; @@ -1007,9 +1034,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; *num_tabs -= 1; } - if !is_primitive { + if !is_primitive && state == KernelState::OutBlock { + let (_, offsets) = &self.backing_allocation[&Device::CUDA]; + let offset = offsets[&id]; + write!(w, "{}{} = backing + dc{};\n", tabs, define_variable, offset.idx())?; + } + if !is_primitive && (state != KernelState::OutBlock || is_block_parallel.is_none() || !is_block_parallel.unwrap()) { let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; @@ -1223,9 +1256,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; - if KernelState::OutBlock == state && is_block_parallel.is_some() && is_block_parallel.unwrap() { - panic!("GPU can't guarantee correctness for multi-block collection writes"); - } let cg_tile = match state { KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index a4eb3240..c0a1a5ce 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -12,6 +12,7 @@ gvn(*); phi-elim(*); dce(*); +forkify(*); infer-schedules(*); gcm(*); diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index ace9082c..a5570b8d 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -17,5 +17,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*) diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs index 83534c9d..9674c2c5 100644 --- a/juno_samples/concat/src/main.rs +++ b/juno_samples/concat/src/main.rs @@ -2,16 +2,47 @@ use hercules_rt::runner; use hercules_rt::HerculesCPURef; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; juno_build::juno!("concat"); fn main() { async_std::task::block_on(async { let mut r = runner!(concat_entry); - let output = r.run(7).await; - println!("{}", output); - assert_eq!(output, 42); + let mut a_data = [7, 7, 0]; + let mut b_data = [7, 7, 0, 0, 7, 7]; + #[cfg(not(feature = "cuda"))] + { + let a = HerculesCPURef::from_slice(&mut a_data); + let b = HerculesCPURef::from_slice(&mut b_data); + let output = r.run(3, 6, a, b).await; + assert_eq!(output, 42); + const N: usize = 3; + let arr : Box<[i32]> = (2..=4).collect(); + let arr = HerculesCPURef::from_slice(&arr); + + let mut r = runner!(concat_switch); + let output = r.run(N as u64, 50, arr.clone()).await; + let result = output.as_slice::<i32>(); + println!("{:?}", result); + assert_eq!(result, [0, 1, 2, 3, 4]); + + let output = r.run(N as u64, 30, arr).await; + let result = output.as_slice::<i32>(); + println!("{:?}", result); + assert_eq!(result, [2, 3, 4, 0, 1]); + } + #[cfg(feature = "cuda")] + { + let mut a_data = [7, 7, 0]; + let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data)); + let mut b_data = [7, 7, 0, 0, 7, 7]; + let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data)); + let output = r.run(3, 6, a.get_ref(), b.get_ref()).await; + assert_eq!(output, 42); + } }); } diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs index 1e94ff89..c1f82528 100644 --- a/juno_samples/implicit_clone/src/main.rs +++ b/juno_samples/implicit_clone/src/main.rs @@ -8,42 +8,34 @@ fn main() { async_std::task::block_on(async { let mut r = runner!(simple_implicit_clone); let output = r.run(3).await; - println!("{}", output); assert_eq!(output, 11); let mut r = runner!(loop_implicit_clone); let output = r.run(100).await; - println!("{}", output); assert_eq!(output, 7); let mut r = runner!(double_loop_implicit_clone); let output = r.run(3).await; - println!("{}", output); assert_eq!(output, 42); let mut r = runner!(tricky_loop_implicit_clone); let output = r.run(2, 2).await; - println!("{}", output); assert_eq!(output, 130); let mut r = runner!(tricky2_loop_implicit_clone); let output = r.run(2, 3).await; - println!("{}", output); assert_eq!(output, 39); let mut r = runner!(tricky3_loop_implicit_clone); let output = r.run(5, 7).await; - println!("{}", output); assert_eq!(output, 7); let mut r = runner!(no_implicit_clone); let output = r.run(4).await; - println!("{}", output); assert_eq!(output, 13); let mut r = runner!(mirage_implicit_clone); let output = r.run(73).await; - println!("{}", output); assert_eq!(output, 843); }); } diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch deleted file mode 100644 index b256d73b..00000000 --- a/juno_samples/matmul/src/cpu.sch +++ /dev/null @@ -1,20 +0,0 @@ -gvn(*); -phi-elim(*); -dce(*); - -auto-outline(*); - -ip-sroa(*); -sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); - -forkify(*); -infer-schedules(*); - -gcm(*); -float-collections(*); -dce(*); -gcm(*); diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index 205dee9c..3d3f919c 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -12,7 +12,6 @@ gvn(*); phi-elim(*); dce(*); -forkify(*); infer-schedules(*); gcm(*); diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index ec111bc1..074937e7 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("nested_ccp.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("nested_ccp.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 4f36ddd8..c56d046a 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 99ef150d..bc99a4bd 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp"); fn main() { async_std::task::block_on(async { - let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); + let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); #[cfg(not(feature = "cuda"))] { - let a = HerculesCPURefMut::from_slice(&mut a); + let a = HerculesCPURef::from_slice(&a); let b = HerculesCPURefMut::from_slice(&mut b); let mut r = runner!(ccp_example); let output_example = r.run(a).await; @@ -23,8 +23,8 @@ fn main() { } #[cfg(feature = "cuda")] { - let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); - let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); + let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b)); let mut r = runner!(ccp_example); let output_example = r.run(a.get_ref_mut()).await; let mut r = runner!(median_array); diff --git a/juno_samples/patterns/Cargo.toml b/juno_samples/patterns/Cargo.toml index a8dda157..bedaf7ca 100644 --- a/juno_samples/patterns/Cargo.toml +++ b/juno_samples/patterns/Cargo.toml @@ -8,6 +8,9 @@ edition = "2021" name = "juno_patterns" path = "src/main.rs" +[features] +cuda = ["juno_build/cuda", "hercules_rt/cuda"] + [build-dependencies] juno_build = { path = "../../juno_build" } diff --git a/juno_samples/patterns/build.rs b/juno_samples/patterns/build.rs index 8ac92f00..625da0a5 100644 --- a/juno_samples/patterns/build.rs +++ b/juno_samples/patterns/build.rs @@ -1,9 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("patterns.jn") - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("patterns.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("patterns.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/patterns/src/gpu.sch similarity index 73% rename from juno_samples/nested_ccp/src/cpu.sch rename to juno_samples/patterns/src/gpu.sch index 7e6be7ee..3d9c8c9e 100644 --- a/juno_samples/nested_ccp/src/cpu.sch +++ b/juno_samples/patterns/src/gpu.sch @@ -2,7 +2,8 @@ gvn(*); phi-elim(*); dce(*); -auto-outline(*); +let out = auto-outline(*); +gpu(out.entry); ip-sroa(*); sroa(*); @@ -14,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index bfd37cb5..58c2c5aa 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,11 +1,22 @@ use juno_build::JunoCompiler; fn main() { - JunoCompiler::new() - .file_in_src("simple3.jn") - .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) - .unwrap() - .build() - .unwrap(); + #[cfg(not(feature = "cuda"))] + { + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .build() + .unwrap(); + } + #[cfg(feature = "cuda")] + { + JunoCompiler::new() + .file_in_src("simple3.jn") + .unwrap() + .schedule_in_src("gpu.sch") + .unwrap() + .build() + .unwrap(); + } } diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index 93e85c48..d6c2a9d6 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -15,5 +15,3 @@ dce(*); infer-schedules(*); gcm(*); -dce(*); -gcm(*); -- GitLab From 7849656e49a926a695b447fcc741431b66085265 Mon Sep 17 00:00:00 2001 From: Russel Arbore <prathi3@illinois.edu> Date: Fri, 31 Jan 2025 16:06:12 -0600 Subject: [PATCH 106/109] oops --- juno_scheduler/src/pm.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 9d6d00d4..2371e0f2 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -673,6 +673,7 @@ impl PassManager { &typing[idx], &control_subgraphs[idx], &bbs[idx], + &backing_allocations[&FunctionID::new(idx)], &collection_objects[&FunctionID::new(idx)], &def_uses[idx], &fork_join_maps[idx], -- GitLab From 12b5e54fecdf071b70f11669dabd300a3a437c49 Mon Sep 17 00:00:00 2001 From: Russel Arbore <prathi3@illinois.edu> Date: Fri, 31 Jan 2025 16:59:53 -0600 Subject: [PATCH 107/109] clean extra dim --- hercules_cg/src/gpu.rs | 54 +++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 6c62ed76..d7a6d258 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -288,7 +288,8 @@ impl GPUContext<'_> { self.get_root_forks_and_num_blocks(self.fork_tree); let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, self.fork_tree, is_block_parallel); let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(self.fork_tree, thread_root_root_fork); - // TODO: Uncomment and adjust once we know logic of extra dim + // TODO: Uncomment and adjust once we know logic of extra dim. This will affect constant + // collections, reads, and writes. // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map); let extra_dim_collects = HashSet::new(); @@ -749,30 +750,9 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; fork_control_map: &HashMap<NodeID, HashSet<NodeID>>, fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, ) -> HashSet<TypeID> { - // Get all constant collection creations - let collect_consts: HashSet<NodeID> = (0..self.function.nodes.len()) - .filter(|idx| self.function.nodes[*idx].is_constant() && !self.types[self.typing[*idx].idx()].is_primitive()) - .map(|idx| NodeID::new(idx)) - .collect(); - // Reverse fork_control_map - let control_fork_map: HashMap<NodeID, NodeID> = fork_control_map.iter() - .flat_map(|(fork, controls)| { - controls.iter().map(move |control| (*control, *fork)) - }) - .collect(); - // Get all uses of each collection, map each use to basic block, then map each basic block to fork - let collect_fork_users: HashMap<NodeID, HashSet<NodeID>> = collect_consts.iter() - .map(|collect_const| { - (*collect_const, self.def_use_map.get_users(*collect_const)) - }) - .map(|(collect_const, users)| { - (collect_const, users.iter().map(|user| control_fork_map[&self.bbs.0[user.idx()]]).collect()) - }) - .collect(); - collect_fork_users.iter() - .filter(|(_, fork_users)| !fork_thread_quota_map.contains_key(fork_users.iter().next().unwrap())) - .map(|(collect_const, _)| self.typing[collect_const.idx()]) - .collect() + // Determine which fork each collection is used in, and check if it's + // parallelized via the fork_thread_quota_map. + todo!() } fn codegen_data_control_no_forks( @@ -1237,7 +1217,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } // Read of primitive requires load after pointer math. Node::Read { collect, indices } => { - let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); + let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects); let data_type_id = self.typing[id.idx()]; if self.types[data_type_id.idx()].is_primitive() { let type_name = self.get_type(data_type_id, true); @@ -1253,7 +1233,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; data, indices, } => { - let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); + let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; let cg_tile = match state { @@ -1452,27 +1432,31 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; * This function emits collection name + pointer math for the provided indices. * All collection types use char pointers. */ - fn codegen_collect(&self, collect: NodeID, indices: &[Index], has_extra_dim: bool) -> String { + fn codegen_collect(&self, collect: NodeID, indices: &[Index], extra_dim_collects: &HashSet<TypeID>) -> String { let mut index_ptr = "0".to_string(); let type_id = self.typing[collect.idx()]; for index in indices { match index { Index::Field(field) => { - self.get_size(type_id, Some(*field), None); + self.get_size(type_id, Some(*field), Some(extra_dim_collects)); } // Variants of summations have zero offset Index::Variant(_) => {} // Convert multi-d array index to 1-d index, and optionally // convert to single-byte index by multiplying by element size Index::Position(array_indices) => { + let has_extra_dim = extra_dim_collects.contains(&self.typing[collect.idx()]); + if has_extra_dim { + continue; + } let Type::Array(element_type, extents) = &self.types[self.typing[collect.idx()].idx()] else { panic!("Expected array type") }; let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]); - let max_left_array_index = array_indices.len() - 1 - if has_extra_dim { 1 } else { 0 }; - for (i, index) in array_indices.iter().skip(if has_extra_dim { 1 } else { 0 }).rev().enumerate() { + let max_left_array_index = array_indices.len() - 1; + for (i, index) in array_indices.iter().rev().enumerate() { cumulative_offset = format!( "{} * ({}{}", cumulative_offset, @@ -1487,9 +1471,9 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; index_ptr.push_str(&format!( " + {}{}", cumulative_offset, - ")".repeat(array_indices.len() - if has_extra_dim { 1 } else { 0 }) + ")".repeat(array_indices.len()) )); - let element_size = self.get_size(*element_type, None, None); + let element_size = self.get_size(*element_type, None, Some(extra_dim_collects)); index_ptr.push_str(&format!(" * {}", element_size)); } } @@ -1556,7 +1540,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; format!("*reinterpret_cast<{}>({}+{})", field_type, name, offset), constant_fields[i], false, - extra_dim_collects, + None, dynamic_shared_offset, w, num_tabs, @@ -1619,7 +1603,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { - let array_size = multiply_dcs(if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { &extents[1..] } else { extents }); + let array_size = if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { "1".to_string() } else { multiply_dcs(extents) }; format!("{} * {}", self.get_alignment(*element_type), array_size) } Type::Product(fields) => { -- GitLab From 008fd95d73bb500c3744ed8721a4b256eb0dac5c Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 31 Jan 2025 17:47:57 -0600 Subject: [PATCH 108/109] fix lifetime --- hercules_rt/src/lib.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index c5ef5f6a..ed5dca1d 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -152,12 +152,12 @@ impl<'a> HerculesCPURefMut<'a> { #[cfg(feature = "cuda")] impl<'a> HerculesCUDARef<'a> { - pub fn to_cpu_ref<T>(self, dst: &mut [T]) -> HerculesCPURef<'a> { + pub fn to_cpu_ref<'b, T>(self, dst: &'b mut [T]) -> HerculesCPURefMut<'b> { unsafe { let size = self.size; let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap(); __copy_cuda_to_cpu(ptr.as_ptr(), self.ptr.as_ptr(), size); - HerculesCPURef { + HerculesCPURefMut { ptr, size, _phantom: PhantomData, @@ -192,12 +192,12 @@ impl<'a> HerculesCUDARefMut<'a> { } } - pub fn to_cpu_ref<T>(self, dst: &mut [T]) -> HerculesCPURef<'a> { + pub fn to_cpu_ref<'b, T>(self, dst: &mut [T]) -> HerculesCPURefMut<'b> { unsafe { let size = self.size; let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap(); __copy_cuda_to_cpu(ptr.as_ptr(), self.ptr.as_ptr(), size); - HerculesCPURef { + HerculesCPURefMut { ptr, size, _phantom: PhantomData, -- GitLab From 386d6159645015819ce0f06de5cbcb7828c39fd7 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 31 Jan 2025 17:52:14 -0600 Subject: [PATCH 109/109] make cava call medianMatrix again --- juno_samples/cava/src/cava.jn | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn index ab4fbe59..359a83ed 100644 --- a/juno_samples/cava/src/cava.jn +++ b/juno_samples/cava/src/cava.jn @@ -117,24 +117,7 @@ fn denoise<row : usize, col : usize>(input : f32[CHAN, row, col]) -> f32[CHAN, r } } - let tmp : f32[9]; - for r = 0 to 3 { - for c = 0 to 3 { - tmp[r * 3 + c] = filter[r, c]; - } - } - - for i = 0 to 9 - 1 { - for j = 0 to 9 - i - 1 { - if tmp[j] > tmp[j+1] { - let t : f32 = tmp[j]; - tmp[j] = tmp[j+1]; - tmp[j+1] = t; - } - } - } - - res[chan, r, c] = tmp[9 / 2]; + res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter); } else { res[chan, r, c] = input[chan, r, c]; } -- GitLab