diff --git a/.gitignore b/.gitignore index 16e4eda72be12bf1449508941f676c9453459632..f8a684ce5223884e46f65b7fa67b6328eadf3100 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,6 @@ *.c *.o *.a -*.hman +*.hrt .*.swp .vscode diff --git a/Cargo.lock b/Cargo.lock index 23c5f4c79b036d0099f4a73d7a6c858cdd703b11..e9e4f311440fbafe440d4734b35fbcc54365bd3e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -267,6 +267,26 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbc26382d871df4b7442e3df10a9402bf3cf5e55cbd66f12be38861425f0564" +[[package]] +name = "call" +version = "0.1.0" +dependencies = [ + "async-std", + "juno_build", + "rand", + "with_builtin_macros", +] + +[[package]] +name = "ccp" +version = "0.1.0" +dependencies = [ + "async-std", + "juno_build", + "rand", + "with_builtin_macros", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -375,7 +395,6 @@ version = "0.1.0" dependencies = [ "async-std", "clap", - "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -448,7 +467,6 @@ version = "0.1.0" dependencies = [ "async-std", "clap", - "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -633,29 +651,6 @@ dependencies = [ "take_mut", ] -[[package]] -name = "hercules_rt" -version = "0.1.0" -dependencies = [ - "hercules_rt_proc", - "libc", - "postcard", - "serde", -] - -[[package]] -name = "hercules_rt_proc" -version = "0.1.0" -dependencies = [ - "anyhow", - "hercules_cg", - "hercules_ir", - "hercules_opt", - "postcard", - "serde", - "uuid", -] - [[package]] name = "hermit-abi" version = "0.4.0" @@ -707,7 +702,6 @@ name = "juno_build" version = "0.1.0" dependencies = [ "hercules_ir", - "hercules_rt", "juno_frontend", "with_builtin_macros", ] @@ -717,7 +711,6 @@ name = "juno_casts_and_intrinsics" version = "0.1.0" dependencies = [ "async-std", - "hercules_rt", "juno_build", "with_builtin_macros", ] @@ -744,7 +737,6 @@ name = "juno_matmul" version = "0.1.0" dependencies = [ "async-std", - "hercules_rt", "juno_build", "with_builtin_macros", ] @@ -754,7 +746,6 @@ name = "juno_nested_ccp" version = "0.1.0" dependencies = [ "async-std", - "hercules_rt", "juno_build", "with_builtin_macros", ] @@ -774,7 +765,6 @@ name = "juno_simple3" version = "0.1.0" dependencies = [ "async-std", - "hercules_rt", "juno_build", "with_builtin_macros", ] @@ -896,7 +886,6 @@ version = "0.1.0" dependencies = [ "async-std", "clap", - "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -1440,28 +1429,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" -[[package]] -name = "uuid" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" -dependencies = [ - "getrandom", - "rand", - "uuid-macro-internal", -] - -[[package]] -name = "uuid-macro-internal" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b91f57fe13a38d0ce9e28a03463d8d3c2468ed03d75375110ec71d93b449a08" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.79", -] - [[package]] name = "value-bag" version = "1.9.0" diff --git a/Cargo.toml b/Cargo.toml index bffe036473679129eaf32885918360c9782fc6a8..0b9262c841d1871c580677854625180e70a9309a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,10 +4,12 @@ members = [ "hercules_cg", "hercules_ir", "hercules_opt", - "hercules_rt", - "hercules_rt_proc", "hercules_tools/hercules_driver", + + "juno_frontend", + "juno_scheduler", + "juno_build", #"hercules_test/hercules_interpreter", #"hercules_test/hercules_tests", @@ -15,10 +17,8 @@ members = [ "hercules_samples/dot", "hercules_samples/matmul", "hercules_samples/fac", - - "juno_frontend", - "juno_scheduler", - "juno_build", + "hercules_samples/call", + "hercules_samples/ccp", "juno_samples/simple3", "juno_samples/matmul", diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index 8437e9e44b2c7792b54f563f86f59ee5fa6bd23a..d9bf505c0e80350328d34ccd3a710bc2f0dbd267 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -1,10 +1,10 @@ extern crate bitvec; extern crate hercules_ir; -use std::cell::{Cell, RefCell}; -use std::collections::{HashMap, HashSet, VecDeque}; +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::fmt::{Error, Write}; -use std::iter::once; +use std::iter::{zip, FromIterator}; +use std::sync::atomic::{AtomicUsize, Ordering}; use self::bitvec::prelude::*; @@ -12,1310 +12,866 @@ use self::hercules_ir::*; use crate::*; -/* - * List of big TODOs that aren't urgent: - * - * 1. Return `undef` when a PartitionExit data output isn't dominated by that - * datum's definition. PartitionExit always returns the complete set of data - * outputs that could ever be needed from a partition - this is because we - * don't want to introduce sum types into partition signatures or do funky - * pointer tricks. So, we could run into the following situation: - * - * Block 1 - * / \ - * Block 2 Block 3 - * | | - * Define v1 Define v2 - * / \ - * PartitionExit(v1,v2) PartitionExit(v1, v2) - * - * Let's assume that the PartitionExits branch to different partitions where - * one of them needs v1 and the other needs v2. Notice that both - * PartitionExits need to return both v1 and v2, since their signatures must - * be identical, even though for both, one of v1 or v2 doesn't dominate the - * PartitionExit. What *should* happen here is that each PartitionExit gets - * lowered to an LLVM `ret`, where the non-dominating output is set to - * `undef`. This works since in the original, un-partitioned, Hercules IR, - * defs must dominate uses, so we won't run into a situation where a returned - * `undef` value is actually read. What happens currently is that the - * generated LLVM will `ret` `%v1` and `%v2`, which LLVM won't compile (since - * the code wouldn't be in SSA form). This should get fixed when we start - * compiling more complicated codes. - * - * 2. Handle >= 3D fork-joins and array accesses. This isn't conceptually - * difficult, but generating the LLVM code to implement these is annoying. - * - * 3. Handle ABI properly when taking in / returning structs taking more than 16 - * bytes. When a passed / returned struct takes more than 16 bytes, it needs - * to be passed around via pointers. This is one of many platform specific C - * ABI rules we need to handle to be properly called from Rust (that 16 byte - * rule is actually x86-64 specific). I'm honestly not sure how to handle - * this well. We avoid running into the manifestation of this problem for - * some samples by removing unneeded parameters / return values from - * partitions at the schedule IR level, which we should do anyway, but this - * isn't a complete solution. - */ +static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0); /* - * The top level function to compile a schedule IR function into LLVM IR, for + * The top level function to compile a Hercules IR function into LLVM IR for * execution on the CPU. We generate LLVM IR textually, since there are no good * LLVM bindings for Rust, and we are *not* writing any C++. */ -pub fn cpu_compile<W: Write>( - function: &SFunction, - manifest: &PartitionManifest, +pub fn cpu_codegen<W: Write>( + function: &Function, + types: &Vec<Type>, + constants: &Vec<Constant>, + dynamic_constants: &Vec<DynamicConstant>, + reverse_postorder: &Vec<NodeID>, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + bbs: &Vec<NodeID>, w: &mut W, ) -> Result<(), Error> { - // Calculate basic analyses over schedule IR. - let virt_reg_to_inst_id = sched_virt_reg_to_inst_id(function); - let dep_graph = sched_dependence_graph(function, &virt_reg_to_inst_id); - let svalue_types = sched_svalue_types(function); - let parallel_reduce_infos = sched_parallel_reduce_sections(function); - - // Calculate the names of each block. For blocks that are the top or bottom - // blocks of sequential fork-joins, references outside the fork-join - // actually need to refer to the header block. This is a bit complicated to - // handle, and we use these names in several places, so pre-calculate the - // block names. Intuitively, if we are "inside" a sequential fork-join, - // references to the top or bottom blocks actually refer to those blocks, - // while if we are "outside" the sequential fork-join, references to both - // the top or bottom blocks actually refer to the loop header block. - let mut block_names = HashMap::new(); - for (block_idx, block) in function.blocks.iter().enumerate() { - for fork_join_id in parallel_reduce_infos - .keys() - .map(|id| Some(*id)) - .chain(once(None)) - { - let block_id = BlockID::new(block_idx); - let possible_parent = block.kind.try_fork_join_id(); - let mut walk = fork_join_id; - - // Check if the location in the key is "inside" the location of the - // block. - let is_inside = if let Some(parent) = possible_parent { - loop { - if let Some(step) = walk { - if step == parent { - // If we see the block's location, then the key - // location is "inside". - break true; - } else { - // Walk the parent until we find something - // interesting. - walk = parallel_reduce_infos[&step].parent_fork_join_id; - } - } else { - // If we don't find the block, then the key location is - // "outside" the block's parallel-reduce. - break false; - } - } - } else { - // Every location is "inside" the top level sequential section. - true - }; - - // Check if the parent is a vectorized fork-join. - let is_parent_vectorized = possible_parent - // Check if the parent fork-join has a vector width. - .map(|parent| parallel_reduce_infos[&parent].vector_width.is_some()) - // Sequential blocks are not vectorized. - .unwrap_or(false); - - // If we are inside the block's fork-join or the block's fork-join - // is vectorized, then refer to the blocks directly. Vectorized - // fork-joins have the same LLVM IR control flow as the schedule IR - // control flow. - if is_inside || is_parent_vectorized { - block_names.insert((block_id, fork_join_id), format!("bb_{}", block_idx)); - } else { - block_names.insert( - (block_id, fork_join_id), - format!("fork_join_seq_header_{}", possible_parent.unwrap().idx()), - ); - } - } - } - - // Create context for emitting LLVM IR. let ctx = CPUContext { function, - manifest, - block: Cell::new((0, &function.blocks[0])), - - virt_reg_to_inst_id, - dep_graph, - svalue_types, - parallel_reduce_infos, - - block_names, - - vector_width: Cell::new(None), - outside_def_used_in_vector: RefCell::new(HashSet::new()), - vectors_from_parallel: RefCell::new(HashSet::new()), - vector_reduce_associative_vars: RefCell::new(HashSet::new()), - vector_reduce_cycle: Cell::new(false), + types, + constants, + dynamic_constants, + reverse_postorder, + typing, + control_subgraph, + bbs, }; - ctx.emit_function(w)?; - - Ok(()) + ctx.codegen_function(w) } -/* - * Top level structure to hold analysis info and cell-ed state. - */ struct CPUContext<'a> { - function: &'a SFunction, - manifest: &'a PartitionManifest, - block: Cell<(usize, &'a SBlock)>, - - // Basic analyses over schedule IR. - virt_reg_to_inst_id: HashMap<usize, InstID>, - dep_graph: HashMap<InstID, Vec<InstID>>, - svalue_types: HashMap<SValue, SType>, - parallel_reduce_infos: HashMap<ForkJoinID, ParallelReduceInfo>, - - // Calculate the names of each block up front. For blocks that are the top - // or bottom blocks of sequential fork-joins, references outside the fork- - // join actually need to refer to the header block. This is a bit - // complicated to handle, and we use these names in several places, so pre- - // calculate the block names. Intuitively, if we are "inside" a sequential - // fork-join, references to the top or bottom blocks actually refer to those - // blocks, while if we are "outside" the sequential fork-join, references to - // both the top or bottom blocks actually refer to the loop header block. - // Fully vectorized fork-joins are not considered sequential. - block_names: HashMap<(BlockID, Option<ForkJoinID>), String>, + function: &'a Function, + types: &'a Vec<Type>, + constants: &'a Vec<Constant>, + dynamic_constants: &'a Vec<DynamicConstant>, + reverse_postorder: &'a Vec<NodeID>, + typing: &'a Vec<TypeID>, + control_subgraph: &'a Subgraph, + bbs: &'a Vec<NodeID>, +} - // Track whether we are currently in a vectorized parallel section - this - // affects how we lower types, for example. - vector_width: Cell<Option<usize>>, - // Track which virtual registers are defined outside the vectorized parallel - // section and used within it. - outside_def_used_in_vector: RefCell<HashSet<usize>>, - // Track which virtual registers are defined in the vectorized parallel - // section and used in the vectorized reduce section. - vectors_from_parallel: RefCell<HashSet<usize>>, - // Track which reduction variables (store their virtual register and - // variable number) are associative in the vectorized reduce section. - vector_reduce_associative_vars: RefCell<HashSet<(usize, usize)>>, - // track whether there are any non-associative reduction variables in a - // vectorized reduce section (which corresponds to whether we need to - // generate explicit control flow or not). - vector_reduce_cycle: Cell<bool>, +#[derive(Default, Debug)] +struct LLVMBlock { + phis: String, + body: String, + term: String, } impl<'a> CPUContext<'a> { - fn emit_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { - // Emit the partition function signature. - write!(w, "define ")?; - if self.function.return_types.len() == 1 { - self.emit_type(&self.function.return_types[0], w)?; - } else { - // Functions with multiple return values return said values in a - // struct. - self.emit_type( - &SType::Product(self.function.return_types.clone().into()), - w, - )?; - } - write!(w, " @{}(", self.manifest.name)?; - (0..self.function.param_types.len()) - .map(|param_idx| Some(SValue::VirtualRegister(param_idx))) - .intersperse(None) - .map(|token| -> Result<(), Error> { - match token { - Some(param_svalue) => { - self.emit_svalue(¶m_svalue, true, w)?; - } - None => write!(w, ", ")?, - } - Ok(()) - }) - .collect::<Result<(), Error>>()?; - // Technically, this may fail if for some reason there's a parallel - // launch partition with no parameters. Blame LLVM for being - // unnecessarily strict about commas of all things... - for parallel_launch_dim in 0..self.manifest.device.num_parallel_launch_dims() { - write!(w, ", i64 %parallel_launch_{}_low", parallel_launch_dim)?; - write!(w, ", i64 %parallel_launch_{}_len", parallel_launch_dim)?; - } - write!(w, ") {{\n",)?; - - // Emit the function body. Emit each block, one at a time. - for (block_idx, block) in self.function.blocks.iter().enumerate() { - self.block.set((block_idx, block)); - - // For "tops" of sequential fork-joins, we emit a special top block - // to be the loop header for the fork-join loop. - if let Some(fork_join_id) = block.kind.try_parallel() - && self.parallel_reduce_infos[&fork_join_id] - .top_parallel_block - .idx() - == block_idx - && self.parallel_reduce_infos[&fork_join_id] - .vector_width - .is_none() - { - self.emit_fork_join_seq_header(fork_join_id, block_idx, w)?; - } - - // Emit the header for the block. - write!( - w, - "{}:\n", - &self.block_names[&(BlockID::new(block_idx), block.kind.try_fork_join_id())] - )?; - - // If this block is in a vectorized parallel section, set up the - // context for vector code generation. - if let Some(fork_join_id) = block.kind.try_parallel() - && let Some(width) = self.parallel_reduce_infos[&fork_join_id].vector_width - { - self.setup_vectorized_parallel_block(width, w)?; - } - - // If this block is in a vectorized reduce section, set up either a - // post-parallel reduction loop or a vector reduction, depending on - // whether there's an associative schedule on each reduction - // variable. - if let Some(fork_join_id) = block.kind.try_reduce() - && let Some(width) = self.parallel_reduce_infos[&fork_join_id].vector_width - { - self.setup_vectorized_reduce_block(fork_join_id, width, w)?; + fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { + // Dump the function signature. + write!( + w, + "define {} @{}(", + self.get_type(self.function.return_type), + self.function.name + )?; + let mut first_param = true; + // The first set of parameters are dynamic constants. + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; } - - // For each basic block, emit instructions in that block. Emit using - // a worklist over the dependency graph. - let mut emitted = bitvec![u8, Lsb0; 0; block.insts.len()]; - let mut worklist = VecDeque::from((0..block.insts.len()).collect::<Vec<_>>()); - while let Some(inst_idx) = worklist.pop_front() { - let inst_id = InstID::new(block_idx, inst_idx); - let dependencies = &self.dep_graph[&inst_id]; - let all_uses_emitted = dependencies - .into_iter() - // Check that all used instructions in this block... - .filter(|inst_id| inst_id.idx_0() == block_idx) - // were already emitted. - .all(|inst_id| emitted[inst_id.idx_1()]); - // Phis don't need to wait for all of their uses to be emitted. - if block.insts[inst_idx].is_phi() || all_uses_emitted { - self.emit_inst( - block.virt_regs[inst_id.idx_1()].0, - &block.insts[inst_idx], - block.kind.try_fork_join_id(), - w, - )?; - emitted.set(inst_id.idx_1(), true); - } else { - worklist.push_back(inst_idx); - } + write!(w, "i64 %dc_p{}", idx)?; + } + // The second set of parameters are normal parameters. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; } - - self.reset_cells(); + write!(w, "{} %p{}", self.get_type(*ty), idx)?; } - write!(w, "}}\n",)?; + write!(w, ") {{\n")?; - Ok(()) - } + let mut blocks: BTreeMap<_, _> = (0..self.function.nodes.len()) + .filter(|idx| self.function.nodes[*idx].is_control()) + .map(|idx| (NodeID::new(idx), LLVMBlock::default())) + .collect(); - fn emit_type<W: Write>(&self, stype: &SType, w: &mut W) -> Result<(), Error> { - if let Some(width) = self.vector_width.get() { - write!(w, "<{} x ", width)?; - } + // Emit calculation of dynamic constants into the start block. Just + // calculate every dynamic constant, and let LLVM clean them up. + self.codegen_dynamic_constants( + blocks.get_mut(&NodeID::new(0)).unwrap(), + self.function.num_dynamic_constants, + )?; - match stype { - SType::Boolean => write!(w, "i1")?, - SType::Integer8 | SType::UnsignedInteger8 => write!(w, "i8")?, - SType::Integer16 | SType::UnsignedInteger16 => write!(w, "i16")?, - SType::Integer32 | SType::UnsignedInteger32 => write!(w, "i32")?, - SType::Integer64 | SType::UnsignedInteger64 => write!(w, "i64")?, - SType::Float32 => write!(w, "float")?, - SType::Float64 => write!(w, "double")?, - SType::Product(fields) => { - write!(w, "{{")?; - fields + // Emit data flow into basic blocks. + let mut worklist = VecDeque::from_iter( + self.reverse_postorder + .into_iter() + .filter(|id| !self.function.nodes[id.idx()].is_control()), + ); + let mut visited = bitvec![u8, Lsb0; 0; self.function.nodes.len()]; + while let Some(id) = worklist.pop_front() { + let node = &self.function.nodes[id.idx()]; + if node.is_phi() + || node.is_reduce() + || get_uses(node) + .as_ref() .into_iter() - .map(Some) - .intersperse(None) - .map(|token| -> Result<(), Error> { - match token { - Some(field_ty) => self.emit_type(field_ty, w)?, - None => write!(w, ", ")?, - } - Ok(()) - }) - .collect::<Result<(), Error>>()?; - write!(w, "}}")?; + .all(|u| self.function.nodes[u.idx()].is_control() || visited[u.idx()]) + { + self.codegen_data_node(*id, &mut blocks)?; + visited.set(id.idx(), true); + } else { + worklist.push_back(id); } - SType::ArrayRef(_) => write!(w, "ptr")?, } - if self.vector_width.get().is_some() { - write!(w, ">")?; + // Emit control flow into basic blocks. + for id in (0..self.function.nodes.len()).map(NodeID::new) { + if !self.function.nodes[id.idx()].is_control() { + continue; + } + self.codegen_control_node(id, &mut blocks)?; } - Ok(()) - } - - fn emit_constant<W: Write>(&self, sconstant: &SConstant, w: &mut W) -> Result<(), Error> { - match sconstant { - SConstant::Boolean(val) => write!(w, "{}", val)?, - SConstant::Integer8(val) => write!(w, "{}", val)?, - SConstant::Integer16(val) => write!(w, "{}", val)?, - SConstant::Integer32(val) => write!(w, "{}", val)?, - SConstant::Integer64(val) => write!(w, "{}", val)?, - SConstant::UnsignedInteger8(val) => write!(w, "{}", val)?, - SConstant::UnsignedInteger16(val) => write!(w, "{}", val)?, - SConstant::UnsignedInteger32(val) => write!(w, "{}", val)?, - SConstant::UnsignedInteger64(val) => write!(w, "{}", val)?, - SConstant::Float32(val) => { - if val.fract() == 0.0 { - write!(w, "{}.0", val)? - } else { - write!(w, "{}", val)? - } - } - SConstant::Float64(val) => { - if val.fract() == 0.0 { - write!(w, "{}.0", val)? - } else { - write!(w, "{}", val)? - } - } - SConstant::Product(fields) => { - write!(w, "{{")?; - fields - .into_iter() - .map(Some) - .intersperse(None) - .map(|token| -> Result<(), Error> { - match token { - Some(field_cons) => { - self.emit_type(&field_cons.get_type(), w)?; - write!(w, " ")?; - self.emit_constant(field_cons, w)?; - } - None => write!(w, ", ")?, - } - Ok(()) - }) - .collect::<Result<(), Error>>()?; - write!(w, "}}")?; - } + // Dump the emitted basic blocks. + for (id, block) in blocks { + write!( + w, + "{}:\n{}{}{}", + self.get_block_name(id), + block.phis, + block.body, + block.term + )?; } + write!(w, "}}\n")?; Ok(()) } - fn emit_svalue<W: Write>(&self, svalue: &SValue, add_ty: bool, w: &mut W) -> Result<(), Error> { - if add_ty { - self.emit_type(&self.svalue_types[svalue], w)?; - write!(w, " ")?; - } - if self.vector_width.get().is_some() - && svalue - .try_virt_reg() - .map(|virt_reg| self.outside_def_used_in_vector.borrow().contains(&virt_reg)) - .unwrap_or(false) - { - match svalue { - SValue::VirtualRegister(virt_reg) => { - write!(w, "%vec_{}_v{}", self.block.get().0, virt_reg)? - } - SValue::Constant(_) => todo!(), - } - } else if svalue - .try_virt_reg() - .map(|virt_reg| self.vectors_from_parallel.borrow().contains(&virt_reg)) - .unwrap_or(false) - { - match svalue { - SValue::VirtualRegister(virt_reg) => write!(w, "%extract_v{}", virt_reg)?, - SValue::Constant(_) => todo!(), - } - } else { - match svalue { - SValue::VirtualRegister(virt_reg) => write!(w, "%v{}", virt_reg)?, - SValue::Constant(cons) => self.emit_constant(cons, w)?, - } + /* + * While control nodes in Hercules IR are predecessor-centric (each take a + * control input that defines the predecessor relationship), LLVM IR basic + * blocks are successor centric (each branch to successor blocks with a + * branch instruction). This difference requires explicit translation. + */ + fn codegen_control_node( + &self, + id: NodeID, + blocks: &mut BTreeMap<NodeID, LLVMBlock>, + ) -> Result<(), Error> { + match self.function.nodes[id.idx()] { + // Start, region, and projection control nodes all have exactly one + // successor and are otherwise simple. + Node::Start + | Node::Region { preds: _ } + | Node::Projection { + control: _, + selection: _, + } => { + let term = &mut blocks.get_mut(&id).unwrap().term; + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(term, " br label %{}\n", self.get_block_name(succ))? + } + // If nodes have two successors - examine the projections to + // determine which branch is which, and branch between them. + Node::If { control: _, cond } => { + let term = &mut blocks.get_mut(&id).unwrap().term; + let mut succs = self.control_subgraph.succs(id); + let succ1 = succs.next().unwrap(); + let succ2 = succs.next().unwrap(); + let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some(); + write!( + term, + " br {}, label %{}, label %{}\n", + self.get_value(cond, true), + self.get_block_name(if succ1_is_true { succ1 } else { succ2 }), + self.get_block_name(if succ1_is_true { succ2 } else { succ1 }), + )? + } + Node::Return { control: _, data } => { + let term = &mut blocks.get_mut(&id).unwrap().term; + write!(term, " ret {}\n", self.get_value(data, true))? + } + _ => panic!("PANIC: Can't lower {:?}.", self.function.nodes[id.idx()]), } Ok(()) } - fn emit_inst<W: Write>( + /* + * Lower data nodes in Hercules IR into LLVM instructions. + */ + fn codegen_data_node( &self, - virt_reg: usize, - inst: &SInst, - location: Option<ForkJoinID>, - w: &mut W, + id: NodeID, + blocks: &mut BTreeMap<NodeID, LLVMBlock>, ) -> Result<(), Error> { - // Helper to emit the initial assignment to the destination virtual - // register, when applicable. - let self_svalue = SValue::VirtualRegister(virt_reg); - let emit_assign = |w: &mut W| -> Result<(), Error> { write!(w, "%v{} = ", virt_reg) }; - - write!(w, " ")?; - match inst { - SInst::Phi { inputs } => { - emit_assign(w)?; - write!(w, "phi ")?; - self.emit_type(&self.svalue_types[&self_svalue], w)?; - write!(w, " ")?; - inputs - .into_iter() - .map(Some) - .intersperse(None) - .map(|token| match token { - Some((pred_block_id, svalue)) => { - write!(w, "[ ")?; - self.emit_svalue(svalue, false, w)?; - write!(w, ", %{} ]", &self.block_names[&(*pred_block_id, location)])?; - Ok(()) - } - None => write!(w, ", "), - }) - .collect::<Result<(), Error>>()?; - } - SInst::ThreadID { - dimension, - fork_join, - } => { - emit_assign(w)?; - if let Some(width) = self.vector_width.get() { - write!(w, "add <{} x i64> <", width)?; - for idx in 0..width { - if idx != 0 { - write!(w, ", ")?; - } - write!(w, "i64 {}", idx)?; + match self.function.nodes[id.idx()] { + Node::Phi { control, ref data } => { + let phis = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().phis; + let preds = self.function.nodes[control.idx()].try_region().unwrap(); + write!( + phis, + " {} = phi {} ", + self.get_value(id, false), + self.get_type(self.typing[id.idx()]) + )?; + for idx in 0..preds.len() { + if idx != 0 { + write!(phis, ", ")?; } - write!(w, ">, zeroinitializer")?; - } else { - write!(w, "add i64 0, %thread_id_{}_{}", fork_join.idx(), dimension)?; - } - } - SInst::ReductionVariable { number } => { - write!(w, "; Already emitted reduction variable #{number}.")?; - } - SInst::Jump { - target, - parallel_entry: _, - reduce_exit, - } => { - if reduce_exit.is_some() && self.vector_reduce_cycle.get() { - // If we're closing a non-vectorized reduction for a - // vectorized parallel, jump back to the beginning of the - // reduction, not the beginning of the parallel section. - let self_block_idx = self.block.get().0; write!( - w, - "br label %{}", - &self.block_names[&(BlockID::new(self_block_idx), location)], + phis, + "[ {}, %{} ]", + self.get_value(data[idx], false), + self.get_block_name(preds[idx]) )?; - } else if reduce_exit.is_some() && self.vectors_from_parallel.borrow().is_empty() { - // If we're closing a reduction and the parallel-reduce is - // not vectorized, we need to branch back to the beginning - // of the parallel-reduce. - write!( - w, - "br label %fork_join_seq_header_{}", - location.unwrap().idx(), - )?; - } else { - // If this is a normal jump (or is closing a reduction and - // is vectorized, along with the parallel section), then - // branch to the successor as expected. - write!(w, "br label %{}", &self.block_names[&(*target, location)])?; } + write!(phis, "\n")?; } - SInst::Branch { - cond, - false_target, - true_target, - } => { - // Branches aren't involved in any parallel-reduce shenanigans, - // so lowering them is straightforward. - write!(w, "br ")?; - self.emit_svalue(cond, true, w)?; + Node::Parameter { index } => { + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; + let ty = self.get_type(self.typing[id.idx()]); write!( - w, - ", label %{}, label %{}", - &self.block_names[&(*true_target, location)], - &self.block_names[&(*false_target, location)], + body, + " {} = bitcast {} %p{} to {}\n", + self.get_value(id, false), + ty, + index, + ty )?; } - SInst::PartitionExit { data_outputs } => { - if data_outputs.len() == 0 { - write!(w, "ret {{}} zeroinitializer")?; - } else if data_outputs.len() == 1 { - write!(w, "ret ")?; - self.emit_svalue(&data_outputs[0], true, w)?; - } else { - let ret_ty = SType::Product( - data_outputs - .iter() - .map(|svalue| self.svalue_types[svalue].clone()) - .collect(), - ); - write!(w, "%v{}_0 = insertvalue ", virt_reg)?; - self.emit_type(&ret_ty, w)?; - write!(w, " undef, ")?; - self.emit_svalue(&data_outputs[0], true, w)?; - write!(w, ", 0\n")?; - for idx in 1..data_outputs.len() { - write!(w, " %v{}_{} = insertvalue ", virt_reg, idx)?; - self.emit_type(&ret_ty, w)?; - write!(w, " %v{}_{}, ", virt_reg, idx - 1)?; - self.emit_svalue(&data_outputs[idx], true, w)?; - write!(w, ", {}\n", idx)?; + Node::Constant { id: cons_id } => { + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; + write!(body, " {} = bitcast ", self.get_value(id, false))?; + match self.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(body, "i1 {} to i1\n", val)?, + Constant::Integer8(val) => write!(body, "i8 {} to i8\n", val)?, + Constant::Integer16(val) => write!(body, "i16 {} to i16\n", val)?, + Constant::Integer32(val) => write!(body, "i32 {} to i32\n", val)?, + Constant::Integer64(val) => write!(body, "i64 {} to i64\n", val)?, + Constant::UnsignedInteger8(val) => write!(body, "i8 {} to i8\n", val)?, + Constant::UnsignedInteger16(val) => write!(body, "i16 {} to i16\n", val)?, + Constant::UnsignedInteger32(val) => write!(body, "i32 {} to i32\n", val)?, + Constant::UnsignedInteger64(val) => write!(body, "i64 {} to i64\n", val)?, + Constant::Float32(val) => { + if val.fract() == 0.0 { + write!(body, "float {}.0 to float\n", val)? + } else { + write!(body, "float {} to float\n", val)? + } + } + Constant::Float64(val) => { + if val.fract() == 0.0 { + write!(body, "double {}.0 to double", val)? + } else { + write!(body, "double {} to double", val)? + } } - write!(w, " ret ")?; - self.emit_type(&ret_ty, w)?; - write!(w, " %v{}_{}", virt_reg, data_outputs.len() - 1)?; + _ => panic!("PANIC: Can't dynamically allocate memory for an aggregate type within a CPU function."), } } - SInst::Return { value } => { - write!(w, "ret ")?; - self.emit_svalue(value, true, w)?; - } - SInst::Unary { input, op } => { - emit_assign(w)?; + Node::DynamicConstant { id: dc_id } => { + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; + // Dynamic constants are all pre-calculated at the top of the + // function. + write!( + body, + " {} = bitcast i64 %dc{} to i64\n", + self.get_value(id, false), + dc_id.idx() + )? + } + Node::Unary { op, input } => { + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; match op { - SUnaryOperator::Not => { - write!(w, "xor ")?; - self.emit_svalue(input, true, w)?; - write!(w, ", -1")?; - } - SUnaryOperator::Neg => { - if self.svalue_types[input].is_float() { - write!(w, "fneg ")?; - self.emit_svalue(input, true, w)?; + UnaryOperator::Not => write!( + body, + " {} = xor {}, -1\n", + self.get_value(id, false), + self.get_value(input, true) + )?, + UnaryOperator::Neg => { + if self.types[self.typing[input.idx()].idx()].is_float() { + write!( + body, + " {} = fneg {}", + self.get_value(id, false), + self.get_value(input, true) + )? } else { - write!(w, "mul ")?; - self.emit_svalue(input, true, w)?; - write!(w, ", -1")?; + write!( + body, + " {} = mul {}, -1", + self.get_value(id, false), + self.get_value(input, true) + )? } } - SUnaryOperator::Cast(dst_ty) => { - let src_ty = &self.svalue_types[input]; - if src_ty.is_integer() + UnaryOperator::Cast(dst_ty_id) => { + let src_ty_id = self.typing[input.idx()]; + let dst_ty = &self.types[dst_ty_id.idx()]; + let src_ty = &self.types[src_ty_id.idx()]; + let opcode = if src_ty.is_integer() && dst_ty.is_integer() && src_ty.num_bits() > dst_ty.num_bits() { - write!(w, "trunc ")?; + "trunc" } else if src_ty.is_signed() && dst_ty.is_integer() && src_ty.num_bits() < dst_ty.num_bits() { - write!(w, "sext ")?; + "sext" } else if src_ty.is_integer() && dst_ty.is_integer() && src_ty.num_bits() < dst_ty.num_bits() { - write!(w, "zext ")?; + "zext" } else if src_ty.is_integer() && dst_ty.is_integer() { // A no-op. - write!(w, "bitcast ")?; + "bitcast" } else if src_ty.is_float() && dst_ty.is_float() && src_ty.num_bits() > dst_ty.num_bits() { - write!(w, "fptrunc ")?; + "fptrunc" } else if src_ty.is_float() && dst_ty.is_float() && src_ty.num_bits() < dst_ty.num_bits() { - write!(w, "fpext ")?; + "fpext" } else if src_ty.is_float() && dst_ty.is_float() { // A no-op. - write!(w, "bitcast ")?; + "bitcast" } else if src_ty.is_float() && dst_ty.is_signed() { - write!(w, "fptosi ")?; + "fptosi" } else if src_ty.is_float() && dst_ty.is_integer() { - write!(w, "fptoui ")?; + "fptoui" } else if src_ty.is_signed() && dst_ty.is_float() { - write!(w, "sitofp ")?; + "sitofp" } else if src_ty.is_integer() && dst_ty.is_float() { - write!(w, "uitofp ")?; + "uitofp" } else { panic!("PANIC: Invalid cast type combination."); - } - self.emit_svalue(input, true, w)?; - write!(w, " to ")?; - self.emit_type(dst_ty, w)?; + }; + write!( + body, + " {} = {} {} to {}\n", + self.get_value(id, false), + opcode, + self.get_value(input, true), + self.get_type(dst_ty_id), + )? } } } - SInst::Binary { left, right, op } => { - // If we're in a vectorized reduce block and this binary - // operation is reducing over an associative reduction variable, - // then we need to emit a LLVM vector reduce intrinsic. - // Otherwise lower into a normal LLVM binary op. - let try_associative_reduction = |sval: &SValue| { - sval.try_virt_reg() - .map(|virt_reg| { - self.vector_reduce_associative_vars - .borrow() - .iter() - .filter(|(red_virt_reg, _)| *red_virt_reg == virt_reg) - .map(|(red_virt_reg, red_num)| (*red_virt_reg, *red_num)) - .next() - }) - .flatten() - }; - if let Some((red_virt_reg, red_num)) = - try_associative_reduction(left).or(try_associative_reduction(right)) - { - let left_virt_reg = left - .try_virt_reg() - .expect("PANIC: Associative reduction can't involve constants."); - let right_virt_reg = right - .try_virt_reg() - .expect("PANIC: Associative reduction can't involve constants."); - let vector_virt_reg = if left_virt_reg != red_virt_reg { - left_virt_reg - } else if right_virt_reg != red_virt_reg { - right_virt_reg - } else { - panic!("PANIC: Associative reduction can't use the reduction variable more than once."); - }; - let info = &self.parallel_reduce_infos[&location.unwrap()]; - write!(w, "%v{} = call reassoc ", red_virt_reg)?; - self.emit_type(&self.svalue_types[&self_svalue], w)?; - let op = op.get_llvm_op(&self.svalue_types[left]); - write!(w, " @llvm.vector.reduce.{}", op)?; - let width = info.vector_width.unwrap(); - self.emit_reduce_suffix(width, &self.svalue_types[&self_svalue], w)?; - write!(w, "(")?; - self.emit_svalue(&info.reduce_inits[red_num], true, w)?; - write!(w, ", ")?; - self.vector_width.set(Some(width)); - let old_vectors_from_parallel = self.vectors_from_parallel.take(); - self.emit_svalue(&SValue::VirtualRegister(vector_virt_reg), true, w)?; - self.vector_width.set(None); - self.vectors_from_parallel - .replace(old_vectors_from_parallel); - write!(w, ")")?; - } else { - emit_assign(w)?; - let op = op.get_llvm_op(&self.svalue_types[left]); - write!(w, "{} ", op)?; - self.emit_svalue(left, true, w)?; - write!(w, ", ")?; - self.emit_svalue(right, false, w)?; + Node::Binary { op, left, right } => { + enum OpTy { + Float, + Unsigned, + Signed, } - } - SInst::Ternary { + + let left_ty = &self.types[self.typing[left.idx()].idx()]; + let op_ty = if left_ty.is_float() { + OpTy::Float + } else if left_ty.is_unsigned() { + OpTy::Unsigned + } else { + OpTy::Signed + }; + + let opcode = match (op, op_ty) { + (BinaryOperator::Add, OpTy::Float) => "fadd", + (BinaryOperator::Add, _) => "add", + (BinaryOperator::Sub, OpTy::Float) => "fsub", + (BinaryOperator::Sub, _) => "sub", + (BinaryOperator::Mul, OpTy::Float) => "fmul", + (BinaryOperator::Mul, _) => "mul", + (BinaryOperator::Div, OpTy::Float) => "fdiv", + (BinaryOperator::Div, OpTy::Unsigned) => "udiv", + (BinaryOperator::Div, OpTy::Signed) => "sdiv", + (BinaryOperator::Rem, OpTy::Float) => "frem", + (BinaryOperator::Rem, OpTy::Unsigned) => "urem", + (BinaryOperator::Rem, OpTy::Signed) => "srem", + (BinaryOperator::LT, OpTy::Float) => "fcmp olt", + (BinaryOperator::LT, OpTy::Unsigned) => "icmp ult", + (BinaryOperator::LT, OpTy::Signed) => "icmp slt", + (BinaryOperator::LTE, OpTy::Float) => "fcmp ole", + (BinaryOperator::LTE, OpTy::Unsigned) => "icmp ule", + (BinaryOperator::LTE, OpTy::Signed) => "icmp sle", + (BinaryOperator::GT, OpTy::Float) => "fcmp ogt", + (BinaryOperator::GT, OpTy::Unsigned) => "icmp ugt", + (BinaryOperator::GT, OpTy::Signed) => "icmp sgt", + (BinaryOperator::GTE, OpTy::Float) => "fcmp oge", + (BinaryOperator::GTE, OpTy::Unsigned) => "icmp uge", + (BinaryOperator::GTE, OpTy::Signed) => "icmp sge", + (BinaryOperator::EQ, OpTy::Float) => "fcmp oeq", + (BinaryOperator::EQ, _) => "icmp eq", + (BinaryOperator::NE, OpTy::Float) => "fcmp one", + (BinaryOperator::NE, _) => "icmp ne", + (BinaryOperator::Or, _) => "or", + (BinaryOperator::And, _) => "and", + (BinaryOperator::Xor, _) => "xor", + (BinaryOperator::LSh, _) => "lsh", + (BinaryOperator::RSh, OpTy::Unsigned) => "lshr", + (BinaryOperator::RSh, _) => "ashr", + }; + + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; + write!( + body, + " {} = {} {}, {}\n", + self.get_value(id, false), + opcode, + self.get_value(left, true), + self.get_value(right, false), + )? + } + Node::Ternary { + op, first, second, third, - op, } => { - emit_assign(w)?; + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; match op { - STernaryOperator::Select => { - write!(w, "select ")?; - self.emit_svalue(first, true, w)?; - write!(w, ", ")?; - self.emit_svalue(second, true, w)?; - write!(w, ", ")?; - self.emit_svalue(third, true, w)?; - } + TernaryOperator::Select => write!( + body, + " {} = select {}, {}, {}\n", + self.get_value(id, false), + self.get_value(first, true), + self.get_value(second, true), + self.get_value(third, true) + )?, } } - SInst::IntrinsicCall { intrinsic, args } => { - emit_assign(w)?; - write!(w, "call ")?; - self.emit_type(&self.svalue_types[&self_svalue], w)?; + Node::IntrinsicCall { + intrinsic, + ref args, + } => { + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; write!( - w, - " @llvm.{}.{}(", - // TODO: make lower case names conform to LLVM expectations. - intrinsic.lower_case_name(), - Self::intrinsic_type_str(&self.svalue_types[&self_svalue]) + body, + " {} = call {} {}(", + self.get_value(id, false), + self.get_type(self.typing[id.idx()]), + convert_intrinsic(&intrinsic, &self.types[self.typing[id.idx()].idx()]), )?; - self.emit_svalue(&args[0], true, w)?; - for idx in 1..args.len() { - write!(w, ", ")?; - self.emit_svalue(&args[idx], true, w)?; - } - write!(w, ")")?; - } - SInst::ProductExtract { product, indices } => { - emit_assign(w)?; - write!(w, "extractvalue ")?; - self.emit_svalue(product, true, w)?; - for index in indices { - write!(w, ", {}", index)?; - } - } - SInst::ProductInsert { - product, - data, - indices, - } => { - emit_assign(w)?; - write!(w, "insertvalue ")?; - self.emit_svalue(product, true, w)?; - write!(w, ", ")?; - self.emit_svalue(data, true, w)?; - for index in indices { - write!(w, ", {}", index)?; + for idx in 0..args.len() { + if idx != 0 { + write!(body, ", ")?; + } + write!(body, "{}", self.get_value(args[idx], true))?; } + write!(body, ")\n")? } - SInst::ArrayLoad { - array, - position, - bounds, + Node::Read { + collect, + ref indices, } => { - self.emit_linear_index_calc(virt_reg, position, bounds, w)?; - write!(w, "%load_ptr_{} = getelementptr ", virt_reg)?; - let old_width = self.vector_width.take(); - self.emit_type(&self.svalue_types[&self_svalue], w)?; - self.vector_width.set(old_width); - write!(w, ", ")?; - self.emit_svalue(array, true, w)?; - write!(w, ", ")?; - self.emit_type(&self.svalue_types[&position[0]], w)?; - write!(w, " %calc_linear_idx_{}\n ", virt_reg)?; - emit_assign(w)?; - if let Some(width) = self.vector_width.get() { - write!(w, "call ")?; - self.emit_type(&self.svalue_types[&self_svalue], w)?; - write!(w, " @llvm.masked.gather")?; - self.emit_gather_scatter_suffix(width, &self.svalue_types[&self_svalue], w)?; - write!(w, "(")?; - self.emit_type(&self.svalue_types[array], w)?; - write!(w, " %load_ptr_{}, i32 8, <{} x i1> <", virt_reg, width)?; - for idx in 0..width { - if idx != 0 { - write!(w, ", ")?; - } - write!(w, "i1 true")?; - } - write!(w, ">, ")?; - self.emit_type(&self.svalue_types[&self_svalue], w)?; - write!(w, " undef)")?; + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; + let collect_name = self.get_value(collect, false); + let collect_ty = self.typing[collect.idx()]; + let index_ptr_name = + self.codegen_index_math(&collect_name, collect_ty, indices, body)?; + let self_ty = self.typing[id.idx()]; + if self.types[self_ty.idx()].is_primitive() { + // If this read reaches a primitive type, actually perform a + // load. + write!( + body, + " {} = load {}, ptr {}\n", + self.get_value(id, false), + self.get_type(self_ty), + index_ptr_name + )?; } else { - write!(w, "load ")?; - self.emit_type(&self.svalue_types[&self_svalue], w)?; - write!(w, ", ptr %load_ptr_{}", virt_reg)?; + // If this read doesn't reach a primitive type, just return + // the calculated offset pointer for the sub-collection. + write!( + body, + " {} = bitcast ptr {} to ptr\n", + self.get_value(id, false), + index_ptr_name + )?; } } - SInst::ArrayStore { - array, - value, - position, - bounds, + Node::Write { + collect, + data, + ref indices, } => { - self.emit_linear_index_calc(virt_reg, position, bounds, w)?; - write!(w, "%store_ptr_{} = getelementptr ", virt_reg)?; - let old_width = self.vector_width.take(); - self.emit_type(&self.svalue_types[value], w)?; - self.vector_width.set(old_width); - write!(w, ", ")?; - self.emit_svalue(array, true, w)?; - write!(w, ", ")?; - self.emit_type(&self.svalue_types[&position[0]], w)?; - write!(w, " %calc_linear_idx_{}\n ", virt_reg)?; - if let Some(width) = self.vector_width.get() { - write!(w, "call ")?; - self.emit_type(&self.svalue_types[&self_svalue], w)?; - write!(w, " @llvm.masked.scatter")?; - self.emit_gather_scatter_suffix(width, &self.svalue_types[&self_svalue], w)?; - write!(w, "(")?; - self.emit_svalue(array, true, w)?; - write!(w, ", ")?; - self.emit_type(&self.svalue_types[array], w)?; - write!(w, " %store_ptr_{}, i32 8, <{} x i1> <", virt_reg, width)?; - for idx in 0..width { - if idx != 0 { - write!(w, ", ")?; - } - write!(w, "i1 true")?; - } - write!(w, ">)")?; + let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body; + let collect_name = self.get_value(collect, false); + let collect_ty = self.typing[collect.idx()]; + let index_ptr_name = + self.codegen_index_math(&collect_name, collect_ty, indices, body)?; + let data_ty = self.typing[data.idx()]; + if self.types[data_ty.idx()].is_primitive() { + // If the data item being written is a primitive type, + // perform a single store of the data value. + write!( + body, + " store {}, ptr {}\n", + self.get_value(data, true), + index_ptr_name + )?; + write!( + body, + " {} = bitcast {} to ptr\n", + self.get_value(id, false), + self.get_value(collect, true) + )?; } else { - write!(w, "store ")?; - self.emit_svalue(value, true, w)?; - write!(w, ", ptr %store_ptr_{}", virt_reg)?; + // If the data item being written is not a primitive type, + // then perform a memcpy from the data collection to the + // destination collection. + let data_size = self.codegen_type_size(data_ty, body)?; + write!( + body, + " call void @llvm.memcpy.p0.p0.i64(ptr {}, {}, i64 {}, i1 false)\n", + index_ptr_name, + self.get_value(data, true), + data_size + )?; } } + _ => panic!("PANIC: Can't lower {:?}.", self.function.nodes[id.idx()]), } - write!(w, "\n")?; - Ok(()) } /* - * Implement the index math to convert a multi-dimensional position to a - * linear position inside an array. + * Calculate all of the dynamic constants upfront. */ - fn emit_linear_index_calc<W: Write>( + fn codegen_dynamic_constants( &self, - virt_reg: usize, - position: &[SValue], - bounds: &[SValue], - w: &mut W, + block: &mut LLVMBlock, + num_dc_params: u32, ) -> Result<(), Error> { - assert_eq!(position.len(), bounds.len()); - - if position.len() == 1 { - write!(w, "%calc_linear_idx_{} = add ", virt_reg)?; - self.emit_svalue(&position[0], true, w)?; - write!(w, ", zeroinitializer\n ")?; - } else if position.len() == 2 { - write!(w, "%calc_linear_idx_{}_0 = mul ", virt_reg)?; - self.emit_svalue(&position[0], true, w)?; - write!(w, ", ")?; - self.emit_svalue(&bounds[1], false, w)?; - write!(w, "\n %calc_linear_idx_{} = add ", virt_reg)?; - self.emit_svalue(&position[1], true, w)?; - write!(w, ", %calc_linear_idx_{}_0", virt_reg)?; - write!(w, "\n ")?; - } else { - todo!("TODO: Handle the 3 or more dimensional array case.") + let body = &mut block.body; + for dc in dynamic_constants_bottom_up(&self.dynamic_constants) { + match self.dynamic_constants[dc.idx()] { + DynamicConstant::Constant(val) => { + write!(body, " %dc{} = bitcast i64 {} to i64\n", dc.idx(), val)? + } + DynamicConstant::Parameter(idx) => { + if idx < num_dc_params as usize { + write!( + body, + " %dc{} = bitcast i64 %dc_p{} to i64\n", + dc.idx(), + idx + )? + } else { + write!(body, " %dc{} = bitcast i64 0 to i64\n", dc.idx())? + } + } + DynamicConstant::Add(left, right) => write!( + body, + " %dc{} = add i64%dc{},%dc{}\n", + dc.idx(), + left.idx(), + right.idx() + )?, + DynamicConstant::Sub(left, right) => write!( + body, + " %dc{} = sub i64%dc{},%dc{}\n", + dc.idx(), + left.idx(), + right.idx() + )?, + DynamicConstant::Mul(left, right) => write!( + body, + " %dc{} = mul i64%dc{},%dc{}\n", + dc.idx(), + left.idx(), + right.idx() + )?, + DynamicConstant::Div(left, right) => write!( + body, + " %dc{} = udiv i64%dc{},%dc{}\n", + dc.idx(), + left.idx(), + right.idx() + )?, + DynamicConstant::Rem(left, right) => write!( + body, + " %dc{} = urem i64%dc{},%dc{}\n", + dc.idx(), + left.idx(), + right.idx() + )?, + } } - Ok(()) } /* - * LLVM intrinsics are a pain to emit textually... + * Emit logic to index into an collection. */ - fn intrinsic_type_str(elem_ty: &SType) -> &'static str { - // We can't just use our previous routines for emitting types, because - // only inside intrinsics does LLVM use "f32" and "f64" properly! - match elem_ty { - SType::Boolean => "i1", - SType::Integer8 | SType::UnsignedInteger8 => "i8", - SType::Integer16 | SType::UnsignedInteger16 => "i16", - SType::Integer32 | SType::UnsignedInteger32 => "i32", - SType::Integer64 | SType::UnsignedInteger64 => "i64", - SType::Float32 => "f32", - SType::Float64 => "f64", - _ => panic!(), - } - } - - fn emit_reduce_suffix<W: Write>( + fn codegen_index_math( &self, - width: usize, - elem_ty: &SType, - w: &mut W, - ) -> Result<(), Error> { - write!(w, ".v{}{}", width, Self::intrinsic_type_str(elem_ty))?; - Ok(()) - } + collect_name: &str, + collect_ty: TypeID, + indices: &[Index], + body: &mut String, + ) -> Result<String, Error> { + let mut acc_ptr = collect_name.to_string(); + for index in indices { + match index { + Index::Field(idx) => { + let Type::Product(ref fields) = self.types[collect_ty.idx()] else { + panic!() + }; - fn emit_gather_scatter_suffix<W: Write>( - &self, - width: usize, - elem_ty: &SType, - w: &mut W, - ) -> Result<(), Error> { - write!( - w, - ".v{}{}.v{}p0", - width, - Self::intrinsic_type_str(elem_ty), - width - )?; - Ok(()) + // Get the offset of the field at index `idx` by calculating + // the product's size up to field `idx`, then offseting the + // base pointer by that amount. + let mut acc_offset = "0".to_string(); + for field in &fields[..*idx] { + let field_align = get_type_alignment(&self.types, *field); + let field = self.codegen_type_size(*field, body)?; + acc_offset = Self::round_up_to(&acc_offset, field_align, body)?; + acc_offset = Self::append(&acc_offset, &field, body)?; + } + acc_offset = Self::round_up_to( + &acc_offset, + get_type_alignment(&self.types, fields[*idx]), + body, + )?; + acc_ptr = Self::gep(collect_name, &acc_offset, body)?; + } + Index::Variant(_) => { + // The tag of a summation is at the end of the summation, so + // the variant pointer is just the base pointer. Do nothing. + } + Index::Position(ref pos) => { + let Type::Array(elem, ref dims) = self.types[collect_ty.idx()] else { + panic!() + }; + + // The offset of the position into an array is: + // + // ((0 * s1 + p1) * s2 + p2) * s3 + p3 ... + let elem_size = self.codegen_type_size(elem, body)?; + let mut acc_offset = "0".to_string(); + for (p, s) in zip(pos, dims) { + let p = self.get_value(*p, false); + let s = format!("%dc{}", s.idx()); + acc_offset = Self::multiply(&acc_offset, &s, body)?; + acc_offset = Self::append(&acc_offset, &p, body)?; + } + + // Convert offset in # elements -> # bytes. + acc_offset = Self::multiply(&acc_offset, &elem_size, body)?; + acc_ptr = Self::gep(collect_name, &acc_offset, body)?; + } + } + } + Ok(acc_ptr) } /* - * Emit the loop header implementing a sequential fork-join. For historical - * reasons, "sequential" fork-joins are just fork-joins that are lowered to - * LLVM level loops. This includes fork-joins that end up getting - * parallelized across threads via low/high bounds. + * Emit logic to calculate the size of a type. This needs to be emitted as + * IR since the size of an array may depend on array constants. */ - fn emit_fork_join_seq_header<W: Write>( - &self, - fork_join_id: ForkJoinID, - block_idx: usize, - w: &mut W, - ) -> Result<(), Error> { - let info = &self.parallel_reduce_infos[&fork_join_id]; - let entry_name = &self.block_names[&(info.predecessor, Some(fork_join_id))]; - let loop_name = &self.block_names[&(info.reduce_block, Some(fork_join_id))]; - let parallel_launch = self.manifest.device.num_parallel_launch_dims() > 0 && info.top_level; - - // Start the header of the loop. - write!(w, "fork_join_seq_header_{}:\n", fork_join_id.idx())?; - - // Emit the phis for the linear loop index variable and the reduction - // variables. - write!( - w, - " %linear_{} = phi i64 [ 0, %{} ], [ %linear_{}_inc, %{} ]\n", - block_idx, entry_name, block_idx, loop_name, - )?; - for (var_num, virt_reg) in info.reduction_variables.iter() { - write!(w, " %v{} = phi ", virt_reg)?; - self.emit_type(&self.svalue_types[&SValue::VirtualRegister(*virt_reg)], w)?; - write!(w, " [ ")?; - self.emit_svalue(&info.reduce_inits[*var_num], false, w)?; - write!(w, ", %{} ], [ ", entry_name)?; - self.emit_svalue(&info.reduce_reducts[*var_num], false, w)?; - write!(w, ", %{} ]\n", loop_name)?; - } + fn codegen_type_size(&self, ty: TypeID, body: &mut String) -> Result<String, Error> { + match self.types[ty.idx()] { + Type::Control => panic!(), + Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => Ok("1".to_string()), + Type::Integer16 | Type::UnsignedInteger16 => Ok("2".to_string()), + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => Ok("4".to_string()), + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => Ok("8".to_string()), + Type::Product(ref fields) => { + let fields_align = fields + .into_iter() + .map(|id| get_type_alignment(&self.types, *id)); + let fields: Result<Vec<String>, Error> = fields + .into_iter() + .map(|id| self.codegen_type_size(*id, body)) + .collect(); + + // Emit LLVM IR to round up to the alignment of the next field, + // and then add the size of that field. At the end, round up to + // the alignment of the whole struct. + let mut acc_size = "0".to_string(); + for (field_align, field) in zip(fields_align, fields?) { + acc_size = Self::round_up_to(&acc_size, field_align, body)?; + acc_size = Self::append(&acc_size, &field, body)?; + } + Self::round_up_to(&acc_size, get_type_alignment(&self.types, ty), body) + } + Type::Summation(ref variants) => { + let variants: Result<Vec<String>, Error> = variants + .into_iter() + .map(|id| self.codegen_type_size(*id, body)) + .collect(); + + // The size of a summation is the size of the largest field, + // plus 1 byte and alignment for the discriminant. + let mut acc_size = "0".to_string(); + for variant in variants? { + acc_size = Self::max(&acc_size, &variant, body)?; + } - // Calculate the loop bounds. - if info.thread_counts.len() == 1 { - write!(w, " %bound_{} = add i64 0, ", block_idx)?; - if parallel_launch { - write!(w, "%parallel_launch_0_len")?; - } else { - self.emit_svalue(&info.thread_counts[0], false, w)?; + // No alignment is necessary for the 1 byte discriminant. + acc_size = Self::append(&acc_size, "1", body)?; + Self::round_up_to(&acc_size, get_type_alignment(&self.types, ty), body) } - write!(w, "\n")?; - } else if info.thread_counts.len() == 2 { - write!(w, " %bound_{} = mul ", block_idx)?; - if parallel_launch { - write!(w, "i64 %parallel_launch_0_len, %parallel_launch_1_len")?; - } else { - self.emit_svalue(&info.thread_counts[0], true, w)?; - write!(w, ", ")?; - self.emit_svalue(&info.thread_counts[1], false, w)?; + Type::Array(elem, ref bounds) => { + // The size of an array is the size of the element multipled by + // the dynamic constant bounds. + let mut acc_size = self.codegen_type_size(elem, body)?; + for dc in bounds { + acc_size = Self::multiply(&acc_size, &format!("dc{}", dc.idx()), body)?; + } + Ok(acc_size) } - write!(w, "\n")?; - } else { - todo!("TODO: Handle the 3 or more dimensional fork-join case.") } + } - // Calculate the multi-dimensional thread indices. - if info.thread_counts.len() == 1 && parallel_launch { - write!( - w, - " %thread_id_{}_0 = add i64 %parallel_launch_0_low, %linear_{}\n", - fork_join_id.idx(), - block_idx - )?; - } else if info.thread_counts.len() == 1 { - write!( - w, - " %thread_id_{}_0 = add i64 0, %linear_{}\n", - fork_join_id.idx(), - block_idx - )?; - } else if info.thread_counts.len() == 2 && parallel_launch { - write!( - w, - " %unshifted_id_{}_0 = udiv i64 %linear_{}, %parallel_launch_1_len\n", - fork_join_id.idx(), - block_idx - )?; - write!( - w, - " %unshifted_id_{}_1 = urem i64 %linear_{}, %parallel_launch_1_len\n", - fork_join_id.idx(), - block_idx - )?; - write!( - w, - " %thread_id_{}_0 = add i64 %unshifted_id_{}_0, %parallel_launch_0_low\n", - fork_join_id.idx(), - fork_join_id.idx(), - )?; - write!( - w, - " %thread_id_{}_1 = add i64 %unshifted_id_{}_1, %parallel_launch_1_low\n", - fork_join_id.idx(), - fork_join_id.idx(), - )?; - } else if info.thread_counts.len() == 2 { - write!( - w, - " %thread_id_{}_0 = udiv i64 %linear_{}, ", - fork_join_id.idx(), - block_idx - )?; - self.emit_svalue(&info.thread_counts[1], false, w)?; - write!(w, "\n")?; - write!( - w, - " %thread_id_{}_1 = urem i64 %linear_{}, ", - fork_join_id.idx(), - block_idx - )?; - self.emit_svalue(&info.thread_counts[1], false, w)?; - write!(w, "\n")?; + fn get_value(&self, id: NodeID, ty: bool) -> String { + if ty { + format!("{} %v{}", self.get_type(self.typing[id.idx()]), id.idx()) } else { - todo!("TODO: Handle the 3 or more dimensional fork-join case.") + format!("%v{}", id.idx()) } + } - // Increment the linear index. - write!( - w, - " %linear_{}_inc = add i64 %linear_{}, 1\n", - block_idx, block_idx - )?; + fn get_block_name(&self, id: NodeID) -> String { + format!("bb_{}", id.idx()) + } + + fn get_type(&self, id: TypeID) -> &'static str { + convert_type(&self.types[id.idx()]) + } - // Emit the branch. + // Use the trick that given a number `x` and a power of two `m`, we can + // compute rounding up `x` to the next multiple of + // `m` via: + // + // (x + m - 1) & -m + // + // Which is equivalent to the following LLVM IR (`m` is a constant): + // + // %1 = add i64 %x, (m-1) + // %2 = and i64 %1, -m + fn round_up_to(x: &str, m: usize, body: &mut String) -> Result<String, Error> { + let init_body_len = Self::gen_filler_id(); write!( - w, - " %cond_{} = icmp ult i64 %linear_{}, %bound_{}\n", - block_idx, block_idx, block_idx + body, + " %round_up_to.{} = add i64 {}, {}\n", + init_body_len, + x, + m - 1 )?; - let top_name = &self.block_names[&(BlockID::new(block_idx), Some(fork_join_id))]; - let succ_name = &self.block_names[&(info.successor, Some(fork_join_id))]; + let name = format!("%round_up_to.{}", Self::gen_filler_id()); write!( - w, - " br i1 %cond_{}, label %{}, label %{}\n", - block_idx, top_name, succ_name + body, + " {} = and i64 %round_up_to.{}, -{}\n", + name, init_body_len, m )?; - - Ok(()) + Ok(name) } - /* - * Calculate and emit block-level info for vectorized parallel blocks. - */ - fn setup_vectorized_parallel_block<W: Write>( - &self, - width: usize, - w: &mut W, - ) -> Result<(), Error> { - let (block_idx, block) = self.block.get(); - - // Get the uses of virtual registers defined outside the - // vectorized region. - let mut outside_def_used_in_vector = HashSet::new(); - for inst in block.insts.iter() { - for virt_reg in sched_get_uses(inst).filter_map(|svalue| svalue.try_virt_reg()) { - let outside = match self.virt_reg_to_inst_id.get(&virt_reg) { - Some(use_inst_id) => { - block.kind != self.function.blocks[use_inst_id.idx_0()].kind - } - // Parameters are always defined outside the vectorized - // region as scalars. - None => true, - }; - if outside { - outside_def_used_in_vector.insert(virt_reg); - } - } - } - - // Broadcast scalar values into vector values. The vector - // register produced needs to be indexed in name by the block - // index. This is because we may end up using the same value in - // multiple vectorized blocks, and we can't have those - // vectorized scalars have the same name. - for outside_virt_reg in outside_def_used_in_vector.iter() { - write!( - w, - " %vec1_{}_v{} = insertelement <1 x ", - block_idx, outside_virt_reg - )?; - let elem_ty = &self.svalue_types[&SValue::VirtualRegister(*outside_virt_reg)]; - self.emit_type(elem_ty, w)?; - write!(w, "> undef, ")?; - self.emit_type(elem_ty, w)?; - write!(w, " %v{}, i32 0\n", outside_virt_reg)?; - write!( - w, - " %vec_{}_v{} = shufflevector <1 x ", - block_idx, outside_virt_reg - )?; - self.emit_type(elem_ty, w)?; - write!(w, "> %vec1_{}_v{}, <1 x ", block_idx, outside_virt_reg)?; - self.emit_type(elem_ty, w)?; - write!(w, "> undef, <{} x i32> zeroinitializer\n", width)?; - } - - // Set the cell values in the context. - self.vector_width.set(Some(width)); - self.outside_def_used_in_vector - .replace(outside_def_used_in_vector); - - Ok(()) + // Emit LLVM IR to add the size of the next field. + fn append(x: &str, f: &str, body: &mut String) -> Result<String, Error> { + let name = format!("%append.{}", Self::gen_filler_id()); + write!(body, " {} = add i64 {}, {}\n", name, x, f)?; + Ok(name) } - /* - * Calculate and emit block-level info for vectorized reduce blocks. - */ - fn setup_vectorized_reduce_block<W: Write>( - &self, - fork_join_id: ForkJoinID, - width: usize, - w: &mut W, - ) -> Result<(), Error> { - let (block_idx, block) = self.block.get(); - - // Get uses of vector values defined in the parallel region. - let mut vectors_from_parallel = HashSet::new(); - for inst in block.insts.iter() { - for virt_reg in sched_get_uses(inst).filter_map(|svalue| svalue.try_virt_reg()) { - if let Some(inst_id) = self.virt_reg_to_inst_id.get(&virt_reg) - && self.function.blocks[inst_id.idx_0()].kind - == SBlockKind::Parallel(fork_join_id) - { - vectors_from_parallel.insert(virt_reg); - } - } - } - - // Each reduction may be representable by an LLVM reduction intrinsic. - // If every reduction in this reduce block is, then we don't need to - // generate an explicit loop. If any one reduction isn't representable - // as a single intrinsic, then we need to generate an explicit loop. The - // explicit loop calculates the reduction for all reductions that can't - // be represented by intrinsics, while intrinsics are still used to - // calculate reductions that can be represented by them. Currently, the - // "associative" schedule captures this info per reduction variable. - let all_intrinsic_representable = block - .insts - .iter() - .enumerate() - .filter(|(_, inst)| inst.is_reduction_variable()) - .all(|(inst_idx, _)| block.schedules[&inst_idx].contains(&SSchedule::Associative)); - if !all_intrinsic_representable { - let info = &self.parallel_reduce_infos[&fork_join_id]; - let entry_name = &self.block_names[&(info.bottom_parallel_block, Some(fork_join_id))]; - let self_name = &self.block_names[&(info.reduce_block, Some(fork_join_id))]; - let succ_name = &self.block_names[&(info.successor, Some(fork_join_id))]; - - // Emit a loop header for the reduce. - write!( - w, - " %linear_{} = phi i64 [ 0, %{} ], [ %linear_{}_inc, %{}_reduce_body ]\n", - block_idx, entry_name, block_idx, self_name, - )?; - // Emit phis for reduction variables here, since they need to be - // above everything emitted below. - for (var_num, virt_reg) in info.reduction_variables.iter() { - // Only emit phis for reduction variables that aren't - // implemented in intrinsics. - if !block.schedules[&self.virt_reg_to_inst_id[virt_reg].idx_1()] - .contains(&SSchedule::Associative) - { - write!(w, " %v{} = phi ", virt_reg)?; - self.emit_type(&self.svalue_types[&SValue::VirtualRegister(*virt_reg)], w)?; - write!(w, " [ ")?; - self.emit_svalue(&info.reduce_inits[*var_num], false, w)?; - write!(w, ", %{} ], [ ", entry_name)?; - self.emit_svalue(&info.reduce_reducts[*var_num], false, w)?; - write!(w, ", %{}_reduce_body ]\n", self_name)?; - } - } - write!( - w, - " %linear_{}_inc = add i64 %linear_{}, 1\n", - block_idx, block_idx - )?; - // The loop bound is the constant vector width. - write!( - w, - " %cond_{} = icmp ult i64 %linear_{}, {}\n", - block_idx, block_idx, width - )?; - // Branch to the reduce loop body. - write!( - w, - " br i1 %cond_{}, label %{}_reduce_body, label %{}\n", - block_idx, self_name, succ_name - )?; - // The rest of the reduce block gets put into a "body" block. - write!(w, "{}_reduce_body:\n", self_name)?; - // Extract the needed element from the used parallel vectors. - self.vector_width.set(Some(width)); - for virt_reg in vectors_from_parallel.iter() { - write!(w, " %extract_v{} = extractelement ", virt_reg)?; - self.emit_svalue(&SValue::VirtualRegister(*virt_reg), true, w)?; - write!(w, ", i64 %linear_{}\n", block_idx)?; - } - self.vector_width.set(None); - - // Signal that the terminator needs to be a conditional branch to - // close the loop. - self.vector_reduce_cycle.set(true); - } + // Emit LLVM IR to get the maximum of two sizes. + fn max(a: &str, b: &str, body: &mut String) -> Result<String, Error> { + let name = format!("%max.{}", Self::gen_filler_id()); + write!( + body, + " {} = call i64 @llvm.umax.i64(i64 {}, i64 {})\n", + name, a, b + )?; + Ok(name) + } - let vector_reduce_associative_vars = block - .insts - .iter() - .enumerate() - .filter_map(|(inst_idx, inst)| { - inst.try_reduction_variable() - .map(|num| (inst_idx, block.virt_regs[inst_idx].0, num)) - }) - .filter(|(inst_idx, _, _)| block.schedules[&inst_idx].contains(&SSchedule::Associative)) - .map(|(_, virt_reg, num)| (virt_reg, num)) - .collect(); + // Emit LLVM IR to multiply two sizes. + fn multiply(a: &str, b: &str, body: &mut String) -> Result<String, Error> { + let name = format!("%multiply.{}", Self::gen_filler_id()); + write!(body, " {} = mul i64 {}, {}\n", name, a, b)?; + Ok(name) + } - self.vectors_from_parallel.replace(vectors_from_parallel); - self.vector_reduce_associative_vars - .replace(vector_reduce_associative_vars); + // Emit LLVM IR to gep a pointer from a byte size. + fn gep(ptr: &str, size: &str, body: &mut String) -> Result<String, Error> { + let name = format!("%gep.{}", Self::gen_filler_id()); + write!( + body, + " {} = getelementptr i8, ptr {}, i64 {}\n", + name, ptr, size + )?; + Ok(name) + } - Ok(()) + fn gen_filler_id() -> usize { + NUM_FILLER_REGS.fetch_add(1, Ordering::Relaxed) } +} - /* - * Reset the cells storing block specific context configuration. - */ - pub fn reset_cells(&self) { - self.vector_width.take(); - self.outside_def_used_in_vector.take(); - self.vectors_from_parallel.take(); - self.vector_reduce_associative_vars.take(); - self.vector_reduce_cycle.take(); +fn convert_type(ty: &Type) -> &'static str { + match ty { + Type::Boolean => "i1", + Type::Integer8 | Type::UnsignedInteger8 => "i8", + Type::Integer16 | Type::UnsignedInteger16 => "i16", + Type::Integer32 | Type::UnsignedInteger32 => "i32", + Type::Integer64 | Type::UnsignedInteger64 => "i64", + Type::Float32 => "float", + Type::Float64 => "double", + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "ptr", + _ => panic!(), } } -impl SBinaryOperator { - fn get_llvm_op(&self, left_ty: &SType) -> &'static str { - enum OpTy { - Float, - Unsigned, - Signed, - } +fn convert_intrinsic(intrinsic: &Intrinsic, ty: &Type) -> String { + let intrinsic = match intrinsic { + Intrinsic::Abs => "abs", + Intrinsic::ACos => "acos", + Intrinsic::ASin => "asin", + Intrinsic::ATan => "atan", + Intrinsic::ATan2 => "atan2", + Intrinsic::Ceil => "ceil", + Intrinsic::Cos => "cos", + Intrinsic::Cosh => "cosh", + Intrinsic::Exp => "exp", + Intrinsic::Exp2 => "exp2", + Intrinsic::Floor => "floor", + Intrinsic::Ln => "log", + Intrinsic::Log10 => "log10", + Intrinsic::Log2 => "log2", + Intrinsic::Pow => "pow", + Intrinsic::Powf => "pow", + Intrinsic::Powi => "powi", + Intrinsic::Round => "round", + Intrinsic::Sin => "sin", + Intrinsic::Sinh => "sinh", + Intrinsic::Sqrt => "sqrt", + Intrinsic::Tan => "tan", + Intrinsic::Tanh => "tanh", + _ => panic!(), + }; - let op_ty = if left_ty.is_float() { - OpTy::Float - } else if left_ty.is_unsigned() { - OpTy::Unsigned - } else { - OpTy::Signed - }; + // We can't just use our previous routines for emitting types, because only + // inside intrinsics does LLVM use "f32" and "f64" properly! + let ty = match ty { + Type::Boolean => "i1", + Type::Integer8 | Type::UnsignedInteger8 => "i8", + Type::Integer16 | Type::UnsignedInteger16 => "i16", + Type::Integer32 | Type::UnsignedInteger32 => "i32", + Type::Integer64 | Type::UnsignedInteger64 => "i64", + Type::Float32 => "f32", + Type::Float64 => "f64", + _ => panic!(), + }; - match (self, op_ty) { - (SBinaryOperator::Add, OpTy::Float) => "fadd", - (SBinaryOperator::Add, _) => "add", - (SBinaryOperator::Sub, OpTy::Float) => "fsub", - (SBinaryOperator::Sub, _) => "sub", - (SBinaryOperator::Mul, OpTy::Float) => "fmul", - (SBinaryOperator::Mul, _) => "mul", - (SBinaryOperator::Div, OpTy::Float) => "fdiv", - (SBinaryOperator::Div, OpTy::Unsigned) => "udiv", - (SBinaryOperator::Div, OpTy::Signed) => "sdiv", - (SBinaryOperator::Rem, OpTy::Float) => "frem", - (SBinaryOperator::Rem, OpTy::Unsigned) => "urem", - (SBinaryOperator::Rem, OpTy::Signed) => "srem", - (SBinaryOperator::LT, OpTy::Float) => "fcmp olt", - (SBinaryOperator::LT, OpTy::Unsigned) => "icmp ult", - (SBinaryOperator::LT, OpTy::Signed) => "icmp slt", - (SBinaryOperator::LTE, OpTy::Float) => "fcmp ole", - (SBinaryOperator::LTE, OpTy::Unsigned) => "icmp ule", - (SBinaryOperator::LTE, OpTy::Signed) => "icmp sle", - (SBinaryOperator::GT, OpTy::Float) => "fcmp ogt", - (SBinaryOperator::GT, OpTy::Unsigned) => "icmp ugt", - (SBinaryOperator::GT, OpTy::Signed) => "icmp sgt", - (SBinaryOperator::GTE, OpTy::Float) => "fcmp oge", - (SBinaryOperator::GTE, OpTy::Unsigned) => "icmp uge", - (SBinaryOperator::GTE, OpTy::Signed) => "icmp sge", - (SBinaryOperator::EQ, OpTy::Float) => "fcmp oeq", - (SBinaryOperator::EQ, _) => "icmp eq", - (SBinaryOperator::NE, OpTy::Float) => "fcmp one", - (SBinaryOperator::NE, _) => "icmp ne", - (SBinaryOperator::Or, _) => "or", - (SBinaryOperator::And, _) => "and", - (SBinaryOperator::Xor, _) => "xor", - (SBinaryOperator::LSh, _) => "lsh", - (SBinaryOperator::RSh, OpTy::Unsigned) => "lshr", - (SBinaryOperator::RSh, _) => "ashr", - } - } + format!("@llvm.{}.{}", intrinsic, ty) } diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index 88171d33923fc773bd8ba8377e6eba80986612be..9013eff7fd2c310388d9869050b38d235a4e205c 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -1,15 +1,9 @@ -#![feature(let_chains, iter_intersperse, map_try_insert)] +#![feature(if_let_guard, let_chains)] pub mod cpu; -pub mod manifest; -pub mod sched_dot; -pub mod sched_gen; -pub mod sched_ir; -pub mod sched_schedule; +pub mod mem; +pub mod rt; pub use crate::cpu::*; -pub use crate::manifest::*; -pub use crate::sched_dot::*; -pub use crate::sched_gen::*; -pub use crate::sched_ir::*; -pub use crate::sched_schedule::*; +pub use crate::mem::*; +pub use crate::rt::*; diff --git a/hercules_cg/src/manifest.rs b/hercules_cg/src/manifest.rs deleted file mode 100644 index d9224c05b93435ec33c67ac261f70c47e9f3622b..0000000000000000000000000000000000000000 --- a/hercules_cg/src/manifest.rs +++ /dev/null @@ -1,180 +0,0 @@ -extern crate serde; - -extern crate hercules_ir; - -use std::collections::BTreeSet; -use std::iter::once; - -use self::serde::Deserialize; -use self::serde::Serialize; - -use self::hercules_ir::*; - -use crate::*; - -/* - * A manifest stores metadata about a Hercules function. This metadata is used - * by the runtime to actually call a Hercules function. - */ -#[derive(Debug, Clone, Hash, Serialize, Deserialize)] -pub struct Manifest { - // The signature of each Hercules function is represented in terms of - // STypes, since this is the lowest level type representation that Hercules - // constructs before reaching target-specific backends. - pub param_types: Vec<(SType, ParameterKind)>, - pub return_type: SType, - - // The dynamic constants (potentially) used in this Hercules function. - pub dynamic_constants: Vec<DynamicConstant>, - // The dimensions for array constants defined and used in this Hercules - // function. - pub array_constants: Vec<Box<[DynamicConstantID]>>, - - // The partitions that make up this Hercules function. - pub partitions: Vec<PartitionManifest>, -} - -#[derive(Debug, Clone, Hash, Serialize, Deserialize)] -pub struct PartitionManifest { - // Each partition has one corresponding SFunction. - pub name: SFunctionName, - // Record the type and kind of each parameter. - pub parameters: Vec<(SType, ParameterKind)>, - // Record the type and kind of each return value. - pub returns: Vec<(SType, ReturnKind)>, - // Record the list of possible successors from this partition. - pub successors: Vec<PartitionID>, - // Device specific parts of the manifest. Represents details of calling - // partition functions not present in the schedule IR type information - // (since schedule IR is target independent). - pub device: DeviceManifest, -} - -#[derive(Debug, Clone, Hash, Serialize, Deserialize, PartialEq, Eq)] -pub enum ParameterKind { - // A parameter corresponding to a parameter of the Hercules function. - HerculesParameter(usize), - // A parameter corresponding to some data defined in some other partition. - DataInput(NodeID), - // A parameter corresponding to a dynamic constant input to the Hercules - // function. - DynamicConstant(usize), - // A parameter corresponding to an array constant used in the partition. - ArrayConstant(ArrayID), -} - -#[derive(Debug, Clone, Hash, Serialize, Deserialize)] -pub enum ReturnKind { - // A return value corresponding to the return value of the Hercules - // function. - HerculesReturn, - // A return value corresponding to some data used in some other partition. - DataOutput(NodeID), - // An integer specifying which partition should be executed next, if this - // partition has multiple successors. - NextPartition, -} - -#[derive(Debug, Clone, Hash, Serialize, Deserialize)] -pub enum DeviceManifest { - CPU { - // If there's a top level fork-join that we parallel launch, specify, - // for each thread dimension, how many tiles we want to spawn, and - // the thread count. The thread count is a dynamic constant. - parallel_launch: Box<[(usize, DynamicConstantID)]>, - }, - GPU, - Call { - // This is a Hercules function name, not a schedule IR function name. - callee: String, - }, -} - -impl Manifest { - pub fn all_visible_types(&self) -> impl Iterator<Item = SType> + '_ { - self.param_types - // Include the Hercules function parameter types. - .iter() - .map(|(ty, _)| ty.clone()) - // Include the Hercules function return type. - .chain(once(self.return_type.clone())) - // Include the partition parameter types. - .chain( - self.partitions - .iter() - .map(|partition| partition.parameters.iter().map(|(ty, _)| ty.clone())) - .flatten(), - ) - // Include the partition return types. - .chain( - self.partitions - .iter() - .map(|partition| partition.returns.iter().map(|(ty, _)| ty.clone())) - .flatten(), - ) - // Include the product types formed by the partition return types, - // since multiple return values are returned inside a struct. - .chain(self.partitions.iter().map(|partition| { - SType::Product(partition.returns.iter().map(|(ty, _)| ty.clone()).collect()) - })) - } - - pub fn transitive_closure_type_set(type_set: BTreeSet<SType>) -> BTreeSet<SType> { - let mut closure = BTreeSet::new(); - let mut workset: BTreeSet<&SType> = type_set.iter().collect(); - - while let Some(ty) = workset.pop_last() { - match ty { - SType::Product(fields) => workset.extend(fields), - SType::ArrayRef(elem) => { - workset.insert(elem); - } - _ => {} - } - closure.insert(ty.clone()); - } - - closure - } -} - -impl PartitionManifest { - pub fn data_inputs(&self) -> impl Iterator<Item = (NodeID, &SType)> + '_ { - self.parameters.iter().filter_map(|(stype, param_kind)| { - if let ParameterKind::DataInput(id) = param_kind { - Some((*id, stype)) - } else { - None - } - }) - } - - pub fn data_outputs(&self) -> impl Iterator<Item = (NodeID, &SType)> + '_ { - self.returns.iter().filter_map(|(stype, return_kind)| { - if let ReturnKind::DataOutput(id) = return_kind { - Some((*id, stype)) - } else { - None - } - }) - } -} - -impl DeviceManifest { - pub fn cpu() -> Self { - DeviceManifest::CPU { - parallel_launch: Box::new([]), - } - } - - pub fn gpu() -> Self { - DeviceManifest::GPU - } - - pub fn num_parallel_launch_dims(&self) -> usize { - match self { - DeviceManifest::CPU { parallel_launch } => parallel_launch.len(), - _ => panic!(), - } - } -} diff --git a/hercules_cg/src/mem.rs b/hercules_cg/src/mem.rs new file mode 100644 index 0000000000000000000000000000000000000000..0c053455953937c9ff4955cc94b6482e0bb59373 --- /dev/null +++ b/hercules_cg/src/mem.rs @@ -0,0 +1,291 @@ +extern crate bitvec; +extern crate hercules_ir; + +use std::collections::{BTreeMap, BTreeSet}; + +use self::bitvec::prelude::*; + +use self::hercules_ir::*; + +#[derive(Debug)] +pub struct MemoryObjects { + node_id_to_memory_objects: Vec<Vec<usize>>, + memory_object_to_origin: Vec<NodeID>, + parameter_index_to_memory_object: Vec<Option<usize>>, + possibly_returned_memory_objects: Vec<usize>, +} + +impl MemoryObjects { + pub fn memory_objects(&self, id: NodeID) -> &Vec<usize> { + &self.node_id_to_memory_objects[id.idx()] + } + + pub fn origin(&self, memory_object: usize) -> NodeID { + self.memory_object_to_origin[memory_object] + } + + pub fn memory_object_of_parameter(&self, parameter: usize) -> Option<usize> { + self.parameter_index_to_memory_object[parameter] + } + + pub fn returned_memory_objects(&self) -> &Vec<usize> { + &self.possibly_returned_memory_objects + } + + pub fn num_memory_objects(&self) -> usize { + self.memory_object_to_origin.len() + } +} + +#[derive(Debug)] +pub struct MemoryObjectsMutability { + func_to_memory_object_to_mutable: Vec<BitVec<u8, Lsb0>>, +} + +impl MemoryObjectsMutability { + pub fn is_mutable(&self, id: FunctionID, memory_object: usize) -> bool { + self.func_to_memory_object_to_mutable[id.idx()][memory_object] + } +} + +/* + * Each node is assigned a set of memory objects output-ed from the node. This + * is just a set of memory object IDs (usize). + */ +#[derive(PartialEq, Eq, Clone, Debug)] +struct MemoryObjectLattice { + objs: BTreeSet<usize>, +} + +impl Semilattice for MemoryObjectLattice { + fn meet(a: &Self, b: &Self) -> Self { + MemoryObjectLattice { + objs: a.objs.union(&b.objs).map(|x| *x).collect(), + } + } + + fn top() -> Self { + MemoryObjectLattice { + objs: BTreeSet::new(), + } + } + + fn bottom() -> Self { + // Technically, this lattice is unbounded - technically technically, the + // lattice is bounded by the number of memory objects in a given + // instance, but incorporating this information is not possible in our + // Semilattice inferface. Luckily bottom() isn't necessary if we never + // call it, which we don't here. + panic!() + } +} + +/* + * Top level function to analyze memory objects in a Hercules function. These + * are distinct collections (products, summations, arrays) that are used in a + * function where we try to disambiguate a string of values produced in the + * immutable value semantics of Hercules IR into a smaller amount of distinct + * memory object that can be modified in-place. + */ +pub fn memory_objects( + function: &Function, + types: &Vec<Type>, + reverse_postorder: &Vec<NodeID>, + typing: &Vec<TypeID>, +) -> MemoryObjects { + // Find memory objects originating at parameters, constants, calls, or + // undefs. + let memory_object_to_origin: Vec<_> = function + .nodes + .iter() + .enumerate() + .filter(|(idx, node)| { + (node.is_parameter() || node.is_constant() || node.is_call() || node.is_undef()) + && !types[typing[*idx].idx()].is_primitive() + }) + .map(|(idx, _)| NodeID::new(idx)) + .collect(); + let node_id_to_originating_memory_obj: BTreeMap<_, _> = memory_object_to_origin + .iter() + .enumerate() + .map(|(idx, id)| (*id, idx)) + .collect(); + + // Map parameter index to memory object, if applicable. Panic if two + // parameter nodes with the same index are found - those really should get + // removed by GVN! + let mut parameter_index_to_memory_object = vec![None; function.param_types.len()]; + for (memory_object, origin) in memory_object_to_origin.iter().enumerate() { + if let Some(param) = function.nodes[origin.idx()].try_parameter() { + assert!( + parameter_index_to_memory_object[param].is_none(), + "PANIC: Found multiple parameter nodes with the same index." + ); + parameter_index_to_memory_object[param] = Some(memory_object); + } + } + + // Run dataflow analysis to figure out which memory objects each data node + // may be. Note that there's a strict subset of data nodes that can assigned + // memory objects: + // + // - Phi: selects between memory objects in SSA form, may be assigned + // multiple possible memory objects. + // - Reduce: reduces over a memory object, similar to phis. + // - Parameter: may originate a memory object. + // - Constant: may originate a memory object. + // - Call: may originate a memory object - if doesn't originate a memory + // object, doesn't become one based on arguments, as arguments are passed + // to callee. + // - Read: may extract a smaller memory object from input - this is + // considered to be the same memory object as the input, as no copy takes + // place. + // - Write: updates a memory object. + // - Undef: may originate a dummy memory object. + // + // Some notable omissions are: + // + // - Return: doesn't technically "output" a memory object, but may consume + // one. As in the logic with calls not returning a memory object, returns + // are not assigned memory objects. + // - Ternary (select): selecting over memory objects is a gray area + // currently. Bail if we see a select over memory objects. + assert!(!function.nodes.iter().enumerate().any(|(idx, node)| node + .try_ternary(TernaryOperator::Select) + .is_some() + && !types[typing[idx].idx()].is_primitive())); + let lattice = forward_dataflow(function, reverse_postorder, |inputs, id| { + match function.nodes[id.idx()] { + Node::Phi { + control: _, + data: _, + } + | Node::Reduce { + control: _, + init: _, + reduct: _, + } => inputs + .into_iter() + .fold(MemoryObjectLattice::top(), |acc, input| { + MemoryObjectLattice::meet(&acc, input) + }), + Node::Parameter { index: _ } + | Node::Constant { id: _ } + | Node::Call { + control: _, + function: _, + dynamic_constants: _, + args: _, + } + | Node::Undef { ty: _ } + if let Some(obj) = node_id_to_originating_memory_obj.get(&id) => + { + MemoryObjectLattice { + objs: [*obj].iter().map(|x| *x).collect(), + } + } + Node::Read { + collect: _, + indices: _, + } + | Node::Write { + collect: _, + data: _, + indices: _, + } => inputs[0].clone(), + _ => MemoryObjectLattice::top(), + } + }); + + // Look at the memory objects the data input to each return could be. + let mut possibly_returned_memory_objects = BTreeSet::new(); + for node in function.nodes.iter() { + if let Node::Return { control: _, data } = node { + possibly_returned_memory_objects = possibly_returned_memory_objects + .union(&lattice[data.idx()].objs) + .map(|x| *x) + .collect(); + } + } + let possibly_returned_memory_objects = possibly_returned_memory_objects.into_iter().collect(); + + let node_id_to_memory_objects = lattice + .into_iter() + .map(|lattice| lattice.objs.into_iter().collect()) + .collect(); + MemoryObjects { + node_id_to_memory_objects, + memory_object_to_origin, + parameter_index_to_memory_object, + possibly_returned_memory_objects, + } +} + +/* + * Determine if each memory object in each function is mutated or not. + */ +pub fn memory_objects_mutability( + module: &Module, + callgraph: &CallGraph, + memory_objects: &Vec<MemoryObjects>, +) -> MemoryObjectsMutability { + let mut mutated: Vec<_> = memory_objects + .iter() + .map(|memory_objects| bitvec![u8, Lsb0; 0; memory_objects.num_memory_objects()]) + .collect(); + let topo = callgraph.topo(); + + for func_id in topo { + // A memory object is mutated when: + // 1. The object is the subject of a write node. + // 2. The object is passed as argument to a function that mutates it. + for (idx, node) in module.functions[func_id.idx()].nodes.iter().enumerate() { + if node.is_write() { + // Every memory object that the write itself corresponds to it + // mutable in this function. + for memory_object in memory_objects[func_id.idx()].memory_objects(NodeID::new(idx)) + { + mutated[func_id.idx()].set(*memory_object, true); + } + } else if let Some((_, callee_id, _, args)) = node.try_call() { + for (param_idx, arg) in args.into_iter().enumerate() { + // If this parameter corresponds to a memory object and it's + // mutable in the callee... + if let Some(param_callee_memory_object) = + memory_objects[callee_id.idx()].memory_object_of_parameter(param_idx) + && mutated[callee_id.idx()][param_callee_memory_object] + { + // Then every memory object corresponding to the + // argument node in this function is mutable. + for memory_object in memory_objects[func_id.idx()].memory_objects(*arg) { + mutated[func_id.idx()].set(*memory_object, true); + } + } + } + } + } + } + + MemoryObjectsMutability { + func_to_memory_object_to_mutable: mutated, + } +} + +/* + * The alignment of a type does not depend on dynamic constants. + */ +pub fn get_type_alignment(types: &Vec<Type>, ty: TypeID) -> usize { + match types[ty.idx()] { + Type::Control => panic!(), + Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1, + Type::Integer16 | Type::UnsignedInteger16 => 2, + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, + Type::Product(ref members) | Type::Summation(ref members) => members + .into_iter() + .map(|id| get_type_alignment(types, *id)) + .max() + .unwrap_or(1), + Type::Array(elem, _) => get_type_alignment(types, elem), + } +} diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs new file mode 100644 index 0000000000000000000000000000000000000000..ddbc8f539e711d51c9d1005995068c6c2ee70eb8 --- /dev/null +++ b/hercules_cg/src/rt.rs @@ -0,0 +1,629 @@ +extern crate bitvec; +extern crate hercules_ir; + +use std::collections::{BTreeMap, VecDeque}; +use std::fmt::{Error, Write}; +use std::iter::{zip, FromIterator}; + +use self::bitvec::prelude::*; + +use self::hercules_ir::*; + +use crate::*; + +/* + * Entry Hercules functions are lowered to async Rust code to achieve easy task + * level parallelism. This Rust is generated textually, and is included via a + * procedural macro in the user's Rust code. + */ +pub fn rt_codegen<W: Write>( + func_id: FunctionID, + module: &Module, + reverse_postorder: &Vec<NodeID>, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + bbs: &Vec<NodeID>, + callgraph: &CallGraph, + memory_objects: &Vec<MemoryObjects>, + memory_objects_mutability: &MemoryObjectsMutability, + w: &mut W, +) -> Result<(), Error> { + let ctx = RTContext { + func_id, + module, + reverse_postorder, + typing, + control_subgraph, + bbs, + callgraph, + memory_objects, + _memory_objects_mutability: memory_objects_mutability, + }; + ctx.codegen_function(w) +} + +struct RTContext<'a> { + func_id: FunctionID, + module: &'a Module, + reverse_postorder: &'a Vec<NodeID>, + typing: &'a Vec<TypeID>, + control_subgraph: &'a Subgraph, + bbs: &'a Vec<NodeID>, + callgraph: &'a CallGraph, + memory_objects: &'a Vec<MemoryObjects>, + // TODO: use once memory objects are passed in a custom type where this + // actually matters. + _memory_objects_mutability: &'a MemoryObjectsMutability, +} + +impl<'a> RTContext<'a> { + fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { + let func = &self.get_func(); + + // Dump the function signature. + write!( + w, + "#[allow(unused_variables,unused_mut)]async fn {}(", + func.name + )?; + let mut first_param = true; + // The first set of parameters are dynamic constants. + for idx in 0..func.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + } + write!(w, "dc_p{}: u64", idx)?; + } + // The second set of parameters are normal parameters. + for idx in 0..func.param_types.len() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + } + if !self.module.types[func.param_types[idx].idx()].is_primitive() { + write!(w, "mut ")?; + } + write!( + w, + "p_i{}: {}", + idx, + self.get_type_interface(func.param_types[idx]) + )?; + } + write!(w, ") -> {} {{\n", self.get_type_interface(func.return_type))?; + + // Copy the "interface" parameters to "non-interface" parameters. + // The purpose of this is to convert memory objects from a Box<[u8]> + // type to a *mut u8 type. This name copying is done so that we can + // easily construct memory objects just after this by moving the + // "interface" parameters. + for (idx, ty) in func.param_types.iter().enumerate() { + if self.module.types[ty.idx()].is_primitive() { + write!(w, " let p{} = p_i{};\n", idx, idx)?; + } else { + write!( + w, + " let p{} = ::std::boxed::Box::as_mut_ptr(&mut p_i{}) as *mut u8;\n", + idx, idx + )?; + } + } + + // Collect the boxes representing ownership over memory objects for this + // function. The actual emitted computation is done entirely using + // pointers, so these get emitted to hold onto ownership over the + // underlying memory and to automatically clean them up when this + // function returns. Memory objects are inside Options, since their + // ownership may get passed to other called RT functions. If this + // function returns a memory object, then at the very end, right before + // the return, the to-be-returned pointer is compared against the owned + // memory objects - it should match exactly one of those objects, and + // that box is what's actually returned. + let mem_obj_ty = "::core::option::Option<::std::boxed::Box<[u8]>>"; + for memory_object in 0..self.memory_objects[self.func_id.idx()].num_memory_objects() { + let origin = self.memory_objects[self.func_id.idx()].origin(memory_object); + match func.nodes[origin.idx()] { + Node::Parameter { index } => write!( + w, + " let mut mem_obj{}: {} = Some(p_i{});\n", + memory_object, mem_obj_ty, index + )?, + Node::Constant { id: _ } => { + let size = self.codegen_type_size(self.typing[origin.idx()]); + write!( + w, + " let mut mem_obj{}: {} = Some((0..{}).map(|_| 0u8).collect());\n", + memory_object, mem_obj_ty, size + )? + } + Node::Call { + control: _, + function: _, + dynamic_constants: _, + args: _, + } + | Node::Undef { ty: _ } => write!( + w, + " let mut mem_obj{}: {} = None;\n", + memory_object, mem_obj_ty, + )?, + _ => panic!(), + } + } + + // Dump signatures for called CPU functions. + write!(w, " extern \"C\" {{\n")?; + for callee in self.callgraph.get_callees(self.func_id) { + let callee = &self.module.functions[callee.idx()]; + write!(w, " fn {}(", callee.name)?; + let mut first_param = true; + for idx in 0..callee.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + } + write!(w, "dc{}: u64", idx)?; + } + for (idx, ty) in callee.param_types.iter().enumerate() { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + } + write!(w, "p{}: {}", idx, self.get_type(*ty))?; + } + write!(w, ") -> {};\n", self.get_type(callee.return_type))?; + } + write!(w, " }}\n")?; + + // Declare intermediary variables for every value. + for idx in 0..func.nodes.len() { + if func.nodes[idx].is_control() { + continue; + } + write!( + w, + " let mut node_{}: {} = {};\n", + idx, + self.get_type(self.typing[idx]), + if self.module.types[self.typing[idx].idx()].is_integer() { + "0" + } else if self.module.types[self.typing[idx].idx()].is_float() { + "0.0" + } else { + "::core::ptr::null::<u8>() as _" + } + )?; + } + + // The core executor is a Rust loop. We literally run a "control token" + // as described in the original sea of nodes paper through the basic + // blocks to drive execution. + write!( + w, + " let mut control_token: i8 = 0;\n loop {{\n match control_token {{\n", + )?; + + let mut blocks: BTreeMap<_, _> = (0..func.nodes.len()) + .filter(|idx| func.nodes[*idx].is_control()) + .map(|idx| (NodeID::new(idx), String::new())) + .collect(); + + // Emit data flow into basic blocks. + let mut worklist = VecDeque::from_iter( + self.reverse_postorder + .into_iter() + .filter(|id| !func.nodes[id.idx()].is_control()), + ); + let mut visited = bitvec![u8, Lsb0; 0; func.nodes.len()]; + while let Some(id) = worklist.pop_front() { + let node = &func.nodes[id.idx()]; + if node.is_phi() + || node.is_reduce() + || get_uses(node) + .as_ref() + .into_iter() + .all(|u| func.nodes[u.idx()].is_control() || visited[u.idx()]) + { + self.codegen_data_node(*id, &mut blocks)?; + visited.set(id.idx(), true); + } else { + worklist.push_back(id); + } + } + + // Emit control flow into basic blocks. + for id in (0..func.nodes.len()).map(NodeID::new) { + if !func.nodes[id.idx()].is_control() { + continue; + } + self.codegen_control_node(id, &mut blocks)?; + } + + // Dump the emitted basic blocks. + for (id, block) in blocks { + write!( + w, + " {} => {{\n{} }}\n", + id.idx(), + block + )?; + } + + // Close the match, loop, and function. + write!(w, " _ => panic!()\n }}\n }}\n}}\n")?; + Ok(()) + } + + /* + * While control nodes in Hercules IR are predecessor-centric (each take a + * control input that defines the predecessor relationship), the Rust loop + * we generate is successor centric. This difference requires explicit + * translation. + */ + fn codegen_control_node( + &self, + id: NodeID, + blocks: &mut BTreeMap<NodeID, String>, + ) -> Result<(), Error> { + let func = &self.get_func(); + match func.nodes[id.idx()] { + // Start, region, and projection control nodes all have exactly one + // successor and are otherwise simple. + Node::Start + | Node::Region { preds: _ } + | Node::Projection { + control: _, + selection: _, + } => { + let block = &mut blocks.get_mut(&id).unwrap(); + let succ = self.control_subgraph.succs(id).next().unwrap(); + write!(block, " control_token = {};\n", succ.idx())? + } + // If nodes have two successors - examine the projections to + // determine which branch is which, and branch between them. + Node::If { control: _, cond } => { + let block = &mut blocks.get_mut(&id).unwrap(); + let mut succs = self.control_subgraph.succs(id); + let succ1 = succs.next().unwrap(); + let succ2 = succs.next().unwrap(); + let succ1_is_true = func.nodes[succ1.idx()].try_projection(1).is_some(); + write!( + block, + " control_token = if {} {{ {} }} else {{ {} }};\n", + self.get_value(cond), + if succ1_is_true { succ1 } else { succ2 }.idx(), + if succ1_is_true { succ2 } else { succ1 }.idx(), + )? + } + Node::Return { control: _, data } => { + let block = &mut blocks.get_mut(&id).unwrap(); + let memory_objects = self.memory_objects[self.func_id.idx()].memory_objects(data); + if memory_objects.is_empty() { + write!(block, " return {};\n", self.get_value(data))? + } else { + // If the value to return is a memory object, figure out + // which memory object it actually is at runtime and return + // that box. + for memory_object in memory_objects { + write!(block, " if let Some(mut mem_obj) = mem_obj{} && ::std::boxed::Box::as_mut_ptr(&mut mem_obj) as *mut u8 == {} {{\n", memory_object, self.get_value(data))?; + write!(block, " return mem_obj;\n")?; + write!(block, " }}\n")?; + } + write!(block, " panic!(\"HERCULES PANIC: Pointer to be returned doesn't match any known memory objects.\");\n")? + } + } + _ => panic!("PANIC: Can't lower {:?}.", func.nodes[id.idx()]), + } + Ok(()) + } + + /* + * Lower data nodes in Hercules IR into Rust statements. + */ + fn codegen_data_node( + &self, + id: NodeID, + blocks: &mut BTreeMap<NodeID, String>, + ) -> Result<(), Error> { + let func = &self.get_func(); + match func.nodes[id.idx()] { + Node::Parameter { index } => { + let block = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap(); + write!( + block, + " {} = p{};\n", + self.get_value(id), + index + )? + } + Node::Constant { id: cons_id } => { + let block = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap(); + write!(block, " {} = ", self.get_value(id))?; + match self.module.constants[cons_id.idx()] { + Constant::Boolean(val) => write!(block, "{}bool", val)?, + Constant::Integer8(val) => write!(block, "{}i8", val)?, + Constant::Integer16(val) => write!(block, "{}i16", val)?, + Constant::Integer32(val) => write!(block, "{}i32", val)?, + Constant::Integer64(val) => write!(block, "{}i64", val)?, + Constant::UnsignedInteger8(val) => write!(block, "{}u8", val)?, + Constant::UnsignedInteger16(val) => write!(block, "{}u16", val)?, + Constant::UnsignedInteger32(val) => write!(block, "{}u32", val)?, + Constant::UnsignedInteger64(val) => write!(block, "{}u64", val)?, + Constant::Float32(val) => write!(block, "{}f32", val)?, + Constant::Float64(val) => write!(block, "{}f64", val)?, + Constant::Product(_, _) | Constant::Summation(_, _, _) | Constant::Array(_) => { + let memory_objects = + self.memory_objects[self.func_id.idx()].memory_objects(id); + assert_eq!(memory_objects.len(), 1); + let memory_object = memory_objects[0]; + write!( + block, + "::std::boxed::Box::as_mut_ptr(mem_obj{}.as_mut().unwrap()) as *mut u8", + memory_object + )? + } + } + write!(block, ";\n")? + } + Node::Call { + control: _, + function: callee_id, + ref dynamic_constants, + ref args, + } => { + let block = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap(); + write!( + block, + " {} = unsafe {{ {}(", + self.get_value(id), + self.module.functions[callee_id.idx()].name + )?; + for dc in dynamic_constants { + self.codegen_dynamic_constant(*dc, block)?; + write!(block, ", ")?; + } + for arg in args { + write!(block, "{}, ", self.get_value(*arg))?; + } + write!(block, ") }};\n")?; + + // When a CPU function is called that returns a memory object, + // that memory object must have come from one of its parameters. + // Dynamically figure out which one it came from, so that we can + // move it to the slot of the output memory object. + let call_memory_objects = + self.memory_objects[self.func_id.idx()].memory_objects(id); + if !call_memory_objects.is_empty() { + assert_eq!(call_memory_objects.len(), 1); + let call_memory_object = call_memory_objects[0]; + + let callee_returned_memory_objects = + self.memory_objects[callee_id.idx()].returned_memory_objects(); + let possible_params: Vec<_> = (0..self.module.functions[callee_id.idx()] + .param_types + .len()) + .filter(|idx| { + let memory_object_of_param = self.memory_objects[callee_id.idx()] + .memory_object_of_parameter(*idx); + // Look at parameters that could be the source of + // the memory object returned by the function. + memory_object_of_param + .map(|memory_object_of_param| { + callee_returned_memory_objects.contains(&memory_object_of_param) + }) + .unwrap_or(false) + }) + .collect(); + let arg_memory_objects = args + .into_iter() + .enumerate() + .filter(|(idx, _)| possible_params.contains(idx)) + .map(|(_, arg)| { + self.memory_objects[self.func_id.idx()] + .memory_objects(*arg) + .into_iter() + }) + .flatten(); + + // Dynamically check which of the memory objects + // corresponding to arguments to the call was returned by + // the call. Move that memory object into the memory object + // of the call. + let mut first_obj = true; + for arg_memory_object in arg_memory_objects { + write!(block, " ")?; + if first_obj { + first_obj = false; + } else { + write!(block, "else ")?; + } + write!(block, "if let Some(mem_obj) = mem_obj{}.as_mut() && ::std::boxed::Box::as_mut_ptr(mem_obj) as *mut u8 == {} {{\n", arg_memory_object, self.get_value(id))?; + write!( + block, + " mem_obj{} = mem_obj{}.take();\n", + call_memory_object, arg_memory_object + )?; + write!(block, " }}\n")?; + } + write!(block, " else {{\n")?; + write!(block, " panic!(\"HERCULES PANIC: Pointer returned from called function doesn't match any known memory objects.\");\n")?; + write!(block, " }}\n")?; + } + } + _ => panic!("PANIC: Can't lower {:?}.", func.nodes[id.idx()]), + } + Ok(()) + } + + /* + * Lower dynamic constant in Hercules IR into a Rust expression. + */ + fn codegen_dynamic_constant<W: Write>( + &self, + id: DynamicConstantID, + w: &mut W, + ) -> Result<(), Error> { + match self.module.dynamic_constants[id.idx()] { + DynamicConstant::Constant(val) => write!(w, "{}", val)?, + DynamicConstant::Parameter(idx) => write!(w, "dc_p{}", idx)?, + DynamicConstant::Add(left, right) => { + write!(w, "(")?; + self.codegen_dynamic_constant(left, w)?; + write!(w, "+")?; + self.codegen_dynamic_constant(right, w)?; + write!(w, ")")?; + } + DynamicConstant::Sub(left, right) => { + write!(w, "(")?; + self.codegen_dynamic_constant(left, w)?; + write!(w, "-")?; + self.codegen_dynamic_constant(right, w)?; + write!(w, ")")?; + } + DynamicConstant::Mul(left, right) => { + write!(w, "(")?; + self.codegen_dynamic_constant(left, w)?; + write!(w, "*")?; + self.codegen_dynamic_constant(right, w)?; + write!(w, ")")?; + } + DynamicConstant::Div(left, right) => { + write!(w, "(")?; + self.codegen_dynamic_constant(left, w)?; + write!(w, "/")?; + self.codegen_dynamic_constant(right, w)?; + write!(w, ")")?; + } + DynamicConstant::Rem(left, right) => { + write!(w, "(")?; + self.codegen_dynamic_constant(left, w)?; + write!(w, "%")?; + self.codegen_dynamic_constant(right, w)?; + write!(w, ")")?; + } + } + Ok(()) + } + + /* + * Lower the size of a type into a Rust expression. + */ + fn codegen_type_size(&self, ty: TypeID) -> String { + match self.module.types[ty.idx()] { + Type::Control => panic!(), + Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => "1".to_string(), + Type::Integer16 | Type::UnsignedInteger16 => "2".to_string(), + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => "4".to_string(), + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => "8".to_string(), + Type::Product(ref fields) => { + let fields_align = fields + .into_iter() + .map(|id| get_type_alignment(&self.module.types, *id)); + let fields: Vec<String> = fields + .into_iter() + .map(|id| self.codegen_type_size(*id)) + .collect(); + + // Emit LLVM IR to round up to the alignment of the next field, + // and then add the size of that field. At the end, round up to + // the alignment of the whole struct. + let mut acc_size = "0".to_string(); + for (field_align, field) in zip(fields_align, fields) { + acc_size = format!( + "(({} + {}) & !{})", + acc_size, + field_align - 1, + field_align - 1 + ); + acc_size = format!("({} + {})", acc_size, field); + } + let total_align = get_type_alignment(&self.module.types, ty); + format!( + "(({} + {}) & !{})", + acc_size, + total_align - 1, + total_align - 1 + ) + } + Type::Summation(ref variants) => { + let variants = variants.into_iter().map(|id| self.codegen_type_size(*id)); + + // The size of a summation is the size of the largest field, + // plus 1 byte and alignment for the discriminant. + let mut acc_size = "0".to_string(); + for variant in variants { + acc_size = format!("::core::cmp::max({}, {})", acc_size, variant); + } + + // No alignment is necessary for the 1 byte discriminant. + let total_align = get_type_alignment(&self.module.types, ty); + format!( + "(({} + 1 + {}) & !{})", + acc_size, + total_align - 1, + total_align - 1 + ) + } + Type::Array(elem, ref bounds) => { + // The size of an array is the size of the element multipled by + // the dynamic constant bounds. + let mut acc_size = self.codegen_type_size(elem); + for dc in bounds { + acc_size = format!("{} * ", acc_size); + self.codegen_dynamic_constant(*dc, &mut acc_size).unwrap(); + } + format!("({})", acc_size) + } + } + } + + fn get_func(&self) -> &Function { + &self.module.functions[self.func_id.idx()] + } + + fn get_value(&self, id: NodeID) -> String { + format!("node_{}", id.idx()) + } + + fn get_type(&self, id: TypeID) -> &'static str { + convert_type(&self.module.types[id.idx()]) + } + + fn get_type_interface(&self, id: TypeID) -> &'static str { + convert_type_interface(&self.module.types[id.idx()]) + } +} + +fn convert_type(ty: &Type) -> &'static str { + match ty { + Type::Boolean => "bool", + Type::Integer8 => "i8", + Type::Integer16 => "i16", + Type::Integer32 => "i32", + Type::Integer64 => "i64", + Type::UnsignedInteger8 => "u8", + Type::UnsignedInteger16 => "u16", + Type::UnsignedInteger32 => "u32", + Type::UnsignedInteger64 => "u64", + Type::Float32 => "f32", + Type::Float64 => "f64", + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "*mut u8", + _ => panic!(), + } +} + +/* + * Collection types are passed to / returned from runtime functions through a + * wrapper type for ownership tracking reasons. + */ +fn convert_type_interface(ty: &Type) -> &'static str { + match ty { + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "Box<[u8]>", + _ => convert_type(ty), + } +} diff --git a/hercules_cg/src/sched_dot.rs b/hercules_cg/src/sched_dot.rs deleted file mode 100644 index f044618931f0ffe45aeb6da1ef2798f4bc7bdfe6..0000000000000000000000000000000000000000 --- a/hercules_cg/src/sched_dot.rs +++ /dev/null @@ -1,174 +0,0 @@ -extern crate bitvec; -extern crate hercules_ir; -extern crate rand; - -use std::collections::{HashMap, VecDeque}; -use std::env::temp_dir; -use std::fmt::Write; -use std::fs::File; -use std::io::Write as _; -use std::process::Command; - -use self::bitvec::prelude::*; -use self::rand::Rng; - -use self::hercules_ir::*; - -use crate::*; - -/* - * Top level function to compute a dot graph for a schedule IR module, and - * immediately render it using xdot. - */ -pub fn xdot_sched_module(module: &SModule) { - let mut tmp_path = temp_dir(); - let mut rng = rand::thread_rng(); - let num: u64 = rng.gen(); - tmp_path.push(format!("sched_dot_{}.dot", num)); - let mut file = File::create(tmp_path.clone()).expect("PANIC: Unable to open output file."); - let mut contents = String::new(); - write_dot(&module, &mut contents).expect("PANIC: Unable to generate output file contents."); - file.write_all(contents.as_bytes()) - .expect("PANIC: Unable to write output file contents."); - Command::new("xdot") - .args([tmp_path]) - .output() - .expect("PANIC: Couldn't execute xdot. Is xdot installed?"); -} - -/* - * Top level function to write a schedule IR module out as a dot graph. - */ -pub fn write_dot<W: Write>(module: &SModule, w: &mut W) -> std::fmt::Result { - write_digraph_header(w)?; - - for (function_name, function) in module.functions.iter() { - // Schedule the SFunction to form a linear ordering of instructions. - let virt_reg_to_inst_id = sched_virt_reg_to_inst_id(function); - let dep_graph = sched_dependence_graph(function, &virt_reg_to_inst_id); - let mut block_to_inst_list = (0..function.blocks.len()) - .map(|block_idx| (block_idx, vec![])) - .collect::<HashMap<usize, Vec<(&SInst, usize, Option<&Vec<SSchedule>>)>>>(); - for (block_idx, block) in function.blocks.iter().enumerate() { - let mut emitted = bitvec![u8, Lsb0; 0; block.insts.len()]; - let mut worklist = VecDeque::from((0..block.insts.len()).collect::<Vec<_>>()); - while let Some(inst_idx) = worklist.pop_front() { - let inst_id = InstID::new(block_idx, inst_idx); - let dependencies = &dep_graph[&inst_id]; - let all_uses_emitted = dependencies - .into_iter() - // Check that all used instructions in this block... - .filter(|inst_id| inst_id.idx_0() == block_idx) - // were already emitted. - .all(|inst_id| emitted[inst_id.idx_1()]); - // Phis don't need to wait for all of their uses to be added. - if block.insts[inst_idx].is_phi() || all_uses_emitted { - block_to_inst_list.get_mut(&block_idx).unwrap().push(( - &block.insts[inst_idx], - block.virt_regs[inst_idx].0, - block.schedules.get(&inst_idx), - )); - emitted.set(inst_id.idx_1(), true); - } else { - worklist.push_back(inst_idx); - } - } - } - - // A SFunction is a subgraph. - write_subgraph_header(function_name, w)?; - - // Each SBlock is a record node. - for (block_idx, block) in function.blocks.iter().enumerate() { - // Emit the instructions in scheduled order. - write_block(function_name, block_idx, &block_to_inst_list[&block_idx], w)?; - - // Add control edges. - for succ in block.successors().as_ref() { - write_control_edge(function_name, block_idx, succ.idx(), w)?; - } - } - - write_graph_footer(w)?; - } - - write_graph_footer(w)?; - Ok(()) -} - -fn write_digraph_header<W: Write>(w: &mut W) -> std::fmt::Result { - write!(w, "digraph \"Module\" {{\n")?; - write!(w, "compound=true\n")?; - Ok(()) -} - -fn write_subgraph_header<W: Write>(function_name: &SFunctionName, w: &mut W) -> std::fmt::Result { - write!(w, "subgraph {} {{\n", function_name)?; - write!(w, "label=\"{}\"\n", function_name)?; - write!(w, "bgcolor=ivory4\n")?; - write!(w, "cluster=true\n")?; - Ok(()) -} - -fn write_graph_footer<W: Write>(w: &mut W) -> std::fmt::Result { - write!(w, "}}\n")?; - Ok(()) -} - -fn write_block<W: Write>( - function_name: &SFunctionName, - block_idx: usize, - insts: &[(&SInst, usize, Option<&Vec<SSchedule>>)], - w: &mut W, -) -> std::fmt::Result { - write!(w, "{}_{} [label=\"{{", function_name, block_idx,)?; - for token in insts.into_iter().map(|token| Some(token)).intersperse(None) { - match token { - Some((inst, virt_reg, schedules)) => { - write!(w, "%{} = {}(", virt_reg, inst.upper_case_name())?; - for token in sched_get_uses(inst).map(|u| Some(u)).intersperse(None) { - match token { - Some(SValue::VirtualRegister(use_virt_reg)) => { - write!(w, "%{}", use_virt_reg)? - } - Some(SValue::Constant(scons)) => write!(w, "{:?}", scons)?, - None => write!(w, ", ")?, - } - } - write!(w, ")")?; - if let Some(schedules) = schedules - && !schedules.is_empty() - { - write!(w, " [")?; - for token in schedules.into_iter().map(|s| Some(s)).intersperse(None) { - match token { - Some(schedule) => write!(w, "{:?}", schedule)?, - None => write!(w, ", ")?, - } - } - write!(w, "]")?; - } - } - None => write!(w, " | ")?, - } - } - write!( - w, - "}}\", shape = \"Mrecord\", style = \"filled\", fillcolor = \"lightblue\"];\n" - )?; - Ok(()) -} - -fn write_control_edge<W: Write>( - function_name: &SFunctionName, - src: usize, - dst: usize, - w: &mut W, -) -> std::fmt::Result { - write!( - w, - "{}_{} -> {}_{} [color=\"black\"];\n", - function_name, src, function_name, dst - )?; - Ok(()) -} diff --git a/hercules_cg/src/sched_gen.rs b/hercules_cg/src/sched_gen.rs deleted file mode 100644 index 70a898b6c259b8bf7543b5ff4d1202132c71cf9b..0000000000000000000000000000000000000000 --- a/hercules_cg/src/sched_gen.rs +++ /dev/null @@ -1,1461 +0,0 @@ -extern crate bitvec; - -extern crate hercules_ir; - -use std::cell::Cell; -use std::collections::{HashMap, VecDeque}; -use std::iter::zip; -use std::mem::{swap, take}; - -use self::bitvec::prelude::*; - -use self::hercules_ir::*; - -use crate::*; - -pub fn sched_compile( - module: &Module, - def_uses: &Vec<ImmutableDefUseMap>, - typing: &ModuleTyping, - control_subgraphs: &Vec<Subgraph>, - fork_join_maps: &Vec<HashMap<NodeID, NodeID>>, - fork_join_nests: &Vec<HashMap<NodeID, Vec<NodeID>>>, - antideps: &Vec<Vec<(NodeID, NodeID)>>, - bbs: &Vec<Vec<NodeID>>, - plans: &Vec<Plan>, -) -> SModule { - let stypes = convert_to_sched_ir_types(&module.types); - let sconstants = convert_to_sched_ir_constants(&module.constants); - let function_names: HashMap<FunctionID, String> = module - .functions - .iter() - .enumerate() - .map(|(idx, function)| (FunctionID::new(idx), function.name.clone())) - .collect(); - - let mut functions = HashMap::new(); - let mut manifests = HashMap::new(); - for idx in 0..module.functions.len() { - let (sfunctions, manifest) = FunctionContext::new( - &module.functions[idx], - &module.types, - &module.constants, - &module.dynamic_constants, - &def_uses[idx], - &typing[idx], - &control_subgraphs[idx], - &fork_join_maps[idx], - &fork_join_nests[idx], - &antideps[idx], - &bbs[idx], - &plans[idx], - &stypes, - &sconstants, - &function_names, - ) - .compile_function(); - - functions.extend(sfunctions.into_iter()); - manifests.insert(module.functions[idx].name.clone(), manifest); - } - - SModule { - functions, - manifests, - } -} - -fn convert_to_sched_ir_types(types: &Vec<Type>) -> Vec<Option<SType>> { - let mut stypes = vec![None; types.len()]; - - for id in types_bottom_up(types) { - stypes[id.idx()] = match &types[id.idx()] { - Type::Control => None, - Type::Boolean => Some(SType::Boolean), - Type::Integer8 => Some(SType::Integer8), - Type::Integer16 => Some(SType::Integer16), - Type::Integer32 => Some(SType::Integer32), - Type::Integer64 => Some(SType::Integer64), - Type::UnsignedInteger8 => Some(SType::UnsignedInteger8), - Type::UnsignedInteger16 => Some(SType::UnsignedInteger16), - Type::UnsignedInteger32 => Some(SType::UnsignedInteger32), - Type::UnsignedInteger64 => Some(SType::UnsignedInteger64), - Type::Float32 => Some(SType::Float32), - Type::Float64 => Some(SType::Float64), - Type::Product(fields) => { - let mut typs = vec![]; - let mut res_none = false; - for id in fields { - if types[id.idx()].is_array() { - res_none = true; - break; - } else { - match &stypes[id.idx()] { - None => { - res_none = true; - break; - } - Some(t) => typs.push(t.clone()), - } - } - } - if res_none { - None - } else { - Some(SType::Product(typs.into())) - } - } - Type::Summation(_) => todo!(), - Type::Array(elem_ty, _) => match &stypes[elem_ty.idx()] { - None => None, - Some(t) => Some(SType::ArrayRef(Box::new(t.clone()))), - }, - }; - } - - stypes -} - -fn convert_to_sched_ir_constants(constants: &Vec<Constant>) -> Vec<Option<SConstant>> { - let mut sconstants = vec![None; constants.len()]; - - for id in constants_bottom_up(constants) { - sconstants[id.idx()] = match &constants[id.idx()] { - Constant::Boolean(val) => Some(SConstant::Boolean(*val)), - Constant::Integer8(val) => Some(SConstant::Integer8(*val)), - Constant::Integer16(val) => Some(SConstant::Integer16(*val)), - Constant::Integer32(val) => Some(SConstant::Integer32(*val)), - Constant::Integer64(val) => Some(SConstant::Integer64(*val)), - Constant::UnsignedInteger8(val) => Some(SConstant::UnsignedInteger8(*val)), - Constant::UnsignedInteger16(val) => Some(SConstant::UnsignedInteger16(*val)), - Constant::UnsignedInteger32(val) => Some(SConstant::UnsignedInteger32(*val)), - Constant::UnsignedInteger64(val) => Some(SConstant::UnsignedInteger64(*val)), - Constant::Float32(val) => Some(SConstant::Float32(*val)), - Constant::Float64(val) => Some(SConstant::Float64(*val)), - Constant::Product(_, fields) => { - let mut consts = vec![]; - let mut res_none = false; - for id in fields { - if constants[id.idx()].is_array() { - res_none = true; - break; - } else { - match &sconstants[id.idx()] { - None => { - res_none = true; - break; - } - Some(c) => consts.push(c.clone()), - } - } - } - if res_none { - None - } else { - Some(SConstant::Product(consts.into())) - } - } - Constant::Summation(_, _, _) => todo!(), - // Array constants are never generated inline schedule IR. - Constant::Array(_) => None, - }; - } - - sconstants -} - -/* - * Converts one Hercules function to N schedule IR functions, where N is the - * number of partitions in the Hercules function. - */ -struct FunctionContext<'a> { - function: &'a Function, - types: &'a Vec<Type>, - constants: &'a Vec<Constant>, - dynamic_constants: &'a Vec<DynamicConstant>, - def_use: &'a ImmutableDefUseMap, - typing: &'a Vec<TypeID>, - control_subgraph: &'a Subgraph, - fork_join_map: &'a HashMap<NodeID, NodeID>, - fork_join_nest: &'a HashMap<NodeID, Vec<NodeID>>, - antideps: &'a Vec<(NodeID, NodeID)>, - bbs: &'a Vec<NodeID>, - plan: &'a Plan, - stypes: &'a Vec<Option<SType>>, - sconstants: &'a Vec<Option<SConstant>>, - function_names: &'a HashMap<FunctionID, String>, - - top_nodes: Vec<NodeID>, - partition_graph: Subgraph, - inverted_partition_map: Vec<Vec<NodeID>>, - data_inputs: Vec<Vec<NodeID>>, - data_outputs: Vec<Vec<NodeID>>, - - num_virtual_registers: Vec<Cell<usize>>, -} - -impl<'a> FunctionContext<'a> { - fn new( - function: &'a Function, - types: &'a Vec<Type>, - constants: &'a Vec<Constant>, - dynamic_constants: &'a Vec<DynamicConstant>, - def_use: &'a ImmutableDefUseMap, - typing: &'a Vec<TypeID>, - control_subgraph: &'a Subgraph, - fork_join_map: &'a HashMap<NodeID, NodeID>, - fork_join_nest: &'a HashMap<NodeID, Vec<NodeID>>, - antideps: &'a Vec<(NodeID, NodeID)>, - bbs: &'a Vec<NodeID>, - plan: &'a Plan, - stypes: &'a Vec<Option<SType>>, - sconstants: &'a Vec<Option<SConstant>>, - function_names: &'a HashMap<FunctionID, String>, - ) -> Self { - let inverted_partition_map = plan.invert_partition_map(); - let top_nodes = plan.compute_top_nodes(function, control_subgraph, &inverted_partition_map); - let partition_graph = partition_graph(function, def_use, plan); - let data_inputs = plan.compute_data_inputs(function); - let data_outputs = plan.compute_data_outputs(function, def_use); - - let num_virtual_registers = vec![Cell::new(0); plan.num_partitions]; - - FunctionContext { - function, - types, - constants, - dynamic_constants, - def_use, - typing, - control_subgraph, - fork_join_map, - fork_join_nest, - antideps, - bbs, - plan, - stypes, - sconstants, - function_names, - - top_nodes, - partition_graph, - inverted_partition_map, - data_inputs, - data_outputs, - - num_virtual_registers, - } - } - - /* - * Top level function to compile a Hercules IR function into simple IR - * functions. - */ - fn compile_function(&self) -> (HashMap<SFunctionName, SFunction>, Manifest) { - let (mut manifest, array_node_to_array_id) = self.compute_manifest(); - - manifest - .partitions - .iter() - .enumerate() - .for_each(|(idx, partition_manifest)| { - self.num_virtual_registers[idx].set(partition_manifest.parameters.len()) - }); - - let partition_functions = (0..self.plan.num_partitions) - .map(|partition_idx| { - let name = self.get_sfunction_name(partition_idx); - let sfunction = - self.compile_partition(partition_idx, &manifest, &array_node_to_array_id); - self.update_manifest(&mut manifest.partitions[partition_idx], &sfunction); - (name, sfunction) - }) - .collect(); - - (partition_functions, manifest) - } - - /* - * Compute the manifest for a Hercules function. This includes all of the - * partition signature information. - */ - fn compute_manifest(&self) -> (Manifest, HashMap<NodeID, ArrayID>) { - // The manifest needs to contain metadata for allocating arrays. - let dynamic_constants = self.dynamic_constants.clone(); - let array_constants = self - .function - .nodes - .iter() - .filter_map(|node| { - if let Some(cons) = node.try_constant() - && let Some(ty) = self.constants[cons.idx()].try_array_type() - { - let extents = self.types[ty.idx()] - .try_extents() - .expect("PANIC: Type of array constant is not an array type."); - Some(extents.into_iter().map(|id| *id).collect()) - } else { - None - } - }) - .collect(); - - // Assign each array constant a unique ID for noting which ones to pass - // to what partition functions. - let array_node_to_array_id = (0..self.function.nodes.len()) - .filter(|node_idx| { - if let Some(cons) = self.function.nodes[*node_idx].try_constant() { - self.constants[cons.idx()].is_array() - } else { - false - } - }) - .enumerate() - .map(|(idx, node_idx)| (NodeID::new(node_idx), ArrayID::new(idx))) - .collect::<HashMap<NodeID, ArrayID>>(); - - let partitions = (0..self.plan.num_partitions) - .map(|partition_idx| { - let partition = &self.inverted_partition_map[partition_idx]; - let name = self.get_sfunction_name(partition_idx); - let mut parameters = vec![]; - let mut returns = vec![]; - - // Compute the signature of each partitions' schedule IR - // function, which has the following structure: - // 1. If the partition is the entry partition, the first - // parameters are the parameters to the Hercules function. If - // not, then the first parameters are all of the data inputs - // to the partition. Note that parameter nodes are always in - // the partition of the start node (the entry partition), so - // function parameters used in other partitions are treated - // as an inter-partition data dependence. - if partition_idx == 0 { - parameters.extend(self.function.param_types.iter().enumerate().map( - |(param_idx, ty_id)| { - ( - self.stypes[ty_id.idx()].clone().unwrap(), - ParameterKind::HerculesParameter(param_idx), - ) - }, - )); - } else { - parameters.extend(self.data_inputs[partition_idx].iter().map(|node_id| { - ( - self.stypes[self.typing[node_id.idx()].idx()] - .clone() - .unwrap(), - ParameterKind::DataInput(*node_id), - ) - })) - } - - // 2. The second set of parameters are references to zero-ed - // memories for implementing array constants. Implicit array - // cloning is, for now, forbidden. Array constants are - // rematerialized into each partition that uses the constant, - // so look over all of the uses of all the nodes in the - // partition, not all of the nodes in the partition. - parameters.extend( - partition - .iter() - .map(|node_id| { - get_uses(&self.function.nodes[node_id.idx()]) - .as_ref() - .iter() - .filter_map(|use_id| { - if let Some(array_id) = array_node_to_array_id.get(use_id) { - Some(( - self.stypes[self.typing[use_id.idx()].idx()] - .clone() - .unwrap(), - ParameterKind::ArrayConstant(*array_id), - )) - } else { - None - } - }) - .collect::<Vec<_>>() - }) - .flatten(), - ); - - // 3. The third set of parameters are the dynamic constants - // passed to the overall function. - parameters.extend((0..self.function.num_dynamic_constants).map(|idx| { - ( - SType::UnsignedInteger64, - ParameterKind::DynamicConstant(idx as usize), - ) - })); - - // Note that many partitions will be given unused parameters - // (mainly dynamic constants). These will be removed during the - // small amount of optimization done on simple IR. - - // Simple IR functions may return multiple values (this is to - // avoid needing to pack / un-pack product types). The return - // value of an exit partition is the return value of the - // Hercules function. The return values of non-exit partitions - // are the data outputs of the partition, possibly plus an - // integer specifying what partition should be executed next, if - // there are multiple successor partitions. A valid partitioning - // will only contain partitions with either a branch to another - // partition xor a return node. - let successors = self - .partition_graph - .succs(NodeID::new(partition_idx)) - .map(|node_id| PartitionID::new(node_id.idx())) - .collect::<Vec<PartitionID>>(); - if partition - .iter() - .any(|node_id| self.function.nodes[node_id.idx()].is_return()) - { - assert_eq!(successors.len(), 0); - returns.push(( - self.stypes[self.function.return_type.idx()] - .clone() - .unwrap(), - ReturnKind::HerculesReturn, - )); - } else { - assert!(successors.len() > 0); - returns.extend(self.data_outputs[partition_idx].iter().map(|node_id| { - ( - self.stypes[self.typing[node_id.idx()].idx()] - .clone() - .unwrap(), - ReturnKind::DataOutput(*node_id), - ) - })); - if successors.len() > 1 { - returns.push((SType::Integer8, ReturnKind::NextPartition)); - } - } - - let device = match self.plan.partition_devices[partition_idx] { - Device::CPU => DeviceManifest::cpu(), - Device::GPU => DeviceManifest::gpu(), - Device::AsyncRust => todo!(), - }; - - PartitionManifest { - name, - parameters, - returns, - successors, - device, - } - }) - .collect(); - - // The parameters for the overall Hercules function is computed in a - // similar fashion as for the individual partition functions. - let mut param_types = vec![]; - param_types.extend(self.function.param_types.iter().enumerate().map( - |(param_idx, ty_id)| { - ( - self.stypes[ty_id.idx()].clone().unwrap(), - ParameterKind::HerculesParameter(param_idx), - ) - }, - )); - param_types.extend(array_node_to_array_id.iter().map(|(node_id, array_id)| { - ( - self.stypes[self.typing[node_id.idx()].idx()] - .clone() - .unwrap(), - ParameterKind::ArrayConstant(*array_id), - ) - })); - param_types.extend((0..self.function.num_dynamic_constants).map(|idx| { - ( - SType::UnsignedInteger64, - ParameterKind::DynamicConstant(idx as usize), - ) - })); - - // The return type is just the schedule IR type corresponding to the - // Hercules function's return type. - let return_type = self.stypes[self.function.return_type.idx()] - .clone() - .unwrap(); - - let manifest = Manifest { - param_types, - return_type, - dynamic_constants, - array_constants, - partitions, - }; - (manifest, array_node_to_array_id) - } - - /* - * Compile a partition into an SFunction. - */ - fn compile_partition( - &self, - partition_idx: usize, - manifest: &Manifest, - array_node_to_array_id: &HashMap<NodeID, ArrayID>, - ) -> SFunction { - let partition = &self.inverted_partition_map[partition_idx]; - let mut blocks = vec![]; - - // First, create basic blocks inside the SFunction corresponding to the - // control nodes in the partition. If this isn't the entry partition - // (partition #0), add an entry block, since the first basic block in a - // partition may have a predecessor inside the partition. - let mut control_id_to_block_id = HashMap::new(); - let mut fork_node_id_to_fork_join_id = HashMap::new(); - if partition_idx != 0 { - // Create an explicit entry block, if one is already created via the - // Start node. - blocks.push(SBlock::default()); - } - for node in partition { - if self.function.nodes[node.idx()].is_control() { - control_id_to_block_id.insert(*node, BlockID::new(blocks.len())); - let mut block = SBlock::default(); - if let Some(imm_fork) = self.fork_join_nest[node].get(0) { - let new_id = ForkJoinID::new(fork_node_id_to_fork_join_id.len()); - let fork_join_id = *fork_node_id_to_fork_join_id - .entry(*imm_fork) - .or_insert(new_id); - block.kind = if self.function.nodes[node.idx()].is_join() { - SBlockKind::Reduce(fork_join_id) - } else { - SBlockKind::Parallel(fork_join_id) - }; - } - blocks.push(block); - } - } - - // Second, assign every data node a SValue. This map incorporates info - // from the manifest to make using SFunction parameters easy. - let mut data_id_to_svalue = manifest.partitions[partition_idx] - .parameters - .iter() - .enumerate() - .filter_map(|(idx, (_, kind))| match kind { - // Assign SValues to nodes defined outside the partition and - // passed in via SFunction parameters. - ParameterKind::DataInput(node_id) => Some((*node_id, SValue::VirtualRegister(idx))), - _ => None, - }) - .chain( - // Assign SValues for nodes inside the partition. - partition - .iter() - .filter(|node_id| !self.function.nodes[node_id.idx()].is_control()) - .filter_map(|data_id| { - let value = match self.function.nodes[data_id.idx()] { - // Phis in a block with no predecessors inside the - // current partition don't get lowered to phis in - // schedule IR - they get lowered to partition - // parameters. Phis with some predecessors in the - // same partition and some in a different partition - // get lowered to a combination of a SFunction - // parameter and a phi instruction, and uses of the - // phi node should become uses of the phi - // instruction. - Node::Phi { control, data: _ } - if self.control_subgraph.preds(control).all(|pred| { - self.plan.partitions[pred.idx()] - != self.plan.partitions[control.idx()] - }) => - { - // If the phi just gets lowered to a parameter, - // it got added above when adding the virtual - // registers for the SFunction parameters. - return None; - } - // Figure out which virtual constant in the - // signature of the current SFunction corresponds to - // a particular Hercules parameter. - Node::Parameter { index } => SValue::VirtualRegister( - manifest.partitions[partition_idx] - .parameters - .iter() - .position(|(_, kind)| { - *kind == ParameterKind::HerculesParameter(index) - }) - .unwrap(), - ), - // Wait to assign SValues to constants. We assign - // SValues to constants in user partitions, not in - // the partition the constant node happens to be in. - Node::Constant { id: _ } => { - return None; - } - // Dynamic constant nodes get generated upfront, - // since they may or may not need a virtual register - // freshly allocated for them. The math necessary - // for them gets put in the block corresponding to - // the control node the DynamicConstant node was - // scheduled to. - Node::DynamicConstant { id } => { - let block_id = control_id_to_block_id[&self.bbs[data_id.idx()]]; - self.compile_dynamic_constant( - id, - &mut blocks[block_id.idx()], - partition_idx, - manifest, - ) - } - // Wait to assign SValues to array writes. - Node::Write { - collect: _, - data: _, - indices: _, - } if self.types[self.typing[data_id.idx()].idx()].is_array() => { - return None - } - _ => SValue::VirtualRegister(self.make_virt_reg(partition_idx)), - }; - Some((*data_id, value)) - }), - ) - .chain( - // Assign SValues for constants used by nodes in the partition. - partition - .iter() - .map(|node_id| { - get_uses(&self.function.nodes[node_id.idx()]) - .as_ref() - .iter() - .filter_map(|use_id| { - if let Node::Constant { id } = self.function.nodes[use_id.idx()] { - // Array constants map to the parameter the - // array memory is passed in through - all - // other constants are represented inline in - // an SValue. - let svalue = if let Some(array_id) = - array_node_to_array_id.get(use_id) - { - SValue::VirtualRegister( - manifest.partitions[partition_idx] - .parameters - .iter() - .position(|(_, kind)| { - *kind == ParameterKind::ArrayConstant(*array_id) - }) - .unwrap(), - ) - } else { - SValue::Constant(self.sconstants[id.idx()].clone().unwrap()) - }; - Some((*use_id, svalue)) - } else { - None - } - }) - .collect::<Vec<_>>() - }) - .flatten(), - ) - .collect::<HashMap<_, _>>(); - - // Next, assign all the array write nodes. Array write nodes are - // recursively assigned the SValue of their `collect` input. - let mut worklist = partition - .iter() - .filter(|id| { - self.function.nodes[id.idx()].is_write() && !data_id_to_svalue.contains_key(id) - }) - .map(|id| *id) - .collect::<VecDeque<_>>(); - while let Some(id) = worklist.pop_front() { - let pred = match self.function.nodes[id.idx()] { - Node::Write { - data: _, - indices: _, - collect, - } => collect, - _ => panic!("PANIC: Filtered out write nodes, but found a different node kind."), - }; - if let Some(svalue) = data_id_to_svalue.get(&pred) { - data_id_to_svalue.insert(id, svalue.clone()); - } else { - worklist.push_front(id); - } - } - - // Third, generate code for every node in the partition. Iterates - // through a worklist of nodes in the partition. For non-phi and non- - // reduce nodes, only emit once all data uses are emitted. In addition, - // consider additional anti-dependence edges from read to write nodes. - // Def-use and anti-dependence edges are the only ordering we guarantee - // in schedule IR basic blocks, and it's up to device-specific backends - // to perform instruction scheduling. - let mut visited = bitvec![u8, Lsb0; 0; self.function.nodes.len()]; - let mut worklist = partition.iter().map(|id| *id).collect::<VecDeque<_>>(); - while let Some(id) = worklist.pop_front() { - if self.function.nodes[id.idx()].is_phi() - || self.function.nodes[id.idx()].is_reduce() - || get_uses(&self.function.nodes[id.idx()]) - .as_ref() - .into_iter() - // If this node isn't a phi or reduce, we need to check that - // all uses, as well as all reads we anti-depend with, have - // been emitted. - .chain(self.antideps.iter().filter_map(|(read, write)| { - if id == *write { - Some(read) - } else { - None - } - })) - // Only data dependencies within this partition need to have - // already been visited. - .all(|use_id| { - self.plan.partitions[use_id.idx()] != PartitionID::new(partition_idx) - || self.function.nodes[use_id.idx()].is_control() - || visited[use_id.idx()] - }) - { - // Once all of the data dependencies for this node are emitted, - // this node can be emitted. - self.compile_node( - id, - &control_id_to_block_id, - &data_id_to_svalue, - &fork_node_id_to_fork_join_id, - &mut blocks, - partition_idx, - manifest, - ); - visited.set(id.idx(), true); - } else { - // Skip emitting node if it's not a phi or reduce node and if - // its data uses are not emitted yet. - worklist.push_back(id); - } - } - - // Fourth, add the jump from the explicit entry block to the top node's - // block in the partition. - if partition_idx != 0 { - // Explicitly jump to the block corresponding to the top of the - // partition. That block may be a parallel block, but it's not a - // reduce block. - let top_node = self.top_nodes[partition_idx]; - let top_block = control_id_to_block_id[&top_node]; - let parallel_entry = if self.function.nodes[top_node.idx()].is_fork() { - self.copy_schedules(top_node, &mut blocks[0]); - Some(self.compile_parallel_entry( - top_node, - &data_id_to_svalue, - &mut blocks[0], - partition_idx, - manifest, - )) - } else { - None - }; - blocks[0].insts.push(SInst::Jump { - target: top_block, - parallel_entry, - reduce_exit: None, - }); - blocks[0] - .virt_regs - .push((self.make_virt_reg(partition_idx), SType::Boolean)); - } - - // Fifth, make sure every block's schedules map is "filled". - for block in blocks.iter_mut() { - for inst_idx in 0..block.insts.len() { - let _ = block.schedules.try_insert(inst_idx, vec![]); - } - } - - SFunction { - blocks, - param_types: manifest.partitions[partition_idx] - .parameters - .iter() - .map(|(sty, _)| sty.clone()) - .collect(), - return_types: manifest.partitions[partition_idx] - .returns - .iter() - .map(|(sty, _)| sty.clone()) - .collect(), - } - } - - fn compile_node( - &self, - id: NodeID, - control_id_to_block_id: &HashMap<NodeID, BlockID>, - data_id_to_svalue: &HashMap<NodeID, SValue>, - fork_node_id_to_fork_join_id: &HashMap<NodeID, ForkJoinID>, - blocks: &mut Vec<SBlock>, - partition_idx: usize, - manifest: &Manifest, - ) { - let bb = self.bbs[id.idx()]; - let block_id = Cell::new(control_id_to_block_id[&bb]); - let mut block = take(&mut blocks[block_id.get().idx()]); - - // Uses of reduce nodes inside their corresponding reduce block need to - // refer to the reduction variable instruction, not the output of the - // reduce block. - let get_svalue = |id: NodeID| data_id_to_svalue[&id].clone(); - let self_virt_reg = || get_svalue(id).try_virt_reg().unwrap(); - - // Helper function to lower a jump to a particular control node. - let lower_jmp = |dst: NodeID, block: &mut SBlock| { - if let Some(block_id) = control_id_to_block_id.get(&dst) { - // The successor block is in this partition. Add extra info to - // the jump if we're jumping into a parallel section or out of a - // reduce section. Note that both of those may be true at once. - let parallel_entry = if self.function.nodes[dst.idx()].is_fork() { - self.copy_schedules(dst, block); - Some(self.compile_parallel_entry( - dst, - data_id_to_svalue, - block, - partition_idx, - manifest, - )) - } else { - None - }; - let reduce_exit = if self.function.nodes[id.idx()].is_join() { - Some(self.compile_reduce_exit(id, data_id_to_svalue)) - } else { - None - }; - block.insts.push(SInst::Jump { - target: *block_id, - parallel_entry, - reduce_exit, - }); - } else { - assert_ne!( - self.plan.partitions[id.idx()], - self.plan.partitions[dst.idx()] - ); - - // The successor block is in a different partition. - let next_partition = self.plan.partitions[dst.idx()]; - let data_outputs = manifest.partitions[partition_idx] - .returns - .iter() - .map(|(_, kind)| match kind { - ReturnKind::DataOutput(id) => get_svalue(*id).clone(), - ReturnKind::HerculesReturn => panic!("PANIC: Partition can't contain a HerculesReturn kind of return value when it jumps to another partition."), - ReturnKind::NextPartition => SValue::Constant(SConstant::Integer8(next_partition.idx() as i8)), - }) - .collect(); - - block.insts.push(SInst::PartitionExit { data_outputs }); - } - block - .virt_regs - .push((self.make_virt_reg(partition_idx), SType::Boolean)); - }; - - // Helper function to generate the dynamic constant math to compute the - // bounds of an array type of a node. - let lower_extents = |id: NodeID, block: &mut SBlock| { - self.types[self.typing[id.idx()].idx()] - .try_extents() - .unwrap() - .iter() - .map(|dc| self.compile_dynamic_constant(*dc, block, partition_idx, manifest)) - .collect() - }; - - // Emit schedule IR instructions corresponding to this Hercules IR node. - match self.function.nodes[id.idx()] { - // Forks are super simple to lower here. Since what's sequential / - // parallel / reducing is encoded in basic block kinds, and entry / - // exits are handled in `lower_jmp`, we just need to add a jump like - // any other control block with one successor. - Node::Start - | Node::Region { preds: _ } - | Node::Projection { - control: _, - selection: _, - } - | Node::Fork { - control: _, - factors: _, - } => { - let mut succs = self.control_subgraph.succs(id); - assert_eq!(succs.len(), 1); - let succ = succs.next().unwrap(); - lower_jmp(succ, &mut block); - } - Node::Join { control: _ } => { - let mut succs = self.control_subgraph.succs(id); - assert_eq!(succs.len(), 1); - let succ = succs.next().unwrap(); - if self.plan.partitions[id.idx()] != self.plan.partitions[succ.idx()] { - // If the successor is in another partition, we need to add - // a sequential block to hold the PartitionExit. Add a jump, - // with reduce exit metadata, to the reduce block. - let exit_block_id = BlockID::new(blocks.len()); - let reduce_exit = self.compile_reduce_exit(id, data_id_to_svalue); - block.insts.push(SInst::Jump { - target: exit_block_id, - parallel_entry: None, - reduce_exit: Some(reduce_exit), - }); - block - .virt_regs - .push((self.make_virt_reg(partition_idx), SType::Boolean)); - // The exit block contains just a PartitionExit instruction. - let mut exit_block = SBlock::default(); - // `lower_jmp` depends on `block_id`, so temporarily update. - let old_block_id = block_id.get(); - block_id.set(exit_block_id); - lower_jmp(succ, &mut exit_block); - block_id.set(old_block_id); - blocks.push(exit_block); - } else { - // Otherwise, lower the jump as normal. - lower_jmp(succ, &mut block); - } - } - Node::If { control: _, cond } => { - let mut succs = self.control_subgraph.succs(id); - let mut proj1 = succs.next().unwrap(); - let mut proj2 = succs.next().unwrap(); - assert_eq!(succs.next(), None); - if self.function.nodes[proj1.idx()].try_proj().unwrap().1 == 1 { - swap(&mut proj1, &mut proj2); - } - block.insts.push(SInst::Branch { - cond: get_svalue(cond).clone(), - false_target: control_id_to_block_id[&self.bbs[proj1.idx()]], - true_target: control_id_to_block_id[&self.bbs[proj2.idx()]], - }); - block - .virt_regs - .push((self.make_virt_reg(partition_idx), SType::Boolean)); - } - Node::Return { control: _, data } => { - block.insts.push(SInst::Return { - value: get_svalue(data).clone(), - }); - block - .virt_regs - .push((self.make_virt_reg(partition_idx), SType::Boolean)); - } - - Node::Phi { control, ref data } => { - let control_uses = get_uses(&self.function.nodes[control.idx()]); - let mut found_in_partition_predecessor = false; - let mut found_out_of_partition_predecessor = false; - let inputs = zip(control_uses.as_ref().iter(), data.iter()) - .filter_map(|(control_use, data_id)| { - if let Some(block_id) = control_id_to_block_id.get(control_use) { - // If any of the predecessors are in this partition, - // we actually generate a phi instruction. - // Otherwise, we just need to refer to the parameter - // of the SFunction corresponding to this phi. - found_in_partition_predecessor = true; - Some((*block_id, get_svalue(*data_id).clone())) - } else if let Some(param_idx) = manifest.partitions[partition_idx] - .parameters - .iter() - .position(|(_, kind)| *kind == ParameterKind::DataInput(id)) - { - // This input to the phi is corresponds to all of - // the inputs from control locations outside this - // partition. This does *not* include constant nodes - // in other partitions - those get propagated (see - // below). Don't add multiple inputs for block #0. - if found_out_of_partition_predecessor { - return None; - } - // This predecessor for the phi gets passed in - // via a parameter set up for this phi. - found_out_of_partition_predecessor = true; - Some((BlockID::new(0), SValue::VirtualRegister(param_idx))) - } else { - // This input to the phi is a constant located - // outside this partition these get propagated in - // schedule IR. - found_in_partition_predecessor = true; - let svalue = get_svalue(*data_id).clone(); - Some((BlockID::new(0), svalue)) - } - }) - .collect(); - - // If there's at least one predecessor inside this partition, we - // need to generate an actual phi instruction. - if found_in_partition_predecessor { - block.insts.push(SInst::Phi { inputs }); - block.virt_regs.push(( - self_virt_reg(), - self.stypes[self.typing[id.idx()].idx()].clone().unwrap(), - )); - } - } - - Node::ThreadID { control, dimension } => { - let fork_join = fork_node_id_to_fork_join_id[&control]; - block.insts.push(SInst::ThreadID { - dimension, - fork_join, - }); - block - .virt_regs - .push((self_virt_reg(), SType::UnsignedInteger64)); - } - Node::Reduce { - control, - init: _, - reduct: _, - } => { - // Determine the reduction variable number based on the users of - // the join node. - let number = self - .def_use - .get_users(control) - .iter() - .filter(|user| self.function.nodes[user.idx()].is_reduce()) - .position(|user| *user == id) - .unwrap(); - self.copy_schedules(id, &mut block); - block.insts.push(SInst::ReductionVariable { number }); - block.virt_regs.push(( - self_virt_reg(), - self.stypes[self.typing[id.idx()].idx()].clone().unwrap(), - )); - } - - Node::Unary { input, op } => { - block.insts.push(SInst::Unary { - input: get_svalue(input).clone(), - op: convert_unary_op(op, &self.stypes), - }); - block.virt_regs.push(( - self_virt_reg(), - self.stypes[self.typing[id.idx()].idx()].clone().unwrap(), - )); - } - Node::Binary { left, right, op } => { - block.insts.push(SInst::Binary { - left: get_svalue(left).clone(), - right: get_svalue(right).clone(), - op: convert_binary_op(op), - }); - block.virt_regs.push(( - self_virt_reg(), - self.stypes[self.typing[id.idx()].idx()].clone().unwrap(), - )); - } - Node::Ternary { - first, - second, - third, - op, - } => { - block.insts.push(SInst::Ternary { - first: get_svalue(first).clone(), - second: get_svalue(second).clone(), - third: get_svalue(third).clone(), - op: convert_ternary_op(op), - }); - block.virt_regs.push(( - self_virt_reg(), - self.stypes[self.typing[id.idx()].idx()].clone().unwrap(), - )); - } - Node::IntrinsicCall { - intrinsic, - ref args, - } => { - let args = args.iter().map(|id| get_svalue(*id).clone()).collect(); - block.insts.push(SInst::IntrinsicCall { intrinsic, args }); - block.virt_regs.push(( - self_virt_reg(), - self.stypes[self.typing[id.idx()].idx()].clone().unwrap(), - )); - } - - Node::Read { - collect, - ref indices, - } => { - let mut collect_svalue = get_svalue(collect); - let mut prod_indices = &indices[..]; - - // We currently only support top-level arrays and products. The - // array and product portions become separate instructions. - // Since arrays are always root types, handle them first. - if let Some(position) = indices[0].try_position() { - // If there's both an array load and a product extract, we - // need to allocate an intermediary virtual register. - let dst_virt_reg = if indices.len() > 1 { - self.make_virt_reg(partition_idx) - } else { - self_virt_reg() - }; - - let position = position.iter().map(|id| get_svalue(*id)).collect(); - // Array loads need the dynamic constant bounds for indexing - // math. - let bounds = lower_extents(collect, &mut block); - let load_ty = if let SType::ArrayRef(elem_ty) = self.stypes - [self.typing[collect.idx()].idx()] - .clone() - .unwrap() - { - *elem_ty - } else { - panic!("PANIC: Type of collection isn't an array when an ArrayLoad use is generated.") - }; - block.insts.push(SInst::ArrayLoad { - array: collect_svalue, - position, - bounds, - }); - block.virt_regs.push((dst_virt_reg, load_ty)); - - // The product extract needs to extract from the product - // loaded from the array. - collect_svalue = SValue::VirtualRegister(dst_virt_reg); - prod_indices = &indices[1..]; - } - - // Handle the product indices. - if prod_indices.len() > 0 { - let indices = prod_indices - .iter() - .map(|index| index.try_field().unwrap()) - .collect(); - block.insts.push(SInst::ProductExtract { - product: collect_svalue, - indices, - }); - block.virt_regs.push(( - self_virt_reg(), - self.stypes[self.typing[id.idx()].idx()].clone().unwrap(), - )); - } - } - Node::Write { - collect, - data, - ref indices, - } => { - // We currently only support top-level arrays and products. - // There are three cases that we handle separately: - // 1. Writing to an array. This just lowers to an ArrayStore. - // 2. Writing to a product inside an array. This lowers to an - // ArrayLoad to get the initial product value, a - // ProductInsert to update the product value, and an - // ArrayStore to write the new product value into the array. - // 3. Writing to a product. This just lowers to a ProductInsert. - - if let Some(position) = indices[0].try_position() - && indices.len() == 1 - { - // Handle case #1. - let position = position.iter().map(|id| get_svalue(*id)).collect(); - // Array stores need the dynamic constant bounds for - // indexing math. - let bounds = lower_extents(collect, &mut block); - block.insts.push(SInst::ArrayStore { - array: get_svalue(collect), - value: get_svalue(data), - position, - bounds, - }); - // Array stores don't produce a meaningful virtual register. - block - .virt_regs - .push((self.make_virt_reg(partition_idx), SType::Boolean)); - } else if let Some(position) = indices[0].try_position() { - // Handle case #2. - let position = position - .iter() - .map(|id| get_svalue(*id)) - .collect::<Box<[_]>>(); - let bounds = lower_extents(collect, &mut block); - - // Load the product. - let load_virt_reg = self.make_virt_reg(partition_idx); - let load_ty = if let SType::ArrayRef(elem_ty) = self.stypes - [self.typing[collect.idx()].idx()] - .clone() - .unwrap() - { - *elem_ty - } else { - panic!("PANIC: Type of collection isn't an array when an ArrayLoad use is generated.") - }; - block.insts.push(SInst::ArrayLoad { - array: get_svalue(collect), - position: position.clone(), - bounds: bounds.clone(), - }); - block.virt_regs.push((load_virt_reg, load_ty.clone())); - - // Update the product. - let update_virt_reg = self.make_virt_reg(partition_idx); - let indices = indices[1..] - .iter() - .map(|index| index.try_field().unwrap()) - .collect(); - block.insts.push(SInst::ProductInsert { - product: SValue::VirtualRegister(load_virt_reg), - data: get_svalue(data), - indices, - }); - block.virt_regs.push((update_virt_reg, load_ty)); - - // Store the product. - block.insts.push(SInst::ArrayStore { - array: get_svalue(collect), - value: SValue::VirtualRegister(update_virt_reg), - position, - bounds, - }); - block - .virt_regs - .push((self.make_virt_reg(partition_idx), SType::Boolean)); - } else { - // Handle case #3. - let indices = indices - .iter() - .map(|index| index.try_field().unwrap()) - .collect(); - block.insts.push(SInst::ProductInsert { - product: get_svalue(collect), - data: get_svalue(data), - indices, - }); - // Product insertions do produce a virtual register, since - // they create a new product value. - block.virt_regs.push(( - self_virt_reg(), - self.stypes[self.typing[id.idx()].idx()].clone().unwrap(), - )); - } - } - - // There are a few nodes for which no code needs to get emitted. - _ => {} - } - - blocks[block_id.get().idx()] = block; - } - - /* - * Helper to copy over schedules. - */ - fn copy_schedules(&self, src: NodeID, block: &mut SBlock) { - block.schedules.insert( - block.insts.len(), - self.plan.schedules[src.idx()] - .iter() - .map(|schedule| sched_make_schedule(schedule)) - .collect(), - ); - } - - /* - * Compiles a reference to a dynamic constant into math to compute that - * dynamic constant. We need a mutable reference to some basic block, since - * we may need to generate math inline to compute the dynamic constant. - */ - fn compile_dynamic_constant( - &self, - dc: DynamicConstantID, - block: &mut SBlock, - partition_idx: usize, - manifest: &Manifest, - ) -> SValue { - match self.dynamic_constants[dc.idx()] { - DynamicConstant::Constant(cons) => { - SValue::Constant(SConstant::UnsignedInteger64(cons as u64)) - } - DynamicConstant::Parameter(idx) => SValue::VirtualRegister( - manifest.partitions[partition_idx] - .parameters - .iter() - .position(|(_, kind)| *kind == ParameterKind::DynamicConstant(idx)) - .unwrap(), - ), - - DynamicConstant::Add(left, right) - | DynamicConstant::Sub(left, right) - | DynamicConstant::Mul(left, right) - | DynamicConstant::Div(left, right) - | DynamicConstant::Rem(left, right) => { - let left = self.compile_dynamic_constant(left, block, partition_idx, manifest); - let right = self.compile_dynamic_constant(right, block, partition_idx, manifest); - let output_virt_reg = self.make_virt_reg(partition_idx); - block.insts.push(SInst::Binary { - left, - right, - op: match self.dynamic_constants[dc.idx()] { - DynamicConstant::Add(_, _) => SBinaryOperator::Add, - DynamicConstant::Sub(_, _) => SBinaryOperator::Sub, - DynamicConstant::Mul(_, _) => SBinaryOperator::Mul, - DynamicConstant::Div(_, _) => SBinaryOperator::Div, - DynamicConstant::Rem(_, _) => SBinaryOperator::Rem, - _ => panic!(), - }, - }); - block - .virt_regs - .push((output_virt_reg, SType::UnsignedInteger64)); - SValue::VirtualRegister(output_virt_reg) - } - } - } - - /* - * Makes a parallel entry for a jump to a fork. - */ - fn compile_parallel_entry( - &self, - fork: NodeID, - data_id_to_svalue: &HashMap<NodeID, SValue>, - block: &mut SBlock, - partition_idx: usize, - manifest: &Manifest, - ) -> ParallelEntry { - let (_, factors) = self.function.nodes[fork.idx()].try_fork().unwrap(); - let thread_counts = factors - .iter() - .map(|dc_id| self.compile_dynamic_constant(*dc_id, block, partition_idx, manifest)) - .collect(); - let reduce_inits = self - .def_use - .get_users(self.fork_join_map[&fork]) - .iter() - .filter_map(|user| self.function.nodes[user.idx()].try_reduce()) - .map(|(_, init, _)| data_id_to_svalue[&init].clone()) - .collect(); - ParallelEntry { - thread_counts, - reduce_inits, - } - } - - /* - * Makes a reduce exit for a jump from a join. - */ - fn compile_reduce_exit( - &self, - join: NodeID, - data_id_to_svalue: &HashMap<NodeID, SValue>, - ) -> ReduceExit { - let reduce_reducts = self - .def_use - .get_users(join) - .iter() - .filter(|user| self.function.nodes[user.idx()].is_reduce()) - .map(|reduce| { - // The SValues that get passed to the reduce exit are the - // `reduct` input to the reduce node. - data_id_to_svalue[&get_uses(&self.function.nodes[reduce.idx()]).as_ref()[2]].clone() - }) - .collect(); - ReduceExit { reduce_reducts } - } - - fn make_virt_reg(&self, partition_idx: usize) -> usize { - let virt_reg = self.num_virtual_registers[partition_idx].get(); - self.num_virtual_registers[partition_idx].set(virt_reg + 1); - virt_reg - } - - fn get_sfunction_name(&self, partition_idx: usize) -> SFunctionName { - format!("{}_{}", self.function.name, partition_idx) - } - - /* - * There is some information we can only add to the manifest once we've - * computed the schedule IR. - */ - fn update_manifest(&self, manifest: &mut PartitionManifest, function: &SFunction) { - let parallel_reduce_infos = sched_parallel_reduce_sections(function); - - // Add parallel launch info for CPU partitions. This relies on checking - // schedules inside the generated schedule IR. - let partition_name = manifest.name.clone(); - if let Some(tiles) = function.blocks[0].schedules[&0] - .iter() - .filter_map(|schedule| schedule.try_parallel_launch()) - .next() - && parallel_reduce_infos - .into_iter() - .any(|(_, info)| info.top_level) - && let DeviceManifest::CPU { parallel_launch } = &mut manifest.device - { - let parallel_entry = function.blocks[0].insts[0].try_jump().unwrap().1.unwrap(); - assert_eq!(tiles.len(), parallel_entry.thread_counts.len()); - let top_level_fork_id = self - .fork_join_nest - .iter() - // Find control nodes in the fork join nesting whose only nest - // it itself (is a top level fork-join). - .filter(|(id, nest)| nest.len() == 1 && nest[0] == **id) - // Only consider forks in this partition. - .filter(|(id, _)| { - self.get_sfunction_name(self.plan.partitions[id.idx()].idx()) == partition_name - }) - .next() - .unwrap() - .0; - *parallel_launch = zip( - tiles.into_iter(), - self.function.nodes[top_level_fork_id.idx()] - .try_fork() - .unwrap() - .1, - ) - .map(|(num_chunks, count_dc_id)| (*num_chunks, *count_dc_id)) - .collect(); - } - } -} - -fn convert_unary_op(op: UnaryOperator, simple_ir_types: &[Option<SType>]) -> SUnaryOperator { - match op { - UnaryOperator::Not => SUnaryOperator::Not, - UnaryOperator::Neg => SUnaryOperator::Neg, - UnaryOperator::Cast(ty) => SUnaryOperator::Cast(simple_ir_types[ty.idx()].clone().unwrap()), - } -} - -fn convert_binary_op(op: BinaryOperator) -> SBinaryOperator { - match op { - BinaryOperator::Add => SBinaryOperator::Add, - BinaryOperator::Sub => SBinaryOperator::Sub, - BinaryOperator::Mul => SBinaryOperator::Mul, - BinaryOperator::Div => SBinaryOperator::Div, - BinaryOperator::Rem => SBinaryOperator::Rem, - BinaryOperator::LT => SBinaryOperator::LT, - BinaryOperator::LTE => SBinaryOperator::LTE, - BinaryOperator::GT => SBinaryOperator::GT, - BinaryOperator::GTE => SBinaryOperator::GTE, - BinaryOperator::EQ => SBinaryOperator::EQ, - BinaryOperator::NE => SBinaryOperator::NE, - BinaryOperator::Or => SBinaryOperator::Or, - BinaryOperator::And => SBinaryOperator::And, - BinaryOperator::Xor => SBinaryOperator::Xor, - BinaryOperator::LSh => SBinaryOperator::LSh, - BinaryOperator::RSh => SBinaryOperator::RSh, - } -} - -fn convert_ternary_op(op: TernaryOperator) -> STernaryOperator { - match op { - TernaryOperator::Select => STernaryOperator::Select, - } -} diff --git a/hercules_cg/src/sched_ir.rs b/hercules_cg/src/sched_ir.rs deleted file mode 100644 index acbc74d2e3101eec164f90c5632099c8f1fe8935..0000000000000000000000000000000000000000 --- a/hercules_cg/src/sched_ir.rs +++ /dev/null @@ -1,673 +0,0 @@ -extern crate ordered_float; -extern crate serde; - -extern crate hercules_ir; - -use std::collections::HashMap; - -use self::serde::Deserialize; -use self::serde::Serialize; - -use self::hercules_ir::*; - -use crate::*; - -/* - * A schedule IR module is a list of functions and a description of each - * Hercules function in terms of schedule IR functions (called the manifest). - */ -#[derive(Debug, Default, Clone)] -pub struct SModule { - // Refer to schedule IR functions by their name. - pub functions: HashMap<SFunctionName, SFunction>, - // Each Hercules function maps to a manifest. - pub manifests: HashMap<String, Manifest>, -} - -/* - * A schedule IR function consists of a CFG of basic blocks, each containing - * instructions. Instructions can produce virtual register outputs, and SSA form - * is guaranteed. SFunctions can have multiple parameters and return values - - * many values may cross partition boundaries at once. - * - * Since SFunctions represent partitions, many SFunctions don't "return". - * Instead, conceptually they "jump" to the next partition to run. SFunctions - * that jump to another partition contain the "PartitionExit" instruction, while - * SFunctions that return from the Hercules function contain the "Return" - * instruction. An SFunction must contain either PartitionExits xor Returns. - * - * There are two special kinds of basic blocks for representing fork-joins: - * parallel blocks and reduce blocks. Each parallel / reduce block is associated - * with a unique ID per fork-join. A parallel block can contain a "ThreadID" - * instruction, which gets the Nth thread ID. A reduce block can contain a - * "Reduce" instruction, which gets the last value of the Mth reduction - * variable. When jumping to a parallel block, a u64 must be provided per fork - * dimension, specifying how many threads should spawn, and an initial value per - * reduction variable must be provided. When jumping out of a reduce block, a - * "new" value for each reduction variable must be provided. - */ -#[derive(Debug, Default, Clone)] -pub struct SFunction { - pub blocks: Vec<SBlock>, - pub param_types: Vec<SType>, - pub return_types: Vec<SType>, -} - -impl SFunction { - pub fn get_inst(&self, id: InstID) -> &SInst { - &self.blocks[id.idx_0()].insts[id.idx_1()] - } - - pub fn get_inst_mut(&mut self, id: InstID) -> &mut SInst { - &mut self.blocks[id.idx_0()].insts[id.idx_1()] - } -} - -/* - * Use a very simple representation for blocks, since modification is not a - * priority. Unlike many IRs (say LLVM), the instructions in schedule IR blocks - * aren't necessarily ordered, as different backends may have different - * scheduling considerations. This means that, for example, each SBlock must - * contain exactly one terminating instruction, but the position of that - * instruction may not be at the end of the block. All that's required is that - * defs precede uses, and that loads and stores to array references are ordered. - */ -#[derive(Debug, Default, Clone)] -pub struct SBlock { - pub insts: Vec<SInst>, - // The virtual registers created by each instruction. Technically, this will - // assign instructions like ArrayStores and Regions a virtual register, - // which doesn't make sense. These virtual registers are just ignored. Each - // virtual register has a certain schedule IR type. The type of virtual - // registers produced by certain instructions, like Jump or ArrayStore, is - // set to SType::Boolean, but it's not meaningful. - pub virt_regs: Vec<(usize, SType)>, - // Map from instruction index in the block to a list of schedules attached - // to that instruction. - pub schedules: HashMap<usize, Vec<SSchedule>>, - pub kind: SBlockKind, -} - -impl SBlock { - pub fn successors(&self) -> BlockSuccessors { - self.insts - .iter() - .map(|inst| inst.block_successors()) - .filter(|successors| *successors != BlockSuccessors::Zero) - .next() - .unwrap_or(BlockSuccessors::Zero) - } -} - -#[derive(Debug, Default, Clone, PartialEq, Eq)] -pub enum SBlockKind { - #[default] - Sequential, - Parallel(ForkJoinID), - Reduce(ForkJoinID), -} - -impl SBlockKind { - pub fn try_parallel(&self) -> Option<ForkJoinID> { - if let SBlockKind::Parallel(id) = self { - Some(*id) - } else { - None - } - } - - pub fn try_reduce(&self) -> Option<ForkJoinID> { - if let SBlockKind::Reduce(id) = self { - Some(*id) - } else { - None - } - } - - pub fn try_fork_join_id(&self) -> Option<ForkJoinID> { - match self { - SBlockKind::Sequential => None, - SBlockKind::Parallel(id) | SBlockKind::Reduce(id) => Some(*id), - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum SSchedule { - ParallelLaunch(Box<[usize]>), - ParallelReduce, - Vectorizable(usize), - Associative, -} - -impl SSchedule { - pub fn try_parallel_launch(&self) -> Option<&[usize]> { - if let SSchedule::ParallelLaunch(tiles) = self { - Some(tiles) - } else { - None - } - } - - pub fn try_vectorizable(&self) -> Option<usize> { - if let SSchedule::Vectorizable(width) = self { - Some(*width) - } else { - None - } - } -} - -pub fn sched_make_schedule(schedule: &Schedule) -> SSchedule { - match schedule { - Schedule::ParallelFork(tiles) => SSchedule::ParallelLaunch(tiles.clone()), - Schedule::ParallelReduce => SSchedule::ParallelReduce, - Schedule::Vectorizable(width) => SSchedule::Vectorizable(*width), - Schedule::Associative => SSchedule::Associative, - } -} - -/* - * Unlike Hercules IR, we can represent a reference to an array (so that we - * don't need to use an array value in this IR). This is fine, since we're not - * doing much analysis / optimization at this stage, and most platforms we want - * to target have a similar model for working with arrays anyway. We still need - * value product types, since the layout of these types may be platform - * dependent. - */ -#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum SType { - Boolean, - Integer8, - Integer16, - Integer32, - Integer64, - UnsignedInteger8, - UnsignedInteger16, - UnsignedInteger32, - UnsignedInteger64, - Float32, - Float64, - // Don't intern STypes to make developing the code generator easier. - Product(Box<[SType]>), - // Array types don't include their bounds, since dynamic constants are not - // an IR-level concept in schedule IR. - ArrayRef(Box<SType>), -} - -impl SType { - pub fn is_float(&self) -> bool { - match self { - SType::Float32 | SType::Float64 => true, - _ => false, - } - } - - pub fn is_unsigned(&self) -> bool { - match self { - SType::UnsignedInteger8 - | SType::UnsignedInteger16 - | SType::UnsignedInteger32 - | SType::UnsignedInteger64 => true, - _ => false, - } - } - - pub fn is_signed(&self) -> bool { - match self { - SType::Integer8 | SType::Integer16 | SType::Integer32 | SType::Integer64 => true, - _ => false, - } - } - - pub fn is_integer(&self) -> bool { - self.is_unsigned() || self.is_signed() || *self == SType::Boolean - } - - pub fn num_bits(&self) -> u8 { - match self { - SType::Boolean => 1, - SType::Integer8 | SType::UnsignedInteger8 => 8, - SType::Integer16 | SType::UnsignedInteger16 => 16, - SType::Integer32 | SType::UnsignedInteger32 | SType::Float32 => 32, - SType::Integer64 | SType::UnsignedInteger64 | SType::Float64 => 64, - _ => panic!(), - } - } - - pub fn try_product(&self) -> Option<&[SType]> { - if let SType::Product(fields) = self { - Some(fields) - } else { - None - } - } -} - -/* - * Represents constants, except for array constants. - */ -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum SConstant { - Boolean(bool), - Integer8(i8), - Integer16(i16), - Integer32(i32), - Integer64(i64), - UnsignedInteger8(u8), - UnsignedInteger16(u16), - UnsignedInteger32(u32), - UnsignedInteger64(u64), - Float32(ordered_float::OrderedFloat<f32>), - Float64(ordered_float::OrderedFloat<f64>), - // Don't intern SConstants to make developing the code generator easier. - Product(Box<[SConstant]>), -} - -impl SConstant { - pub fn get_type(&self) -> SType { - match self { - SConstant::Boolean(_) => SType::Boolean, - SConstant::Integer8(_) => SType::Integer8, - SConstant::Integer16(_) => SType::Integer16, - SConstant::Integer32(_) => SType::Integer32, - SConstant::Integer64(_) => SType::Integer64, - SConstant::UnsignedInteger8(_) => SType::UnsignedInteger8, - SConstant::UnsignedInteger16(_) => SType::UnsignedInteger16, - SConstant::UnsignedInteger32(_) => SType::UnsignedInteger32, - SConstant::UnsignedInteger64(_) => SType::UnsignedInteger64, - SConstant::Float32(_) => SType::Float32, - SConstant::Float64(_) => SType::Float64, - SConstant::Product(fields) => { - SType::Product(fields.into_iter().map(|field| field.get_type()).collect()) - } - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum SValue { - Constant(SConstant), - VirtualRegister(usize), -} - -impl SValue { - pub fn try_virt_reg(&self) -> Option<usize> { - if let SValue::VirtualRegister(vr) = self { - Some(*vr) - } else { - None - } - } - - pub fn try_constant(&self) -> Option<&SConstant> { - if let SValue::Constant(cons) = self { - Some(cons) - } else { - None - } - } -} - -/* - * Typical instructions of a CFG + SSA IR, plus some instructions for - * representing particular Hercules IR quirks. - */ -#[derive(Debug, Clone)] -pub enum SInst { - Phi { - inputs: Box<[(BlockID, SValue)]>, - }, - ThreadID { - dimension: usize, - fork_join: ForkJoinID, - }, - ReductionVariable { - number: usize, - }, - Jump { - target: BlockID, - parallel_entry: Option<ParallelEntry>, - reduce_exit: Option<ReduceExit>, - }, - Branch { - cond: SValue, - false_target: BlockID, - true_target: BlockID, - }, - PartitionExit { - data_outputs: Box<[SValue]>, - }, - Return { - value: SValue, - }, - Unary { - input: SValue, - op: SUnaryOperator, - }, - Binary { - left: SValue, - right: SValue, - op: SBinaryOperator, - }, - Ternary { - first: SValue, - second: SValue, - third: SValue, - op: STernaryOperator, - }, - IntrinsicCall { - intrinsic: Intrinsic, - args: Box<[SValue]>, - }, - ProductExtract { - product: SValue, - indices: Box<[usize]>, - }, - ProductInsert { - product: SValue, - data: SValue, - indices: Box<[usize]>, - }, - ArrayLoad { - array: SValue, - position: Box<[SValue]>, - bounds: Box<[SValue]>, - }, - ArrayStore { - array: SValue, - value: SValue, - position: Box<[SValue]>, - bounds: Box<[SValue]>, - }, -} - -impl SInst { - pub fn is_reduction_variable(&self) -> bool { - if let SInst::ReductionVariable { number: _ } = self { - true - } else { - false - } - } - - pub fn is_phi(&self) -> bool { - if let SInst::Phi { inputs: _ } = self { - true - } else { - false - } - } - - pub fn is_jump(&self) -> bool { - if let SInst::Jump { - target: _, - parallel_entry: _, - reduce_exit: _, - } = self - { - true - } else { - false - } - } - - pub fn is_branch(&self) -> bool { - if let SInst::Branch { - cond: _, - false_target: _, - true_target: _, - } = self - { - true - } else { - false - } - } - - pub fn is_partition_exit(&self) -> bool { - if let SInst::PartitionExit { data_outputs: _ } = self { - true - } else { - false - } - } - - pub fn is_return(&self) -> bool { - if let SInst::Return { value: _ } = self { - true - } else { - false - } - } - - pub fn is_terminator(&self) -> bool { - self.is_jump() || self.is_branch() || self.is_partition_exit() || self.is_return() - } - - pub fn try_thread_id(&self) -> Option<(usize, ForkJoinID)> { - if let SInst::ThreadID { - dimension, - fork_join, - } = self - { - Some((*dimension, *fork_join)) - } else { - None - } - } - - pub fn try_reduction_variable(&self) -> Option<usize> { - if let SInst::ReductionVariable { number } = self { - Some(*number) - } else { - None - } - } - - pub fn try_jump(&self) -> Option<(BlockID, Option<&ParallelEntry>, Option<&ReduceExit>)> { - if let SInst::Jump { - target, - parallel_entry, - reduce_exit, - } = self - { - Some((*target, parallel_entry.as_ref(), reduce_exit.as_ref())) - } else { - None - } - } - - pub fn block_successors(&self) -> BlockSuccessors { - match self { - SInst::Jump { - target, - parallel_entry: _, - reduce_exit: _, - } => BlockSuccessors::One([*target]), - SInst::Branch { - cond: _, - false_target, - true_target, - } => BlockSuccessors::Two([*false_target, *true_target]), - _ => BlockSuccessors::Zero, - } - } - - pub fn upper_case_name(&self) -> &'static str { - match self { - SInst::Phi { inputs: _ } => "Phi", - SInst::ThreadID { - dimension: _, - fork_join: _, - } => "ThreadID", - SInst::ReductionVariable { number: _ } => "ReductionVariable", - SInst::Jump { - target: _, - parallel_entry: _, - reduce_exit: _, - } => "Jump", - SInst::Branch { - cond: _, - false_target: _, - true_target: _, - } => "Branch", - SInst::PartitionExit { data_outputs: _ } => "PartitionExit", - SInst::Return { value: _ } => "Return", - SInst::Unary { input: _, op } => op.upper_case_name(), - SInst::Binary { - left: _, - right: _, - op, - } => op.upper_case_name(), - SInst::Ternary { - first: _, - second: _, - third: _, - op, - } => op.upper_case_name(), - SInst::IntrinsicCall { intrinsic, args: _ } => intrinsic.upper_case_name(), - SInst::ProductExtract { - product: _, - indices: _, - } => "ProductExtract", - SInst::ProductInsert { - product: _, - data: _, - indices: _, - } => "ProductInsert", - SInst::ArrayLoad { - array: _, - position: _, - bounds: _, - } => "ArrayLoad", - SInst::ArrayStore { - array: _, - value: _, - position: _, - bounds: _, - } => "ArrayStore", - } - } -} - -#[derive(Debug, PartialEq, Eq)] -pub enum BlockSuccessors { - Zero, - One([BlockID; 1]), - Two([BlockID; 2]), -} - -impl AsRef<[BlockID]> for BlockSuccessors { - fn as_ref(&self) -> &[BlockID] { - match self { - BlockSuccessors::Zero => &[], - BlockSuccessors::One(x) => x, - BlockSuccessors::Two(x) => x, - } - } -} - -/* - * On entering a parallel section, we need to specify how many threads to spawn - * and what the initial values of the reduction variables are. - */ -#[derive(Debug, Clone)] -pub struct ParallelEntry { - pub thread_counts: Box<[SValue]>, - pub reduce_inits: Box<[SValue]>, -} - -/* - * On exiting a reduce section, we need to specify which instructions in the - * reduce block correspond to what reduction variables. This also specifies - * which values defined inside the reduce block can be used outside the block. - */ -#[derive(Debug, Clone)] -pub struct ReduceExit { - pub reduce_reducts: Box<[SValue]>, -} - -/* - * The operator types are mostly the same. - */ -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum SUnaryOperator { - Not, - Neg, - Cast(SType), -} - -impl SUnaryOperator { - pub fn upper_case_name(&self) -> &'static str { - match self { - SUnaryOperator::Not => "Not", - SUnaryOperator::Neg => "Neg", - SUnaryOperator::Cast(_) => "Cast", - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum SBinaryOperator { - Add, - Sub, - Mul, - Div, - Rem, - LT, - LTE, - GT, - GTE, - EQ, - NE, - Or, - And, - Xor, - LSh, - RSh, -} - -impl SBinaryOperator { - pub fn upper_case_name(&self) -> &'static str { - match self { - SBinaryOperator::Add => "Add", - SBinaryOperator::Sub => "Sub", - SBinaryOperator::Mul => "Mul", - SBinaryOperator::Div => "Div", - SBinaryOperator::Rem => "Rem", - SBinaryOperator::LT => "LT", - SBinaryOperator::LTE => "LTE", - SBinaryOperator::GT => "GT", - SBinaryOperator::GTE => "GTE", - SBinaryOperator::EQ => "EQ", - SBinaryOperator::NE => "NE", - SBinaryOperator::Or => "Or", - SBinaryOperator::And => "And", - SBinaryOperator::Xor => "Xor", - SBinaryOperator::LSh => "LSh", - SBinaryOperator::RSh => "RSh", - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum STernaryOperator { - Select, -} - -impl STernaryOperator { - pub fn upper_case_name(&self) -> &'static str { - match self { - STernaryOperator::Select => "Select", - } - } -} - -pub type SFunctionName = String; - -define_id_type!(ArrayID); -define_id_type!(BlockID); -define_id_type!(ForkJoinID); - -define_dual_id_type!(InstID); diff --git a/hercules_cg/src/sched_schedule.rs b/hercules_cg/src/sched_schedule.rs deleted file mode 100644 index 4ba407f0e316e6611e311ff095bff6121a922c5a..0000000000000000000000000000000000000000 --- a/hercules_cg/src/sched_schedule.rs +++ /dev/null @@ -1,465 +0,0 @@ -extern crate hercules_ir; - -use std::collections::{HashMap, HashSet, VecDeque}; -use std::iter::{empty, once, zip}; - -use self::hercules_ir::*; - -use crate::*; - -/* - * Iterate over the uses of a instruction. - */ -pub fn sched_get_uses(inst: &SInst) -> Box<dyn Iterator<Item = &SValue> + '_> { - match inst { - SInst::Phi { inputs } => Box::new(inputs.iter().map(|(_, svalue)| svalue)), - SInst::ThreadID { - dimension: _, - fork_join: _, - } => Box::new(empty()), - SInst::ReductionVariable { number: _ } => Box::new(empty()), - SInst::Jump { - target: _, - parallel_entry, - reduce_exit, - } => { - let first = parallel_entry.as_ref().map(|parallel_entry| { - parallel_entry - .thread_counts - .iter() - .chain(parallel_entry.reduce_inits.iter()) - }); - let second = reduce_exit - .as_ref() - .map(|reduce_exit| reduce_exit.reduce_reducts.iter()); - match (first, second) { - (Some(first), Some(second)) => Box::new(first.chain(second)), - (Some(first), None) => Box::new(first), - (None, Some(second)) => Box::new(second), - (None, None) => Box::new(empty()), - } - } - SInst::Branch { - cond, - false_target: _, - true_target: _, - } => Box::new(once(cond)), - SInst::PartitionExit { data_outputs } => Box::new(data_outputs.iter()), - SInst::Return { value } => Box::new(once(value)), - SInst::Unary { input, op: _ } => Box::new(once(input)), - SInst::Binary { left, right, op: _ } => Box::new(once(left).chain(once(right))), - SInst::Ternary { - first, - second, - third, - op: _, - } => Box::new(once(first).chain(once(second)).chain(once(third))), - SInst::IntrinsicCall { intrinsic: _, args } => Box::new(args.iter()), - SInst::ProductExtract { - product, - indices: _, - } => Box::new(once(product)), - SInst::ProductInsert { - product, - data, - indices: _, - } => Box::new(once(product).chain(once(data))), - SInst::ArrayLoad { - array, - position, - bounds, - } => Box::new(once(array).chain(position.iter()).chain(bounds.iter())), - SInst::ArrayStore { - array, - value, - position, - bounds, - } => Box::new( - once(array) - .chain(once(value)) - .chain(position.iter()) - .chain(bounds.iter()), - ), - } -} - -/* - * Map virtual registers to corresponding instruction IDs. - */ -pub fn sched_virt_reg_to_inst_id(function: &SFunction) -> HashMap<usize, InstID> { - let mut virt_reg_to_inst_id = HashMap::new(); - for block_idx in 0..function.blocks.len() { - let block = &function.blocks[block_idx]; - for inst_idx in 0..block.insts.len() { - let virt_reg = block.virt_regs[inst_idx].0; - let inst_id = InstID::new(block_idx, inst_idx); - virt_reg_to_inst_id.insert(virt_reg, inst_id); - } - } - virt_reg_to_inst_id -} - -/* - * Build a dependency graph of instructions in an SFunction. - */ -pub fn sched_dependence_graph( - function: &SFunction, - virt_reg_to_inst_id: &HashMap<usize, InstID>, -) -> HashMap<InstID, Vec<InstID>> { - let mut dep_graph = HashMap::new(); - for inst_id in virt_reg_to_inst_id.values() { - dep_graph.insert(*inst_id, vec![]); - } - - // Process the dependencies in each block. This includes inter-block - // dependencies for normal def-use edges. - for block_idx in 0..function.blocks.len() { - let block = &function.blocks[block_idx]; - - // Add normal dependencies. - for inst_idx in 0..block.insts.len() { - let inst_id = InstID::new(block_idx, inst_idx); - let inst = &block.insts[inst_idx]; - for use_sval in sched_get_uses(inst) { - if let SValue::VirtualRegister(virt_reg) = use_sval { - // Uses of parameters don't correspond to any instruction we - // need to depend on. - if let Some(use_id) = virt_reg_to_inst_id.get(virt_reg) { - let deps = dep_graph.get_mut(&inst_id).unwrap(); - if !deps.contains(use_id) { - deps.push(*use_id); - } - } - } - } - } - - // Phis should appear at the top of linear basic blocks. - - // Add dependencies between the phis. - let mut last_phi = None; - for inst_idx in 0..block.insts.len() { - let inst_id = InstID::new(block_idx, inst_idx); - let inst = &block.insts[inst_idx]; - if inst.is_phi() { - if let Some(last_phi) = last_phi { - let deps = dep_graph.get_mut(&inst_id).unwrap(); - if !deps.contains(&last_phi) { - deps.push(last_phi); - } - } - last_phi = Some(inst_id); - } - } - - // If there is at least one phi, add a dependency between the "last" phi - // and every non-phi instruction with no dependencies yet. - if let Some(last_phi) = last_phi { - for inst_idx in 0..block.insts.len() { - let inst_id = InstID::new(block_idx, inst_idx); - let inst = &block.insts[inst_idx]; - if !inst.is_phi() { - let deps = dep_graph.get_mut(&inst_id).unwrap(); - if deps.is_empty() { - deps.push(last_phi); - } - } - } - } - - // Terminator instructions appear at the bottom of linear basic blocks. - - // Find every non-terminator instruction with no users. - let mut no_user_insts = (0..block.insts.len()) - .filter(|inst_idx| !block.insts[*inst_idx].is_terminator()) - .map(|inst_idx| InstID::new(block_idx, inst_idx)) - .collect::<HashSet<_>>(); - for inst_idx in 0..block.insts.len() { - let inst_id = InstID::new(block_idx, inst_idx); - for dep in dep_graph[&inst_id].iter() { - no_user_insts.remove(dep); - } - } - - // Add a dependency between each instruction with no users (previously) - // and each terminator instruction. - for inst_idx in 0..block.insts.len() { - let inst_id = InstID::new(block_idx, inst_idx); - let inst = &block.insts[inst_idx]; - if inst.is_terminator() { - let deps = dep_graph.get_mut(&inst_id).unwrap(); - for no_user_inst in no_user_insts.iter() { - if !deps.contains(no_user_inst) { - deps.push(*no_user_inst); - } - } - } - } - } - - dep_graph -} - -/* - * Assemble a map from SValue to SType. - */ -pub fn sched_svalue_types(function: &SFunction) -> HashMap<SValue, SType> { - let mut result = HashMap::new(); - - // Add types of parameters. - for (param_idx, param_ty) in function.param_types.iter().enumerate() { - result.insert(SValue::VirtualRegister(param_idx), param_ty.clone()); - } - - // Add types of instructions and constants. - for block in function.blocks.iter() { - for (inst, (virt_reg, sty)) in zip(block.insts.iter(), block.virt_regs.iter()) { - // Add the type of the output of the instruction. - result.insert(SValue::VirtualRegister(*virt_reg), sty.clone()); - - // Find constants inthe uses of instructions. - for u in sched_get_uses(inst) { - if let SValue::Constant(cons) = u { - result.insert(u.clone(), cons.get_type()); - } - } - } - } - - result -} - -/* - * Analysis information for one fork-join. - */ -#[derive(Debug)] -pub struct ParallelReduceInfo { - // The block that jumps into the parallel section. - pub predecessor: BlockID, - // The block that is jumped into after the reduce section. - pub successor: BlockID, - - // The first parallel block in the parallel section. - pub top_parallel_block: BlockID, - // The parallel block that jumps to the reduce section. - pub bottom_parallel_block: BlockID, - // The single block in the reduce section. - pub reduce_block: BlockID, - - // The thread count SValues used for this fork-join. - pub thread_counts: Box<[SValue]>, - // The initial SValues for the reduction variables. - pub reduce_inits: Box<[SValue]>, - // The reduct SValues for the reduction variables. - pub reduce_reducts: Box<[SValue]>, - - // Map from thread ID dimension to virtual registers of corresponding thread - // ID instructions. - pub thread_ids: HashMap<usize, Vec<usize>>, - // Map from reduction variable number to virtual register of the - // corresponding reduction variable instruction. - pub reduction_variables: HashMap<usize, usize>, - - // If this parallel-reduce section is inside another parallel-reduce, store - // the parent's ForkJoinID. Parallel-reduce sections in an SFunction form a - // forest. - pub parent_fork_join_id: Option<ForkJoinID>, - - // Information about how this fork-join should be scheduled. Collecting this - // info here just makes writing the backends more convenient. - pub vector_width: Option<usize>, - // For each reduction variable, track if its associative or parallel - // individually. - pub associative_reduce: HashMap<usize, bool>, - pub parallel_reduce: HashMap<usize, bool>, - // Track if this is a "top-level" parallel-reduce. That is, the parallel- - // reduce is the "only thing" inside this partition function. Only these - // parallel-reduces can be parallelized on the CPU, even if this parallel- - // reduce has a parallel schedule on the entry jump. - pub top_level: bool, -} - -/* - * Analyze parallel-reduce sections to make lowering them easier. Returns a map - * from ForkJoinID to information about that parallel-reduce section. - */ -pub fn sched_parallel_reduce_sections( - function: &SFunction, -) -> HashMap<ForkJoinID, ParallelReduceInfo> { - let mut result = HashMap::new(); - - for (block_idx, block) in function.blocks.iter().enumerate() { - // Start by identifying a jump into a parallel section. - for (inst_idx, inst) in block.insts.iter().enumerate() { - if let SInst::Jump { - target, - parallel_entry, - reduce_exit: _, - } = inst - && let Some(parallel_entry) = parallel_entry - { - let predecessor = BlockID::new(block_idx); - let ParallelEntry { - thread_counts, - reduce_inits, - } = parallel_entry.clone(); - let vector_width = block.schedules[&inst_idx] - .iter() - .filter_map(|schedule| schedule.try_vectorizable()) - .next(); - - // The jump target is the top of the parallel section. Get the - // fork-join ID from that block. - let top_parallel_block = *target; - let fork_join_id = function.blocks[top_parallel_block.idx()] - .kind - .try_parallel() - .unwrap(); - - // Traverse the blocks until finding a jump to the corresponding - // reduce block. - let mut queue = VecDeque::from(vec![top_parallel_block]); - let mut visited = HashSet::new(); - visited.insert(top_parallel_block); - let mut bfs_dest = None; - while let Some(bfs) = queue.pop_front() { - for succ in function.blocks[bfs.idx()].successors().as_ref() { - if let Some(reduce_fork_join_id) = - function.blocks[succ.idx()].kind.try_reduce() - && reduce_fork_join_id == fork_join_id - { - bfs_dest = Some((bfs, *succ)); - } else if !visited.contains(succ) { - queue.push_back(*succ); - visited.insert(*succ); - } - } - } - let (bottom_parallel_block, reduce_block) = bfs_dest.unwrap(); - - // Find the jump out of the reduce block. - let (successor, _, reduce_exit) = function.blocks[reduce_block.idx()] - .insts - .iter() - .filter_map(|inst| inst.try_jump()) - .next() - .unwrap(); - let reduce_reducts = reduce_exit.unwrap().reduce_reducts.clone(); - - // Find the thread ID instructions. - let mut thread_ids = (0..thread_counts.len()) - .map(|dim| (dim, vec![])) - .collect::<HashMap<usize, Vec<usize>>>(); - for parallel_block in visited { - for (inst, (virt_reg, _)) in zip( - function.blocks[parallel_block.idx()].insts.iter(), - function.blocks[parallel_block.idx()].virt_regs.iter(), - ) { - if let Some((dim, tid_fork_join)) = inst.try_thread_id() - && tid_fork_join == fork_join_id - { - thread_ids.get_mut(&dim).unwrap().push(*virt_reg); - } - } - } - - // Find the reduction variable instructions. - let mut associative_reduce = HashMap::new(); - let mut parallel_reduce = HashMap::new(); - let reduce_sblock = &function.blocks[reduce_block.idx()]; - let reduction_variables = zip( - reduce_sblock.insts.iter().enumerate(), - reduce_sblock.virt_regs.iter(), - ) - .filter_map(|((inst_idx, inst), (virt_reg, _))| { - inst.try_reduction_variable().map(|number| { - let schedules = &reduce_sblock.schedules[&inst_idx]; - associative_reduce - .insert(number, schedules.contains(&SSchedule::Associative)); - parallel_reduce - .insert(number, schedules.contains(&SSchedule::ParallelReduce)); - (number, *virt_reg) - }) - }) - .collect(); - - // Assemble all of the info and add it to the map. - let info = ParallelReduceInfo { - predecessor, - successor, - - top_parallel_block, - bottom_parallel_block, - reduce_block, - - thread_counts, - reduce_inits, - reduce_reducts, - - thread_ids, - reduction_variables, - - parent_fork_join_id: None, - vector_width, - associative_reduce, - parallel_reduce, - - top_level: false, - }; - result.insert(fork_join_id, info); - } - } - } - - // Figure out if any parallel-reduces are top level - that is, they are the - // "only thing" in the partition function. - for (_, parallel_reduce_info) in result.iter_mut() { - // A parallel-reduce is top-level if its predecessor is the entry block - // containing only a jump and its successor is an exit block containing - // just a function terminator. - let pred_block = &function.blocks[parallel_reduce_info.predecessor.idx()]; - let succ_block = &function.blocks[parallel_reduce_info.successor.idx()]; - if parallel_reduce_info.predecessor == BlockID::new(0) - && pred_block.insts.len() == 1 - && pred_block.insts[0].is_jump() - && succ_block.insts.len() == 1 - && (succ_block.insts[0].is_partition_exit() || succ_block.insts[0].is_return()) - { - parallel_reduce_info.top_level = true; - } - } - - // Compute the parallel-reduce forest last, since this requires some info we - // just computed above. - let mut parents = HashMap::new(); - for (fork_join_id, parallel_reduce_info) in result.iter() { - let mut pred_block = parallel_reduce_info.predecessor; - - // Keep looking at predecessors of adjacent parallel-reduce sections - // until one belongs to a parent parallel-reduce or is sequential, so - // this parallel-reduce is a root. - let parent = loop { - match function.blocks[pred_block.idx()].kind { - // If the predecessor is sequential, then this parallel-reduce - // is a root. - SBlockKind::Sequential => break None, - // If the predecessor is parallel, then this parallel-reduce is - // inside that parallel-reduce. - SBlockKind::Parallel(parent) => break Some(parent), - // If the predecessor is reduce, then that parallel-reduce is a - // child of the same parent. Iterate on its predecessor. - SBlockKind::Reduce(adjacent) => { - pred_block = result[&adjacent].predecessor; - } - } - }; - parents.insert(*fork_join_id, parent); - } - - // Insert the information into the parallel reduce info map. - for (fork_join_id, parallel_reduce_info) in result.iter_mut() { - parallel_reduce_info.parent_fork_join_id = parents[fork_join_id]; - } - - result -} diff --git a/hercules_ir/src/callgraph.rs b/hercules_ir/src/callgraph.rs index 84be922dea8a89732bf2f2ad0d9fe3f3865d5d90..3a8e6316f8213b2c665e2ad34034d070a495c0cb 100644 --- a/hercules_ir/src/callgraph.rs +++ b/hercules_ir/src/callgraph.rs @@ -43,6 +43,37 @@ impl CallGraph { pub fn num_functions(&self) -> usize { self.first_callees.len() } + + pub fn topo(&self) -> Vec<FunctionID> { + let mut num_calls: Vec<usize> = (0..self.num_functions()) + .map(|idx| self.num_callees(FunctionID::new(idx))) + .collect(); + let mut no_calls_stack: Vec<FunctionID> = num_calls + .iter() + .enumerate() + .filter(|(_, num)| **num == 0) + .map(|(idx, _)| FunctionID::new(idx)) + .collect(); + let mut topo = vec![]; + while let Some(no_call_func) = no_calls_stack.pop() { + topo.push(no_call_func); + for caller in self.get_callers(no_call_func) { + num_calls[caller.idx()] -= 1; + if num_calls[caller.idx()] == 0 { + no_calls_stack.push(*caller); + } + } + } + + // Mutual recursion is not currently supported, so assert that a + // topological sort exists. + assert_eq!( + topo.len(), + self.num_functions(), + "PANIC: Found mutual recursion in Hercules IR." + ); + topo + } } /* diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs index 8c36c8ad31ca1e196e8a13778239d6f2a83080d4..d23a49729e1b76c46dc85b6d850d56e8914dd213 100644 --- a/hercules_ir/src/dot.rs +++ b/hercules_ir/src/dot.rs @@ -42,9 +42,10 @@ pub fn xdot_module( file.write_all(contents.as_bytes()) .expect("PANIC: Unable to write output file contents."); Command::new("xdot") - .args([tmp_path]) + .args([&tmp_path]) .output() .expect("PANIC: Couldn't execute xdot. Is xdot installed?"); + println!("Graphviz written to: {}", tmp_path.display()); } /* diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 77a7ed3c971ecc68ebeb3393dc818d20013fb978..cef8e43ae9b6448a9c0239df12a9d94976b0b751 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -545,6 +545,58 @@ pub fn constants_bottom_up(constants: &Vec<Constant>) -> impl Iterator<Item = Co } } +/* + * Create an iterator that traverses all the dynamic constants in the module + * bottom up. This uses a coroutine to make iteratively traversing the dynamic + * constant DAGs easier. + */ +pub fn dynamic_constants_bottom_up( + dynamic_constants: &Vec<DynamicConstant>, +) -> impl Iterator<Item = DynamicConstantID> + '_ { + let mut visited = bitvec![u8, Lsb0; 0; dynamic_constants.len()]; + let mut stack = (0..dynamic_constants.len()) + .map(DynamicConstantID::new) + .collect::<Vec<DynamicConstantID>>(); + let coroutine = #[coroutine] + move || { + // Since this is a coroutine, handle recursion manually. + while let Some(id) = stack.pop() { + if visited[id.idx()] { + continue; + } + match dynamic_constants[id.idx()] { + DynamicConstant::Add(left, right) + | DynamicConstant::Sub(left, right) + | DynamicConstant::Mul(left, right) + | DynamicConstant::Div(left, right) + | DynamicConstant::Rem(left, right) => { + // We have to yield the children of this node before + // this node itself. We keep track of which nodes have + // yielded using visited. + let can_yield = visited[left.idx()] && visited[right.idx()]; + if can_yield { + visited.set(id.idx(), true); + yield id; + } else { + // Push ourselves, then children, so that children + // get popped first. + stack.push(id); + stack.push(left); + stack.push(right); + } + } + _ => { + visited.set(id.idx(), true); + yield id; + } + } + } + }; + CoroutineIterator { + coroutine: Box::new(coroutine), + } +} + struct CoroutineIterator<G, I> where G: Coroutine<Yield = I, Return = ()> + Unpin, @@ -671,6 +723,16 @@ impl Type { } } + pub fn is_signed(&self) -> bool { + match self { + Type::Integer8 => true, + Type::Integer16 => true, + Type::Integer32 => true, + Type::Integer64 => true, + _ => false, + } + } + pub fn is_fixed(&self) -> bool { match self { Type::Integer8 => true, @@ -685,6 +747,10 @@ impl Type { } } + pub fn is_integer(&self) -> bool { + self.is_fixed() || self.is_bool() + } + pub fn is_float(&self) -> bool { match self { Type::Float32 => true, @@ -740,6 +806,17 @@ impl Type { None } } + + pub fn num_bits(&self) -> u8 { + match self { + Type::Boolean => 1, + Type::Integer8 | Type::UnsignedInteger8 => 8, + Type::Integer16 | Type::UnsignedInteger16 => 16, + Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 32, + Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 64, + _ => panic!(), + } + } } impl Constant { @@ -774,7 +851,7 @@ impl Constant { } } - pub fn is_strictly_scalar(&self) -> bool { + pub fn is_scalar(&self) -> bool { match self { Constant::Boolean(_) => true, Constant::Integer8(_) => true, @@ -1013,6 +1090,8 @@ impl Node { } ); + define_pattern_predicate!(is_undef, Node::Undef { ty: _ }); + pub fn try_region(&self) -> Option<&[NodeID]> { if let Node::Region { preds } = self { Some(preds) @@ -1090,6 +1169,14 @@ impl Node { } } + pub fn try_parameter(&self) -> Option<usize> { + if let Node::Parameter { index } = self { + Some(*index) + } else { + None + } + } + pub fn try_constant(&self) -> Option<ConstantID> { if let Node::Constant { id } = self { Some(*id) @@ -1137,6 +1224,21 @@ impl Node { } } + pub fn try_ternary(&self, bop: TernaryOperator) -> Option<(NodeID, NodeID, NodeID)> { + if let Node::Ternary { + first, + second, + third, + op, + } = self + && *op == bop + { + Some((*first, *second, *third)) + } else { + None + } + } + pub fn is_zero_constant(&self, constants: &Vec<Constant>) -> bool { if let Node::Constant { id } = self && constants[id.idx()].is_zero() diff --git a/hercules_ir/src/loops.rs b/hercules_ir/src/loops.rs index 5aa6bd19a65f842ab19ac855066ce894e0e568f8..7c9a0a85949efcc248439031601b2fed17f0acf6 100644 --- a/hercules_ir/src/loops.rs +++ b/hercules_ir/src/loops.rs @@ -1,8 +1,8 @@ extern crate bitvec; use std::collections::hash_map; -use std::collections::HashMap; use std::collections::VecDeque; +use std::collections::{HashMap, HashSet}; use self::bitvec::prelude::*; @@ -37,6 +37,14 @@ impl LoopTree { self.loops.iter() } + pub fn nodes_in_loop(&self, header: NodeID) -> impl Iterator<Item = NodeID> + '_ { + self.loops[&header].0.iter_ones().map(NodeID::new) + } + + pub fn is_in_loop(&self, header: NodeID, is_in: NodeID) -> bool { + header == self.root || self.loops[&header].0[is_in.idx()] + } + /* * Sometimes, we need to iterate the loop tree bottom-up. Just assemble the * order upfront. @@ -195,3 +203,98 @@ fn loop_reachability_helper( visited } } + +/* + * Top level function to calculate reduce cycles. Returns for each reduce node + * what other nodes form a cycle with that reduce node. + */ +pub fn reduce_cycles( + function: &Function, + def_use: &ImmutableDefUseMap, +) -> HashMap<NodeID, HashSet<NodeID>> { + let reduces = (0..function.nodes.len()) + .filter(|idx| function.nodes[*idx].is_reduce()) + .map(NodeID::new); + let mut result = HashMap::new(); + + for reduce in reduces { + let (_, _, reduct) = function.nodes[reduce.idx()].try_reduce().unwrap(); + + // First, find all data nodes that are used by the `reduct` input of the + // reduce, including the `reduct` itself. + let mut use_reachable = HashSet::new(); + use_reachable.insert(reduct); + let mut worklist = vec![reduct]; + while let Some(item) = worklist.pop() { + for u in get_uses(&function.nodes[item.idx()]).as_ref() { + if !function.nodes[u.idx()].is_control() && !use_reachable.contains(u) { + use_reachable.insert(*u); + worklist.push(*u); + } + } + } + + // Second, find all data nodes thare are users of the reduce node. + let mut user_reachable = HashSet::new(); + let mut worklist = vec![reduce]; + while let Some(item) = worklist.pop() { + for u in def_use.get_users(item) { + if !function.nodes[u.idx()].is_control() && !user_reachable.contains(u) { + user_reachable.insert(*u); + worklist.push(*u); + } + } + } + + // Nodes that are both use-reachable and user-reachable by the reduce + // node are in the reduce node's cycle. + result.insert( + reduce, + use_reachable + .intersection(&user_reachable) + .map(|id| *id) + .collect(), + ); + } + + result +} + +/* + * Top level function to calculate which data nodes are "inside" a fork-join, + * not including its reduces. + */ +pub fn data_nodes_in_fork_joins( + function: &Function, + def_use: &ImmutableDefUseMap, + fork_join_map: &HashMap<NodeID, NodeID>, +) -> HashMap<NodeID, HashSet<NodeID>> { + let mut result = HashMap::new(); + + for (fork, join) in fork_join_map { + let mut worklist = vec![*fork]; + let mut set = HashSet::new(); + + while let Some(item) = worklist.pop() { + for u in def_use.get_users(item) { + if function.nodes[u.idx()].is_control() + || function.nodes[u.idx()] + .try_reduce() + .map(|(control, _, _)| control == *join) + .unwrap_or(false) + { + // Ignore control users and reduces of the fork-join. + continue; + } + if !set.contains(u) { + set.insert(*u); + worklist.push(*u); + } + } + } + + result.insert(*fork, set); + } + + result +} diff --git a/hercules_opt/src/ccp.rs b/hercules_opt/src/ccp.rs index 1b28db47ac4c59a943ce58aed54e764c5c1ae085..aa3d0e680bfa5ea7b1553e5b31e34e049fcbb600 100644 --- a/hercules_opt/src/ccp.rs +++ b/hercules_opt/src/ccp.rs @@ -4,7 +4,6 @@ use std::collections::HashSet; use std::iter::zip; use self::hercules_ir::dataflow::*; -use self::hercules_ir::def_use::*; use self::hercules_ir::ir::*; use crate::*; @@ -442,12 +441,18 @@ fn ccp_flow_function( if inputs[reduct.idx()].is_reachable() { constant = ConstantLattice::meet(&constant, &inputs[reduct.idx()].constant); } - CCPLattice { reachability, constant } + CCPLattice { + reachability, + constant, + } } else { - CCPLattice { reachability, constant: ConstantLattice::top() } + CCPLattice { + reachability, + constant: ConstantLattice::top(), + } } - }, - Node::Return { control, data } => inputs[control.idx()].clone(), + } + Node::Return { control, data: _ } => inputs[control.idx()].clone(), Node::Parameter { index: _ } => CCPLattice::bottom(), // A constant node is the "source" of concrete constant lattice values. Node::Constant { id } => CCPLattice { @@ -861,10 +866,7 @@ fn ccp_flow_function( constant: new_constant, } } - Node::Read { - collect, - indices, - } => { + Node::Read { collect, indices } => { let mut reachability = inputs[collect.idx()].reachability.clone(); for index in indices.iter() { if let Index::Position(positions) = index { diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 7f9c9ba2077a6fd343751df812f30da3ea55b2ee..0ff58822180575979430cb98018ca6dc128ae3d9 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -218,6 +218,10 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { &self.function } + pub fn get_dynamic_constants(&self) -> Ref<'_, Vec<DynamicConstant>> { + self.dynamic_constants.borrow() + } + pub fn get_users(&self, id: NodeID) -> impl ExactSizeIterator<Item = NodeID> + '_ { self.mut_def_use[id.idx()].iter().map(|x| *x) } @@ -276,8 +280,12 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { self.editor.dynamic_constants.borrow().len() + self.added_dynamic_constants.len() } + pub fn num_node_ids(&self) -> usize { + self.editor.function.nodes.len() + self.added_nodeids.len() + } + pub fn add_node(&mut self, node: Node) -> NodeID { - let id = NodeID::new(self.editor.function.nodes.len() + self.added_nodeids.len()); + let id = NodeID::new(self.num_node_ids()); // Added nodes need to have an entry in the def-use map. self.updated_def_use.insert(id, HashSet::new()); // Added nodes use other nodes, and we need to update their def-use diff --git a/hercules_opt/src/fork_concat_split.rs b/hercules_opt/src/fork_concat_split.rs new file mode 100644 index 0000000000000000000000000000000000000000..df3652dfe4be2454161fe75d80235f157ce27786 --- /dev/null +++ b/hercules_opt/src/fork_concat_split.rs @@ -0,0 +1,137 @@ +extern crate hercules_ir; + +use std::collections::{HashMap, HashSet}; +use std::iter::zip; + +use self::hercules_ir::ir::*; + +use crate::*; + +/* + * Split multi-dimensional fork-joins into separate one-dimensional fork-joins. + * Useful for code generation. + */ +pub fn fork_split( + editor: &mut FunctionEditor, + fork_join_map: &HashMap<NodeID, NodeID>, + reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>, +) { + // A single multi-dimensional fork becomes multiple forks, a join becomes + // multiple joins, a thread ID becomes a thread ID on the correct + // fork, and a reduce becomes multiple reduces to shuffle the reduction + // value through the fork-join nest. + for (fork, join) in fork_join_map { + let nodes = &editor.func().nodes; + let (fork_control, factors) = nodes[fork.idx()].try_fork().unwrap(); + if factors.len() < 2 { + continue; + } + let factors: Box<[DynamicConstantID]> = factors.into(); + let join_control = nodes[join.idx()].try_join().unwrap(); + let tids: Vec<_> = editor + .get_users(*fork) + .filter(|id| nodes[id.idx()].is_thread_id()) + .collect(); + let reduces: Vec<_> = editor + .get_users(*join) + .filter(|id| nodes[id.idx()].is_reduce()) + .collect(); + + let data_in_reduce_cycle: HashSet<(NodeID, NodeID)> = reduces + .iter() + .map(|reduce| editor.get_users(*reduce).map(move |user| (user, *reduce))) + .flatten() + .filter(|(user, reduce)| reduce_cycles[&reduce].contains(&user)) + .collect(); + + editor.edit(|mut edit| { + // Create the forks and a thread ID per fork. + let mut acc_fork = fork_control; + let mut new_tids = vec![]; + for factor in factors { + acc_fork = edit.add_node(Node::Fork { + control: acc_fork, + factors: Box::new([factor]), + }); + new_tids.push(edit.add_node(Node::ThreadID { + control: acc_fork, + dimension: 0, + })); + } + + // Create the joins. + let mut acc_join = if join_control == *fork { + acc_fork + } else { + join_control + }; + let mut joins = vec![]; + for _ in new_tids.iter() { + acc_join = edit.add_node(Node::Join { control: acc_join }); + joins.push(acc_join); + } + + // Create the reduces. + let mut new_reduces = vec![]; + for reduce in reduces.iter() { + let (_, init, reduct) = edit.get_node(*reduce).try_reduce().unwrap(); + let num_nodes = edit.num_node_ids(); + let mut inner_reduce = NodeID::new(0); + let mut outer_reduce = NodeID::new(0); + for (join_idx, join) in joins.iter().enumerate() { + let init = if join_idx == joins.len() - 1 { + init + } else { + NodeID::new(num_nodes + join_idx + 1) + }; + let reduct = if join_idx == 0 { + reduct + } else { + NodeID::new(num_nodes + join_idx - 1) + }; + let reduce = edit.add_node(Node::Reduce { + control: *join, + init, + reduct, + }); + assert_eq!(reduce, NodeID::new(num_nodes + join_idx)); + if join_idx == 0 { + inner_reduce = reduce; + } + if join_idx == joins.len() - 1 { + outer_reduce = reduce; + } + } + new_reduces.push((inner_reduce, outer_reduce)); + } + + // Replace everything. + edit = edit.replace_all_uses(*fork, acc_fork)?; + edit = edit.replace_all_uses(*join, acc_join)?; + for tid in tids.iter() { + let dim = edit.get_node(*tid).try_thread_id().unwrap().1; + edit = edit.replace_all_uses(*tid, new_tids[dim])?; + } + for (reduce, (inner_reduce, outer_reduce)) in zip(reduces.iter(), new_reduces) { + edit = edit.replace_all_uses_where(*reduce, inner_reduce, |id| { + data_in_reduce_cycle.contains(&(*id, *reduce)) + })?; + edit = edit.replace_all_uses_where(*reduce, outer_reduce, |id| { + !data_in_reduce_cycle.contains(&(*id, *reduce)) + })?; + } + + // Delete all the old stuff. + edit = edit.delete_node(*fork)?; + edit = edit.delete_node(*join)?; + for tid in tids { + edit = edit.delete_node(tid)?; + } + for reduce in reduces { + edit = edit.delete_node(reduce)?; + } + + Ok(edit) + }); + } +} diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs index 425fe315fdc6dcb2de22c60db16fb42ec9a3f273..6b9e006d489863dea79381f69528ec5cfe4741d8 100644 --- a/hercules_opt/src/inline.rs +++ b/hercules_opt/src/inline.rs @@ -20,32 +20,8 @@ pub fn inline( mut plans: Option<&mut Vec<Plan>>, ) { // Step 1: run topological sort on the call graph to inline the "deepest" - // function first. Mutual recursion is not currently supported, so assert - // that a topological sort exists. - let mut num_calls: Vec<usize> = (0..editors.len()) - .map(|idx| callgraph.num_callees(FunctionID::new(idx))) - .collect(); - let mut no_calls_stack: Vec<FunctionID> = num_calls - .iter() - .enumerate() - .filter(|(_, num)| **num == 0) - .map(|(idx, _)| FunctionID::new(idx)) - .collect(); - let mut topo = vec![]; - while let Some(no_call_func) = no_calls_stack.pop() { - topo.push(no_call_func); - for caller in callgraph.get_callers(no_call_func) { - num_calls[caller.idx()] -= 1; - if num_calls[caller.idx()] == 0 { - no_calls_stack.push(*caller); - } - } - } - assert_eq!( - topo.len(), - editors.len(), - "PANIC: Found mutual recursion in Hercules IR." - ); + // function first. + let topo = callgraph.topo(); // Step 2: make sure each function has a single return node. If an edit // failed to make a function have a single return node, then we can't inline diff --git a/hercules_opt/src/lib.rs b/hercules_opt/src/lib.rs index a01f901b1b2b1aceaf05eacfe7fe7207a553a182..5a429e14c15ef842e5bcf475d2a78a139fb1850b 100644 --- a/hercules_opt/src/lib.rs +++ b/hercules_opt/src/lib.rs @@ -4,6 +4,7 @@ pub mod ccp; pub mod dce; pub mod delete_uncalled; pub mod editor; +pub mod fork_concat_split; pub mod fork_guard_elim; pub mod forkify; pub mod gvn; @@ -14,12 +15,14 @@ pub mod pass; pub mod phi_elim; pub mod pred; pub mod sroa; +pub mod unforkify; pub mod utils; pub use crate::ccp::*; pub use crate::dce::*; pub use crate::delete_uncalled::*; pub use crate::editor::*; +pub use crate::fork_concat_split::*; pub use crate::fork_guard_elim::*; pub use crate::forkify::*; pub use crate::gvn::*; @@ -30,4 +33,5 @@ pub use crate::pass::*; pub use crate::phi_elim::*; pub use crate::pred::*; pub use crate::sroa::*; +pub use crate::unforkify::*; pub use crate::utils::*; diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs index ee240846d9ea8df356e2ffe27dcf827636df9366..eb8d386c0706039a254c69810e249ef363d3060f 100644 --- a/hercules_opt/src/outline.rs +++ b/hercules_opt/src/outline.rs @@ -558,8 +558,9 @@ pub fn outline( } /* - * Just outlines all of a function except the entry and return. Minimum work - * needed to cause runtime Rust code to be generated as necessary. + * Just outlines all of a function except the entry, return, and aggregate + * constants. This is the minimum work needed to cause runtime Rust code to be + * generated as necessary. */ pub fn dumb_outline( editor: &mut FunctionEditor, @@ -575,7 +576,11 @@ pub fn dumb_outline( .node_ids() .filter(|id| { let node = &editor.func().nodes[id.idx()]; - !(node.is_start() || node.is_parameter() || node.is_return()) + if let Node::Constant { id } = editor.func().nodes[id.idx()] { + editor.get_constant(id).is_scalar() + } else { + !(node.is_start() || node.is_parameter() || node.is_return()) + } }) .collect(); outline( diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index ccba355aa936cfb5629187e039d480178b6ba005..006bd371a52c500dd750fa48e28f02f0c42a719c 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -5,7 +5,7 @@ extern crate serde; extern crate take_mut; use std::cell::RefCell; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::env::temp_dir; use std::fs::File; use std::io::Write; @@ -27,24 +27,25 @@ pub enum Pass { DCE, CCP, GVN, - Forkify, PhiElim, + Forkify, ForkGuardElim, Predication, SROA, Inline, Outline, + InterproceduralSROA, + DeleteUncalled, + ForkSplit, + Unforkify, Verify, // Parameterized over whether analyses that aid visualization are necessary. // Useful to set to false if displaying a potentially broken module. Xdot(bool), - SchedXdot, // Parameterized over output directory and module name. Codegen(String, String), // Parameterized over where to serialize module to. Serialize(String), - InterproceduralSROA, - DeleteUncalled, } /* @@ -68,15 +69,14 @@ pub struct PassManager { pub fork_join_maps: Option<Vec<HashMap<NodeID, NodeID>>>, pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>, pub loops: Option<Vec<LoopTree>>, + pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub antideps: Option<Vec<Vec<(NodeID, NodeID)>>>, + pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub bbs: Option<Vec<Vec<NodeID>>>, pub callgraph: Option<CallGraph>, // Current plan. pub plans: Option<Vec<Plan>>, - - // Store the manifest of a compiled object. - pub manifests: Option<HashMap<String, Manifest>>, } impl PassManager { @@ -93,11 +93,12 @@ impl PassManager { fork_join_maps: None, fork_join_nests: None, loops: None, + reduce_cycles: None, antideps: None, + data_nodes_in_fork_joins: None, bbs: None, callgraph: None, plans: None, - manifests: None, } } @@ -225,6 +226,18 @@ impl PassManager { } } + pub fn make_reduce_cycles(&mut self) { + if self.reduce_cycles.is_none() { + self.make_def_uses(); + let def_uses = self.def_uses.as_ref().unwrap().iter(); + self.reduce_cycles = Some( + zip(self.module.functions.iter(), def_uses) + .map(|(function, def_use)| reduce_cycles(function, def_use)) + .collect(), + ); + } + } + pub fn make_antideps(&mut self) { if self.antideps.is_none() { self.make_def_uses(); @@ -239,6 +252,26 @@ impl PassManager { } } + pub fn make_data_nodes_in_fork_joins(&mut self) { + if self.data_nodes_in_fork_joins.is_none() { + self.make_def_uses(); + self.make_fork_join_maps(); + self.data_nodes_in_fork_joins = Some( + zip( + self.module.functions.iter(), + zip( + self.def_uses.as_ref().unwrap().iter(), + self.fork_join_maps.as_ref().unwrap().iter(), + ), + ) + .map(|(function, (def_use, fork_join_map))| { + data_nodes_in_fork_joins(function, def_use, fork_join_map) + }) + .collect(), + ); + } + } + pub fn make_bbs(&mut self) { if self.bbs.is_none() { self.make_def_uses(); @@ -804,6 +837,78 @@ impl PassManager { assert!(self.module.functions.len() > 0, "PANIC: There are no entry functions in the Hercules module being compiled, and they all got deleted by DeleteUncalled. Please mark at least one function as an entry!"); } + Pass::ForkSplit => { + self.make_def_uses(); + self.make_fork_join_maps(); + self.make_reduce_cycles(); + let def_uses = self.def_uses.as_ref().unwrap(); + let fork_join_maps = self.fork_join_maps.as_ref().unwrap(); + let reduce_cycles = self.reduce_cycles.as_ref().unwrap(); + for idx in 0..self.module.functions.len() { + let constants_ref = + RefCell::new(std::mem::take(&mut self.module.constants)); + let dynamic_constants_ref = + RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); + let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); + let mut editor = FunctionEditor::new( + &mut self.module.functions[idx], + &constants_ref, + &dynamic_constants_ref, + &types_ref, + &def_uses[idx], + ); + fork_split(&mut editor, &fork_join_maps[idx], &reduce_cycles[idx]); + + self.module.constants = constants_ref.take(); + self.module.dynamic_constants = dynamic_constants_ref.take(); + self.module.types = types_ref.take(); + + let edits = &editor.edits(); + if let Some(plans) = self.plans.as_mut() { + repair_plan(&mut plans[idx], &self.module.functions[idx], edits); + } + let grave_mapping = self.module.functions[idx].delete_gravestones(); + if let Some(plans) = self.plans.as_mut() { + plans[idx].fix_gravestones(&grave_mapping); + } + } + self.clear_analyses(); + } + Pass::Unforkify => { + self.make_def_uses(); + self.make_fork_join_maps(); + let def_uses = self.def_uses.as_ref().unwrap(); + let fork_join_maps = self.fork_join_maps.as_ref().unwrap(); + for idx in 0..self.module.functions.len() { + let constants_ref = + RefCell::new(std::mem::take(&mut self.module.constants)); + let dynamic_constants_ref = + RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); + let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); + let mut editor = FunctionEditor::new( + &mut self.module.functions[idx], + &constants_ref, + &dynamic_constants_ref, + &types_ref, + &def_uses[idx], + ); + unforkify(&mut editor, &fork_join_maps[idx]); + + self.module.constants = constants_ref.take(); + self.module.dynamic_constants = dynamic_constants_ref.take(); + self.module.types = types_ref.take(); + + let edits = &editor.edits(); + if let Some(plans) = self.plans.as_mut() { + repair_plan(&mut plans[idx], &self.module.functions[idx], edits); + } + let grave_mapping = self.module.functions[idx].delete_gravestones(); + if let Some(plans) = self.plans.as_mut() { + plans[idx].fix_gravestones(&grave_mapping); + } + } + self.clear_analyses(); + } Pass::Verify => { let ( def_uses, @@ -853,65 +958,66 @@ impl PassManager { self.plans.as_ref(), ); } - Pass::SchedXdot => { - self.make_def_uses(); - self.make_typing(); - self.make_control_subgraphs(); - self.make_fork_join_maps(); - self.make_fork_join_nests(); - self.make_antideps(); - self.make_bbs(); - self.make_plans(); - - let smodule = sched_compile( - &self.module, - self.def_uses.as_ref().unwrap(), - self.typing.as_ref().unwrap(), - self.control_subgraphs.as_ref().unwrap(), - self.fork_join_maps.as_ref().unwrap(), - self.fork_join_nests.as_ref().unwrap(), - self.antideps.as_ref().unwrap(), - self.bbs.as_ref().unwrap(), - self.plans.as_ref().unwrap(), - ); - - xdot_sched_module(&smodule); - } Pass::Codegen(output_dir, module_name) => { - self.make_def_uses(); + self.make_reverse_postorders(); self.make_typing(); self.make_control_subgraphs(); - self.make_fork_join_maps(); - self.make_fork_join_nests(); - self.make_antideps(); self.make_bbs(); - self.make_plans(); + self.make_callgraph(); + let reverse_postorders = self.reverse_postorders.as_ref().unwrap(); + let typing = self.typing.as_ref().unwrap(); + let control_subgraphs = self.control_subgraphs.as_ref().unwrap(); + let bbs = self.bbs.as_ref().unwrap(); + let callgraph = self.callgraph.as_ref().unwrap(); - let smodule = sched_compile( - &self.module, - self.def_uses.as_ref().unwrap(), - self.typing.as_ref().unwrap(), - self.control_subgraphs.as_ref().unwrap(), - self.fork_join_maps.as_ref().unwrap(), - self.fork_join_nests.as_ref().unwrap(), - self.antideps.as_ref().unwrap(), - self.bbs.as_ref().unwrap(), - self.plans.as_ref().unwrap(), - ); + let memory_objects: Vec<_> = (0..self.module.functions.len()) + .map(|idx| { + memory_objects( + &self.module.functions[idx], + &self.module.types, + &reverse_postorders[idx], + &typing[idx], + ) + }) + .collect(); + let memory_objects_mutable = + memory_objects_mutability(&self.module, &callgraph, &memory_objects); + let mut rust_rt = String::new(); let mut llvm_ir = String::new(); - for manifest in smodule.manifests.values() { - for partition_manifest in manifest.partitions.iter() { - let function = &smodule.functions[&partition_manifest.name]; - match partition_manifest.device { - DeviceManifest::CPU { parallel_launch: _ } => { - cpu_compile(function, partition_manifest, &mut llvm_ir).unwrap() - } - _ => todo!(), - } + for idx in 0..self.module.functions.len() { + if self.module.functions[idx].entry { + rt_codegen( + FunctionID::new(idx), + &self.module, + &reverse_postorders[idx], + &typing[idx], + &control_subgraphs[idx], + &bbs[idx], + &callgraph, + &memory_objects, + &memory_objects_mutable, + &mut rust_rt, + ) + .unwrap(); + } else { + // TODO: determine which backend to use for function. + cpu_codegen( + &self.module.functions[idx], + &self.module.types, + &self.module.constants, + &self.module.dynamic_constants, + &reverse_postorders[idx], + &typing[idx], + &control_subgraphs[idx], + &bbs[idx], + &mut llvm_ir, + ) + .unwrap(); } } println!("{}", llvm_ir); + println!("{}", rust_rt); // Write the LLVM IR into a temporary file. let mut tmp_path = temp_dir(); @@ -938,14 +1044,13 @@ impl PassManager { assert!(clang_process.wait().unwrap().success()); println!("{}", output_archive); - // Package manifest into a file. - let hman_contents: Vec<u8> = postcard::to_allocvec(&smodule.manifests).unwrap(); - let mut file = File::create(format!("{}/{}.hman", output_dir, module_name)) - .expect("PANIC: Unable to open output manifest file."); - file.write_all(&hman_contents) - .expect("PANIC: Unable to write output manifest file contents."); - self.manifests = Some(smodule.manifests); - println!("{:?}", self.manifests); + // Write the Rust runtime into a file. + let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name); + let mut file = File::create(&output_rt) + .expect("PANIC: Unable to open output Rust runtime file."); + file.write_all(rust_rt.as_bytes()) + .expect("PANIC: Unable to write output Rust runtime file contents."); + println!("{}", output_rt); } Pass::Serialize(output_file) => { let module_contents: Vec<u8> = postcard::to_allocvec(&self.module).unwrap(); @@ -995,10 +1100,6 @@ impl PassManager { self.module } - pub fn get_manifests(self) -> HashMap<String, Manifest> { - self.manifests.unwrap() - } - fn fix_deleted_functions(&mut self, id_mapping: &[Option<usize>]) { let mut idx = 0; diff --git a/hercules_opt/src/unforkify.rs b/hercules_opt/src/unforkify.rs new file mode 100644 index 0000000000000000000000000000000000000000..f31b740984c5b7d0ab6baaffd3edc807fe8cadbc --- /dev/null +++ b/hercules_opt/src/unforkify.rs @@ -0,0 +1,147 @@ +extern crate hercules_ir; + +use std::collections::HashMap; +use std::iter::zip; + +use self::hercules_ir::ir::*; + +use crate::*; + +/* + * Convert forks back into loops right before codegen when a backend is not + * lowering a fork-join to vector / parallel code. Lowering fork-joins into + * sequential loops in LLVM is actually not entirely trivial, so it's easier to + * just do this transformation within Hercules IR. + */ +pub fn unforkify(editor: &mut FunctionEditor, fork_join_map: &HashMap<NodeID, NodeID>) { + let mut zero_cons_id = ConstantID::new(0); + let mut one_cons_id = ConstantID::new(0); + assert!(editor.edit(|mut edit| { + zero_cons_id = edit.add_constant(Constant::UnsignedInteger64(0)); + one_cons_id = edit.add_constant(Constant::UnsignedInteger64(1)); + Ok(edit) + })); + + // Convert the fork to a region, thread IDs to a single phi, reduces to + // phis, and the join to a branch at the top of the loop. The previous + // control insides of the fork-join should become the successor of the true + // projection node, and what was the use of the join should become a use of + // the new region. + for (fork, join) in fork_join_map { + let nodes = &editor.func().nodes; + let (fork_control, factors) = nodes[fork.idx()].try_fork().unwrap(); + if factors.len() > 1 { + // For now, don't convert multi-dimensional fork-joins. Rely on pass + // that splits fork-joins. + continue; + } + let join_control = nodes[join.idx()].try_join().unwrap(); + let tids: Vec<_> = editor + .get_users(*fork) + .filter(|id| nodes[id.idx()].is_thread_id()) + .collect(); + let reduces: Vec<_> = editor + .get_users(*join) + .filter(|id| nodes[id.idx()].is_reduce()) + .collect(); + + let num_nodes = editor.node_ids().len(); + let region_id = NodeID::new(num_nodes); + let if_id = NodeID::new(num_nodes + 1); + let proj_back_id = NodeID::new(num_nodes + 2); + let proj_exit_id = NodeID::new(num_nodes + 3); + let zero_id = NodeID::new(num_nodes + 4); + let one_id = NodeID::new(num_nodes + 5); + let indvar_id = NodeID::new(num_nodes + 6); + let add_id = NodeID::new(num_nodes + 7); + let dc_id = NodeID::new(num_nodes + 8); + let neq_id = NodeID::new(num_nodes + 9); + let phi_ids = (num_nodes + 10..num_nodes + 10 + reduces.len()).map(NodeID::new); + + let region = Node::Region { + preds: Box::new([ + fork_control, + if join_control == *fork { + proj_back_id + } else { + join_control + }, + ]), + }; + let if_node = Node::If { + control: region_id, + cond: neq_id, + }; + let proj_back = Node::Projection { + control: if_id, + selection: 1, + }; + let proj_exit = Node::Projection { + control: if_id, + selection: 0, + }; + let zero = Node::Constant { id: zero_cons_id }; + let one = Node::Constant { id: one_cons_id }; + let indvar = Node::Phi { + control: region_id, + data: Box::new([zero_id, add_id]), + }; + let add = Node::Binary { + op: BinaryOperator::Add, + left: indvar_id, + right: one_id, + }; + let dc = Node::DynamicConstant { id: factors[0] }; + let neq = Node::Binary { + op: BinaryOperator::NE, + left: indvar_id, + right: dc_id, + }; + let phis: Vec<_> = reduces + .iter() + .map(|reduce_id| { + let (_, init, reduct) = nodes[reduce_id.idx()].try_reduce().unwrap(); + Node::Phi { + control: region_id, + data: Box::new([init, reduct]), + } + }) + .collect(); + + editor.edit(|mut edit| { + assert_eq!(edit.add_node(region), region_id); + assert_eq!(edit.add_node(if_node), if_id); + assert_eq!(edit.add_node(proj_back), proj_back_id); + assert_eq!(edit.add_node(proj_exit), proj_exit_id); + assert_eq!(edit.add_node(zero), zero_id); + assert_eq!(edit.add_node(one), one_id); + assert_eq!(edit.add_node(indvar), indvar_id); + assert_eq!(edit.add_node(add), add_id); + assert_eq!(edit.add_node(dc), dc_id); + assert_eq!(edit.add_node(neq), neq_id); + for (phi_id, phi) in zip(phi_ids.clone(), phis) { + assert_eq!(edit.add_node(phi), phi_id); + } + + edit = edit.replace_all_uses(*fork, proj_back_id)?; + edit = edit.replace_all_uses(*join, proj_exit_id)?; + for tid in tids.iter() { + edit = edit.replace_all_uses(*tid, indvar_id)?; + } + for (reduce, phi_id) in zip(reduces.iter(), phi_ids) { + edit = edit.replace_all_uses(*reduce, phi_id)?; + } + + edit = edit.delete_node(*fork)?; + edit = edit.delete_node(*join)?; + for tid in tids { + edit = edit.delete_node(tid)?; + } + for reduce in reduces { + edit = edit.delete_node(reduce)?; + } + + Ok(edit) + }); + } +} diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml deleted file mode 100644 index 76c940b71979ee5bb7ab1d01abb77e3f7f3a0460..0000000000000000000000000000000000000000 --- a/hercules_rt/Cargo.toml +++ /dev/null @@ -1,11 +0,0 @@ -[package] -name = "hercules_rt" -version = "0.1.0" -authors = ["Russel Arbore <rarbore2@illinois.edu>"] -edition = "2021" - -[dependencies] -libc = "*" -postcard = { version = "*", features = ["alloc"] } -serde = { version = "*", features = ["derive"] } -hercules_rt_proc = { path = "../hercules_rt_proc" } diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs deleted file mode 100644 index 95b93aa488d708b24733ed78489d0cdb08d33a19..0000000000000000000000000000000000000000 --- a/hercules_rt/src/lib.rs +++ /dev/null @@ -1,4 +0,0 @@ -extern crate hercules_rt_proc; - -pub use hercules_rt_proc::use_hman; -pub use hercules_rt_proc::use_hir; diff --git a/hercules_rt_proc/Cargo.toml b/hercules_rt_proc/Cargo.toml deleted file mode 100644 index 6d026135252836154102006f0a444f2a1f824313..0000000000000000000000000000000000000000 --- a/hercules_rt_proc/Cargo.toml +++ /dev/null @@ -1,17 +0,0 @@ -[package] -name = "hercules_rt_proc" -version = "0.1.0" -authors = ["Russel Arbore <rarbore2@illinois.edu>"] -edition = "2021" - -[lib] -proc-macro = true - -[dependencies] -postcard = { version = "*", features = ["alloc"] } -serde = { version = "*", features = ["derive"] } -hercules_cg = { path = "../hercules_cg" } -hercules_ir = { path = "../hercules_ir" } -hercules_opt = { path = "../hercules_opt" } -anyhow = "*" -uuid = { version = "*", features = ["v4", "fast-rng", "macro-diagnostics"] } diff --git a/hercules_rt_proc/src/lib.rs b/hercules_rt_proc/src/lib.rs deleted file mode 100644 index 0465a2584d3c266e77415afcdcc90be1276481f9..0000000000000000000000000000000000000000 --- a/hercules_rt_proc/src/lib.rs +++ /dev/null @@ -1,521 +0,0 @@ -#![feature(iter_intersperse)] - -extern crate anyhow; -extern crate hercules_cg; -extern crate hercules_ir; -extern crate hercules_opt; -extern crate postcard; -extern crate proc_macro; - -use std::collections::{BTreeSet, HashMap}; -use std::ffi::OsStr; -use std::fmt::Write; -use std::fs::File; -use std::io::prelude::*; -use std::path::Path; - -use proc_macro::*; - -use self::hercules_cg::*; -use self::hercules_ir::{DynamicConstant, DynamicConstantID, ID}; - -/* - * Convert schedule IR types to the Rust types generated in the interface. - */ -fn generate_type_string(ty: &SType) -> String { - match ty { - SType::Boolean => "bool".to_string(), - SType::Integer8 => "i8".to_string(), - SType::Integer16 => "i16".to_string(), - SType::Integer32 => "i32".to_string(), - SType::Integer64 => "i64".to_string(), - SType::UnsignedInteger8 => "u8".to_string(), - SType::UnsignedInteger16 => "u16".to_string(), - SType::UnsignedInteger32 => "u32".to_string(), - SType::UnsignedInteger64 => "u64".to_string(), - SType::Float32 => "f32".to_string(), - SType::Float64 => "f64".to_string(), - SType::Product(fields) => { - fields.iter().fold("__Prod".to_string(), |acc, field| { - format!("{}_{}", acc, generate_type_name(field)) - }) + "_" - } - SType::ArrayRef(elem) => format!("*mut {}", generate_type_string(elem)), - } -} - -fn generate_type_name(ty: &SType) -> String { - match ty { - SType::Boolean - | SType::Integer8 - | SType::Integer16 - | SType::Integer32 - | SType::Integer64 - | SType::UnsignedInteger8 - | SType::UnsignedInteger16 - | SType::UnsignedInteger32 - | SType::UnsignedInteger64 - | SType::Float32 - | SType::Float64 => generate_type_string(ty), - SType::Product(fields) => { - fields.iter().fold("__Prod".to_string(), |acc, field| { - format!("{}_{}", acc, generate_type_name(field)) - }) + "_" - } - SType::ArrayRef(elem) => format!("ArrayRef_{}", generate_type_name(elem)), - } -} - -fn compute_dynamic_constant<W: Write>( - dc: DynamicConstantID, - manifest: &Manifest, - rust_code: &mut W, -) -> Result<(), anyhow::Error> { - match manifest.dynamic_constants[dc.idx()] { - DynamicConstant::Constant(cons) => write!(rust_code, "{}", cons)?, - DynamicConstant::Parameter(idx) => write!(rust_code, "dc_{}", idx)?, - DynamicConstant::Add(left, right) => { - write!(rust_code, "(")?; - compute_dynamic_constant(left, manifest, rust_code)?; - write!(rust_code, " + ")?; - compute_dynamic_constant(right, manifest, rust_code)?; - write!(rust_code, ")")?; - } - DynamicConstant::Sub(left, right) => { - write!(rust_code, "(")?; - compute_dynamic_constant(left, manifest, rust_code)?; - write!(rust_code, " - ")?; - compute_dynamic_constant(right, manifest, rust_code)?; - write!(rust_code, ")")?; - } - DynamicConstant::Mul(left, right) => { - write!(rust_code, "(")?; - compute_dynamic_constant(left, manifest, rust_code)?; - write!(rust_code, " * ")?; - compute_dynamic_constant(right, manifest, rust_code)?; - write!(rust_code, ")")?; - } - DynamicConstant::Div(left, right) => { - write!(rust_code, "(")?; - compute_dynamic_constant(left, manifest, rust_code)?; - write!(rust_code, " / ")?; - compute_dynamic_constant(right, manifest, rust_code)?; - write!(rust_code, ")")?; - } - DynamicConstant::Rem(left, right) => { - write!(rust_code, "(")?; - compute_dynamic_constant(left, manifest, rust_code)?; - write!(rust_code, " % ")?; - compute_dynamic_constant(right, manifest, rust_code)?; - write!(rust_code, ")")?; - } - } - Ok(()) -} - -/* - * Generate async Rust code orchestrating partition execution. - */ -fn codegen( - manifests: &HashMap<String, Manifest>, - link_library: &Option<String>, -) -> Result<String, anyhow::Error> { - // Write to a String containing all of the Rust code. - let mut rust_code = "".to_string(); - - // Rust doesn't allow you to send pointers between threads. In order to send - // pointers between threads, we need to wrap them in a struct that unsafely - // implements Send and Sync. This passes the responsibility of - // synchronization onto us, which we do by being careful with how we lower - // parallel code. Make this type generic so that we actually wrap all - // arguments in it for ease of macro codegen. - write!( - rust_code, - "#[derive(Clone, Copy, Debug)]\nstruct SendSyncWrapper<T: Copy>(T);\nunsafe impl<T: Copy> Send for SendSyncWrapper<T> {{}}\nunsafe impl<T: Copy> Sync for SendSyncWrapper<T> {{}}\n" - )?; - - // Emit the product types used in this module. We can't just emit product - // types, since we need #[repr(C)] to interact with LLVM. - let visible_stypes = manifests - .into_iter() - .map(|(_, manifest)| manifest.all_visible_types()) - .flatten() - .collect::<BTreeSet<SType>>(); - let all_stypes = Manifest::transitive_closure_type_set(visible_stypes); - for stype in all_stypes.iter() { - if let Some(fields) = stype.try_product() { - write!( - rust_code, - "#[derive(Clone, Copy, Debug)]\n#[repr(C)]\nstruct {}({});\n", - generate_type_string(stype), - fields - .iter() - .map(|field| generate_type_string(field)) - .intersperse(", ".to_string()) - .fold("".to_string(), |acc, token| acc + &token) - )?; - } - } - - // Emit the async Rust functions implementing each Hercules function. - for (function_name, manifest) in manifests.into_iter() { - // Emit the function signature. - write!(rust_code, "async unsafe fn {}(", function_name)?; - for (param_ty, param_kind) in manifest.param_types.iter() { - match param_kind { - ParameterKind::HerculesParameter(idx) => write!(rust_code, "param_{}", idx)?, - ParameterKind::DataInput(_) => panic!( - "PANIC: Parameter kind for Hercules function parameter cannot be DataInput." - ), - ParameterKind::DynamicConstant(idx) => write!(rust_code, "dc_{}", idx)?, - ParameterKind::ArrayConstant(array_id) => { - write!(rust_code, "array_{}", array_id.idx())? - } - } - write!(rust_code, ": {}, ", generate_type_string(param_ty))? - } - write!( - rust_code, - ") -> {} {{\n", - generate_type_string(&manifest.return_type) - )?; - - // Compute the signature for each partition function and emit the extern - // function signatures. - if let Some(link_library_name) = link_library { - write!(rust_code, " #[link(name = \"{}\")]\n", link_library_name)?; - } - write!(rust_code, " extern \"C\" {{\n")?; - for partition in manifest.partitions.iter() { - write!(rust_code, " fn {}(", partition.name)?; - - // Add parameters for SFunction signature. - for (param_stype, kind) in partition.parameters.iter() { - match kind { - ParameterKind::HerculesParameter(idx) => write!(rust_code, "param_{}: ", idx)?, - ParameterKind::DataInput(id) => write!(rust_code, "data_{}: ", id.idx())?, - ParameterKind::DynamicConstant(idx) => write!(rust_code, "dc_{}: ", idx)?, - ParameterKind::ArrayConstant(id) => write!(rust_code, "array_{}: ", id.idx())?, - } - write!(rust_code, "{}, ", generate_type_string(param_stype))?; - } - - // Add parameters for device specific lowering details. - if let DeviceManifest::CPU { parallel_launch } = &partition.device { - for parallel_launch_dim in 0..parallel_launch.len() { - write!( - rust_code, - "parallel_launch_low_{}: u64, parallel_launch_len_{}: u64, ", - parallel_launch_dim, parallel_launch_dim - )?; - } - } - - // Add the return product of the SFunction signature. - let return_stype = if partition.returns.len() == 1 { - partition.returns[0].0.clone() - } else { - SType::Product( - partition - .returns - .iter() - .map(|(return_stype, _)| return_stype.clone()) - .collect(), - ) - }; - write!(rust_code, ") -> {};\n", generate_type_string(&return_stype),)?; - } - write!(rust_code, " }}\n")?; - - // Declare all of the intermediary data input / output variables. They - // are declared as MaybeUninit, since they get assigned after running a - // partition. MaybeUninits should always be defined before assume_init() - // is called on them, assuming a valid partitioning. - let mut data_inputs = BTreeSet::new(); - let mut data_outputs = BTreeSet::new(); - for partition in manifest.partitions.iter() { - data_inputs.extend(partition.data_inputs()); - data_outputs.extend(partition.data_outputs()); - } - assert_eq!(data_inputs, data_outputs); - for (node, stype) in data_inputs { - write!(rust_code, " let mut node_{}: ::core::mem::MaybeUninit<{}> = ::core::mem::MaybeUninit::uninit();\n", node.idx(), generate_type_string(stype))?; - } - - // The core executor is a Rust loop. We literally run a "control token" - // as described in the original sea of nodes paper through the - // partitions to drive execution. - write!( - rust_code, - " let mut control_token: i8 = 0;\n loop {{\n", - )?; - - // Match on the control token position to determine which partition to - // execute. - write!(rust_code, " match control_token {{\n")?; - - // Emit the match arm per partition. - for (idx, partition) in manifest.partitions.iter().enumerate() { - // Open the arm. - write!(rust_code, " {} => {{\n", idx)?; - - match partition.device { - DeviceManifest::CPU { - ref parallel_launch, - } => { - for (idx, (_, kind)) in partition.parameters.iter().enumerate() { - write!( - rust_code, - " let local_param_{} = SendSyncWrapper(", - idx - )?; - match kind { - ParameterKind::HerculesParameter(idx) => { - write!(rust_code, "param_{}", idx)? - } - ParameterKind::DataInput(id) => { - write!(rust_code, "node_{}.assume_init()", id.idx())? - } - ParameterKind::DynamicConstant(idx) => write!(rust_code, "dc_{}", idx)?, - ParameterKind::ArrayConstant(id) => { - write!(rust_code, "array_{}", id.idx())? - } - } - write!(rust_code, ");\n")?; - } - - if parallel_launch.is_empty() { - // Call the partition function. - write!( - rust_code, - " let output = {}(", - partition.name - )?; - for idx in 0..partition.parameters.len() { - write!(rust_code, "local_param_{}.0, ", idx)?; - } - write!(rust_code, ");\n")?; - } else { - // Compute the dynamic constant bounds. - for (dim, (_, dc)) in parallel_launch.into_iter().enumerate() { - write!(rust_code, " let bound_{} = ", dim)?; - compute_dynamic_constant(*dc, manifest, &mut rust_code)?; - write!(rust_code, ";\n let low_{} = 0;\n", dim)?; - } - - // Simultaneously calculate the tiles lows and lens and - // spawn the tiles. Emit the launches unrolled. - let mut tile = vec![0; parallel_launch.len()]; - let total_num_tiles = parallel_launch - .into_iter() - .fold(1, |acc, (num_tiles, _)| acc * num_tiles); - for tile_num in 0..total_num_tiles { - // Calculate the lows and lens for this tile. - for (dim, tile) in tile.iter().enumerate() { - let num_tiles = parallel_launch[dim].0; - write!( - rust_code, - " let len_{} = bound_{} / {} + ({} < bound_{} % {}) as u64;\n", - dim, dim, num_tiles, tile, dim, num_tiles - )?; - } - - // Spawn the tile. We need to explicitly copy the - // SendSyncWrappers, or else the path expression for - // the parameters get interpreted as what needs to - // be moved, when we want the wrapper itself to be - // what gets moved. Ugh. - write!( - rust_code, - " let tile_{} = async_std::task::spawn(async move {{ ", - tile_num, - )?; - for idx in 0..partition.parameters.len() { - write!( - rust_code, - "let local_param_{} = local_param_{}; ", - idx, idx - )?; - } - write!(rust_code, "SendSyncWrapper({}(", partition.name)?; - for idx in 0..partition.parameters.len() { - write!(rust_code, "local_param_{}.0, ", idx)?; - } - for dim in 0..parallel_launch.len() { - write!(rust_code, "low_{}, len_{}, ", dim, dim)?; - } - write!(rust_code, ")) }});\n")?; - - // Go to the next tile. - for dim in (0..parallel_launch.len()).rev() { - tile[dim] += 1; - let num_tiles = parallel_launch[dim].0; - if tile[dim] < num_tiles { - write!( - rust_code, - " let low_{} = low_{} + len_{};\n", - dim, dim, dim - )?; - break; - } else { - tile[dim] = 0; - write!(rust_code, " let low_{} = 0;\n", dim)?; - } - } - } - - // Join the JoinHandles, and get the output from one of - // them. - write!( - rust_code, - " let output = ::core::future::join!(", - )?; - for tile_num in 0..total_num_tiles { - write!(rust_code, "tile_{}, ", tile_num)?; - } - // join! unhelpfully returns either a tuple or a single - // value, but never a singleton tuple. - if total_num_tiles == 1 { - write!(rust_code, ").await.0;\n")?; - } else { - write!(rust_code, ").await.0.0;\n")?; - } - } - - // Assign the outputs. - for (output_idx, (_, kind)) in partition.returns.iter().enumerate() { - let output_ref = if partition.returns.len() == 1 { - "output".to_string() - } else { - format!("output.{}", output_idx) - }; - match kind { - ReturnKind::HerculesReturn => { - write!(rust_code, " return {};\n", output_ref)? - } - ReturnKind::DataOutput(id) => write!( - rust_code, - " node_{}.write({});\n", - id.idx(), - output_ref - )?, - ReturnKind::NextPartition => write!( - rust_code, - " control_token = {};\n", - output_ref - )?, - } - } - } - _ => todo!(), - } - - // If there's only one partition successor, then an explicit - // NextPartition isn't returned - emit the new control token here. - if partition.successors.len() == 1 { - write!( - rust_code, - " control_token = {};\n", - partition.successors[0].idx() - )?; - } - - // Close the arm. - write!(rust_code, " }}\n")?; - } - - // Close the match, and handle invalid control token values. - write!( - rust_code, - " _ => panic!(\"PANIC: Invalid control token value.\"),\n }}\n" - )?; - - // Close the loop. - write!(rust_code, " }}\n")?; - - // Close the function. - write!(rust_code, "}}\n")?; - } - - Ok(rust_code) -} - -/* - * Generate the async Rust runtime from the manifest of a Hercules module. - */ -#[proc_macro] -pub fn use_hman(path: TokenStream) -> TokenStream { - use TokenTree::Literal; - - // Get the path as a Rust path object, and make sure it's a .hman file. - let mut tokens_iter = path.into_iter(); - let token = tokens_iter - .next() - .expect("Please provide a path to a .hman file to the use_hman! macro."); - assert!(tokens_iter.next().is_none(), "Too many tokens provided to the use_hman! macro. Please provide only one path to a .hman file."); - let literal = if let Literal(literal) = token { - literal - } else { - panic!("Please provide a string literal containing the path to a .hman file to the use_hman! macro."); - }; - let literal_string = literal.to_string(); - let path = Path::new(&literal_string[1..(literal_string.len() - 1)]); - assert_eq!( - path.extension(), - Some(OsStr::new("hman")), - "Please provide only .hman files to the use_hman! macro." - ); - assert_eq!( - path.try_exists().ok(), - Some(true), - "Please provide a valid path to a .hman file to the use_hman! macro." - ); - - // Load manifest from path. - let mut f = File::open(path).unwrap(); - let mut buffer = vec![]; - f.read_to_end(&mut buffer).unwrap(); - let manifests = postcard::from_bytes(&buffer).unwrap(); - - // Generate Rust code. - let rust_code = codegen(&manifests, &None).unwrap(); - eprintln!("{}", rust_code); - rust_code.parse().unwrap() -} - -#[proc_macro] -pub fn use_hir(hir_tokens: TokenStream) -> TokenStream { - use std::env; - use TokenTree::Literal; - - let mut tokens_iter = hir_tokens.into_iter(); - let token = tokens_iter - .next() - .expect("Please provide Hercules IR to use the use_hir! macro."); - assert!( - tokens_iter.next().is_none(), - "Too many tokens provided to use the use_hir! macro. Please provide only Hercules IR." - ); - let literal = if let Literal(literal) = token { - literal - } else { - panic!("Please provide a string literal containing Hercules IR."); - }; - let literal_string = literal.to_string(); - - let module = hercules_ir::parse::parse(&literal_string[1..(literal_string.len() - 1)]) - .expect("PANIC: Failed to parse Hercules IR string."); - let out_dir = env::var("OUT_DIR").unwrap(); - let libname = format!("hir_generated_{}", uuid::Uuid::new_v4().simple()); - - let mut p = hercules_opt::pass::PassManager::new(module); - p.add_pass(hercules_opt::pass::Pass::Codegen(out_dir, libname.clone())); - - p.run_passes(); - - let manifests = p.get_manifests(); - let rust_code = codegen(&manifests, &Some(libname)).unwrap(); - eprintln!("{}", rust_code); - - rust_code.parse().unwrap() -} diff --git a/hercules_samples/call/Cargo.toml b/hercules_samples/call/Cargo.toml new file mode 100644 index 0000000000000000000000000000000000000000..4a2fbb862039ad0d268b85fc6e85463dc87841d7 --- /dev/null +++ b/hercules_samples/call/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "call" +version = "0.1.0" +authors = ["Russel Arbore <rarbore2@illinois.edu>"] +edition = "2021" + +[build-dependencies] +juno_build = { path = "../../juno_build" } + +[dependencies] +juno_build = { path = "../../juno_build" } +rand = "*" +async-std = "*" +with_builtin_macros = "0.1.0" diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs new file mode 100644 index 0000000000000000000000000000000000000000..dbefe008a14e57785261e2757bb0e0dbbb5fa27c --- /dev/null +++ b/hercules_samples/call/build.rs @@ -0,0 +1,10 @@ +extern crate juno_build; +use juno_build::JunoCompiler; + +fn main() { + JunoCompiler::new() + .ir_in_src("call.hir") + .unwrap() + .build() + .unwrap(); +} diff --git a/hercules_samples/call.hir b/hercules_samples/call/src/call.hir similarity index 72% rename from hercules_samples/call.hir rename to hercules_samples/call/src/call.hir index 3c4f79111cd2ee4ee4c530dacba4ceb6b5e5e9e0..937ce1ef70eae9a569692a8d8c61b2cf26646d04 100644 --- a/hercules_samples/call.hir +++ b/hercules_samples/call/src/call.hir @@ -1,9 +1,9 @@ -fn myfunc(x: i32) -> i32 +fn myfunc(x: u64) -> u64 cr = region(start) y = call<16>(add, cr, x, x) r = return(cr, y) -fn add<1>(x: i32, y: i32) -> i32 +fn add<1>(x: u64, y: u64) -> u64 w = add(x, y) dc = dynamic_constant(#0) z = add(w, dc) diff --git a/hercules_samples/call/src/main.rs b/hercules_samples/call/src/main.rs new file mode 100644 index 0000000000000000000000000000000000000000..3bbb634c7405dd9aff81dd9c3a0068b54df45a26 --- /dev/null +++ b/hercules_samples/call/src/main.rs @@ -0,0 +1,19 @@ +#![feature(box_as_ptr, let_chains)] + +extern crate async_std; +extern crate juno_build; + +juno_build::juno!("call"); + +fn main() { + async_std::task::block_on(async { + let x = myfunc(7).await; + let y = add(10, 2, 18).await; + assert_eq!(x, y); + }); +} + +#[test] +fn dot_test() { + main(); +} diff --git a/hercules_samples/ccp/Cargo.toml b/hercules_samples/ccp/Cargo.toml new file mode 100644 index 0000000000000000000000000000000000000000..3547aa52df766cb00445e088176c076fb5996b80 --- /dev/null +++ b/hercules_samples/ccp/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "ccp" +version = "0.1.0" +authors = ["Russel Arbore <rarbore2@illinois.edu>"] +edition = "2021" + +[build-dependencies] +juno_build = { path = "../../juno_build" } + +[dependencies] +juno_build = { path = "../../juno_build" } +rand = "*" +async-std = "*" +with_builtin_macros = "0.1.0" diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs new file mode 100644 index 0000000000000000000000000000000000000000..650b51b8b14715579de164f7fc65330e113613a1 --- /dev/null +++ b/hercules_samples/ccp/build.rs @@ -0,0 +1,10 @@ +extern crate juno_build; +use juno_build::JunoCompiler; + +fn main() { + JunoCompiler::new() + .ir_in_src("ccp.hir") + .unwrap() + .build() + .unwrap(); +} diff --git a/hercules_samples/ccp_example.hir b/hercules_samples/ccp/src/ccp.hir similarity index 100% rename from hercules_samples/ccp_example.hir rename to hercules_samples/ccp/src/ccp.hir diff --git a/hercules_samples/ccp/src/main.rs b/hercules_samples/ccp/src/main.rs new file mode 100644 index 0000000000000000000000000000000000000000..5fc78ab51227a12ef46636bd4479cd3321cc509b --- /dev/null +++ b/hercules_samples/ccp/src/main.rs @@ -0,0 +1,18 @@ +#![feature(box_as_ptr, let_chains)] + +extern crate async_std; +extern crate juno_build; + +juno_build::juno!("ccp"); + +fn main() { + async_std::task::block_on(async { + let x = tricky(7).await; + assert_eq!(x, 1); + }); +} + +#[test] +fn dot_test() { + main(); +} diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml index 69cd39e388661b3f7f6dca53cf9210ab7050902c..f74ab1f6f4ed5de3b02ab45b1f6fca461fdbc192 100644 --- a/hercules_samples/dot/Cargo.toml +++ b/hercules_samples/dot/Cargo.toml @@ -10,7 +10,6 @@ juno_build = { path = "../../juno_build" } [dependencies] clap = { version = "*", features = ["derive"] } juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs index 6d4cf3800563cd17b48eb744d2e30ffe8c641bf5..0f5ee518506e6abe6cb815c244699ce12ccb45d1 100644 --- a/hercules_samples/dot/src/main.rs +++ b/hercules_samples/dot/src/main.rs @@ -1,14 +1,31 @@ +#![feature(box_as_ptr, let_chains)] + extern crate async_std; -extern crate clap; extern crate juno_build; +use core::ptr::copy_nonoverlapping; + juno_build::juno!("dot"); fn main() { async_std::task::block_on(async { - let mut a = vec![0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; - let mut b = vec![0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; - let c = unsafe { dot(a.as_mut_ptr(), b.as_mut_ptr(), 8).await }; + let a: Box<[f32]> = Box::new([0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]); + let b: Box<[f32]> = Box::new([0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]); + let mut a_bytes: Box<[u8]> = Box::new([0; 32]); + let mut b_bytes: Box<[u8]> = Box::new([0; 32]); + unsafe { + copy_nonoverlapping( + Box::as_ptr(&a) as *const u8, + Box::as_mut_ptr(&mut a_bytes) as *mut u8, + 32, + ); + copy_nonoverlapping( + Box::as_ptr(&b) as *const u8, + Box::as_mut_ptr(&mut b_bytes) as *mut u8, + 32, + ); + }; + let c = dot(8, a_bytes, b_bytes).await; println!("{}", c); assert_eq!(c, 70.0); }); diff --git a/hercules_samples/fac/Cargo.toml b/hercules_samples/fac/Cargo.toml index 9082a4fc4194ac3fa9c694a5a5973f605772a384..d4b9c5fe2ae7ac6907d3c1181dabb3cb247cff2e 100644 --- a/hercules_samples/fac/Cargo.toml +++ b/hercules_samples/fac/Cargo.toml @@ -10,7 +10,6 @@ juno_build = { path = "../../juno_build" } [dependencies] clap = { version = "*", features = ["derive"] } juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/hercules_samples/fac/src/fac.hir b/hercules_samples/fac/src/fac.hir index 0d85c5d095cef28bc88e3b9669c26ed42fde8b8c..e43dd8cae1a605bca7c3ceac4eb7c029665e86e6 100644 --- a/hercules_samples/fac/src/fac.hir +++ b/hercules_samples/fac/src/fac.hir @@ -1,4 +1,4 @@ -fn fac_inner(x: i32) -> i32 +fn fac(x: i32) -> i32 zero = constant(i32, 0) one = constant(i32, 1) loop = region(start, if_true) @@ -11,8 +11,3 @@ fn fac_inner(x: i32) -> i32 if_false = projection(if, 0) if_true = projection(if, 1) r = return(if_false, fac_acc) - -fn fac(x: i32) -> i32 - cr = region(start) - call = call(fac_inner, cr, x) - r = return(cr, call) diff --git a/hercules_samples/fac/src/main.rs b/hercules_samples/fac/src/main.rs index e3a307fcfd521217f2956e77f25332a54e6befee..7071fd2c115bba1d6ff60fae688b262354d4fc71 100644 --- a/hercules_samples/fac/src/main.rs +++ b/hercules_samples/fac/src/main.rs @@ -6,7 +6,7 @@ juno_build::juno!("fac"); fn main() { async_std::task::block_on(async { - let f = unsafe { fac(8).await }; + let f = fac(8).await; println!("{}", f); assert_eq!(f, 40320); }); diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml index 9066c1535e2c40400bdb3b5ca20a3e38237ef597..d3975c5ca58b68cdb3fef0f6d8a3cf8e106408d6 100644 --- a/hercules_samples/matmul/Cargo.toml +++ b/hercules_samples/matmul/Cargo.toml @@ -10,7 +10,6 @@ juno_build = { path = "../../juno_build" } [dependencies] clap = { version = "*", features = ["derive"] } juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs index f3ceba93d88402443b503a232abd5af71eb9ca58..12c14249aa62502c766028cef3c0518cf0fb4633 100644 --- a/hercules_samples/matmul/src/main.rs +++ b/hercules_samples/matmul/src/main.rs @@ -1,19 +1,39 @@ -#![feature(future_join)] +#![feature(box_as_ptr, let_chains)] extern crate async_std; -extern crate clap; extern crate juno_build; +use core::ptr::copy_nonoverlapping; + juno_build::juno!("matmul"); fn main() { async_std::task::block_on(async { - let mut a = vec![1.0, 2.0, 3.0, 4.0]; - let mut b = vec![5.0, 6.0, 7.0, 8.0]; - let mut c = vec![0.0, 0.0, 0.0, 0.0]; + let a: Box<[f32]> = Box::new([1.0, 2.0, 3.0, 4.0]); + let b: Box<[f32]> = Box::new([5.0, 6.0, 7.0, 8.0]); + let mut a_bytes: Box<[u8]> = Box::new([0; 16]); + let mut b_bytes: Box<[u8]> = Box::new([0; 16]); + unsafe { + copy_nonoverlapping( + Box::as_ptr(&a) as *const u8, + Box::as_mut_ptr(&mut a_bytes) as *mut u8, + 16, + ); + copy_nonoverlapping( + Box::as_ptr(&b) as *const u8, + Box::as_mut_ptr(&mut b_bytes) as *mut u8, + 16, + ); + }; + let c_bytes = matmul(2, 2, 2, a_bytes, b_bytes).await; + let mut c: Box<[f32]> = Box::new([0.0; 4]); unsafe { - matmul(a.as_mut_ptr(), b.as_mut_ptr(), c.as_mut_ptr(), 2, 2, 2).await; - } + copy_nonoverlapping( + Box::as_ptr(&c_bytes) as *const u8, + Box::as_mut_ptr(&mut c) as *mut u8, + 16, + ); + }; println!("[[{}, {}], [{}, {}]]", c[0], c[1], c[2], c[3]); assert_eq!(c[0], 19.0); assert_eq!(c[1], 22.0); diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml index 4f6234988f7e30ade972933dcec56b29412369ba..72faf4bd14da65b482f2e379c1b51ce3ede8dcf0 100644 --- a/juno_build/Cargo.toml +++ b/juno_build/Cargo.toml @@ -6,6 +6,5 @@ edition = "2021" [dependencies] juno_frontend = { path = "../juno_frontend" } -hercules_rt = { path = "../hercules_rt" } hercules_ir = { path = "../hercules_ir" } with_builtin_macros = "0.1.0" diff --git a/juno_build/src/lib.rs b/juno_build/src/lib.rs index e01a518722d8e074905593f04e14391e0d905a9e..fdaf4d27cbcb7b31738e6df42d69272369d5026f 100644 --- a/juno_build/src/lib.rs +++ b/juno_build/src/lib.rs @@ -1,5 +1,4 @@ extern crate hercules_ir; -extern crate hercules_rt; use juno_compiler::*; @@ -233,8 +232,8 @@ impl JunoCompiler { macro_rules! juno { ($path:expr) => { with_builtin_macros::with_builtin!( - let $hman = concat!(env!("OUT_DIR"), "/", $path, ".hman") in { - hercules_rt::use_hman!($hman); + let $hrt = concat!(env!("OUT_DIR"), "/rt_", $path, ".hrt") in { + include!($hrt); }); }; } diff --git a/juno_frontend/src/lib.rs b/juno_frontend/src/lib.rs index 4713cfeb92c13b46a7bcc413a765f9a4951cb5f5..c39faef9011b39960a74ecf4472df8c0780a8281 100644 --- a/juno_frontend/src/lib.rs +++ b/juno_frontend/src/lib.rs @@ -184,9 +184,15 @@ pub fn compile_ir( add_pass!(pm, verify, Forkify); add_pass!(pm, verify, ForkGuardElim); add_verified_pass!(pm, verify, DCE); + add_pass!(pm, verify, Outline); + add_pass!(pm, verify, InterproceduralSROA); + add_pass!(pm, verify, SROA); + add_pass!(pm, verify, ForkSplit); + add_pass!(pm, verify, Unforkify); + add_pass!(pm, verify, GVN); + add_verified_pass!(pm, verify, DCE); if x_dot { pm.add_pass(hercules_opt::pass::Pass::Xdot(true)); - pm.add_pass(hercules_opt::pass::Pass::SchedXdot); } pm.add_pass(hercules_opt::pass::Pass::Codegen(output_dir, module_name)); diff --git a/juno_samples/casts_and_intrinsics/Cargo.toml b/juno_samples/casts_and_intrinsics/Cargo.toml index f49797969012f5195a0338b7b14fa04414dddb03..af74c07acc3950b22b9b2c95e0d07090f99d7490 100644 --- a/juno_samples/casts_and_intrinsics/Cargo.toml +++ b/juno_samples/casts_and_intrinsics/Cargo.toml @@ -13,6 +13,5 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs index 8422f7e60619b48aca2d5e783eaed45e70be71c3..fafa97bbc642751c37b2ef47a43954fa84f340d9 100644 --- a/juno_samples/casts_and_intrinsics/build.rs +++ b/juno_samples/casts_and_intrinsics/build.rs @@ -5,8 +5,6 @@ fn main() { JunoCompiler::new() .file_in_src("casts_and_intrinsics.jn") .unwrap() - .schedule_in_src("casts_and_intrinsics.sch") - .unwrap() .build() .unwrap(); } diff --git a/juno_samples/casts_and_intrinsics/src/casts_and_intrinsics.sch b/juno_samples/casts_and_intrinsics/src/casts_and_intrinsics.sch deleted file mode 100644 index 80ec2766eac8850ff736cd8a1f52dbe87bf24553..0000000000000000000000000000000000000000 --- a/juno_samples/casts_and_intrinsics/src/casts_and_intrinsics.sch +++ /dev/null @@ -1,2 +0,0 @@ -function casts_and_intrinsics { -} diff --git a/juno_samples/casts_and_intrinsics/src/main.rs b/juno_samples/casts_and_intrinsics/src/main.rs index 344168e0e1995becff3b4580f4d957002a9ee2a5..037d4c4025ca141887034353124436a2db8f84f3 100644 --- a/juno_samples/casts_and_intrinsics/src/main.rs +++ b/juno_samples/casts_and_intrinsics/src/main.rs @@ -2,15 +2,12 @@ extern crate async_std; extern crate juno_build; -extern crate hercules_rt; juno_build::juno!("casts_and_intrinsics"); fn main() { async_std::task::block_on(async { - let output = unsafe { - casts_and_intrinsics(16.0).await - }; + let output = casts_and_intrinsics(16.0).await; println!("{}", output); assert_eq!(output, 4); }); diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml index dd40d2094d58bbb17da943f798c1565b12000245..c272fc443df485aaacd80fe5fdc882bd4d02225c 100644 --- a/juno_samples/matmul/Cargo.toml +++ b/juno_samples/matmul/Cargo.toml @@ -13,6 +13,5 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index e68df99828cc47d8dba49fbd11e40156c670dc01..81f645e0666dfb22e075953c3d0a1a531909f1a0 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -5,8 +5,6 @@ fn main() { JunoCompiler::new() .file_in_src("matmul.jn") .unwrap() - .schedule_in_src("matmul.sch") - .unwrap() .build() .unwrap(); } diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs index 6d1867acd13be82b817e7363626acbbd52aa5343..6ec3dae763672075b5410f1b0350c56504f36068 100644 --- a/juno_samples/matmul/src/main.rs +++ b/juno_samples/matmul/src/main.rs @@ -1,19 +1,39 @@ -#![feature(future_join)] +#![feature(future_join, box_as_ptr, let_chains)] extern crate async_std; extern crate juno_build; -extern crate hercules_rt; + +use core::ptr::copy_nonoverlapping; juno_build::juno!("matmul"); fn main() { async_std::task::block_on(async { - let mut a = vec![1.0, 2.0, 3.0, 4.0]; - let mut b = vec![5.0, 6.0, 7.0, 8.0]; - let mut c = vec![0.0, 0.0, 0.0, 0.0]; + let a: Box<[f32]> = Box::new([1.0, 2.0, 3.0, 4.0]); + let b: Box<[f32]> = Box::new([5.0, 6.0, 7.0, 8.0]); + let mut a_bytes: Box<[u8]> = Box::new([0; 16]); + let mut b_bytes: Box<[u8]> = Box::new([0; 16]); + unsafe { + copy_nonoverlapping( + Box::as_ptr(&a) as *const u8, + Box::as_mut_ptr(&mut a_bytes) as *mut u8, + 16, + ); + copy_nonoverlapping( + Box::as_ptr(&b) as *const u8, + Box::as_mut_ptr(&mut b_bytes) as *mut u8, + 16, + ); + }; + let c_bytes = matmul(2, 2, 2, a_bytes, b_bytes).await; + let mut c: Box<[f32]> = Box::new([0.0; 4]); unsafe { - matmul(a.as_mut_ptr(), b.as_mut_ptr(), c.as_mut_ptr(), 2, 2, 2).await; - } + copy_nonoverlapping( + Box::as_ptr(&c_bytes) as *const u8, + Box::as_mut_ptr(&mut c) as *mut u8, + 16, + ); + }; println!("[[{}, {}], [{}, {}]]", c[0], c[1], c[2], c[3]); assert_eq!(c[0], 19.0); assert_eq!(c[1], 22.0); @@ -26,3 +46,4 @@ fn main() { fn matmul_test() { main(); } + diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch deleted file mode 100644 index 847a91214a88c8523771394506d25eb55c3f675d..0000000000000000000000000000000000000000 --- a/juno_samples/matmul/src/matmul.sch +++ /dev/null @@ -1,7 +0,0 @@ -function matmul { - partition { @outer, @middle, @inner } on cpu //gpu - partition @exit on cpu - - parallelize @outer - vectorize @inner -} diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml index 8c9b969d23019b8bbd3bf28b3506e2e497ae8ec7..7ffc13f21b155dbe6028d808be97aaf0e5ffb8d6 100644 --- a/juno_samples/nested_ccp/Cargo.toml +++ b/juno_samples/nested_ccp/Cargo.toml @@ -13,6 +13,5 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index f866112f34914add4f9598d217bf6589fd033dba..80f92c0b9f9e600a157fa22784843438a7445aae 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -1,17 +1,24 @@ -#![feature(future_join)] +#![feature(box_as_ptr, let_chains)] extern crate async_std; extern crate juno_build; -extern crate hercules_rt; + +use core::ptr::copy_nonoverlapping; juno_build::juno!("nested_ccp"); fn main() { async_std::task::block_on(async { - let mut a = vec![17.0, 18.0, 19.0]; - let output = unsafe { - ccp_example(a.as_mut_ptr()).await + let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); + let mut a_bytes: Box<[u8]> = Box::new([0; 12]); + unsafe { + copy_nonoverlapping( + Box::as_ptr(&a) as *const u8, + Box::as_mut_ptr(&mut a_bytes) as *mut u8, + 12, + ); }; + let output = ccp_example(a_bytes).await; println!("{}", output); assert_eq!(output, 1.0); }); diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml index 8060c5b3472ad898cb48e011332a852cd7b6705e..201c8d3782d4b41d7bfef5b7df4b5b29758e6e00 100644 --- a/juno_samples/simple3/Cargo.toml +++ b/juno_samples/simple3/Cargo.toml @@ -13,6 +13,5 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } -hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index 38b198b0ef4a6ee8a3aac4a7da1ff32653981f22..0e476e8d41c7880741a3f474f341a0decf0bda4b 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -5,8 +5,6 @@ fn main() { JunoCompiler::new() .file_in_src("simple3.jn") .unwrap() - .schedule_in_src("simple3.sch") - .unwrap() .build() .unwrap(); } diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs index 71e2766701130fd66e3c09a34493adc11ddb842f..89be5527a2e4d1e5842778818aa934db98fcdf09 100644 --- a/juno_samples/simple3/src/main.rs +++ b/juno_samples/simple3/src/main.rs @@ -1,16 +1,31 @@ -#![feature(future_join)] +#![feature(box_as_ptr, let_chains)] extern crate async_std; -extern crate hercules_rt; extern crate juno_build; +use core::ptr::copy_nonoverlapping; + juno_build::juno!("simple3"); fn main() { async_std::task::block_on(async { - let mut a = vec![1, 2, 3, 4, 5, 6, 7, 8]; - let mut b = vec![8, 7, 6, 5, 4, 3, 2, 1]; - let c = unsafe { simple3(a.as_mut_ptr(), b.as_mut_ptr(), 8).await }; + let a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]); + let b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]); + let mut a_bytes: Box<[u8]> = Box::new([0; 32]); + let mut b_bytes: Box<[u8]> = Box::new([0; 32]); + unsafe { + copy_nonoverlapping( + Box::as_ptr(&a) as *const u8, + Box::as_mut_ptr(&mut a_bytes) as *mut u8, + 32, + ); + copy_nonoverlapping( + Box::as_ptr(&b) as *const u8, + Box::as_mut_ptr(&mut b_bytes) as *mut u8, + 32, + ); + }; + let c = simple3(8, a_bytes, b_bytes).await; println!("{}", c); assert_eq!(c, 120); }); diff --git a/juno_samples/simple3/src/simple3.sch b/juno_samples/simple3/src/simple3.sch deleted file mode 100644 index b3842bee68ab6f5b0aee8ccc663b93cb57753b5a..0000000000000000000000000000000000000000 --- a/juno_samples/simple3/src/simple3.sch +++ /dev/null @@ -1,6 +0,0 @@ -function simple3 { - partition @loop on cpu - partition @exit on cpu - - vectorize @loop -}