diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d8af3a03dd006708a95167fc8dc9f7d49381a372..b80dd590ab42b4019daa285a83ec89d978884006 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,9 +1,5 @@ -build-job: - stage: build - script: - - cargo build - test-job: stage: test script: - cargo test + - cargo test --features=cuda diff --git a/Cargo.lock b/Cargo.lock index 13cadc9592c8e1d95e21c013f465919ffcf6c5ec..ea295fdffe60316165c2de9f8ef41d3536579039 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -395,6 +395,7 @@ version = "0.1.0" dependencies = [ "async-std", "clap", + "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -649,8 +650,13 @@ dependencies = [ "postcard", "serde", "take_mut", + "tempfile", ] +[[package]] +name = "hercules_rt" +version = "0.1.0" + [[package]] name = "hermit-abi" version = "0.4.0" @@ -702,6 +708,7 @@ name = "juno_antideps" version = "0.1.0" dependencies = [ "async-std", + "hercules_rt", "juno_build", "with_builtin_macros", ] @@ -741,11 +748,22 @@ dependencies = [ "phf", ] +[[package]] +name = "juno_implicit_clone" +version = "0.1.0" +dependencies = [ + "async-std", + "hercules_rt", + "juno_build", + "with_builtin_macros", +] + [[package]] name = "juno_matmul" version = "0.1.0" dependencies = [ "async-std", + "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -756,6 +774,7 @@ name = "juno_nested_ccp" version = "0.1.0" dependencies = [ "async-std", + "hercules_rt", "juno_build", "with_builtin_macros", ] @@ -775,6 +794,7 @@ name = "juno_simple3" version = "0.1.0" dependencies = [ "async-std", + "hercules_rt", "juno_build", "with_builtin_macros", ] @@ -896,6 +916,7 @@ version = "0.1.0" dependencies = [ "async-std", "clap", + "hercules_rt", "juno_build", "rand", "with_builtin_macros", @@ -1372,6 +1393,19 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tempfile" +version = "3.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" +dependencies = [ + "cfg-if", + "fastrand", + "once_cell", + "rustix", + "windows-sys 0.59.0", +] + [[package]] name = "time" version = "0.3.36" diff --git a/Cargo.toml b/Cargo.toml index 1db806f41befe6b20d89b8484e069a4674276d6d..86307fd85b5fc707ce4bd9db7f645d5fea8c5dbb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "hercules_cg", "hercules_ir", "hercules_opt", + "hercules_rt", "hercules_tools/hercules_driver", @@ -25,5 +26,5 @@ members = [ "juno_samples/casts_and_intrinsics", "juno_samples/nested_ccp", "juno_samples/antideps", - #"juno_samples/implicit_clone", + "juno_samples/implicit_clone", ] diff --git a/hercules_cg/Cargo.toml b/hercules_cg/Cargo.toml index 8a60956b356465632225e5dfb477166149c44648..cf0767de04c81e35974a7aca89e9085b543e1a9d 100644 --- a/hercules_cg/Cargo.toml +++ b/hercules_cg/Cargo.toml @@ -2,6 +2,7 @@ name = "hercules_cg" version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] +edition = "2021" [dependencies] rand = "*" diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index a09eacfa621876a570d37bd96e1d1f534e94fa4e..3f600fc0b6e729711be8632442e2b756b40e26a1 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -1,11 +1,9 @@ -extern crate hercules_ir; - use std::collections::BTreeMap; use std::fmt::{Error, Write}; use std::iter::zip; use std::sync::atomic::{AtomicUsize, Ordering}; -use self::hercules_ir::*; +use hercules_ir::*; use crate::*; @@ -21,10 +19,8 @@ pub fn cpu_codegen<W: Write>( types: &Vec<Type>, constants: &Vec<Constant>, dynamic_constants: &Vec<DynamicConstant>, - reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, - antideps: &Vec<(NodeID, NodeID)>, bbs: &BasicBlocks, w: &mut W, ) -> Result<(), Error> { @@ -33,10 +29,8 @@ pub fn cpu_codegen<W: Write>( types, constants, dynamic_constants, - reverse_postorder, typing, control_subgraph, - antideps, bbs, }; ctx.codegen_function(w) @@ -47,10 +41,8 @@ struct CPUContext<'a> { types: &'a Vec<Type>, constants: &'a Vec<Constant>, dynamic_constants: &'a Vec<DynamicConstant>, - reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, - antideps: &'a Vec<(NodeID, NodeID)>, bbs: &'a BasicBlocks, } @@ -275,7 +267,7 @@ impl<'a> CPUContext<'a> { write!(body, "double {} to double", val)? } } - _ => panic!("PANIC: Can't dynamically allocate memory for an aggregate type within a CPU function."), + _ => panic!("PANIC: Can't dynamically allocate memory for an aggregate type within a CPU function ({:?} in {}).", id, self.function.name), } } Node::DynamicConstant { id: dc_id } => { @@ -524,12 +516,6 @@ impl<'a> CPUContext<'a> { self.get_value(data, true), index_ptr_name )?; - write!( - body, - " {} = bitcast {} to ptr\n", - self.get_value(id, false), - self.get_value(collect, true) - )?; } else { // If the data item being written is not a primitive type, // then perform a memcpy from the data collection to the @@ -543,6 +529,12 @@ impl<'a> CPUContext<'a> { data_size )?; } + write!( + body, + " {} = bitcast {} to ptr\n", + self.get_value(id, false), + self.get_value(collect, true) + )?; } _ => panic!("PANIC: Can't lower {:?}.", self.function.nodes[id.idx()]), } @@ -732,7 +724,7 @@ impl<'a> CPUContext<'a> { // the dynamic constant bounds. let mut acc_size = self.codegen_type_size(elem, body)?; for dc in bounds { - acc_size = Self::multiply(&acc_size, &format!("dc{}", dc.idx()), body)?; + acc_size = Self::multiply(&acc_size, &format!("%dc{}", dc.idx()), body)?; } Ok(acc_size) } diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs index 50b7dda48108c9b50774f54da23162cb9125c5e0..09a5bc2689ecf3ed1a7186f2ada52758c7b19fbb 100644 --- a/hercules_cg/src/device.rs +++ b/hercules_cg/src/device.rs @@ -1,6 +1,4 @@ -extern crate hercules_ir; - -use self::hercules_ir::*; +use hercules_ir::*; /* * Top level function to definitively place functions onto devices. A function diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs index 2c1d3fc06980b1b9af608452a89d31a8150430b4..e41f0205433b52fd974e449f55d4cd8ea9ca7bcf 100644 --- a/hercules_cg/src/lib.rs +++ b/hercules_cg/src/lib.rs @@ -10,9 +10,19 @@ pub use crate::gpu::*; pub use crate::device::*; pub use crate::rt::*; -extern crate hercules_ir; +use hercules_ir::*; -use self::hercules_ir::*; +/* + * Basic block info consists of two things: + * + * 1. A map from node to block (named by control nodes). + * 2. For each node, which nodes are in its own block. + * + * Note that for #2, the structure is Vec<NodeID>, meaning the nodes are ordered + * inside the block. This order corresponds to the traversal order of the nodes + * in the block needed by the backend code generators. + */ +pub type BasicBlocks = (Vec<NodeID>, Vec<Vec<NodeID>>); /* * The alignment of a type does not depend on dynamic constants. diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 890c898d75841c85756f3b6e2dd0011678e7789b..13370c4580a9365f0a9013b573e558133656d407 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -1,10 +1,8 @@ -extern crate hercules_ir; - use std::collections::BTreeMap; use std::fmt::{Error, Write}; use std::iter::zip; -use self::hercules_ir::*; +use hercules_ir::*; use crate::*; @@ -16,10 +14,8 @@ use crate::*; pub fn rt_codegen<W: Write>( func_id: FunctionID, module: &Module, - reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>, control_subgraph: &Subgraph, - antideps: &Vec<(NodeID, NodeID)>, bbs: &BasicBlocks, collection_objects: &CollectionObjects, callgraph: &CallGraph, @@ -29,10 +25,8 @@ pub fn rt_codegen<W: Write>( let ctx = RTContext { func_id, module, - reverse_postorder, typing, control_subgraph, - antideps, bbs, collection_objects, callgraph, @@ -44,10 +38,8 @@ pub fn rt_codegen<W: Write>( struct RTContext<'a> { func_id: FunctionID, module: &'a Module, - reverse_postorder: &'a Vec<NodeID>, typing: &'a Vec<TypeID>, control_subgraph: &'a Subgraph, - antideps: &'a Vec<(NodeID, NodeID)>, bbs: &'a BasicBlocks, collection_objects: &'a CollectionObjects, callgraph: &'a CallGraph, @@ -61,7 +53,7 @@ impl<'a> RTContext<'a> { // Dump the function signature. write!( w, - "#[allow(unused_variables,unused_mut)]\nasync fn {}(", + "#[allow(unused_variables,unused_mut,unused_parens)]\nasync fn {}<'a>(", func.name )?; let mut first_param = true; @@ -84,75 +76,29 @@ impl<'a> RTContext<'a> { if !self.module.types[func.param_types[idx].idx()].is_primitive() { write!(w, "mut ")?; } - write!( - w, - "p_i{}: {}", - idx, - self.get_type_interface(func.param_types[idx]) - )?; - } - write!(w, ") -> {} {{\n", self.get_type_interface(func.return_type))?; - - // Copy the "interface" parameters to "non-interface" parameters. - // The purpose of this is to convert collection objects from a Box<[u8]> - // type to a *mut u8 type. This name copying is done so that we can - // easily construct objects just after this by moving the "inferface" - // parameters. - for (idx, ty) in func.param_types.iter().enumerate() { - if self.module.types[ty.idx()].is_primitive() { - write!(w, " let p{} = p_i{};\n", idx, idx)?; - } else { - write!( - w, - " let p{} = ::std::boxed::Box::as_mut_ptr(&mut p_i{}) as *mut u8;\n", - idx, idx - )?; - } + write!(w, "p{}: {}", idx, self.get_type(func.param_types[idx]))?; } + write!(w, ") -> {} {{\n", self.get_type(func.return_type))?; - // Collect the boxes representing ownership over collection objects for - // this function. The actual emitted computation is done entirely using - // pointers, so these get emitted to hold onto ownership over the - // underlying memory and to automatically clean them up when this - // function returns. Collection objects are inside Options, since their - // ownership may get passed to other called RT functions. If this - // function returns a collection object, then at the very end, right - // before the return, the to-be-returned pointer is compared against the - // owned collection objects - it should match exactly one of those - // objects, and that box is what's actually returned. - let mem_obj_ty = "::core::option::Option<::std::boxed::Box<[u8]>>"; + // Allocate collection constants. for object in self.collection_objects[&self.func_id].iter_objects() { - match self.collection_objects[&self.func_id].origin(object) { - CollectionObjectOrigin::Parameter(index) => write!( - w, - " let mut obj{}: {} = Some(p_i{});\n", - object.idx(), - mem_obj_ty, - index - )?, - CollectionObjectOrigin::Constant(id) => { - let size = self.codegen_type_size(self.typing[id.idx()]); - write!( - w, - " let mut obj{}: {} = Some((0..{}).map(|_| 0u8).collect());\n", - object.idx(), - mem_obj_ty, - size - )? - } - CollectionObjectOrigin::Call(_) | CollectionObjectOrigin::Undef(_) => write!( + if let CollectionObjectOrigin::Constant(id) = + self.collection_objects[&self.func_id].origin(object) + { + let size = self.codegen_type_size(self.typing[id.idx()]); + write!( w, - " let mut obj{}: {} = None;\n", + " let mut obj{}: ::hercules_rt::HerculesBox = unsafe {{ ::hercules_rt::HerculesBox::__zeros({}) }};\n", object.idx(), - mem_obj_ty, - )?, + size + )? } } - // Dump signatures for called CPU functions. + // Dump signatures for called device functions. write!(w, " extern \"C\" {{\n")?; for callee in self.callgraph.get_callees(self.func_id) { - if self.devices[callee.idx()] != Device::LLVM { + if self.devices[callee.idx()] == Device::AsyncRust { continue; } let callee = &self.module.functions[callee.idx()]; @@ -172,9 +118,9 @@ impl<'a> RTContext<'a> { } else { write!(w, ", ")?; } - write!(w, "p{}: {}", idx, self.get_type(*ty))?; + write!(w, "p{}: {}", idx, self.device_get_type(*ty))?; } - write!(w, ") -> {};\n", self.get_type(callee.return_type))?; + write!(w, ") -> {};\n", self.device_get_type(callee.return_type))?; } write!(w, " }}\n")?; @@ -193,7 +139,7 @@ impl<'a> RTContext<'a> { } else if self.module.types[self.typing[idx].idx()].is_float() { "0.0" } else { - "::core::ptr::null::<u8>() as _" + "unsafe { ::hercules_rt::HerculesBox::__null() }" } )?; } @@ -284,20 +230,7 @@ impl<'a> RTContext<'a> { } Node::Return { control: _, data } => { let block = &mut blocks.get_mut(&id).unwrap(); - let objects = self.collection_objects[&self.func_id].objects(data); - if objects.is_empty() { - write!(block, " return {};\n", self.get_value(data))? - } else { - // If the value to return is a collection object, figure out - // which object it actually is at runtime and return that - // box. - for object in objects { - write!(block, " if let Some(mut obj) = obj{} && ::std::boxed::Box::as_mut_ptr(&mut obj) as *mut u8 == {} {{\n", object.idx(), self.get_value(data))?; - write!(block, " return obj;\n")?; - write!(block, " }}\n")?; - } - write!(block, " panic!(\"HERCULES PANIC: Pointer to be returned doesn't match any known collection objects.\");\n")? - } + write!(block, " return {};\n", self.get_value(data))? } _ => panic!("PANIC: Can't lower {:?}.", func.nodes[id.idx()]), } @@ -316,12 +249,21 @@ impl<'a> RTContext<'a> { match func.nodes[id.idx()] { Node::Parameter { index } => { let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap(); - write!( - block, - " {} = p{};\n", - self.get_value(id), - index - )? + if self.module.types[self.typing[id.idx()].idx()].is_primitive() { + write!( + block, + " {} = p{};\n", + self.get_value(id), + index + )? + } else { + write!( + block, + " {} = unsafe {{ p{}.__take() }};\n", + self.get_value(id), + index + )? + } } Node::Constant { id: cons_id } => { let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap(); @@ -342,11 +284,7 @@ impl<'a> RTContext<'a> { let objects = self.collection_objects[&self.func_id].objects(id); assert_eq!(objects.len(), 1); let object = objects[0]; - write!( - block, - "::std::boxed::Box::as_mut_ptr(obj{}.as_mut().unwrap()) as *mut u8", - object.idx() - )? + write!(block, "unsafe {{ obj{}.__take() }}", object.idx())? } } write!(block, ";\n")? @@ -357,86 +295,100 @@ impl<'a> RTContext<'a> { ref dynamic_constants, ref args, } => { - match self.devices[callee_id.idx()] { - Device::LLVM => { + let device = self.devices[callee_id.idx()]; + match device { + // The device backends ensure that device functions have the + // same C interface. + Device::LLVM | Device::CUDA => { let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap(); + + let device = match device { + Device::LLVM => "cpu", + Device::CUDA => "cuda", + _ => panic!(), + }; + + // First, get the raw pointers to collections that the + // device function takes as input. + let callee_objs = &self.collection_objects[&callee_id]; + for (idx, arg) in args.into_iter().enumerate() { + if let Some(obj) = callee_objs.param_to_object(idx) { + // Extract a raw pointer from the HerculesBox. + if callee_objs.is_mutated(obj) { + write!( + block, + " let arg_tmp{} = unsafe {{ {}.__{}_ptr_mut() }};\n", + idx, + self.get_value(*arg), + device + )?; + } else { + write!( + block, + " let arg_tmp{} = unsafe {{ {}.__{}_ptr() }};\n", + idx, + self.get_value(*arg), + device + )?; + } + } else { + write!( + block, + " let arg_tmp{} = {};\n", + idx, + self.get_value(*arg) + )?; + } + } + + // Emit the call. write!( block, - " {} = unsafe {{ {}(", - self.get_value(id), + " let call_tmp = unsafe {{ {}(", self.module.functions[callee_id.idx()].name )?; for dc in dynamic_constants { self.codegen_dynamic_constant(*dc, block)?; write!(block, ", ")?; } - for arg in args { - write!(block, "{}, ", self.get_value(*arg))?; + for idx in 0..args.len() { + write!(block, "arg_tmp{}, ", idx)?; } write!(block, ") }};\n")?; - // When a CPU function is called that returns a + // When a device function is called that returns a // collection object, that object must have come from // one of its parameters. Dynamically figure out which // one it came from, so that we can move it to the slot // of the output object. - let call_objects = self.collection_objects[&self.func_id].objects(id); - if !call_objects.is_empty() { - assert_eq!(call_objects.len(), 1); - let call_object = call_objects[0]; - - let callee_returned_objects = - self.collection_objects[&callee_id].returned_objects(); - let possible_params: Vec<_> = - (0..self.module.functions[callee_id.idx()].param_types.len()) - .filter(|idx| { - let object_of_param = self.collection_objects[&callee_id] - .param_to_object(*idx); - // Look at parameters that could be the - // source of the memory object returned - // by the function. - object_of_param - .map(|object_of_param| { - callee_returned_objects.contains(&object_of_param) - }) - .unwrap_or(false) - }) - .collect(); - let arg_objects = args - .into_iter() - .enumerate() - .filter(|(idx, _)| possible_params.contains(idx)) - .map(|(_, arg)| { - self.collection_objects[&self.func_id] - .objects(*arg) - .into_iter() - }) - .flatten(); - - // Dynamically check which of the memory objects - // corresponding to arguments to the call was - // returned by the call. Move that memory object - // into the memory object of the call. - let mut first_obj = true; - for arg_object in arg_objects { - write!(block, " ")?; - if first_obj { - first_obj = false; - } else { - write!(block, "else ")?; + let caller_objects = self.collection_objects[&self.func_id].objects(id); + if !caller_objects.is_empty() { + for (idx, arg) in args.into_iter().enumerate() { + if idx != 0 { + write!(block, " else\n")?; } - write!(block, "if let Some(obj) = obj{}.as_mut() && ::std::boxed::Box::as_mut_ptr(obj) as *mut u8 == {} {{\n", arg_object.idx(), self.get_value(id))?; write!( block, - " obj{} = obj{}.take();\n", - call_object.idx(), - arg_object.idx() + " if call_tmp == arg_tmp{} {{\n", + idx )?; - write!(block, " }}\n")?; + write!( + block, + " {} = unsafe {{ {}.__take() }};\n", + self.get_value(id), + self.get_value(*arg) + )?; + write!(block, " }}")?; } write!(block, " else {{\n")?; - write!(block, " panic!(\"HERCULES PANIC: Pointer returned from called function doesn't match any known collection objects.\");\n")?; + write!(block, " panic!(\"HERCULES PANIC: Pointer returned from device function doesn't match an argument pointer.\");\n")?; write!(block, " }}\n")?; + } else { + write!( + block, + " {} = call_tmp;\n", + self.get_value(id) + )?; } } Device::AsyncRust => { @@ -455,12 +407,11 @@ impl<'a> RTContext<'a> { if self.module.types[self.typing[arg.idx()].idx()].is_primitive() { write!(block, "{}, ", self.get_value(*arg))?; } else { - write!(block, "{}.take(), ", self.get_value(*arg))?; + write!(block, "unsafe {{ {}.__take() }}, ", self.get_value(*arg))?; } } write!(block, ").await;\n")?; } - _ => todo!(), } } _ => panic!( @@ -606,8 +557,8 @@ impl<'a> RTContext<'a> { convert_type(&self.module.types[id.idx()]) } - fn get_type_interface(&self, id: TypeID) -> &'static str { - convert_type_interface(&self.module.types[id.idx()]) + fn device_get_type(&self, id: TypeID) -> &'static str { + device_convert_type(&self.module.types[id.idx()]) } } @@ -624,18 +575,27 @@ fn convert_type(ty: &Type) -> &'static str { Type::UnsignedInteger64 => "u64", Type::Float32 => "f32", Type::Float64 => "f64", - Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "*mut u8", + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => { + "::hercules_rt::HerculesBox<'a>" + } _ => panic!(), } } -/* - * Collection types are passed to / returned from runtime functions through a - * wrapper type for ownership tracking reasons. - */ -fn convert_type_interface(ty: &Type) -> &'static str { +fn device_convert_type(ty: &Type) -> &'static str { match ty { - Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "Box<[u8]>", - _ => convert_type(ty), + Type::Boolean => "bool", + Type::Integer8 => "i8", + Type::Integer16 => "i16", + Type::Integer32 => "i32", + Type::Integer64 => "i64", + Type::UnsignedInteger8 => "u8", + Type::UnsignedInteger16 => "u16", + Type::UnsignedInteger32 => "u32", + Type::UnsignedInteger64 => "u64", + Type::Float32 => "f32", + Type::Float64 => "f64", + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "*mut u8", + _ => panic!(), } } diff --git a/hercules_ir/Cargo.toml b/hercules_ir/Cargo.toml index 44648c11baaa5539f6598270f4423b591f4422e2..deda9cc58758f6cc834aadcc8e4ec66625fefb4b 100644 --- a/hercules_ir/Cargo.toml +++ b/hercules_ir/Cargo.toml @@ -2,10 +2,11 @@ name = "hercules_ir" version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@illinois.edu>"] +edition = "2021" [dependencies] rand = "*" nom = "*" ordered-float = { version = "*", features = ["serde"] } bitvec = "*" -serde = { version = "*", features = ["derive"] } \ No newline at end of file +serde = { version = "*", features = ["derive"] } diff --git a/hercules_ir/src/antideps.rs b/hercules_ir/src/antideps.rs deleted file mode 100644 index af949708dd2bc278410ecedd632a04313fa30415..0000000000000000000000000000000000000000 --- a/hercules_ir/src/antideps.rs +++ /dev/null @@ -1,297 +0,0 @@ -use std::collections::{BTreeMap, BTreeSet}; -use std::iter::zip; - -use crate::*; - -/* - * In addition to collections, we need to figure out which "generation" of a - * collection a node may take as input. - */ -#[derive(PartialEq, Eq, Clone, Debug)] -struct GenerationLattice { - objs: BTreeSet<(CollectionObjectID, NodeID)>, -} - -impl Semilattice for GenerationLattice { - fn meet(a: &Self, b: &Self) -> Self { - GenerationLattice { - objs: a.objs.union(&b.objs).map(|x| *x).collect(), - } - } - - fn top() -> Self { - GenerationLattice { - objs: BTreeSet::new(), - } - } - - fn bottom() -> Self { - // Bottom is not representable for this lattice with our Semilattice - // interface, but we never need to construct it. - panic!() - } -} - -/* - * Function to assemble anti-dependence edges. Returns a list of pairs of nodes. - * The first item in the pair is the reading node, and the second item is the - * mutating node. - */ -pub fn antideps( - function: &Function, - reverse_postorder: &Vec<NodeID>, - objects: &FunctionCollectionObjects, -) -> Vec<(NodeID, NodeID)> { - // First, we analyze "generations" of collections as they are mutated. - // Originating, mutating, phi, and reduce nodes start a new generation of a - // collection. Generations are not ordered due to loops, but are rather just - // node IDs of node (parameter, constant, call, undef, write, phi, reduce). - // Other nodes operating on collections mean reads / writes can operate on - // potentially different generations of multiple collections (select). - let lattice = forward_dataflow(function, reverse_postorder, |inputs, id| { - match function.nodes[id.idx()] { - Node::Ternary { - op: TernaryOperator::Select, - first: _, - second: _, - third: _, - } => inputs - .into_iter() - .fold(GenerationLattice::top(), |acc, input| { - GenerationLattice::meet(&acc, input) - }), - Node::Parameter { index: _ } | Node::Constant { id: _ } | Node::Undef { ty: _ } => { - let objs = objects.objects(id); - GenerationLattice { - objs: objs.into_iter().map(|obj| (*obj, id)).collect(), - } - } - Node::Call { - control: _, - function: _, - dynamic_constants: _, - ref args, - } => { - let mut objs = BTreeSet::new(); - let call_objs = objects.objects(id); - - // If this call node might originate an object, add that to the - // lattice output - its generation is this call node. - for obj in call_objs { - if objects.origin(*obj) == CollectionObjectOrigin::Call(id) { - assert!(objs.len() <= 1); - objs.insert((*obj, id)); - } - } - - // For every argument... - for (arg, arg_gens) in zip(args, inputs.into_iter().skip(1)) { - // Look at its objects... - for arg_obj in objects.objects(*arg) { - // For each object that might be returned... - if call_objs.contains(&arg_obj) { - let mutable = objects.mutators(*arg_obj).contains(&id); - for (obj, gen) in arg_gens.objs.iter() { - // Add that object to the output lattice. - if obj == arg_obj && mutable { - // Set the generation to this node if the - // object might be mutated. - objs.insert((*obj, id)); - } else if obj == arg_obj { - // Otherwise, keep the old generation. - objs.insert((*obj, *gen)); - } - } - } - } - } - GenerationLattice { objs } - } - Node::Read { - collect: _, - indices: _, - } => inputs[0].clone(), - Node::Phi { - control: _, - data: _, - } - | Node::Reduce { - control: _, - init: _, - reduct: _, - } - | Node::Write { - collect: _, - data: _, - indices: _, - } => { - // Phis, reduces, and writes update the generation to the write. - let objs = inputs[0].objs.iter().map(|(obj, _)| (*obj, id)).collect(); - GenerationLattice { objs } - } - _ => GenerationLattice::top(), - } - }); - - // Second, we generate anti-dependence edges from the dataflow analysis. - // There are four cases where an anti-dependence edge is generated: - // - // 1. A read node and a write node share an object and generation pair on - // their `collect` input. - // 2. A read node and a call node share an object and generation pair, where - // the pair is on the read's `collect` input and the pair is on any input - // of the call node AND the call node is a mutator of the object. - // 3. A call node and a write node share an object and generation pair, - // where the pair is on any input of the call node and the pair is on the - // write's `collect` input. - // 4. A call node and another call node share an object and generation pair, - // where the pair is on any input of both call nodes AND the second call - // node is a mutator of the object. - let mut reads_writes_calls_mut_calls_per_pair: BTreeMap< - (CollectionObjectID, NodeID), - (Vec<NodeID>, Vec<NodeID>, Vec<NodeID>, Vec<NodeID>), - > = BTreeMap::new(); - for (idx, node) in function.nodes.iter().enumerate() { - let id = NodeID::new(idx); - match node { - Node::Read { - collect, - indices: _, - } => { - for pair in lattice[collect.idx()].objs.iter() { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .0 - .push(id); - } - } - Node::Write { - collect, - data, - indices: _, - } => { - for pair in lattice[collect.idx()].objs.iter() { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .1 - .push(id); - } - - // When a write takes a collection on its `data` input, it - // memcpys that collection into the mutated collection. This is - // a read. - if !objects.objects(*data).is_empty() { - for pair in lattice[collect.idx()].objs.iter() { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .0 - .push(id); - } - } - } - Node::Call { - control: _, - function: _, - dynamic_constants: _, - ref args, - } => { - for arg in args { - for pair in lattice[arg.idx()].objs.iter() { - if objects.mutators(pair.0).contains(&id) { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .3 - .push(id); - } else { - reads_writes_calls_mut_calls_per_pair - .entry(*pair) - .or_default() - .2 - .push(id); - } - } - } - } - _ => {} - } - } - - // Once we've grouped reads / writes / calls by pairs, we create pair-wise - // anti-dependence edges. Due to loops, a write may technically anti-depend - // on a read where the read depends on the write, but we don't want to - // generate that anti-dependence edge, since it'll create a cycle during - // backend code generation. Thus, if the mutator in an anti-dependence is - // the same as the generation of the current pair, don't generate the edge. - let mut antideps = vec![]; - for ((_, gen), (reads, writes, calls, mut_calls)) in reads_writes_calls_mut_calls_per_pair { - // Case 1: - for read in reads.iter() { - for write in writes.iter() { - if *write != gen && *read != *write { - antideps.push((*read, *write)); - } - } - } - - // Case 2: - for read in reads.iter() { - for mut_call in mut_calls.iter() { - if *mut_call != gen && *read != *mut_call { - antideps.push((*read, *mut_call)); - } - } - } - - // Case 3: - for call in calls.iter().chain(mut_calls.iter()) { - for write in writes.iter() { - if *write != gen && *call != *write { - antideps.push((*call, *write)); - } - } - } - - // Case 4: - for call in calls.iter().chain(mut_calls.iter()) { - for mut_call in mut_calls.iter() { - if *mut_call != gen && *call != *mut_call { - antideps.push((*call, *mut_call)); - } - } - } - } - - antideps -} - -/* - * Utility to make a map from node to anti-dependency uses (map mutator -> - * reads). - */ -pub fn flip_antideps(antideps: &Vec<(NodeID, NodeID)>) -> BTreeMap<NodeID, Vec<NodeID>> { - let mut result: BTreeMap<NodeID, Vec<NodeID>> = BTreeMap::new(); - - for (read, mutator) in antideps { - result.entry(*mutator).or_default().push(*read); - } - - result -} - -/* - * Utility to make a map from node to anti-dependency users (map reads -> - * mutators). - */ -pub fn map_antideps(antideps: &Vec<(NodeID, NodeID)>) -> BTreeMap<NodeID, Vec<NodeID>> { - let mut result: BTreeMap<NodeID, Vec<NodeID>> = BTreeMap::new(); - - for (read, mutator) in antideps { - result.entry(*read).or_default().push(*mutator); - } - - result -} diff --git a/hercules_ir/src/collections.rs b/hercules_ir/src/collections.rs index 23d84b1b6629f0a477217d0657085d234bf6cfe1..8bb1b359fdbf27c44baaec5ac129419abb066331 100644 --- a/hercules_ir/src/collections.rs +++ b/hercules_ir/src/collections.rs @@ -285,8 +285,8 @@ pub fn collection_objects( Node::Read { collect: _, indices: _, - } - | Node::Write { + } if !module.types[typing[id.idx()].idx()].is_primitive() => inputs[0].clone(), + Node::Write { collect: _, data: _, indices: _, diff --git a/hercules_ir/src/dataflow.rs b/hercules_ir/src/dataflow.rs index 6df19d1408e06cc9e18c2f3f52e4eb5f5f618315..ced77d0ba4b107ce740fd3bafb99645f17747f96 100644 --- a/hercules_ir/src/dataflow.rs +++ b/hercules_ir/src/dataflow.rs @@ -1,7 +1,5 @@ -extern crate bitvec; - -use self::bitvec::prelude::*; -use self::bitvec::slice::*; +use bitvec::prelude::*; +use bitvec::slice::*; use crate::*; diff --git a/hercules_ir/src/dom.rs b/hercules_ir/src/dom.rs index f9046b434dc707224f8fbbddb6fec766eaf0b1b9..2c0f085df7f3057380a8a9364ad1acc70a71f640 100644 --- a/hercules_ir/src/dom.rs +++ b/hercules_ir/src/dom.rs @@ -77,6 +77,26 @@ impl DomTree { .1 } + /* + * Find the node with the shallowest level in the dom tree amongst the nodes + * given. + */ + pub fn shallowest_amongst<I>(&self, x: I) -> NodeID + where + I: Iterator<Item = NodeID>, + { + x.map(|x| { + if x == self.root { + (0, x) + } else { + (self.idom[&x].0, x) + } + }) + .min_by(|x, y| x.0.cmp(&y.0)) + .unwrap() + .1 + } + /* * Find the least common ancestor in the tree of two nodes. This is an * ancestor of the two nodes that is as far down the tree as possible. diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs index 5ef16bb1c61846a36b78e308767838c03fb8ede0..4d526366808e8b2aea39fecd81f6c00269ffb154 100644 --- a/hercules_ir/src/dot.rs +++ b/hercules_ir/src/dot.rs @@ -1,5 +1,3 @@ -extern crate rand; - use std::collections::HashMap; use std::env::temp_dir; use std::fmt::Write; @@ -7,7 +5,7 @@ use std::fs::File; use std::io::Write as _; use std::process::Command; -use self::rand::Rng; +use rand::Rng; use crate::*; @@ -20,7 +18,6 @@ pub fn xdot_module( reverse_postorders: &Vec<Vec<NodeID>>, doms: Option<&Vec<DomTree>>, fork_join_maps: Option<&Vec<HashMap<NodeID, NodeID>>>, - bbs: Option<&Vec<BasicBlocks>>, ) { let mut tmp_path = temp_dir(); let mut rng = rand::thread_rng(); @@ -33,7 +30,6 @@ pub fn xdot_module( &reverse_postorders, doms, fork_join_maps, - bbs, &mut contents, ) .expect("PANIC: Unable to generate output file contents."); @@ -55,7 +51,6 @@ pub fn write_dot<W: Write>( reverse_postorders: &Vec<Vec<NodeID>>, doms: Option<&Vec<DomTree>>, fork_join_maps: Option<&Vec<HashMap<NodeID, NodeID>>>, - bbs: Option<&Vec<BasicBlocks>>, w: &mut W, ) -> std::fmt::Result { write_digraph_header(w)?; @@ -170,28 +165,6 @@ pub fn write_dot<W: Write>( } } - // Step 4: draw BB edges in olive. - if let Some(bbs) = bbs { - let bbs = &bbs[function_id.idx()]; - for node_idx in 0..bbs.0.len() { - let maybe_data = NodeID::new(node_idx); - let control = bbs.0[node_idx]; - if maybe_data != control { - write_edge( - maybe_data, - function_id, - control, - function_id, - true, - "olivedrab4, constraint=false", - "dotted", - &module, - w, - )?; - } - } - } - write_graph_footer(w)?; } diff --git a/hercules_ir/src/fork_join_analysis.rs b/hercules_ir/src/fork_join_analysis.rs new file mode 100644 index 0000000000000000000000000000000000000000..130bc2ed49c637d28cc27cfd88c564222c680c12 --- /dev/null +++ b/hercules_ir/src/fork_join_analysis.rs @@ -0,0 +1,129 @@ +use std::collections::{HashMap, HashSet}; + +use bitvec::prelude::*; + +use crate::*; + +/* + * Top level function for creating a fork-join map. Map is from fork node ID to + * join node ID, since a join can easily determine the fork it corresponds to + * (that's the mechanism used to implement this analysis). This analysis depends + * on type information. + */ +pub fn fork_join_map(function: &Function, control: &Subgraph) -> HashMap<NodeID, NodeID> { + let mut fork_join_map = HashMap::new(); + for idx in 0..function.nodes.len() { + // We only care about join nodes. + if function.nodes[idx].is_join() { + // Iterate the control predecessors until finding a fork. Maintain a + // counter of unmatched fork-join pairs seen on the way, since fork- + // joins may be nested. Every join is dominated by their fork, so + // just iterate the first unseen predecessor of each control node. + let join_id = NodeID::new(idx); + let mut unpaired = 0; + let mut cursor = join_id; + let mut seen = HashSet::<NodeID>::new(); + let fork_id = loop { + cursor = control + .preds(cursor) + .filter(|pred| !seen.contains(pred)) + .next() + .unwrap(); + seen.insert(cursor); + + if function.nodes[cursor.idx()].is_join() { + unpaired += 1; + } else if function.nodes[cursor.idx()].is_fork() && unpaired > 0 { + unpaired -= 1; + } else if function.nodes[cursor.idx()].is_fork() { + break cursor; + } + }; + fork_join_map.insert(fork_id, join_id); + } + } + fork_join_map +} + +/* + * Find fork/join nests that each control node is inside of. Result is a map + * from each control node to a list of fork nodes. The fork nodes are listed in + * ascending order of nesting. + */ +pub fn compute_fork_join_nesting( + function: &Function, + dom: &DomTree, + fork_join_map: &HashMap<NodeID, NodeID>, +) -> HashMap<NodeID, Vec<NodeID>> { + // For each control node, ascend dominator tree, looking for fork nodes. For + // each fork node, make sure each control node isn't strictly dominated by + // the corresponding join node. + (0..function.nodes.len()) + .map(NodeID::new) + .filter(|id| dom.contains(*id)) + .map(|id| { + ( + id, + dom.ascend(id) + // Filter for forks that dominate this control node, + .filter(|id| function.nodes[id.idx()].is_fork()) + // where its corresponding join doesn't dominate the control + // node (if so, then this control is after the fork-join). + .filter(|fork_id| !dom.does_prop_dom(fork_join_map[&fork_id], id)) + .collect(), + ) + }) + .collect() +} + +/* + * Check if a data node dominates a control node. This involves checking all + * immediate control uses to see if they dominate the queried control node. + */ +pub fn does_data_dom_control( + function: &Function, + data: NodeID, + control: NodeID, + dom: &DomTree, +) -> bool { + let mut stack = vec![data]; + let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; + visited.set(data.idx(), true); + + while let Some(pop) = stack.pop() { + let node = &function.nodes[pop.idx()]; + + let imm_control = match node { + Node::Phi { control, data: _ } + | Node::Reduce { + control, + init: _, + reduct: _, + } + | Node::Call { + control, + function: _, + dynamic_constants: _, + args: _, + } => Some(*control), + _ if node.is_control() => Some(pop), + _ => { + for u in get_uses(node).as_ref() { + if !visited[u.idx()] { + visited.set(u.idx(), true); + stack.push(*u); + } + } + None + } + }; + + if let Some(imm_control) = imm_control + && !dom.does_dom(imm_control, control) + { + return false; + } + } + + true +} diff --git a/hercules_ir/src/gcm.rs b/hercules_ir/src/gcm.rs deleted file mode 100644 index 3718df9b00d0e262572d4602a9c9555ea9f6bb98..0000000000000000000000000000000000000000 --- a/hercules_ir/src/gcm.rs +++ /dev/null @@ -1,391 +0,0 @@ -extern crate bitvec; - -use std::collections::{HashMap, HashSet, VecDeque}; -use std::iter::{zip, FromIterator}; - -use self::bitvec::prelude::*; - -use crate::*; - -/* - * Basic block info consists of two things: - * - * 1. A map from node to block (named by control nodes). - * 2. For each node, which nodes are in its own block. - * - * Note that for #2, the structure is Vec<NodeID>, meaning the nodes are ordered - * inside the block. This order corresponds to the traversal order of the nodes - * in the block needed by the backend code generators. - */ -pub type BasicBlocks = (Vec<NodeID>, Vec<Vec<NodeID>>); - -/* - * Top level global code motion function. Assigns each data node to one of its - * immediate control use / user nodes, forming (unordered) basic blocks. Returns - * the control node / basic block each node is in. Takes in a partial - * partitioning that must be respected. Based on the schedule-early-schedule- - * late method from Cliff Click's PhD thesis. - */ -pub fn gcm( - function: &Function, - def_use: &ImmutableDefUseMap, - reverse_postorder: &Vec<NodeID>, - control_subgraph: &Subgraph, - dom: &DomTree, - antideps: &Vec<(NodeID, NodeID)>, - loops: &LoopTree, - fork_join_map: &HashMap<NodeID, NodeID>, -) -> BasicBlocks { - let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()]; - let back_edges = control_subgraph.back_edges(NodeID::new(0)); - let no_loop_reachability = - control_subgraph.pairwise_reachability(|src, dst| !back_edges.contains(&(src, dst))); - let antideps_users = map_antideps(antideps); - let antideps_uses = flip_antideps(antideps); - - // Step 1: assign the basic block locations of all nodes that must be in a - // specific block. This includes control nodes as well as some special data - // nodes, such as phis. - for idx in 0..function.nodes.len() { - match function.nodes[idx] { - Node::Phi { control, data: _ } => bbs[idx] = Some(control), - Node::ThreadID { - control, - dimension: _, - } => bbs[idx] = Some(control), - Node::Reduce { - control, - init: _, - reduct: _, - } => bbs[idx] = Some(control), - Node::Call { - control, - function: _, - dynamic_constants: _, - args: _, - } => bbs[idx] = Some(control), - Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)), - Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)), - Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)), - _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)), - _ => {} - } - } - - // Step 2: schedule early. Place nodes in the earliest position they could - // go - use worklist to iterate nodes. - let mut schedule_early = bbs.clone(); - let mut worklist = VecDeque::from(reverse_postorder.clone()); - while let Some(id) = worklist.pop_front() { - if schedule_early[id.idx()].is_some() { - continue; - } - - // For every use, check what block is its "schedule early" block. This - // node goes in the lowest block amongst those blocks. - let use_places: Option<Vec<NodeID>> = get_uses(&function.nodes[id.idx()]) - .as_ref() - .into_iter() - .map(|id| *id) - .map(|id| schedule_early[id.idx()]) - .collect(); - if let Some(use_places) = use_places { - // If every use has been placed, we can place this node as the - // lowest place in the domtree that dominates all of the use places. - let lowest = dom.lowest_amongst(use_places.into_iter()); - schedule_early[id.idx()] = Some(lowest); - } else { - // If not, then just push this node back on the worklist. - worklist.push_back(id); - } - } - - // Step 3: schedule late and pick each nodes final position. Since the late - // schedule of each node depends on the final positions of its users, these - // two steps must be fused. Compute their latest position, then use the - // control dependent + shallow loop heuristic to actually place them. - let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map - .into_iter() - .map(|(fork, join)| (*join, *fork)) - .collect(); - let mut worklist = VecDeque::from_iter(reverse_postorder.into_iter().map(|id| *id).rev()); - 'worklist: while let Some(id) = worklist.pop_front() { - if bbs[id.idx()].is_some() { - continue; - } - - // Calculate the least common ancestor of user blocks, a.k.a. the "late" - // schedule. - let calculate_lca = || -> Option<_> { - let mut lca = None; - // Helper to incrementally update the LCA. - let mut update_lca = |a| { - if let Some(acc) = lca { - lca = Some(dom.least_common_ancestor(acc, a)); - } else { - lca = Some(a); - } - }; - - // For every user, consider where we need to be to directly dominate the - // user. - for user in def_use.get_users(id).as_ref().into_iter().map(|id| *id) { - if let Node::Phi { control, data } = &function.nodes[user.idx()] { - // For phis, we need to dominate the block jumping to the phi in - // the slot that corresponds to our use. - for (control, data) in - zip(get_uses(&function.nodes[control.idx()]).as_ref(), data) - { - if id == *data { - update_lca(*control); - } - } - } else if let Node::Reduce { - control, - init, - reduct, - } = &function.nodes[user.idx()] - { - // For reduces, we need to either dominate the block right - // before the fork if we're the init input, or we need to - // dominate the join if we're the reduct input. - if id == *init { - let before_fork = function.nodes[join_fork_map[control].idx()] - .try_fork() - .unwrap() - .0; - update_lca(before_fork); - } else { - assert_eq!(id, *reduct); - update_lca(*control); - } - } else { - // For everything else, we just need to dominate the user. - update_lca(bbs[user.idx()]?); - } - } - - Some(lca) - }; - - // Check if all users have been placed. If one of them hasn't, then add - // this node back on to the worklist. - let Some(lca) = calculate_lca() else { - worklist.push_back(id); - continue; - }; - - // Check if all anti-dependency users have been placed. If one of them - // hasn't, then add this node back on to the worklist. We need to know - // where the anti-dependency users are, so that we can place this - // read "above" mutators that anti-depend on it. The condition for a - // potential placement location is that in the CFG *without loop back- - // edges* the mutator cannot reach the read. Ask Russel about why this - // works, hopefully I'll have a convincing argument by then ;). - let mut antidep_user_locations = vec![]; - for antidep_user in antideps_users.get(&id).unwrap_or(&vec![]) { - if let Some(location) = bbs[antidep_user.idx()] { - antidep_user_locations.push(location); - } else { - worklist.push_back(id); - continue 'worklist; - } - } - - // Look between the LCA and the schedule early location to place the - // node. - let schedule_early = schedule_early[id.idx()].unwrap(); - let mut chain = dom - // If the node has no users, then it doesn't really matter where we - // place it - just place it at the early placement. - .chain(lca.unwrap_or(schedule_early), schedule_early) - // Only allow locations that don't violate the anti-depence property - // listed above. - .filter(|location| { - !antidep_user_locations.iter().any(|antidep_user_location| { - antidep_user_location != location - && no_loop_reachability[antidep_user_location.idx()][location.idx()] - }) - }); - let mut location = chain.next().unwrap(); - while let Some(control_node) = chain.next() { - // If the next node further up the dominator tree is in a shallower - // loop nest or if we can get out of a reduce loop when we don't - // need to be in one, place this data node in a higher-up location. - let shallower_nest = if let (Some(old_nest), Some(new_nest)) = - (loops.nesting(location), loops.nesting(control_node)) - { - old_nest > new_nest - } else { - false - }; - // This will move all nodes that don't need to be in reduce loops - // outside of reduce loops. Nodes that do need to be in a reduce - // loop use the reduce node forming the loop, so the dominator chain - // will consist of one block, and this loop won't ever iterate. - let currently_at_join = function.nodes[location.idx()].is_join(); - if shallower_nest || currently_at_join { - location = control_node; - } - } - - bbs[id.idx()] = Some(location); - } - let bbs: Vec<_> = bbs.into_iter().map(Option::unwrap).collect(); - - // Step 4: determine the order of nodes inside each block. Use worklist to - // add nodes to blocks in order that obeys dependencies. - let mut order: Vec<Vec<NodeID>> = vec![vec![]; function.nodes.len()]; - let mut worklist = VecDeque::from_iter( - reverse_postorder - .into_iter() - .filter(|id| !function.nodes[id.idx()].is_control()), - ); - let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; - while let Some(id) = worklist.pop_front() { - let node = &function.nodes[id.idx()]; - if node.is_phi() - || node.is_reduce() - || get_uses(node) - .as_ref() - .into_iter() - .chain(antideps_uses.get(&id).into_iter().flatten()) - .all(|u| { - function.nodes[u.idx()].is_control() - || bbs[u.idx()] != bbs[id.idx()] - || visited[u.idx()] - }) - { - order[bbs[id.idx()].idx()].push(*id); - visited.set(id.idx(), true); - } else { - worklist.push_back(id); - } - } - - (bbs, order) -} - -/* - * Top level function for creating a fork-join map. Map is from fork node ID to - * join node ID, since a join can easily determine the fork it corresponds to - * (that's the mechanism used to implement this analysis). This analysis depends - * on type information. - */ -pub fn fork_join_map(function: &Function, control: &Subgraph) -> HashMap<NodeID, NodeID> { - let mut fork_join_map = HashMap::new(); - for idx in 0..function.nodes.len() { - // We only care about join nodes. - if function.nodes[idx].is_join() { - // Iterate the control predecessors until finding a fork. Maintain a - // counter of unmatched fork-join pairs seen on the way, since fork- - // joins may be nested. Every join is dominated by their fork, so - // just iterate the first unseen predecessor of each control node. - let join_id = NodeID::new(idx); - let mut unpaired = 0; - let mut cursor = join_id; - let mut seen = HashSet::<NodeID>::new(); - let fork_id = loop { - cursor = control - .preds(cursor) - .filter(|pred| !seen.contains(pred)) - .next() - .unwrap(); - seen.insert(cursor); - - if function.nodes[cursor.idx()].is_join() { - unpaired += 1; - } else if function.nodes[cursor.idx()].is_fork() && unpaired > 0 { - unpaired -= 1; - } else if function.nodes[cursor.idx()].is_fork() { - break cursor; - } - }; - fork_join_map.insert(fork_id, join_id); - } - } - fork_join_map -} - -/* - * Find fork/join nests that each control node is inside of. Result is a map - * from each control node to a list of fork nodes. The fork nodes are listed in - * ascending order of nesting. - */ -pub fn compute_fork_join_nesting( - function: &Function, - dom: &DomTree, - fork_join_map: &HashMap<NodeID, NodeID>, -) -> HashMap<NodeID, Vec<NodeID>> { - // For each control node, ascend dominator tree, looking for fork nodes. For - // each fork node, make sure each control node isn't strictly dominated by - // the corresponding join node. - (0..function.nodes.len()) - .map(NodeID::new) - .filter(|id| dom.contains(*id)) - .map(|id| { - ( - id, - dom.ascend(id) - // Filter for forks that dominate this control node, - .filter(|id| function.nodes[id.idx()].is_fork()) - // where its corresponding join doesn't dominate the control - // node (if so, then this control is after the fork-join). - .filter(|fork_id| !dom.does_prop_dom(fork_join_map[&fork_id], id)) - .collect(), - ) - }) - .collect() -} - -/* - * Check if a data node dominates a control node. This involves checking all - * immediate control uses to see if they dominate the queried control node. - */ -pub fn does_data_dom_control( - function: &Function, - data: NodeID, - control: NodeID, - dom: &DomTree, -) -> bool { - let mut stack = vec![data]; - let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; - visited.set(data.idx(), true); - - while let Some(pop) = stack.pop() { - let node = &function.nodes[pop.idx()]; - - let imm_control = match node { - Node::Phi { control, data: _ } - | Node::Reduce { - control, - init: _, - reduct: _, - } - | Node::Call { - control, - function: _, - dynamic_constants: _, - args: _, - } => Some(*control), - _ if node.is_control() => Some(pop), - _ => { - for u in get_uses(node).as_ref() { - if !visited[u.idx()] { - visited.set(u.idx(), true); - stack.push(*u); - } - } - None - } - }; - - if let Some(imm_control) = imm_control - && !dom.does_dom(imm_control, control) - { - return false; - } - } - - true -} diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 2faf2bb6925888c49f0c9d85d929a8e5508baa37..ba01d8bf138c62696c48c3b1d82bb0508a5b74b7 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1,19 +1,13 @@ -extern crate bitvec; -extern crate ordered_float; -extern crate serde; - -use self::bitvec::prelude::*; -use self::serde::Deserialize; -use self::serde::Serialize; -use std::cmp::Ordering; -use std::cmp::{max, min}; -use std::collections::HashMap; -use std::convert::TryInto; -use std::fmt::{Error, Write}; +use std::fmt::Write; use std::ops::Coroutine; use std::ops::CoroutineState; use std::pin::Pin; +use bitvec::prelude::*; +use ordered_float::OrderedFloat; +use serde::Deserialize; +use serde::Serialize; + use crate::*; /* @@ -96,8 +90,8 @@ pub enum Constant { UnsignedInteger16(u16), UnsignedInteger32(u32), UnsignedInteger64(u64), - Float32(ordered_float::OrderedFloat<f32>), - Float64(ordered_float::OrderedFloat<f64>), + Float32(OrderedFloat<f32>), + Float64(OrderedFloat<f64>), Product(TypeID, Box<[ConstantID]>), Summation(TypeID, u32, ConstantID), // Array constants are always zero. @@ -949,8 +943,8 @@ impl Constant { Constant::UnsignedInteger16(0) => true, Constant::UnsignedInteger32(0) => true, Constant::UnsignedInteger64(0) => true, - Constant::Float32(ord) => *ord == ordered_float::OrderedFloat::<f32>(0.0), - Constant::Float64(ord) => *ord == ordered_float::OrderedFloat::<f64>(0.0), + Constant::Float32(ord) => *ord == OrderedFloat::<f32>(0.0), + Constant::Float64(ord) => *ord == OrderedFloat::<f64>(0.0), _ => false, } } @@ -965,8 +959,8 @@ impl Constant { Constant::UnsignedInteger16(1) => true, Constant::UnsignedInteger32(1) => true, Constant::UnsignedInteger64(1) => true, - Constant::Float32(ord) => *ord == ordered_float::OrderedFloat::<f32>(1.0), - Constant::Float64(ord) => *ord == ordered_float::OrderedFloat::<f64>(1.0), + Constant::Float32(ord) => *ord == OrderedFloat::<f32>(1.0), + Constant::Float64(ord) => *ord == OrderedFloat::<f64>(1.0), _ => false, } } @@ -1141,8 +1135,8 @@ pub fn dynamic_constant_cmp( return Ok(ranges); } - // Scalar multiple requires both that all right terms have left - // term with same positive multiplier, and there are no + // Scalar multiple requires both that all right terms have left + // term with same positive multiplier, and there are no // outstanding left terms after matching. let mut is_scalar_multiple = true; let mut scalar_factor = 0; @@ -1518,6 +1512,19 @@ impl Node { } } + pub fn try_write(&self) -> Option<(NodeID, NodeID, &[Index])> { + if let Node::Write { + collect, + data, + indices, + } = self + { + Some((*collect, *data, indices)) + } else { + None + } + } + pub fn is_zero_constant(&self, constants: &Vec<Constant>) -> bool { if let Node::Constant { id } = self && constants[id.idx()].is_zero() diff --git a/hercules_ir/src/lib.rs b/hercules_ir/src/lib.rs index 05e5e2e860a122392a668a254d54a7a5917db3f4..32bbf6310ff7ea0415383dc3fd7176043de835ee 100644 --- a/hercules_ir/src/lib.rs +++ b/hercules_ir/src/lib.rs @@ -6,7 +6,6 @@ iter_intersperse )] -pub mod antideps; pub mod build; pub mod callgraph; pub mod collections; @@ -14,7 +13,7 @@ pub mod dataflow; pub mod def_use; pub mod dom; pub mod dot; -pub mod gcm; +pub mod fork_join_analysis; pub mod ir; pub mod loops; pub mod parse; @@ -22,7 +21,6 @@ pub mod subgraph; pub mod typecheck; pub mod verify; -pub use crate::antideps::*; pub use crate::build::*; pub use crate::callgraph::*; pub use crate::collections::*; @@ -30,7 +28,7 @@ pub use crate::dataflow::*; pub use crate::def_use::*; pub use crate::dom::*; pub use crate::dot::*; -pub use crate::gcm::*; +pub use crate::fork_join_analysis::*; pub use crate::ir::*; pub use crate::loops::*; pub use crate::parse::*; diff --git a/hercules_ir/src/loops.rs b/hercules_ir/src/loops.rs index 7c9a0a85949efcc248439031601b2fed17f0acf6..13e935e0dd151ba3a29c4d07c9f9ee50341d5091 100644 --- a/hercules_ir/src/loops.rs +++ b/hercules_ir/src/loops.rs @@ -1,10 +1,8 @@ -extern crate bitvec; - use std::collections::hash_map; use std::collections::VecDeque; use std::collections::{HashMap, HashSet}; -use self::bitvec::prelude::*; +use bitvec::prelude::*; use crate::*; @@ -25,6 +23,7 @@ use crate::*; pub struct LoopTree { root: NodeID, loops: HashMap<NodeID, (BitVec<u8, Lsb0>, NodeID)>, + inverse_loops: HashMap<NodeID, NodeID>, nesting: HashMap<NodeID, usize>, } @@ -45,6 +44,10 @@ impl LoopTree { header == self.root || self.loops[&header].0[is_in.idx()] } + pub fn header_of(&self, control_node: NodeID) -> Option<NodeID> { + self.inverse_loops.get(&control_node).map(|h| *h) + } + /* * Sometimes, we need to iterate the loop tree bottom-up. Just assemble the * order upfront. @@ -149,7 +152,16 @@ pub fn loops( }) .collect(); - // Step 6: compute loop tree nesting. + // Step 6: compute the inverse loop map - this maps control nodes to which + // loop they are in (keyed by header), if they are in one. + let mut inverse_loops = HashMap::new(); + for (header, (contents, _)) in loops.iter() { + for idx in contents.iter_ones() { + inverse_loops.insert(NodeID::new(idx), *header); + } + } + + // Step 7: compute loop tree nesting. let mut nesting = HashMap::new(); let mut worklist: VecDeque<NodeID> = loops.keys().map(|id| *id).collect(); while let Some(header) = worklist.pop_front() { @@ -166,6 +178,7 @@ pub fn loops( LoopTree { root, loops, + inverse_loops, nesting, } } diff --git a/hercules_ir/src/parse.rs b/hercules_ir/src/parse.rs index 5e4b12178d1782df47d2e5d36abffdb568311799..21eb325a530907bec0aa1f34788708da76453a54 100644 --- a/hercules_ir/src/parse.rs +++ b/hercules_ir/src/parse.rs @@ -1,5 +1,3 @@ -extern crate nom; - use std::cell::RefCell; use std::collections::HashMap; use std::str::FromStr; diff --git a/hercules_ir/src/subgraph.rs b/hercules_ir/src/subgraph.rs index 89e8bcc64febd6fe36ec69d0d3a68a0dc0eda348..908011d3198ee4fe982fdc7b7aa263c87f80e3c7 100644 --- a/hercules_ir/src/subgraph.rs +++ b/hercules_ir/src/subgraph.rs @@ -1,9 +1,7 @@ -extern crate bitvec; - use std::collections::{HashMap, HashSet}; use std::mem::take; -use self::bitvec::prelude::*; +use bitvec::prelude::*; use crate::*; @@ -23,6 +21,7 @@ pub struct Subgraph { original_num_nodes: u32, } +#[derive(Debug, Clone)] pub struct SubgraphIterator<'a> { nodes: &'a Vec<NodeID>, edges: &'a [u32], @@ -203,6 +202,33 @@ impl Subgraph { edges } + pub fn rev_po(&self, root: NodeID) -> Vec<NodeID> { + let mut order = vec![]; + let mut stack = vec![]; + let mut visited = bitvec![u8, Lsb0; 0; self.original_num_nodes as usize]; + + stack.push(root); + visited.set(root.idx(), true); + + while let Some(pop) = stack.pop() { + if self.succs(pop).any(|succ| !visited[succ.idx()]) { + stack.push(pop); + for succ in self.succs(pop) { + if !visited[succ.idx()] { + visited.set(succ.idx(), true); + stack.push(succ); + break; + } + } + } else { + order.push(pop); + } + } + + order.reverse(); + order + } + pub fn pairwise_reachability<P>(&self, p: P) -> Vec<BitVec<u8, Lsb0>> where P: Fn(NodeID, NodeID) -> bool, diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs index c657d5987f005a721ffe663ee22fa6b8fc877b43..d6862c354199dc748797e47d4f663f898df24d7b 100644 --- a/hercules_ir/src/typecheck.rs +++ b/hercules_ir/src/typecheck.rs @@ -984,10 +984,6 @@ fn typeflow( data: _, indices, } => { - if indices.len() == 0 { - return Error(String::from("Write node must have at least one index.")); - } - // Traverse the collect input's type tree downwards. if let (Concrete(mut collect_id), Concrete(data_id)) = (inputs[0], inputs[1]) { for index in indices.iter() { diff --git a/hercules_ir/src/verify.rs b/hercules_ir/src/verify.rs index 18ad92c3271df9dd9bea3bc5881489dd84176c70..572bb9d11d3aca8efb5bd70b6b18781da83bc0e7 100644 --- a/hercules_ir/src/verify.rs +++ b/hercules_ir/src/verify.rs @@ -1,9 +1,7 @@ -extern crate bitvec; - use std::collections::HashMap; use std::iter::zip; -use self::bitvec::prelude::*; +use bitvec::prelude::*; use crate::*; diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml index e1936a97d4e717b06195188016b711735af6367b..84f6aca83e508d905ad0e13f0670e7d45c18d22b 100644 --- a/hercules_opt/Cargo.toml +++ b/hercules_opt/Cargo.toml @@ -2,10 +2,12 @@ name = "hercules_opt" version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@illinois.edu>"] +edition = "2021" [dependencies] ordered-float = "*" bitvec = "*" +tempfile = "*" either = "*" itertools = "*" take_mut = "*" diff --git a/hercules_opt/src/ccp.rs b/hercules_opt/src/ccp.rs index a66bf63345c6f951e7f6e7bb8abeba5bcabdf968..39fab9da391a4ee17618fb5b5f899e37138fce1f 100644 --- a/hercules_opt/src/ccp.rs +++ b/hercules_opt/src/ccp.rs @@ -1,12 +1,10 @@ -extern crate hercules_ir; - use std::cmp::{max, min}; use std::collections::HashSet; use std::iter::zip; -use self::hercules_ir::dataflow::*; -use self::hercules_ir::def_use::get_uses; -use self::hercules_ir::ir::*; +use hercules_ir::dataflow::*; +use hercules_ir::def_use::get_uses; +use hercules_ir::ir::*; use crate::*; @@ -415,8 +413,8 @@ fn ccp_flow_function( }), // If node has only one output, if doesn't directly handle crossover of // reachability and constant propagation. Read handles that. - Node::If { control, cond } => inputs[control.idx()].clone(), - Node::Match { control, sum } => inputs[control.idx()].clone(), + Node::If { control, cond: _ } => inputs[control.idx()].clone(), + Node::Match { control, sum: _ } => inputs[control.idx()].clone(), Node::Fork { control, factors: _, diff --git a/hercules_opt/src/dce.rs b/hercules_opt/src/dce.rs index 75268694fdccb23920cccf3cd60cb5ae6738ef0f..026672a395d783c0abd5257894c4c32335654371 100644 --- a/hercules_opt/src/dce.rs +++ b/hercules_opt/src/dce.rs @@ -1,13 +1,10 @@ -extern crate hercules_ir; - -use self::hercules_ir::def_use::*; -use self::hercules_ir::ir::*; +use hercules_ir::def_use::*; +use hercules_ir::ir::*; use crate::*; /* - * Top level function to run dead code elimination. Deletes nodes by setting - * nodes to gravestones. Works with a function already containing gravestones. + * Top level function to run dead code elimination. */ pub fn dce(editor: &mut FunctionEditor) { // Create worklist (starts as all nodes). diff --git a/hercules_opt/src/delete_uncalled.rs b/hercules_opt/src/delete_uncalled.rs index 78ab428526971f121e0a2bc582e55b423395730b..1a19ee010e4772b8a2367b928f8226109dbef32c 100644 --- a/hercules_opt/src/delete_uncalled.rs +++ b/hercules_opt/src/delete_uncalled.rs @@ -1,9 +1,7 @@ -extern crate bitvec; -extern crate hercules_ir; -use self::bitvec::prelude::*; +use bitvec::prelude::*; -use self::hercules_ir::callgraph::*; -use self::hercules_ir::ir::*; +use hercules_ir::callgraph::*; +use hercules_ir::ir::*; use crate::*; diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 0c97abff6429a76f03481542f03c9ba7cd09a5f3..1318f032e0ac8a02c0375ae2de56311f0d97306a 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -1,18 +1,13 @@ -extern crate bitvec; -extern crate either; -extern crate hercules_ir; -extern crate itertools; - use std::cell::{Ref, RefCell}; use std::collections::{BTreeMap, HashSet}; use std::mem::take; use std::ops::Deref; -use self::bitvec::prelude::*; -use self::either::Either; +use bitvec::prelude::*; +use either::Either; -use self::hercules_ir::def_use::*; -use self::hercules_ir::ir::*; +use hercules_ir::def_use::*; +use hercules_ir::ir::*; /* * Helper object for editing Hercules functions in a trackable manner. Edits @@ -25,6 +20,7 @@ pub struct FunctionEditor<'a> { // Wraps a mutable reference to a function. Doesn't provide access to this // reference directly, so that we can monitor edits. function: &'a mut Function, + function_id: FunctionID, // Keep a RefCell to (dynamic) constants and types to allow function changes // to update these constants: &'a RefCell<Vec<Constant>>, @@ -61,6 +57,7 @@ pub struct FunctionEdit<'a: 'b, 'b> { added_types: Vec<Type>, // Compute a def-use map entries iteratively. updated_def_use: BTreeMap<NodeID, HashSet<NodeID>>, + updated_param_types: Option<Vec<TypeID>>, updated_return_type: Option<TypeID>, // Keep track of which deleted and added node IDs directly correspond. sub_edits: Vec<(NodeID, NodeID)>, @@ -69,6 +66,7 @@ pub struct FunctionEdit<'a: 'b, 'b> { impl<'a: 'b, 'b> FunctionEditor<'a> { pub fn new( function: &'a mut Function, + function_id: FunctionID, constants: &'a RefCell<Vec<Constant>>, dynamic_constants: &'a RefCell<Vec<DynamicConstant>>, types: &'a RefCell<Vec<Type>>, @@ -87,6 +85,7 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { FunctionEditor { function, + function_id, constants, dynamic_constants, types, @@ -110,6 +109,7 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { added_dynamic_constants: Vec::new().into(), added_types: Vec::new().into(), updated_def_use: BTreeMap::new(), + updated_param_types: None, updated_return_type: None, sub_edits: vec![], }; @@ -120,13 +120,14 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { let FunctionEdit { editor, deleted_nodeids, - added_nodeids, + added_nodeids: _, added_and_updated_nodes, added_and_updated_schedules, added_constants, added_dynamic_constants, added_types, updated_def_use, + updated_param_types, updated_return_type, sub_edits, } = populated_edit; @@ -203,7 +204,12 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { editor_dynamic_constants.extend(added_dynamic_constants); editor_types.extend(added_types); - // Step 8: update return type if necessary. + // Step 8: update parameter types if necessary. + if let Some(param_types) = updated_param_types { + editor.function.param_types = param_types; + } + + // Step 9: update return type if necessary. if let Some(return_type) = updated_return_type { editor.function.return_type = return_type; } @@ -218,6 +224,10 @@ impl<'a: 'b, 'b> FunctionEditor<'a> { &self.function } + pub fn func_id(&self) -> FunctionID { + self.function_id + } + pub fn get_dynamic_constants(&self) -> Ref<'_, Vec<DynamicConstant>> { self.dynamic_constants.borrow() } @@ -573,6 +583,10 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { } } + pub fn set_param_types(&mut self, tys: Vec<TypeID>) { + self.updated_param_types = Some(tys); + } + pub fn set_return_type(&mut self, ty: TypeID) { self.updated_return_type = Some(ty); } @@ -585,8 +599,8 @@ mod editor_tests { use std::mem::replace; - use self::hercules_ir::dataflow::reverse_postorder; - use self::hercules_ir::parse::parse; + use hercules_ir::dataflow::reverse_postorder; + use hercules_ir::parse::parse; fn canonicalize(function: &mut Function) -> Vec<Option<NodeID>> { // The reverse postorder traversal from the Start node is a map from new @@ -660,6 +674,7 @@ fn func(x: i32) -> i32 // Edit the function by replacing the add with a multiply. let mut editor = FunctionEditor::new( func, + FunctionID::new(0), &constants_ref, &dynamic_constants_ref, &types_ref, diff --git a/hercules_opt/src/float_collections.rs b/hercules_opt/src/float_collections.rs new file mode 100644 index 0000000000000000000000000000000000000000..faa38375f68e93fad04b632c2f2e5491403a6a3e --- /dev/null +++ b/hercules_opt/src/float_collections.rs @@ -0,0 +1,105 @@ +use hercules_ir::*; + +use crate::*; + +/* + * Float collections constants out of device functions, where allocation isn't + * allowed. + */ +pub fn float_collections( + editors: &mut [FunctionEditor], + typing: &ModuleTyping, + callgraph: &CallGraph, + devices: &Vec<Device>, +) { + let topo = callgraph.topo(); + for to_float_id in topo { + // Collection constants float until reaching an AsyncRust function. + if devices[to_float_id.idx()] == Device::AsyncRust { + continue; + } + + // Find the target constant nodes in the function. + let cons: Vec<(NodeID, Node)> = editors[to_float_id.idx()] + .func() + .nodes + .iter() + .enumerate() + .filter(|(_, node)| { + node.try_constant() + .map(|cons_id| !editors[to_float_id.idx()].get_constant(cons_id).is_scalar()) + .unwrap_or(false) + }) + .map(|(idx, node)| (NodeID::new(idx), node.clone())) + .collect(); + if cons.is_empty() { + continue; + } + + // Each constant node becomes a new parameter. + let mut new_param_types = editors[to_float_id.idx()].func().param_types.clone(); + let old_num_params = new_param_types.len(); + for (id, _) in cons.iter() { + new_param_types.push(typing[to_float_id.idx()][id.idx()]); + } + let success = editors[to_float_id.idx()].edit(|mut edit| { + for (idx, (id, _)) in cons.iter().enumerate() { + let param = edit.add_node(Node::Parameter { + index: idx + old_num_params, + }); + edit = edit.replace_all_uses(*id, param)?; + edit = edit.delete_node(*id)?; + } + edit.set_param_types(new_param_types); + Ok(edit) + }); + if !success { + continue; + } + + // Add constants in callers and pass them into calls. + for caller in callgraph.get_callers(to_float_id) { + let calls: Vec<(NodeID, Node)> = editors[caller.idx()] + .func() + .nodes + .iter() + .enumerate() + .filter(|(_, node)| { + node.try_call() + .map(|(_, callee, _, _)| callee == to_float_id) + .unwrap_or(false) + }) + .map(|(idx, node)| (NodeID::new(idx), node.clone())) + .collect(); + let success = editors[caller.idx()].edit(|mut edit| { + let cons_ids: Vec<_> = cons + .iter() + .map(|(_, node)| edit.add_node(node.clone())) + .collect(); + for (id, node) in calls { + let Node::Call { + control, + function, + dynamic_constants, + args, + } = node + else { + panic!() + }; + let mut args = Vec::from(args); + args.extend(cons_ids.iter()); + let new_call = edit.add_node(Node::Call { + control, + function, + dynamic_constants, + args: args.into_boxed_slice(), + }); + edit = edit.replace_all_uses(id, new_call)?; + edit = edit.delete_node(id)?; + } + Ok(edit) + }); + assert!(success); + } + } +} diff --git a/hercules_opt/src/fork_concat_split.rs b/hercules_opt/src/fork_concat_split.rs index 232b43f70374427842b425f912928d316535d886..186cd6a6eaad70ba4b26e25dee8714c7988ee611 100644 --- a/hercules_opt/src/fork_concat_split.rs +++ b/hercules_opt/src/fork_concat_split.rs @@ -1,9 +1,7 @@ -extern crate hercules_ir; - use std::collections::{HashMap, HashSet}; use std::iter::zip; -use self::hercules_ir::ir::*; +use hercules_ir::ir::*; use crate::*; diff --git a/hercules_opt/src/fork_guard_elim.rs b/hercules_opt/src/fork_guard_elim.rs index cfa2a6ff344f5641a28d01b7b1fa44feb480b03e..842c83086f9ecedbbe5c8c96bf160de8968a953a 100644 --- a/hercules_opt/src/fork_guard_elim.rs +++ b/hercules_opt/src/fork_guard_elim.rs @@ -1,10 +1,8 @@ -extern crate hercules_ir; - use std::collections::{HashMap, HashSet}; -use self::hercules_ir::get_uses_mut; -use self::hercules_ir::ir::*; -use self::hercules_ir::ImmutableDefUseMap; +use hercules_ir::get_uses_mut; +use hercules_ir::ir::*; +use hercules_ir::ImmutableDefUseMap; /* * This is a Hercules IR transformation that: diff --git a/hercules_opt/src/forkify.rs b/hercules_opt/src/forkify.rs index e32bef383a077d6444bf3f3817c4adbe96fee78b..fb53a5e4a84bb08da7606aea914d368878c94b41 100644 --- a/hercules_opt/src/forkify.rs +++ b/hercules_opt/src/forkify.rs @@ -1,10 +1,8 @@ -extern crate hercules_ir; - use std::iter::zip; -use self::hercules_ir::def_use::*; -use self::hercules_ir::ir::*; -use self::hercules_ir::loops::*; +use hercules_ir::def_use::*; +use hercules_ir::ir::*; +use hercules_ir::loops::*; /* * Top level function to convert natural loops with simple induction variables diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs new file mode 100644 index 0000000000000000000000000000000000000000..a7df9bd9c8409daf4daa004dcd2b29a3ec2660c8 --- /dev/null +++ b/hercules_opt/src/gcm.rs @@ -0,0 +1,881 @@ +use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque}; +use std::iter::{empty, once, zip, FromIterator}; + +use bitvec::prelude::*; +use either::Either; + +use hercules_cg::*; +use hercules_ir::*; + +use crate::*; + +/* + * Top level function to legalize the reference semantics of a Hercules IR + * function. Hercules IR is a value semantics representation, meaning that all + * program state is in the form of copyable values, and mutation takes place by + * making a new value that is a copy of the old value with some modification. + * This representation is extremely convenient for optimization, but is not good + * for code generation, where we need to generate code with references to get + * good performance. Hercules IR can alternatively be interpreted using + * reference semantics, where pointers to collection objects are passed around, + * read from, and written to. However, the value semantics and reference + * semantics interpretation of a Hercules IR function may not be equal - this + * pass transforms a Hercules IR function such that its new value semantics is + * the same as its old value semantics and that its new reference semantics is + * the same as its new value semantics. This pass returns a placement of nodes + * into ordered basic blocks, since the reference semantics of a function + * depends on the order of execution with respect to anti-dependencies. This + * is analogous to global code motion from the original sea of nodes paper. + * + * Our strategy for handling multiple mutating users of a collection is to treat + * the problem similar to register allocation; we perform a liveness analysis, + * spill constants into newly allocated constants, and read back the spilled + * contents when they are used after the first mutation. It's not obvious how + * many spills are needed upfront, and newly spilled constants may affect the + * liveness analysis result, so every spill restarts the process of checking for + * spills. Once no more spills are found, the process terminates. When a spill + * is found, the basic block assignments, and all the other analyses, are not + * necessarily valid anymore, so this function is called in a loop in pass.rs + * until no more spills are found. + */ +pub fn gcm( + editor: &mut FunctionEditor, + def_use: &ImmutableDefUseMap, + reverse_postorder: &Vec<NodeID>, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + dom: &DomTree, + fork_join_map: &HashMap<NodeID, NodeID>, + loops: &LoopTree, + objects: &CollectionObjects, +) -> Option<BasicBlocks> { + let bbs = basic_blocks( + editor.func(), + editor.func_id(), + def_use, + reverse_postorder, + dom, + loops, + fork_join_map, + objects, + ); + if spill_clones(editor, typing, control_subgraph, objects, &bbs) { + None + } else { + Some(bbs) + } +} + +/* + * Top level global code motion function. Assigns each data node to one of its + * immediate control use / user nodes, forming (unordered) basic blocks. Returns + * the control node / basic block each node is in. Takes in a partial + * partitioning that must be respected. Based on the schedule-early-schedule- + * late method from Cliff Click's PhD thesis. May fail if an anti-dependency + * edge can't be satisfied - in this case, a clone that has to be induced is + * returned instead. + */ +fn basic_blocks( + function: &Function, + func_id: FunctionID, + def_use: &ImmutableDefUseMap, + reverse_postorder: &Vec<NodeID>, + dom: &DomTree, + loops: &LoopTree, + fork_join_map: &HashMap<NodeID, NodeID>, + objects: &CollectionObjects, +) -> BasicBlocks { + let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()]; + + // Step 1: assign the basic block locations of all nodes that must be in a + // specific block. This includes control nodes as well as some special data + // nodes, such as phis. + for idx in 0..function.nodes.len() { + match function.nodes[idx] { + Node::Phi { control, data: _ } => bbs[idx] = Some(control), + Node::ThreadID { + control, + dimension: _, + } => bbs[idx] = Some(control), + Node::Reduce { + control, + init: _, + reduct: _, + } => bbs[idx] = Some(control), + Node::Call { + control, + function: _, + dynamic_constants: _, + args: _, + } => bbs[idx] = Some(control), + Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)), + Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)), + Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)), + _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)), + _ => {} + } + } + + // Step 2: schedule early. Place nodes in the earliest position they could + // go - use worklist to iterate nodes. + let mut schedule_early = bbs.clone(); + let mut worklist = VecDeque::from(reverse_postorder.clone()); + while let Some(id) = worklist.pop_front() { + if schedule_early[id.idx()].is_some() { + continue; + } + + // For every use, check what block is its "schedule early" block. This + // node goes in the lowest block amongst those blocks. + let use_places: Option<Vec<NodeID>> = get_uses(&function.nodes[id.idx()]) + .as_ref() + .into_iter() + .map(|id| *id) + .map(|id| schedule_early[id.idx()]) + .collect(); + if let Some(use_places) = use_places { + // If every use has been placed, we can place this node as the + // lowest place in the domtree that dominates all of the use places. + let lowest = dom.lowest_amongst(use_places.into_iter()); + schedule_early[id.idx()] = Some(lowest); + } else { + // If not, then just push this node back on the worklist. + worklist.push_back(id); + } + } + + // Step 3: find anti-dependence edges. An anti-dependence edge needs to be + // drawn between a collection reading node and a collection mutating node + // when the following conditions are true: + // + // 1: The reading and mutating nodes may involve the same collection. + // 2: The node producing the collection used by the reading node is in a + // schedule early block that dominates the schedule early block of the + // mutating node. The node producing the collection used by the reading + // node may be an originator of a collection, phi or reduce, or mutator, + // but not forwarding read - forwarding reads are collapsed, and the + // bottom read is treated as reading from the transitive parent of the + // forwarding read(s). + let mut antideps = BTreeSet::new(); + for id in reverse_postorder.iter() { + // Find a terminating read node and the collections it reads. + let terminating_reads: BTreeSet<_> = + terminating_reads(function, func_id, *id, objects).collect(); + if !terminating_reads.is_empty() { + // Walk forwarding reads to find anti-dependency roots. + let mut workset = terminating_reads.clone(); + let mut roots = BTreeSet::new(); + while let Some(pop) = workset.pop_first() { + let forwarded: BTreeSet<_> = + forwarding_reads(function, func_id, pop, objects).collect(); + if forwarded.is_empty() { + roots.insert(pop); + } else { + workset.extend(forwarded); + } + } + + // For each root, find mutating nodes dominated by the root that + // modify an object read on any input of the current node (the + // terminating read). + // TODO: make this less outrageously inefficient. + let func_objects = &objects[&func_id]; + for root in roots.iter() { + let root_early = schedule_early[root.idx()].unwrap(); + let mut root_block_iterated_users: BTreeSet<NodeID> = BTreeSet::new(); + let mut workset = BTreeSet::new(); + workset.insert(*root); + while let Some(pop) = workset.pop_first() { + let users = def_use.get_users(pop).into_iter().filter(|user| { + !function.nodes[user.idx()].is_phi() + && !function.nodes[user.idx()].is_reduce() + && schedule_early[user.idx()].unwrap() == root_early + }); + workset.extend(users.clone()); + root_block_iterated_users.extend(users); + } + let read_objs: BTreeSet<_> = terminating_reads + .iter() + .map(|read_use| func_objects.objects(*read_use).into_iter()) + .flatten() + .map(|id| *id) + .collect(); + for mutator in reverse_postorder.iter() { + let mutator_early = schedule_early[mutator.idx()].unwrap(); + if dom.does_dom(root_early, mutator_early) + && (root_early != mutator_early + || root_block_iterated_users.contains(&mutator)) + && mutating_objects(function, func_id, *mutator, objects) + .any(|mutated| read_objs.contains(&mutated)) + && id != mutator + { + antideps.insert((*id, *mutator)); + } + } + } + } + } + let mut antideps_uses = vec![vec![]; function.nodes.len()]; + let mut antideps_users = vec![vec![]; function.nodes.len()]; + for (reader, mutator) in antideps.iter() { + antideps_uses[mutator.idx()].push(*reader); + antideps_users[reader.idx()].push(*mutator); + } + + // Step 4: schedule late and pick each nodes final position. Since the late + // schedule of each node depends on the final positions of its users, these + // two steps must be fused. Compute their latest position, then use the + // control dependent + shallow loop heuristic to actually place them. A + // placement might not necessarily be found due to anti-dependency edges. + // These are optional and not necessary to consider, but we do since obeying + // them can reduce the number of clones. If the worklist stops making + // progress, stop considering the anti-dependency edges. + let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map + .into_iter() + .map(|(fork, join)| (*join, *fork)) + .collect(); + let mut worklist = VecDeque::from_iter(reverse_postorder.into_iter().map(|id| *id).rev()); + let mut num_skip_iters = 0; + let mut consider_antidependencies = true; + while let Some(id) = worklist.pop_front() { + if num_skip_iters >= worklist.len() { + consider_antidependencies = false; + } + + if bbs[id.idx()].is_some() { + num_skip_iters = 0; + continue; + } + + // Calculate the least common ancestor of user blocks, a.k.a. the "late" + // schedule. + let calculate_lca = || -> Option<_> { + let mut lca = None; + // Helper to incrementally update the LCA. + let mut update_lca = |a| { + if let Some(acc) = lca { + lca = Some(dom.least_common_ancestor(acc, a)); + } else { + lca = Some(a); + } + }; + + // For every user, consider where we need to be to directly dominate the + // user. + for user in def_use + .get_users(id) + .as_ref() + .into_iter() + .chain(if consider_antidependencies { + Either::Left(antideps_users[id.idx()].iter()) + } else { + Either::Right(empty()) + }) + .map(|id| *id) + { + if let Node::Phi { control, data } = &function.nodes[user.idx()] { + // For phis, we need to dominate the block jumping to the phi in + // the slot that corresponds to our use. + for (control, data) in + zip(get_uses(&function.nodes[control.idx()]).as_ref(), data) + { + if id == *data { + update_lca(*control); + } + } + } else if let Node::Reduce { + control, + init, + reduct, + } = &function.nodes[user.idx()] + { + // For reduces, we need to either dominate the block right + // before the fork if we're the init input, or we need to + // dominate the join if we're the reduct input. + if id == *init { + let before_fork = function.nodes[join_fork_map[control].idx()] + .try_fork() + .unwrap() + .0; + update_lca(before_fork); + } else { + assert_eq!(id, *reduct); + update_lca(*control); + } + } else { + // For everything else, we just need to dominate the user. + update_lca(bbs[user.idx()]?); + } + } + + Some(lca) + }; + + // Check if all users have been placed. If one of them hasn't, then add + // this node back on to the worklist. + let Some(lca) = calculate_lca() else { + worklist.push_back(id); + num_skip_iters += 1; + continue; + }; + + // Look between the LCA and the schedule early location to place the + // node. + let schedule_early = schedule_early[id.idx()].unwrap(); + let schedule_late = lca.unwrap_or(schedule_early); + let mut chain = dom + // If the node has no users, then it doesn't really matter where we + // place it - just place it at the early placement. + .chain(schedule_late, schedule_early); + + if let Some(mut location) = chain.next() { + while let Some(control_node) = chain.next() { + // If the next node further up the dominator tree is in a shallower + // loop nest or if we can get out of a reduce loop when we don't + // need to be in one, place this data node in a higher-up location. + let old_nest = loops + .header_of(location) + .map(|header| loops.nesting(header).unwrap()); + let new_nest = loops + .header_of(control_node) + .map(|header| loops.nesting(header).unwrap()); + let shallower_nest = if let (Some(old_nest), Some(new_nest)) = (old_nest, new_nest) + { + old_nest > new_nest + } else { + // If the new location isn't a loop, it's nesting level should + // be considered "shallower" if the current location is in a + // loop. + old_nest.is_some() + }; + // This will move all nodes that don't need to be in reduce loops + // outside of reduce loops. Nodes that do need to be in a reduce + // loop use the reduce node forming the loop, so the dominator chain + // will consist of one block, and this loop won't ever iterate. + let currently_at_join = function.nodes[location.idx()].is_join(); + if shallower_nest || currently_at_join { + location = control_node; + } + } + + bbs[id.idx()] = Some(location); + num_skip_iters = 0; + } else { + // If there is no valid location for this node, then it's a reading + // node of a collection that can't be placed above a mutation that + // anti-depend uses it. Push the node back on the list, and we'll + // stop considering anti-dependencies soon. Don't immediately stop + // considering anti-dependencies, as we may be able to eak out some + // more use of them. + worklist.push_back(id); + num_skip_iters += 1; + continue; + } + } + let bbs: Vec<_> = bbs.into_iter().map(Option::unwrap).collect(); + // Calculate the number of phis and reduces per basic block. We use this to + // emit phis and reduces at the top of basic blocks. We want to emit phis + // and reduces first into ordered basic blocks for two reasons: + // 1. This is useful for liveness analysis. + // 2. This is needed for some backends - LLVM expects phis to be at the top + // of basic blocks. + let mut num_phis_reduces = vec![0; function.nodes.len()]; + for (node_idx, bb) in bbs.iter().enumerate() { + let node = &function.nodes[node_idx]; + if node.is_phi() || node.is_reduce() { + num_phis_reduces[bb.idx()] += 1; + } + } + + // Step 5: determine the order of nodes inside each block. Use worklist to + // add nodes to blocks in order that obeys dependencies. + let mut order: Vec<Vec<NodeID>> = vec![vec![]; function.nodes.len()]; + let mut worklist = VecDeque::from_iter( + reverse_postorder + .into_iter() + .filter(|id| !function.nodes[id.idx()].is_control()), + ); + let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()]; + let mut num_skip_iters = 0; + let mut consider_antidependencies = true; + while let Some(id) = worklist.pop_front() { + // If the worklist isn't making progress, then there's at least one + // reading node of a collection that is in a anti-depend + normal depend + // use cycle with a mutating node. See above comment about anti- + // dependencies being optional; we just stop considering them here. + if num_skip_iters >= worklist.len() { + consider_antidependencies = false; + } + + // Phis and reduces always get emitted. Other nodes need to obey + // dependency relationships and need to come after phis and reduces. + let node = &function.nodes[id.idx()]; + let bb = bbs[id.idx()]; + if node.is_phi() + || node.is_reduce() + || (num_phis_reduces[bb.idx()] == 0 + && get_uses(node) + .as_ref() + .into_iter() + .chain(if consider_antidependencies { + Either::Left(antideps_uses[id.idx()].iter()) + } else { + Either::Right(empty()) + }) + .all(|u| { + function.nodes[u.idx()].is_control() + || bbs[u.idx()] != bbs[id.idx()] + || visited[u.idx()] + })) + { + order[bb.idx()].push(*id); + visited.set(id.idx(), true); + num_skip_iters = 0; + if node.is_phi() || node.is_reduce() { + num_phis_reduces[bb.idx()] -= 1; + } + } else { + worklist.push_back(id); + num_skip_iters += 1; + } + } + + (bbs, order) +} + +fn terminating_reads<'a>( + function: &'a Function, + func_id: FunctionID, + reader: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = NodeID> + 'a> { + match function.nodes[reader.idx()] { + Node::Read { + collect, + indices: _, + } if objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)), + Node::Write { + collect: _, + data, + indices: _, + } if !objects[&func_id].objects(data).is_empty() => Box::new(once(data)), + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| { + let objects = &objects[&callee]; + let returns = objects.returned_objects(); + let param_obj = objects.param_to_object(idx)?; + if !objects.is_mutated(param_obj) && !returns.contains(¶m_obj) { + Some(*arg) + } else { + None + } + })), + _ => Box::new(empty()), + } +} + +fn forwarding_reads<'a>( + function: &'a Function, + func_id: FunctionID, + reader: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = NodeID> + 'a> { + match function.nodes[reader.idx()] { + Node::Read { + collect, + indices: _, + } if !objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)), + Node::Ternary { + op: TernaryOperator::Select, + first: _, + second, + third, + } if !objects[&func_id].objects(reader).is_empty() => { + Box::new(once(second).chain(once(third))) + } + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| { + let objects = &objects[&callee]; + let returns = objects.returned_objects(); + let param_obj = objects.param_to_object(idx)?; + if !objects.is_mutated(param_obj) && returns.contains(¶m_obj) { + Some(*arg) + } else { + None + } + })), + _ => Box::new(empty()), + } +} + +fn mutating_objects<'a>( + function: &'a Function, + func_id: FunctionID, + mutator: NodeID, + objects: &'a CollectionObjects, +) -> Box<dyn Iterator<Item = CollectionObjectID> + 'a> { + match function.nodes[mutator.idx()] { + Node::Write { + collect, + data: _, + indices: _, + } => Box::new(objects[&func_id].objects(collect).into_iter().map(|id| *id)), + Node::Call { + control: _, + function: callee, + dynamic_constants: _, + ref args, + } => Box::new( + args.into_iter() + .enumerate() + .filter_map(move |(idx, arg)| { + let callee_objects = &objects[&callee]; + let param_obj = callee_objects.param_to_object(idx)?; + if callee_objects.is_mutated(param_obj) { + Some(objects[&func_id].objects(*arg).into_iter().map(|id| *id)) + } else { + None + } + }) + .flatten(), + ), + _ => Box::new(empty()), + } +} + +type Liveness = BTreeMap<NodeID, Vec<BTreeSet<NodeID>>>; + +/* + * Top level function to find implicit clones that need to be spilled. Returns + * whether a clone was spilled, in which case the whole scheduling process must + * be restarted. + */ +fn spill_clones( + editor: &mut FunctionEditor, + typing: &Vec<TypeID>, + control_subgraph: &Subgraph, + objects: &CollectionObjects, + bbs: &BasicBlocks, +) -> bool { + // Step 1: compute a liveness analysis of collection values in the IR. This + // requires a dataflow analysis over the scheduled IR, which is not a common + // need in Hercules, so just hardcode the analysis. + let liveness = liveness_dataflow( + editor.func(), + editor.func_id(), + control_subgraph, + objects, + bbs, + ); + + // Step 2: compute an interference graph from the liveness result. This + // graph contains a vertex per node ID producing a collection value and an + // edge per pair of node IDs that interfere. Nodes A and B interfere if node + // A is defined right above a point where node B is live. + let mut edges = vec![]; + for (bb, liveness) in liveness { + let insts = &bbs.1[bb.idx()]; + for (node, live) in zip(insts, liveness.into_iter().skip(1)) { + for live_node in live { + if *node != live_node { + edges.push((*node, live_node)); + } + } + } + } + + // Step 3: filter edges (A, B) to just see edges where A uses B and A isn't + // a terminating read. These are the edges that may require a spill. + let mut spill_edges = edges.into_iter().filter(|(a, b)| { + get_uses(&editor.func().nodes[a.idx()]) + .as_ref() + .into_iter() + .any(|u| *u == *b) + && !terminating_reads(editor.func(), editor.func_id(), *a, objects).any(|id| id == *b) + }); + + // Step 4: if there is a spill edge, spill it and return true. Otherwise, + // return false. + if let Some((user, obj)) = spill_edges.next() { + // Figure out the most immediate dominating region for every basic + // block. These are the points where spill slot phis get placed. + let nodes = &editor.func().nodes; + let mut imm_dom_reg = vec![NodeID::new(0); editor.func().nodes.len()]; + for (idx, node) in nodes.into_iter().enumerate() { + if node.is_region() { + imm_dom_reg[idx] = NodeID::new(idx); + } + } + let rev_po = control_subgraph.rev_po(NodeID::new(0)); + for bb in rev_po.iter() { + if !nodes[bb.idx()].is_region() && !nodes[bb.idx()].is_start() { + imm_dom_reg[bb.idx()] = + imm_dom_reg[control_subgraph.preds(*bb).next().unwrap().idx()]; + } + } + + let other_obj_users: Vec<_> = editor.get_users(obj).filter(|id| *id != user).collect(); + let mut dummy_phis = vec![NodeID::new(0); imm_dom_reg.len()]; + let mut success = editor.edit(|mut edit| { + // Construct the spill slot. This is just a constant that gets phi- + // ed throughout the entire function. + let cons_id = edit.add_zero_constant(typing[obj.idx()]); + let slot_id = edit.add_node(Node::Constant { id: cons_id }); + + // Allocate IDs for phis that move the spill slot throughout the + // function without implicit clones. These are dummy phis, since + // there are potentially cycles between them. We will replace them + // later. + for (idx, reg) in imm_dom_reg.iter().enumerate().skip(1) { + if idx == reg.idx() { + dummy_phis[idx] = edit.add_node(Node::Phi { + control: *reg, + data: empty().collect(), + }); + } + } + + // Spill `obj` before `user` potentially modifies it. + let spill_region = imm_dom_reg[bbs.0[obj.idx()].idx()]; + let spill_id = edit.add_node(Node::Write { + collect: if spill_region == NodeID::new(0) { + slot_id + } else { + dummy_phis[spill_region.idx()] + }, + data: obj, + indices: empty().collect(), + }); + + // Before each other user, unspill `obj`. + for other_user in other_obj_users { + let other_region = imm_dom_reg[bbs.0[other_user.idx()].idx()]; + // If this assert fails, then `obj` is not in the first basic + // block, but it has a user that is in the first basic block, + // which violates SSA. + assert!(other_region == spill_region || other_region != NodeID::new(0)); + + // If an other user is a phi, we need to be a little careful + // about how we insert unspilling code for `obj`. Instead of + // inserting an unspill in the same block as the user, we need + // to insert one in each predecessor of the phi that corresponds + // to a use of `obj`. Since this requires modifying individual + // uses in a phi, just rebuild the node entirely. + if let Node::Phi { control, data } = edit.get_node(other_user).clone() { + assert_eq!(control, other_region); + let mut new_data = vec![]; + for (pred, data) in zip(control_subgraph.preds(control), data) { + let pred = imm_dom_reg[pred.idx()]; + if data == obj { + let unspill_id = edit.add_node(Node::Write { + collect: obj, + data: if pred == spill_region { + spill_id + } else { + dummy_phis[pred.idx()] + }, + indices: empty().collect(), + }); + new_data.push(unspill_id); + } else { + new_data.push(data); + } + } + let new_phi = edit.add_node(Node::Phi { + control, + data: new_data.into_boxed_slice(), + }); + edit = edit.replace_all_uses(other_user, new_phi)?; + edit = edit.delete_node(other_user)?; + } else { + let unspill_id = edit.add_node(Node::Write { + collect: obj, + data: if other_region == spill_region { + spill_id + } else { + dummy_phis[other_region.idx()] + }, + indices: empty().collect(), + }); + edit = edit.replace_all_uses_where(obj, unspill_id, |id| *id == other_user)?; + } + } + + // Create and hook up all the real phis. Phi elimination will clean + // this up. + let mut real_phis = vec![NodeID::new(0); imm_dom_reg.len()]; + for (idx, reg) in imm_dom_reg.iter().enumerate().skip(1) { + if idx == reg.idx() { + real_phis[idx] = edit.add_node(Node::Phi { + control: *reg, + data: control_subgraph + .preds(*reg) + .map(|pred| { + let pred = imm_dom_reg[pred.idx()]; + if pred == spill_region { + spill_id + } else if pred == NodeID::new(0) { + slot_id + } else { + dummy_phis[pred.idx()] + } + }) + .collect(), + }); + } + } + for (dummy, real) in zip(dummy_phis.iter(), real_phis) { + if *dummy != real { + edit = edit.replace_all_uses(*dummy, real)?; + } + } + + Ok(edit) + }); + success = success + && editor.edit(|mut edit| { + for dummy in dummy_phis { + if dummy != NodeID::new(0) { + edit = edit.delete_node(dummy)?; + } + } + Ok(edit) + }); + assert!(success, "PANIC: GCM cannot fail to edit a function, as it needs to legalize the reference semantics of every function before code generation."); + true + } else { + false + } +} + +/* + * Liveness dataflow analysis on scheduled Hercules IR. Just look at nodes that + * involve collections. + */ +fn liveness_dataflow( + function: &Function, + func_id: FunctionID, + control_subgraph: &Subgraph, + objects: &CollectionObjects, + bbs: &BasicBlocks, +) -> Liveness { + let mut po = control_subgraph.rev_po(NodeID::new(0)); + po.reverse(); + let mut liveness = Liveness::default(); + for (bb_idx, insts) in bbs.1.iter().enumerate() { + liveness.insert(NodeID::new(bb_idx), vec![BTreeSet::new(); insts.len() + 1]); + } + let mut num_phis_reduces = vec![0; function.nodes.len()]; + let mut reducing = vec![false; function.nodes.len()]; + for (node_idx, bb) in bbs.0.iter().enumerate() { + let node = &function.nodes[node_idx]; + if node.is_phi() || node.is_reduce() { + num_phis_reduces[bb.idx()] += 1; + // Phis and reduces can't be in the same basic block. + if node.is_reduce() { + assert!(num_phis_reduces[bb.idx()] == 0 || reducing[bb.idx()]); + reducing[bb.idx()] = true; + } else { + assert!(!reducing[bb.idx()]); + } + } + } + let is_obj = |id: NodeID| !objects[&func_id].objects(id).is_empty(); + + loop { + let mut changed = false; + + for bb in po.iter() { + // First, calculate the liveness set for the bottom of this block. + let last_pt = bbs.1[bb.idx()].len(); + let old_value = &liveness[&bb][last_pt]; + let mut new_value = BTreeSet::new(); + for succ in control_subgraph.succs(*bb).chain(if reducing[bb.idx()] { + Either::Left(once(*bb)) + } else { + Either::Right(empty()) + }) { + // The liveness at the bottom of a basic block is the union of: + // 1. The liveness of each succecessor right after its phis and + // reduces. + // 2. Every data use in a phi or reduce that corresponds to this + // block as the predecessor. + let after_phis_reduces_pt = num_phis_reduces[succ.idx()]; + new_value.extend(&liveness[&succ][after_phis_reduces_pt]); + for inst_idx in 0..after_phis_reduces_pt { + let id = bbs.1[succ.idx()][inst_idx]; + new_value.remove(&id); + match function.nodes[id.idx()] { + Node::Phi { control, ref data } if is_obj(data[0]) => { + assert_eq!(control, succ); + new_value.extend( + zip(control_subgraph.preds(succ), data) + .filter(|(pred, _)| *pred == *bb) + .map(|(_, data)| *data), + ); + } + Node::Reduce { + control, + init, + reduct, + } if is_obj(init) => { + assert_eq!(control, succ); + if succ == *bb { + new_value.insert(reduct); + } else { + new_value.insert(init); + } + } + _ => {} + } + } + } + changed |= *old_value != new_value; + liveness.get_mut(&bb).unwrap()[last_pt] = new_value; + + // Second, calculate the liveness set above each instruction in this block. + for pt in (0..last_pt).rev() { + let old_value = &liveness[&bb][pt]; + let mut new_value = liveness[&bb][pt + 1].clone(); + let id = bbs.1[bb.idx()][pt]; + let uses = get_uses(&function.nodes[id.idx()]); + new_value.remove(&id); + new_value.extend( + if let Node::Write { + collect: _, + data, + ref indices, + } = function.nodes[id.idx()] + && indices.is_empty() + { + // If this write is a cloning write, the `collect` input + // isn't actually live, because its value doesn't + // matter. + Either::Left(once(data).filter(|id| is_obj(*id))) + } else { + Either::Right( + uses.as_ref() + .into_iter() + .map(|id| *id) + .filter(|id| is_obj(*id)), + ) + }, + ); + changed |= *old_value != new_value; + liveness.get_mut(&bb).unwrap()[pt] = new_value; + } + } + + if !changed { + return liveness; + } + } +} diff --git a/hercules_opt/src/gvn.rs b/hercules_opt/src/gvn.rs index a9db0cd879a0796f0ae1545c7bb1048d012b0d39..42835e853c36a994f656a16e1662e8837fd55239 100644 --- a/hercules_opt/src/gvn.rs +++ b/hercules_opt/src/gvn.rs @@ -1,8 +1,6 @@ -extern crate hercules_ir; - use std::collections::HashMap; -use self::hercules_ir::ir::*; +use hercules_ir::ir::*; use crate::*; @@ -11,7 +9,7 @@ use crate::*; * fairly simple compared to in a normal CFG. Needs access to constants for * identity function simplification. */ -pub fn gvn(editor: &mut FunctionEditor) { +pub fn gvn(editor: &mut FunctionEditor, gvn_constants_and_clones: bool) { // Create worklist (starts as all nodes) and value number hashmap. let mut worklist: Vec<NodeID> = (0..editor.func().nodes.len()).map(NodeID::new).collect(); let mut value_numbers: HashMap<Node, NodeID> = HashMap::new(); @@ -28,7 +26,18 @@ pub fn gvn(editor: &mut FunctionEditor) { // Next, check if there is a value number for this simplified value yet. if let Some(number) = value_numbers.get(&editor.func().nodes[value.idx()]) { // If the number is this worklist item, there's nothing to be done. - if *number == work { + // Also, don't GVN constants and clones if indicated to not do so. + if *number == work + || (!gvn_constants_and_clones + && (editor.func().nodes[work.idx()] + .try_constant() + .map(|cons_id| !editor.get_constant(cons_id).is_scalar()) + .unwrap_or(false) + || editor.func().nodes[work.idx()] + .try_write() + .map(|(_, _, indices)| indices.is_empty()) + .unwrap_or(false))) + { continue; } diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs index 9ab158742d140448c6aadead23cd237e7f7be1c9..63a05b0c10bd4efca7da76ceb21969b118770e53 100644 --- a/hercules_opt/src/inline.rs +++ b/hercules_opt/src/inline.rs @@ -1,11 +1,9 @@ -extern crate hercules_ir; - use std::collections::HashMap; use std::iter::zip; -use self::hercules_ir::callgraph::*; -use self::hercules_ir::def_use::*; -use self::hercules_ir::ir::*; +use hercules_ir::callgraph::*; +use hercules_ir::def_use::*; +use hercules_ir::ir::*; use crate::*; diff --git a/hercules_opt/src/interprocedural_sroa.rs b/hercules_opt/src/interprocedural_sroa.rs index c6cf448b0615cac528bbbff055d91c4bd0d371d6..9edb4d02134e4c5c05aaf398cddbd7bb19686625 100644 --- a/hercules_opt/src/interprocedural_sroa.rs +++ b/hercules_opt/src/interprocedural_sroa.rs @@ -1,9 +1,9 @@ -extern crate hercules_ir; use std::collections::HashMap; +use std::iter::zip; + +use hercules_ir::ir::*; -use self::hercules_ir::ir::*; use crate::*; -use std::iter::zip; /** * Given an editor for each function in a module, return V s.t. diff --git a/hercules_opt/src/lib.rs b/hercules_opt/src/lib.rs index 4a4011b1f19c62d741f8d30189998039a1dd1b30..08d183a7f8c2ddfc650bb1ef3ce385761a137def 100644 --- a/hercules_opt/src/lib.rs +++ b/hercules_opt/src/lib.rs @@ -4,13 +4,14 @@ pub mod ccp; pub mod dce; pub mod delete_uncalled; pub mod editor; +pub mod float_collections; pub mod fork_concat_split; pub mod fork_guard_elim; pub mod forkify; +pub mod gcm; pub mod gvn; pub mod inline; pub mod interprocedural_sroa; -pub mod materialize_clones; pub mod outline; pub mod pass; pub mod phi_elim; @@ -24,13 +25,14 @@ pub use crate::ccp::*; pub use crate::dce::*; pub use crate::delete_uncalled::*; pub use crate::editor::*; +pub use crate::float_collections::*; pub use crate::fork_concat_split::*; pub use crate::fork_guard_elim::*; pub use crate::forkify::*; +pub use crate::gcm::*; pub use crate::gvn::*; pub use crate::inline::*; pub use crate::interprocedural_sroa::*; -pub use crate::materialize_clones::*; pub use crate::outline::*; pub use crate::pass::*; pub use crate::phi_elim::*; diff --git a/hercules_opt/src/materialize_clones.rs b/hercules_opt/src/materialize_clones.rs deleted file mode 100644 index 687ac10c87d595c6f53a3fff4435e14f6dc4f375..0000000000000000000000000000000000000000 --- a/hercules_opt/src/materialize_clones.rs +++ /dev/null @@ -1,21 +0,0 @@ -extern crate hercules_ir; - -use self::hercules_ir::*; - -use crate::*; - -/* - * Top level function to materialize clones of collections. This transformation - * eliminates the possibility of multiple independent writes (including dynamic - * writes) to a single collection by introducing extra collection constants and - * inserting explicit clones. This allows us to make the simplifying assumption - * in the backend that collections have reference, rather than value, semantics. - * The pass calling this function is mandatory for correctness. - */ -pub fn materialize_clones( - editor: &mut FunctionEditor, - objects: &FunctionCollectionObjects, - bbs: &BasicBlocks, -) { - todo!() -} diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs index 70062bbcbce0001f0d07a3cfb25fbf2cd94d0433..80f97c7f079ad57bb1edb0929dfe56578eae338c 100644 --- a/hercules_opt/src/outline.rs +++ b/hercules_opt/src/outline.rs @@ -1,14 +1,12 @@ -extern crate hercules_ir; - use std::collections::{BTreeMap, BTreeSet}; use std::iter::zip; use std::sync::atomic::{AtomicUsize, Ordering}; -use self::hercules_ir::def_use::*; -use self::hercules_ir::dom::*; -use self::hercules_ir::gcm::*; -use self::hercules_ir::ir::*; -use self::hercules_ir::subgraph::*; +use hercules_ir::def_use::*; +use hercules_ir::dom::*; +use hercules_ir::fork_join_analysis::*; +use hercules_ir::ir::*; +use hercules_ir::subgraph::*; use crate::*; @@ -567,9 +565,7 @@ pub fn outline( } /* - * Just outlines all of a function except the entry, return, and aggregate - * constants. This is the minimum work needed to cause runtime Rust code to be - * generated as necessary. + * Just outlines all of a function except the start and return nodes. */ pub fn dumb_outline( editor: &mut FunctionEditor, @@ -585,11 +581,7 @@ pub fn dumb_outline( .node_ids() .filter(|id| { let node = &editor.func().nodes[id.idx()]; - if let Node::Constant { id } = editor.func().nodes[id.idx()] { - editor.get_constant(id).is_scalar() - } else { - !(node.is_start() || node.is_parameter() || node.is_return()) - } + !(node.is_start() || node.is_parameter() || node.is_return()) }) .collect(); outline( diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs index 08faa5b0760c0312688bbd902a53e2ff350bb1be..7366a3362e9942ce78d4fe0b2238fb8ce1324124 100644 --- a/hercules_opt/src/pass.rs +++ b/hercules_opt/src/pass.rs @@ -1,21 +1,16 @@ -extern crate hercules_cg; -extern crate hercules_ir; -extern crate postcard; -extern crate serde; -extern crate take_mut; - use std::cell::RefCell; use std::collections::{HashMap, HashSet}; -use std::env::temp_dir; use std::fs::File; use std::io::Write; use std::iter::zip; use std::process::{Command, Stdio}; -use self::serde::Deserialize; +use serde::Deserialize; + +use tempfile::TempDir; -use self::hercules_cg::*; -use self::hercules_ir::*; +use hercules_cg::*; +use hercules_ir::*; use crate::*; @@ -38,8 +33,9 @@ pub enum Pass { DeleteUncalled, ForkSplit, Unforkify, - MaterializeClones, InferSchedules, + GCM, + FloatCollections, Verify, // Parameterized over whether analyses that aid visualization are necessary. // Useful to set to false if displaying a potentially broken module. @@ -72,7 +68,6 @@ pub struct PassManager { pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>, pub loops: Option<Vec<LoopTree>>, pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, - pub antideps: Option<Vec<Vec<(NodeID, NodeID)>>>, pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub bbs: Option<Vec<BasicBlocks>>, pub collection_objects: Option<CollectionObjects>, @@ -94,7 +89,6 @@ impl PassManager { fork_join_nests: None, loops: None, reduce_cycles: None, - antideps: None, data_nodes_in_fork_joins: None, bbs: None, collection_objects: None, @@ -238,28 +232,6 @@ impl PassManager { } } - pub fn make_antideps(&mut self) { - if self.antideps.is_none() { - self.make_reverse_postorders(); - self.make_collection_objects(); - self.antideps = Some( - zip( - self.module.functions.iter(), - zip( - self.reverse_postorders.as_ref().unwrap().iter(), - self.collection_objects.as_ref().unwrap().iter(), - ), - ) - // Fine since collection_objects is a BTreeMap - iteration order - // is fixed. - .map(|(function, (reverse_postorder, objects))| { - antideps(function, reverse_postorder, objects.1) - }) - .collect(), - ); - } - } - pub fn make_data_nodes_in_fork_joins(&mut self) { if self.data_nodes_in_fork_joins.is_none() { self.make_def_uses(); @@ -280,64 +252,6 @@ impl PassManager { } } - pub fn make_bbs(&mut self) { - if self.bbs.is_none() { - self.make_def_uses(); - self.make_reverse_postorders(); - self.make_control_subgraphs(); - self.make_doms(); - self.make_antideps(); - self.make_loops(); - self.make_fork_join_maps(); - let def_uses = self.def_uses.as_ref().unwrap().iter(); - let reverse_postorders = self.reverse_postorders.as_ref().unwrap().iter(); - let control_subgraphs = self.control_subgraphs.as_ref().unwrap().iter(); - let doms = self.doms.as_ref().unwrap().iter(); - let antideps = self.antideps.as_ref().unwrap().iter(); - let loops = self.loops.as_ref().unwrap().iter(); - let fork_join_maps = self.fork_join_maps.as_ref().unwrap().iter(); - self.bbs = Some( - zip( - self.module.functions.iter(), - zip( - def_uses, - zip( - reverse_postorders, - zip( - control_subgraphs, - zip(doms, zip(antideps, zip(loops, fork_join_maps))), - ), - ), - ), - ) - .map( - |( - function, - ( - def_use, - ( - reverse_postorder, - (control_subgraph, (dom, (antideps, (loops, fork_join_map)))), - ), - ), - )| { - gcm( - function, - def_use, - reverse_postorder, - control_subgraph, - dom, - antideps, - loops, - fork_join_map, - ) - }, - ) - .collect(), - ); - } - } - pub fn make_collection_objects(&mut self) { if self.collection_objects.is_none() { self.make_reverse_postorders(); @@ -375,6 +289,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -409,6 +324,7 @@ impl PassManager { .map(|(i, f)| { FunctionEditor::new( f, + FunctionID::new(i), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -442,6 +358,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -468,12 +385,13 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, &def_uses[idx], ); - gvn(&mut editor); + gvn(&mut editor, false); self.module.constants = constants_ref.take(); self.module.dynamic_constants = dynamic_constants_ref.take(); @@ -515,6 +433,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -590,6 +509,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -614,18 +534,21 @@ impl PassManager { let dynamic_constants_ref = RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); - let mut editors: Vec<_> = - zip(self.module.functions.iter_mut(), def_uses.iter()) - .map(|(func, def_use)| { - FunctionEditor::new( - func, - &constants_ref, - &dynamic_constants_ref, - &types_ref, - def_use, - ) - }) - .collect(); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); inline(&mut editors, callgraph); self.module.constants = constants_ref.take(); @@ -645,18 +568,21 @@ impl PassManager { RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let old_num_funcs = self.module.functions.len(); - let mut editors: Vec<_> = - zip(self.module.functions.iter_mut(), def_uses.iter()) - .map(|(func, def_use)| { - FunctionEditor::new( - func, - &constants_ref, - &dynamic_constants_ref, - &types_ref, - def_use, - ) - }) - .collect(); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); for editor in editors.iter_mut() { collapse_returns(editor); ensure_between_control_flow(editor); @@ -678,18 +604,21 @@ impl PassManager { let dynamic_constants_ref = RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); - let mut editors: Vec<_> = - zip(self.module.functions.iter_mut(), def_uses.iter()) - .map(|(func, def_use)| { - FunctionEditor::new( - func, - &constants_ref, - &dynamic_constants_ref, - &types_ref, - def_use, - ) - }) - .collect(); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); let mut new_funcs = vec![]; for (idx, editor) in editors.iter_mut().enumerate() { let new_func_id = FunctionID::new(old_num_funcs + new_funcs.len()); @@ -726,18 +655,21 @@ impl PassManager { // By default in an editor all nodes are mutable, which is desired in this case // since we are only modifying the IDs of functions that we call. - let mut editors: Vec<_> = - zip(self.module.functions.iter_mut(), def_uses.iter()) - .map(|(func, def_use)| { - FunctionEditor::new( - func, - &constants_ref, - &dynamic_constants_ref, - &types_ref, - def_use, - ) - }) - .collect(); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); let new_idx = delete_uncalled(&mut editors, callgraph); self.module.constants = constants_ref.take(); @@ -768,6 +700,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -796,6 +729,7 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, @@ -811,13 +745,24 @@ impl PassManager { } self.clear_analyses(); } - Pass::MaterializeClones => { + Pass::GCM => loop { self.make_def_uses(); + self.make_reverse_postorders(); + self.make_typing(); + self.make_control_subgraphs(); + self.make_doms(); + self.make_fork_join_maps(); + self.make_loops(); self.make_collection_objects(); - self.make_bbs(); let def_uses = self.def_uses.as_ref().unwrap(); + let reverse_postorders = self.reverse_postorders.as_ref().unwrap(); + let typing = self.typing.as_ref().unwrap(); + let doms = self.doms.as_ref().unwrap(); + let fork_join_maps = self.fork_join_maps.as_ref().unwrap(); + let loops = self.loops.as_ref().unwrap(); + let control_subgraphs = self.control_subgraphs.as_ref().unwrap(); let collection_objects = self.collection_objects.as_ref().unwrap(); - let bbs = self.bbs.as_ref().unwrap(); + let mut bbs = vec![]; for idx in 0..self.module.functions.len() { let constants_ref = RefCell::new(std::mem::take(&mut self.module.constants)); @@ -826,16 +771,25 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, &def_uses[idx], ); - materialize_clones( + if let Some(bb) = gcm( &mut editor, - &collection_objects[&FunctionID::new(idx)], - &bbs[idx], - ); + &def_uses[idx], + &reverse_postorders[idx], + &typing[idx], + &control_subgraphs[idx], + &doms[idx], + &fork_join_maps[idx], + &loops[idx], + collection_objects, + ) { + bbs.push(bb); + } self.module.constants = constants_ref.take(); self.module.dynamic_constants = dynamic_constants_ref.take(); @@ -844,6 +798,48 @@ impl PassManager { self.module.functions[idx].delete_gravestones(); } self.clear_analyses(); + if bbs.len() == self.module.functions.len() { + self.bbs = Some(bbs); + break; + } + }, + Pass::FloatCollections => { + self.make_def_uses(); + self.make_typing(); + self.make_callgraph(); + let def_uses = self.def_uses.as_ref().unwrap(); + let typing = self.typing.as_ref().unwrap(); + let callgraph = self.callgraph.as_ref().unwrap(); + let devices = device_placement(&self.module.functions, &callgraph); + let constants_ref = RefCell::new(std::mem::take(&mut self.module.constants)); + let dynamic_constants_ref = + RefCell::new(std::mem::take(&mut self.module.dynamic_constants)); + let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); + let mut editors: Vec<_> = zip( + self.module.functions.iter_mut().enumerate(), + def_uses.iter(), + ) + .map(|((idx, func), def_use)| { + FunctionEditor::new( + func, + FunctionID::new(idx), + &constants_ref, + &dynamic_constants_ref, + &types_ref, + def_use, + ) + }) + .collect(); + float_collections(&mut editors, typing, callgraph, &devices); + + self.module.constants = constants_ref.take(); + self.module.dynamic_constants = dynamic_constants_ref.take(); + self.module.types = types_ref.take(); + + for func in self.module.functions.iter_mut() { + func.delete_gravestones(); + } + self.clear_analyses(); } Pass::InferSchedules => { self.make_def_uses(); @@ -860,12 +856,17 @@ impl PassManager { let types_ref = RefCell::new(std::mem::take(&mut self.module.types)); let mut editor = FunctionEditor::new( &mut self.module.functions[idx], + FunctionID::new(idx), &constants_ref, &dynamic_constants_ref, &types_ref, &def_uses[idx], ); - infer_parallel_reduce(&mut editor, &fork_join_maps[idx], &reduce_cycles[idx]); + infer_parallel_reduce( + &mut editor, + &fork_join_maps[idx], + &reduce_cycles[idx], + ); infer_parallel_fork(&mut editor, &fork_join_maps[idx]); infer_vectorizable(&mut editor, &fork_join_maps[idx]); infer_tight_associative(&mut editor, &reduce_cycles[idx]); @@ -905,28 +906,21 @@ impl PassManager { if *force_analyses { self.make_doms(); self.make_fork_join_maps(); - self.make_bbs(); } xdot_module( &self.module, self.reverse_postorders.as_ref().unwrap(), self.doms.as_ref(), self.fork_join_maps.as_ref(), - self.bbs.as_ref(), ); } Pass::Codegen(output_dir, module_name) => { - self.make_reverse_postorders(); self.make_typing(); self.make_control_subgraphs(); - self.make_antideps(); - self.make_bbs(); self.make_collection_objects(); self.make_callgraph(); - let reverse_postorders = self.reverse_postorders.as_ref().unwrap(); let typing = self.typing.as_ref().unwrap(); let control_subgraphs = self.control_subgraphs.as_ref().unwrap(); - let antideps = self.antideps.as_ref().unwrap(); let bbs = self.bbs.as_ref().unwrap(); let collection_objects = self.collection_objects.as_ref().unwrap(); let callgraph = self.callgraph.as_ref().unwrap(); @@ -943,10 +937,8 @@ impl PassManager { &self.module.types, &self.module.constants, &self.module.dynamic_constants, - &reverse_postorders[idx], &typing[idx], &control_subgraphs[idx], - &antideps[idx], &bbs[idx], &mut llvm_ir, ) @@ -954,10 +946,8 @@ impl PassManager { Device::AsyncRust => rt_codegen( FunctionID::new(idx), &self.module, - &reverse_postorders[idx], &typing[idx], &control_subgraphs[idx], - &antideps[idx], &bbs[idx], &collection_objects, &callgraph, @@ -985,16 +975,18 @@ impl PassManager { println!("{}", rust_rt); // Write the LLVM IR into a temporary file. - let mut tmp_path = temp_dir(); + let tmp_dir = TempDir::new().unwrap(); + let mut tmp_path = tmp_dir.path().to_path_buf(); tmp_path.push(format!("{}.ll", module_name)); + println!("{}", tmp_path.display()); let mut file = File::create(&tmp_path) .expect("PANIC: Unable to open output LLVM IR file."); file.write_all(llvm_ir.as_bytes()) .expect("PANIC: Unable to write output LLVM IR file contents."); - println!("{}", tmp_path.display()); // Compile LLVM IR into an ELF object file. let output_archive = format!("{}/lib{}.a", output_dir, module_name); + println!("{}", output_archive); let mut clang_process = Command::new("clang") .arg(&tmp_path) .arg("--emit-static-lib") @@ -1007,15 +999,14 @@ impl PassManager { .spawn() .expect("Error running clang. Is it installed?"); assert!(clang_process.wait().unwrap().success()); - println!("{}", output_archive); // Write the Rust runtime into a file. let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name); + println!("{}", output_rt); let mut file = File::create(&output_rt) .expect("PANIC: Unable to open output Rust runtime file."); file.write_all(rust_rt.as_bytes()) .expect("PANIC: Unable to write output Rust runtime file contents."); - println!("{}", output_rt); } Pass::Serialize(output_file) => { let module_contents: Vec<u8> = postcard::to_allocvec(&self.module).unwrap(); @@ -1040,7 +1031,6 @@ impl PassManager { self.fork_join_nests = None; self.loops = None; self.reduce_cycles = None; - self.antideps = None; self.data_nodes_in_fork_joins = None; self.bbs = None; self.collection_objects = None; diff --git a/hercules_opt/src/phi_elim.rs b/hercules_opt/src/phi_elim.rs index 2788e56a9199d20997e1272fe495e45048d91767..a79b43d3ca6a6b7fa85f09127ce53ba70b0063be 100644 --- a/hercules_opt/src/phi_elim.rs +++ b/hercules_opt/src/phi_elim.rs @@ -1,12 +1,9 @@ -extern crate bitvec; -extern crate hercules_ir; - use std::collections::VecDeque; use std::iter::FromIterator; -use self::bitvec::prelude::*; +use bitvec::prelude::*; -use self::hercules_ir::ir::*; +use hercules_ir::ir::*; use crate::*; diff --git a/hercules_opt/src/pred.rs b/hercules_opt/src/pred.rs index 09d9753d0c09a561483c733eb87415beda435944..be1b4a0bb77e3ab225f495577eae9d6a24e52561 100644 --- a/hercules_opt/src/pred.rs +++ b/hercules_opt/src/pred.rs @@ -1,15 +1,12 @@ -extern crate bitvec; -extern crate hercules_ir; - use std::collections::HashMap; use std::collections::HashSet; use std::collections::VecDeque; -use self::bitvec::prelude::*; +use bitvec::prelude::*; -use self::hercules_ir::def_use::*; -use self::hercules_ir::dom::*; -use self::hercules_ir::ir::*; +use hercules_ir::def_use::*; +use hercules_ir::dom::*; +use hercules_ir::ir::*; /* * Top level function to convert acyclic control flow in vectorized fork-joins diff --git a/hercules_opt/src/schedule.rs b/hercules_opt/src/schedule.rs index ff895b1651b298cc59a8b0afbd921da2aaf04f82..2c8209aae002beffca3c8f7ee40d6986fed49fb0 100644 --- a/hercules_opt/src/schedule.rs +++ b/hercules_opt/src/schedule.rs @@ -1,9 +1,7 @@ -extern crate hercules_ir; - use std::collections::{HashMap, HashSet}; -use self::hercules_ir::def_use::*; -use self::hercules_ir::ir::*; +use hercules_ir::def_use::*; +use hercules_ir::ir::*; use crate::*; diff --git a/hercules_opt/src/sroa.rs b/hercules_opt/src/sroa.rs index a73ecb2b42ef42c94f3e97faa8d777a7040cc6f4..6461ad717d5b7cbeac0e237916d11d4a3a4ae6d6 100644 --- a/hercules_opt/src/sroa.rs +++ b/hercules_opt/src/sroa.rs @@ -1,8 +1,6 @@ -extern crate hercules_ir; - use std::collections::{BTreeMap, HashMap, VecDeque}; -use self::hercules_ir::ir::*; +use hercules_ir::ir::*; use crate::*; diff --git a/hercules_opt/src/unforkify.rs b/hercules_opt/src/unforkify.rs index 61c86a276c6f71be3790ec1d9342d6b364d020c4..a5df7a7c404e820a92d27842211dd0e3396dae41 100644 --- a/hercules_opt/src/unforkify.rs +++ b/hercules_opt/src/unforkify.rs @@ -1,9 +1,7 @@ -extern crate hercules_ir; - use std::collections::HashMap; use std::iter::zip; -use self::hercules_ir::ir::*; +use hercules_ir::ir::*; use crate::*; diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs index c32225b50bd1d02500ad210171cde0a58d67b17e..77fa1ff6525c575414b94e12905502b257ac1a12 100644 --- a/hercules_opt/src/utils.rs +++ b/hercules_opt/src/utils.rs @@ -1,7 +1,5 @@ -extern crate hercules_ir; - -use self::hercules_ir::def_use::*; -use self::hercules_ir::ir::*; +use hercules_ir::def_use::*; +use hercules_ir::ir::*; use crate::*; diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml new file mode 100644 index 0000000000000000000000000000000000000000..c4678b18918f323e601a75b0efa1f0289b9b8de9 --- /dev/null +++ b/hercules_rt/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "hercules_rt" +version = "0.1.0" +authors = ["Russel Arbore <rarbore2@illinois.edu>"] +edition = "2021" + +[features] +cuda = [] + +[dependencies] + diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs new file mode 100644 index 0000000000000000000000000000000000000000..15b9f6396d2c90eee66dfecb5a255cef2890726b --- /dev/null +++ b/hercules_rt/build.rs @@ -0,0 +1,25 @@ +use std::env::var; +use std::path::Path; +use std::process::Command; + +fn main() { + if cfg!(feature = "cuda") { + let out_dir = var("OUT_DIR").unwrap(); + Command::new("nvcc") + .args(&["src/rtdefs.cu", "-c", "-o"]) + .arg(&format!("{}/rtdefs.o", out_dir)) + .status() + .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?"); + Command::new("ar") + .args(&["crus", "librtdefs.a", "rtdefs.o"]) + .current_dir(&Path::new(&out_dir)) + .status() + .unwrap(); + + println!("cargo::rustc-link-search=native={}", out_dir); + println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/"); + println!("cargo::rustc-link-lib=static=rtdefs"); + println!("cargo::rustc-link-lib=cudart"); + println!("cargo::rerun-if-changed=src/rtdefs.cu"); + } +} diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs new file mode 100644 index 0000000000000000000000000000000000000000..2a96970e88e2b432c8c9c2ed896dc1c623781cbd --- /dev/null +++ b/hercules_rt/src/lib.rs @@ -0,0 +1,211 @@ +use std::alloc::{alloc, alloc_zeroed, dealloc, Layout}; +use std::marker::PhantomData; +use std::mem::swap; +use std::ptr::{copy_nonoverlapping, NonNull}; +use std::slice::from_raw_parts; + +#[cfg(feature = "cuda")] +extern "C" { + fn cuda_alloc(size: usize) -> *mut u8; + fn cuda_alloc_zeroed(size: usize) -> *mut u8; + fn cuda_dealloc(ptr: *mut u8); + fn copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize); + fn copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize); + fn copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize); +} + +/* + * An in-memory collection object that can be used by functions compiled by the + * Hercules compiler. + */ +pub struct HerculesBox<'a> { + cpu_shared: Option<NonNull<u8>>, + cpu_exclusive: Option<NonNull<u8>>, + cpu_owned: Option<NonNull<u8>>, + + #[cfg(feature = "cuda")] + cuda_owned: Option<NonNull<u8>>, + + size: usize, + _phantom: PhantomData<&'a u8>, +} + +impl<'b, 'a: 'b> HerculesBox<'a> { + pub fn from_slice<T>(slice: &'a [T]) -> Self { + HerculesBox { + cpu_shared: Some(unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) }), + cpu_exclusive: None, + cpu_owned: None, + + #[cfg(feature = "cuda")] + cuda_owned: None, + + size: slice.len() * size_of::<T>(), + _phantom: PhantomData, + } + } + + pub fn from_slice_mut<T>(slice: &'a mut [T]) -> Self { + HerculesBox { + cpu_shared: None, + cpu_exclusive: Some(unsafe { NonNull::new_unchecked(slice.as_mut_ptr() as *mut u8) }), + cpu_owned: None, + + #[cfg(feature = "cuda")] + cuda_owned: None, + + size: slice.len() * size_of::<T>(), + _phantom: PhantomData, + } + } + + pub fn as_slice<T>(&'b mut self) -> &'b [T] { + assert_eq!(self.size % size_of::<T>(), 0); + unsafe { from_raw_parts(self.__cpu_ptr() as *const T, self.size / size_of::<T>()) } + } + + unsafe fn get_cpu_ptr(&self) -> Option<NonNull<u8>> { + self.cpu_owned.or(self.cpu_exclusive).or(self.cpu_shared) + } + + #[cfg(feature = "cuda")] + unsafe fn get_cuda_ptr(&self) -> Option<NonNull<u8>> { + self.cuda_owned + } + + unsafe fn allocate_cpu(&mut self) -> NonNull<u8> { + if let Some(ptr) = self.cpu_owned { + ptr + } else { + let ptr = + NonNull::new(alloc(Layout::from_size_align_unchecked(self.size, 16))).unwrap(); + self.cpu_owned = Some(ptr); + ptr + } + } + + #[cfg(feature = "cuda")] + unsafe fn allocate_cuda(&mut self) -> NonNull<u8> { + if let Some(ptr) = self.cuda_owned { + ptr + } else { + let ptr = cuda_alloc(self.size); + self.cuda_owned = Some(NonNull::new(ptr).unwrap()); + self.cuda_owned.unwrap() + } + } + + unsafe fn deallocate_cpu(&mut self) { + if let Some(ptr) = self.cpu_owned { + dealloc( + ptr.as_ptr(), + Layout::from_size_align_unchecked(self.size, 16), + ); + self.cpu_owned = None; + } + } + + #[cfg(feature = "cuda")] + unsafe fn deallocate_cuda(&mut self) { + if let Some(ptr) = self.cuda_owned { + cuda_dealloc(ptr.as_ptr()); + self.cuda_owned = None; + } + } + + pub unsafe fn __zeros(size: u64) -> Self { + assert_ne!(size, 0); + let size = size as usize; + HerculesBox { + cpu_shared: None, + cpu_exclusive: None, + cpu_owned: Some( + NonNull::new(alloc_zeroed(Layout::from_size_align_unchecked(size, 16))).unwrap(), + ), + + #[cfg(feature = "cuda")] + cuda_owned: None, + + size: size, + _phantom: PhantomData, + } + } + + pub unsafe fn __null() -> Self { + HerculesBox { + cpu_shared: None, + cpu_exclusive: None, + cpu_owned: None, + + #[cfg(feature = "cuda")] + cuda_owned: None, + + size: 0, + _phantom: PhantomData, + } + } + + pub unsafe fn __take(&mut self) -> Self { + let mut ret = Self::__null(); + swap(&mut ret, self); + ret + } + + pub unsafe fn __cpu_ptr(&mut self) -> *mut u8 { + if let Some(ptr) = self.get_cpu_ptr() { + return ptr.as_ptr(); + } + #[cfg(feature = "cuda")] + { + let cuda_ptr = self.get_cuda_ptr().unwrap(); + let cpu_ptr = self.allocate_cpu(); + copy_cuda_to_cpu(cpu_ptr.as_ptr(), cuda_ptr.as_ptr(), self.size); + return cpu_ptr.as_ptr(); + } + panic!() + } + + pub unsafe fn __cpu_ptr_mut(&mut self) -> *mut u8 { + let cpu_ptr = self.__cpu_ptr(); + if Some(cpu_ptr) == self.cpu_shared.map(|nn| nn.as_ptr()) { + self.allocate_cpu(); + copy_nonoverlapping(cpu_ptr, self.cpu_owned.unwrap().as_ptr(), self.size); + } + self.cpu_shared = None; + self.cpu_exclusive = None; + #[cfg(feature = "cuda")] + self.deallocate_cuda(); + cpu_ptr + } + + #[cfg(feature = "cuda")] + pub unsafe fn __cuda_ptr(&mut self) -> *mut u8 { + if let Some(ptr) = self.get_cuda_ptr() { + ptr.as_ptr() + } else { + let cpu_ptr = self.get_cpu_ptr().unwrap(); + let cuda_ptr = self.allocate_cuda(); + copy_cpu_to_cuda(cuda_ptr.as_ptr(), cpu_ptr.as_ptr(), self.size); + cuda_ptr.as_ptr() + } + } + + #[cfg(feature = "cuda")] + pub unsafe fn __cuda_ptr_mut(&mut self) -> *mut u8 { + let cuda_ptr = self.__cuda_ptr(); + self.cpu_shared = None; + self.cpu_exclusive = None; + self.deallocate_cpu(); + cuda_ptr + } +} + +impl<'a> Drop for HerculesBox<'a> { + fn drop(&mut self) { + unsafe { + self.deallocate_cpu(); + #[cfg(feature = "cuda")] + self.deallocate_cuda(); + } + } +} diff --git a/hercules_rt/src/rtdefs.cu b/hercules_rt/src/rtdefs.cu new file mode 100644 index 0000000000000000000000000000000000000000..b7378d816c65666b445a989fc6b0530d821beb7b --- /dev/null +++ b/hercules_rt/src/rtdefs.cu @@ -0,0 +1,38 @@ +extern "C" { + void *cuda_alloc(size_t size) { + void *ptr = NULL; + cudaError_t res = cudaMalloc(&ptr, size); + if (res != cudaSuccess) { + ptr = NULL; + } + return ptr; + } + + void *cuda_alloc_zeroed(size_t size) { + void *ptr = cuda_alloc(size); + if (!ptr) { + return NULL; + } + cudaError_t res = cudaMemset(ptr, 0, size); + if (res != cudaSuccess) { + return NULL; + } + return ptr; + } + + void cuda_dealloc(void *ptr) { + cudaFree(ptr); + } + + void copy_cpu_to_cuda(void *dst, void *src, size_t size) { + cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); + } + + void copy_cuda_to_cpu(void *dst, void *src, size_t size) { + cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost); + } + + void copy_cuda_to_cuda(void *dst, void *src, size_t size) { + cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice); + } +} diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs index dbefe008a14e57785261e2757bb0e0dbbb5fa27c..af48fe64f7c2b778c9841e45ecf13b8a6a5740d2 100644 --- a/hercules_samples/call/build.rs +++ b/hercules_samples/call/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/hercules_samples/call/src/main.rs b/hercules_samples/call/src/main.rs index b5c999fdac3738f6d3fced2164fbf9320d5a1034..0b657dd81e7ade1491f3c6c51fdd9a9f0abb5d57 100644 --- a/hercules_samples/call/src/main.rs +++ b/hercules_samples/call/src/main.rs @@ -1,8 +1,5 @@ #![feature(box_as_ptr, let_chains)] -extern crate async_std; -extern crate juno_build; - juno_build::juno!("call"); fn main() { diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs index 650b51b8b14715579de164f7fc65330e113613a1..f04d48c7d0ea6df8b16d70b05cedabfc04c1f6f2 100644 --- a/hercules_samples/ccp/build.rs +++ b/hercules_samples/ccp/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/hercules_samples/ccp/src/main.rs b/hercules_samples/ccp/src/main.rs index 9e2aced90044d1af68e6a47f9205ee1a21a69328..7f6459a0c3593c38794d5534ac7af41e404db8c1 100644 --- a/hercules_samples/ccp/src/main.rs +++ b/hercules_samples/ccp/src/main.rs @@ -1,8 +1,5 @@ #![feature(box_as_ptr, let_chains)] -extern crate async_std; -extern crate juno_build; - juno_build::juno!("ccp"); fn main() { diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml index f74ab1f6f4ed5de3b02ab45b1f6fca461fdbc192..69cd39e388661b3f7f6dca53cf9210ab7050902c 100644 --- a/hercules_samples/dot/Cargo.toml +++ b/hercules_samples/dot/Cargo.toml @@ -10,6 +10,7 @@ juno_build = { path = "../../juno_build" } [dependencies] clap = { version = "*", features = ["derive"] } juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs index cfa03fd3b989748fe530071f289fb31f036283b4..2a239bc6c3ebd3780cb15358375c59bdfb2e25ae 100644 --- a/hercules_samples/dot/build.rs +++ b/hercules_samples/dot/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs index 0f5ee518506e6abe6cb815c244699ce12ccb45d1..0b5c6a93c3527d4c6863c48956f1d10947bdb017 100644 --- a/hercules_samples/dot/src/main.rs +++ b/hercules_samples/dot/src/main.rs @@ -1,31 +1,16 @@ #![feature(box_as_ptr, let_chains)] -extern crate async_std; -extern crate juno_build; - -use core::ptr::copy_nonoverlapping; +use hercules_rt::HerculesBox; juno_build::juno!("dot"); fn main() { async_std::task::block_on(async { - let a: Box<[f32]> = Box::new([0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]); - let b: Box<[f32]> = Box::new([0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]); - let mut a_bytes: Box<[u8]> = Box::new([0; 32]); - let mut b_bytes: Box<[u8]> = Box::new([0; 32]); - unsafe { - copy_nonoverlapping( - Box::as_ptr(&a) as *const u8, - Box::as_mut_ptr(&mut a_bytes) as *mut u8, - 32, - ); - copy_nonoverlapping( - Box::as_ptr(&b) as *const u8, - Box::as_mut_ptr(&mut b_bytes) as *mut u8, - 32, - ); - }; - let c = dot(8, a_bytes, b_bytes).await; + let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; + let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; + let a = HerculesBox::from_slice(&a); + let b = HerculesBox::from_slice(&b); + let c = dot(8, a, b).await; println!("{}", c); assert_eq!(c, 70.0); }); diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs index 49a6024828516f994522e66bf1de1d714f63e75b..4d8226f11183d9500e6affec4c46110e8626ee69 100644 --- a/hercules_samples/fac/build.rs +++ b/hercules_samples/fac/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/hercules_samples/fac/src/main.rs b/hercules_samples/fac/src/main.rs index 7071fd2c115bba1d6ff60fae688b262354d4fc71..b6e0257b0be48e39555050194be6d29459c3c765 100644 --- a/hercules_samples/fac/src/main.rs +++ b/hercules_samples/fac/src/main.rs @@ -1,7 +1,3 @@ -extern crate async_std; -extern crate clap; -extern crate juno_build; - juno_build::juno!("fac"); fn main() { diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml index d3975c5ca58b68cdb3fef0f6d8a3cf8e106408d6..9066c1535e2c40400bdb3b5ca20a3e38237ef597 100644 --- a/hercules_samples/matmul/Cargo.toml +++ b/hercules_samples/matmul/Cargo.toml @@ -10,6 +10,7 @@ juno_build = { path = "../../juno_build" } [dependencies] clap = { version = "*", features = ["derive"] } juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } rand = "*" async-std = "*" with_builtin_macros = "0.1.0" diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs index ec6eb892326c5d6eb341c89239b0465de0cf4a49..08478deaac459d9a94f79fdabce37da9a1205f89 100644 --- a/hercules_samples/matmul/build.rs +++ b/hercules_samples/matmul/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs index 93d007c791579a75dea65d5680ab3018e9b00085..767fda07195544ae291e0e9e731a1ee43bd1615b 100644 --- a/hercules_samples/matmul/src/main.rs +++ b/hercules_samples/matmul/src/main.rs @@ -1,13 +1,9 @@ #![feature(box_as_ptr, let_chains)] -extern crate async_std; -extern crate juno_build; -extern crate rand; - -use core::ptr::copy_nonoverlapping; - use rand::random; +use hercules_rt::HerculesBox; + juno_build::juno!("matmul"); fn main() { @@ -15,31 +11,8 @@ fn main() { const I: usize = 256; const J: usize = 64; const K: usize = 128; - let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); - let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); - let mut a_bytes: Box<[u8]> = Box::new([0; I * J * 4]); - let mut b_bytes: Box<[u8]> = Box::new([0; J * K * 4]); - unsafe { - copy_nonoverlapping( - Box::as_ptr(&a) as *const u8, - Box::as_mut_ptr(&mut a_bytes) as *mut u8, - I * J * 4, - ); - copy_nonoverlapping( - Box::as_ptr(&b) as *const u8, - Box::as_mut_ptr(&mut b_bytes) as *mut u8, - J * K * 4, - ); - }; - let c_bytes = matmul(I as u64, J as u64, K as u64, a_bytes, b_bytes).await; - let mut c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); - unsafe { - copy_nonoverlapping( - Box::as_ptr(&c_bytes) as *const u8, - Box::as_mut_ptr(&mut c) as *mut u8, - I * K * 4, - ); - }; + let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); + let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); for i in 0..I { for k in 0..K { @@ -48,7 +21,10 @@ fn main() { } } } - assert_eq!(c, correct_c); + let a = HerculesBox::from_slice_mut(&mut a); + let b = HerculesBox::from_slice_mut(&mut b); + let mut c = matmul(I as u64, J as u64, K as u64, a, b).await; + assert_eq!(c.as_slice::<i32>(), &*correct_c); }); } diff --git a/hercules_test/hercules_interpreter/src/interpreter.rs b/hercules_test/hercules_interpreter/src/interpreter.rs index a166427fe062f90577f1b29b193cc08e02e01193..621260e5a25597e3dedcc279348e0be7c1fb9472 100644 --- a/hercules_test/hercules_interpreter/src/interpreter.rs +++ b/hercules_test/hercules_interpreter/src/interpreter.rs @@ -1,19 +1,14 @@ -extern crate itertools; -extern crate ordered_float; -use crate::value; - -use self::itertools::Itertools; use std::collections::HashMap; use std::panic; use std::collections::hash_map::Entry::Occupied; -use value::*; +use itertools::Itertools; -extern crate hercules_ir; -extern crate hercules_opt; +use hercules_ir::*; -use self::hercules_ir::*; +use crate::value; +use value::*; /* High level design details / discussion for this: * diff --git a/hercules_test/hercules_interpreter/src/main.rs b/hercules_test/hercules_interpreter/src/main.rs index 4d2ebd31853aedf36fb5871e62af9422f49b3c0f..5db31cd730fe802dd9ccbf1b8e0d603c736fb196 100644 --- a/hercules_test/hercules_interpreter/src/main.rs +++ b/hercules_test/hercules_interpreter/src/main.rs @@ -1,18 +1,13 @@ -extern crate clap; -extern crate hercules_ir; -extern crate hercules_opt; -extern crate rand; +use std::fs::File; +use std::io::prelude::*; +use clap::Parser; + +use hercules_ir::*; use hercules_interpreter::interpreter::*; use hercules_interpreter::*; use hercules_interpreter::value; -use std::fs::File; -use std::io::prelude::*; - -use self::hercules_ir::*; - -use clap::Parser; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] diff --git a/hercules_test/hercules_interpreter/src/value.rs b/hercules_test/hercules_interpreter/src/value.rs index d236145c00e1bfd63a1b45ff5bebad93398062ea..e032bd5b711a713c55c6cd911c99e327ff1d2e0b 100644 --- a/hercules_test/hercules_interpreter/src/value.rs +++ b/hercules_test/hercules_interpreter/src/value.rs @@ -1,23 +1,18 @@ #![allow(unused)] -extern crate derive_more; use derive_more::From; /* Defines semantic meaning of IR operations. */ -extern crate itertools; use crate::dyn_const_value; -use self::itertools::Itertools; +use itertools::Itertools; use std::clone; use std::convert::TryInto; use std::panic; -extern crate hercules_ir; -extern crate hercules_opt; - -use self::hercules_ir::*; -use self::hercules_opt::*; +use hercules_ir::*; +use hercules_opt::*; #[derive(PartialEq, Debug, Clone, Eq)] pub enum InterpreterVal { diff --git a/hercules_test/hercules_tests/tests/opt_tests.rs b/hercules_test/hercules_tests/tests/opt_tests.rs index c14d4db5b0072ec0f724cdde9b467c7568e1b41c..388dfeddf38eecab3ffb191a344682395e71d5cd 100644 --- a/hercules_test/hercules_tests/tests/opt_tests.rs +++ b/hercules_test/hercules_tests/tests/opt_tests.rs @@ -1,11 +1,10 @@ use std::env; +use rand::Rng; + use hercules_interpreter::*; use hercules_opt::pass::Pass; -extern crate rand; -use rand::Rng; - #[test] fn matmul_int() { let module = parse_file("../test_inputs/matmul_int.hir"); @@ -198,4 +197,4 @@ fn sum_int2_smaller() { Pass::DCE, ], vec![1; 100]); -} \ No newline at end of file +} diff --git a/hercules_tools/hercules_driver/Cargo.toml b/hercules_tools/hercules_driver/Cargo.toml index 9236c34a948e61e962f2fddc3d23a4fd72cfaee6..ad9397b140052539a341084646d5f7fde1cbafff 100644 --- a/hercules_tools/hercules_driver/Cargo.toml +++ b/hercules_tools/hercules_driver/Cargo.toml @@ -2,6 +2,7 @@ name = "hercules_driver" version = "0.1.0" authors = ["Russel Arbore <rarbore2@illinois.edu>"] +edition = "2021" [dependencies] clap = { version = "*", features = ["derive"] } diff --git a/hercules_tools/hercules_driver/src/main.rs b/hercules_tools/hercules_driver/src/main.rs index 97c9fe2ef9f03c87f50d079d4b1aabad371b894f..a2550022129029387664fb4327528d09078c2e02 100644 --- a/hercules_tools/hercules_driver/src/main.rs +++ b/hercules_tools/hercules_driver/src/main.rs @@ -1,6 +1,3 @@ -extern crate clap; -extern crate postcard; - use std::fs::File; use std::io::prelude::*; use std::path::Path; diff --git a/juno_build/src/lib.rs b/juno_build/src/lib.rs index fdaf4d27cbcb7b31738e6df42d69272369d5026f..0c676e4c1b203c53e98a8430e0f2354104540e07 100644 --- a/juno_build/src/lib.rs +++ b/juno_build/src/lib.rs @@ -1,5 +1,3 @@ -extern crate hercules_ir; - use juno_compiler::*; use std::env::{current_dir, var}; diff --git a/juno_frontend/src/lib.rs b/juno_frontend/src/lib.rs index b18b29791b54aa267945ec8e658fdce069e250f3..906d780560626d7aff1716b4d159aa82d76a7df5 100644 --- a/juno_frontend/src/lib.rs +++ b/juno_frontend/src/lib.rs @@ -9,8 +9,6 @@ mod semant; mod ssa; mod types; -extern crate hercules_ir; - use std::fmt; use std::path::Path; @@ -187,18 +185,24 @@ pub fn compile_ir( //add_pass!(pm, verify, Forkify); //add_pass!(pm, verify, ForkGuardElim); add_verified_pass!(pm, verify, DCE); + add_pass!(pm, verify, ForkSplit); + add_pass!(pm, verify, Unforkify); + add_pass!(pm, verify, GVN); + add_verified_pass!(pm, verify, DCE); + add_pass!(pm, verify, DCE); add_pass!(pm, verify, Outline); add_pass!(pm, verify, InterproceduralSROA); add_pass!(pm, verify, SROA); add_pass!(pm, verify, InferSchedules); - add_pass!(pm, verify, ForkSplit); - add_pass!(pm, verify, Unforkify); - add_pass!(pm, verify, GVN); add_verified_pass!(pm, verify, DCE); if x_dot { pm.add_pass(hercules_opt::pass::Pass::Xdot(true)); } + add_pass!(pm, verify, GCM); + add_verified_pass!(pm, verify, DCE); + add_pass!(pm, verify, FloatCollections); + add_pass!(pm, verify, GCM); pm.add_pass(hercules_opt::pass::Pass::Codegen(output_dir, module_name)); pm.run_passes(); diff --git a/juno_frontend/src/main.rs b/juno_frontend/src/main.rs index 4624e7160d242b064fdae85e7eba889e752fff75..d98c1e29e5aedeac5ce2a4e791c669a7af34c320 100644 --- a/juno_frontend/src/main.rs +++ b/juno_frontend/src/main.rs @@ -1,5 +1,3 @@ -extern crate clap; - use juno_compiler::*; use clap::{ArgGroup, Parser}; diff --git a/juno_frontend/src/semant.rs b/juno_frontend/src/semant.rs index 1ccdf3ed80b33deab7fe51c51cf44c118f4c4518..660d8afe35a803e7395d67871236e0f596b09a87 100644 --- a/juno_frontend/src/semant.rs +++ b/juno_frontend/src/semant.rs @@ -1,5 +1,3 @@ -extern crate hercules_ir; - use std::collections::{HashMap, LinkedList}; use std::fmt; use std::fs::File; diff --git a/juno_frontend/src/ssa.rs b/juno_frontend/src/ssa.rs index f5e1d8302d8a407e2a6e1b6d89cc9b62d4e5c40e..578f7a9af120644d265d5c5b679fb97b35806d36 100644 --- a/juno_frontend/src/ssa.rs +++ b/juno_frontend/src/ssa.rs @@ -4,12 +4,11 @@ * Compiler Construction. CC 2013. Lecture Notes in Computer Science, vol 7791. Springer, Berlin, * Heidelberg. https://doi.org/10.1007/978-3-642-37051-9_6 */ -extern crate hercules_ir; use std::collections::{HashMap, HashSet}; -use self::hercules_ir::build::*; -use self::hercules_ir::ir::*; +use hercules_ir::build::*; +use hercules_ir::ir::*; use crate::labeled_builder::LabeledBuilder; pub struct SSA { diff --git a/juno_frontend/src/types.rs b/juno_frontend/src/types.rs index 582e7cfddb87cc47e467662d52e87029319e99ad..5f907cd9370a343eb501e0c4b5b1dcef2a55f651 100644 --- a/juno_frontend/src/types.rs +++ b/juno_frontend/src/types.rs @@ -1,11 +1,12 @@ use std::collections::{HashMap, HashSet, VecDeque}; use crate::dynconst::DynConst; -use crate::hercules_ir::build::*; -use crate::hercules_ir::ir::*; use crate::locs::Location; use crate::parser; +use hercules_ir::build::*; +use hercules_ir::ir::*; + #[derive(Copy, Clone, PartialEq, Eq, Debug)] pub enum Either<A, B> { Left(A), diff --git a/juno_samples/antideps/Cargo.toml b/juno_samples/antideps/Cargo.toml index 40b4d47c57bbe422bfe983ee73a1d140a2ab1b94..9bd1d5a0d484e257ee3ad1e425a09f503da4b503 100644 --- a/juno_samples/antideps/Cargo.toml +++ b/juno_samples/antideps/Cargo.toml @@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs index 757243b829c3f805f8d87d38676e72e2b532f072..7ed716a444460d7a90965f5b7f5faf3a7aadcb14 100644 --- a/juno_samples/antideps/build.rs +++ b/juno_samples/antideps/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/juno_samples/antideps/src/antideps.jn b/juno_samples/antideps/src/antideps.jn index 5949c91a4e64bea9e3afa966f1f9f6a160d8553a..9efe71f10963aacf1620c4348abef6a74d8cb502 100644 --- a/juno_samples/antideps/src/antideps.jn +++ b/juno_samples/antideps/src/antideps.jn @@ -7,7 +7,20 @@ fn simple_antideps(a : usize, b : usize) -> i32 { } #[entry] -fn complex_antideps(x : i32) -> i32 { +fn loop_antideps(input : i32) -> i32 { + let arr1 : i32[1]; + arr1[0] = 2; + let p1 = arr1[0]; + while input > 10 { + arr1[0] = arr1[0] + 1; + input -= 10; + } + let p2 = arr1[0]; + return p1 + p2; +} + +#[entry] +fn complex_antideps1(x : i32) -> i32 { let arr : i32[4]; let arr2 : i32[12]; arr[1] = 7 + arr2[0]; @@ -28,6 +41,23 @@ fn complex_antideps(x : i32) -> i32 { return r; } +#[entry] +fn complex_antideps2(input : i32) -> i32 { + let arr1 : i32[2]; + arr1[0] = 2; + arr1[1] = 3; + let p1 = arr1[0] + arr1[1]; + if input > 0 { + while input > 10 { + arr1[0] = arr1[1] + input; + arr1[1] = arr1[0] + input; + input -= 10; + } + } + let p2 = arr1[0]; + return p1 + p2; +} + #[entry] fn very_complex_antideps(x: usize) -> usize { let arr1 : usize[203]; diff --git a/juno_samples/antideps/src/main.rs b/juno_samples/antideps/src/main.rs index b0a991637bde67a0229fb749213927b8e14c06dd..0b065cbaa6e6cffcaf9ff7b3fbc5a2c882dc7248 100644 --- a/juno_samples/antideps/src/main.rs +++ b/juno_samples/antideps/src/main.rs @@ -1,8 +1,5 @@ #![feature(future_join, box_as_ptr)] -extern crate async_std; -extern crate juno_build; - juno_build::juno!("antideps"); fn main() { @@ -11,10 +8,18 @@ fn main() { println!("{}", output); assert_eq!(output, 5); - let output = complex_antideps(9).await; + let output = loop_antideps(11).await; + println!("{}", output); + assert_eq!(output, 5); + + let output = complex_antideps1(9).await; println!("{}", output); assert_eq!(output, 20); + let output = complex_antideps2(44).await; + println!("{}", output); + assert_eq!(output, 226); + let output = very_complex_antideps(3).await; println!("{}", output); assert_eq!(output, 144); diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs index fafa97bbc642751c37b2ef47a43954fa84f340d9..16d5c7a4f7fcb00344fc7669b67103a27f71a7c6 100644 --- a/juno_samples/casts_and_intrinsics/build.rs +++ b/juno_samples/casts_and_intrinsics/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/juno_samples/casts_and_intrinsics/src/main.rs b/juno_samples/casts_and_intrinsics/src/main.rs index 037d4c4025ca141887034353124436a2db8f84f3..8ee509bfb401aca4bd42751c0fe34beee94c7324 100644 --- a/juno_samples/casts_and_intrinsics/src/main.rs +++ b/juno_samples/casts_and_intrinsics/src/main.rs @@ -1,8 +1,5 @@ #![feature(future_join)] -extern crate async_std; -extern crate juno_build; - juno_build::juno!("casts_and_intrinsics"); fn main() { diff --git a/juno_samples/implicit_clone/Cargo.toml b/juno_samples/implicit_clone/Cargo.toml index 928fa1f2718b9637da2ac3fe740a0d893345d576..b312f5def295a6c31a7d2c9eab5c23f4e16ccd2f 100644 --- a/juno_samples/implicit_clone/Cargo.toml +++ b/juno_samples/implicit_clone/Cargo.toml @@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs index 3378966b64a71a239178edc6a010f9081939f58a..75c1afc41a75b2006b26042323df3bdc3fcf5a17 100644 --- a/juno_samples/implicit_clone/build.rs +++ b/juno_samples/implicit_clone/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn index 17e345e51e80db27c0d2f21854e22abf1eefcdb8..882e5abc51bb596be63512eaff96434a2c45d43a 100644 --- a/juno_samples/implicit_clone/src/implicit_clone.jn +++ b/juno_samples/implicit_clone/src/implicit_clone.jn @@ -1,5 +1,5 @@ #[entry] -fn implicit_clone(input : i32) -> i32 { +fn simple_implicit_clone(input : i32) -> i32 { let arr : i32[3]; arr[0] = 2; let arr2 = arr; @@ -7,3 +7,150 @@ fn implicit_clone(input : i32) -> i32 { arr[2] = 4; return arr[0] + arr2[0] + arr[1] + arr2[1] + arr[2] + arr2[2]; } + +#[entry] +fn loop_implicit_clone(input : i32) -> i32 { + let arr : i32[3]; + let r : i32 = 5; + while input > 0 { + r = arr[0]; + let arr2 = arr; + let x = arr2[input as usize - input as usize]; + arr2[input as usize - input as usize] = 9; + if x == 0 { + input -= arr2[0]; + } else { + r = 99; + break; + } + } + return r + 7; +} + +#[entry] +fn double_loop_implicit_clone(a : usize) -> usize { + for i = 0 to a { + let arr : i32[1]; + for j = 0 to a { + arr[0] = 1; + } + } + return 42; +} + +#[entry] +fn tricky_loop_implicit_clone(a : usize, b : usize) -> i32 { + let x = 0; + for j = 0 to 2 { + for i = 0 to 5 { + let arr : i32[3]; + let arr2 : i32[1]; + if a == b { + arr[a] += 7; + } else { + arr[a] += 1; + } + for k = 0 to (a + b - 1) { + arr[a] += 2; + arr2[0] += 1; + } + x += arr[b]; + } + } + return x; +} + +#[entry] +fn tricky2_loop_implicit_clone(a : usize, b : usize) -> i32 { + let x = 0; + for i = 0 to 3 { + let arr1 : i32[1]; + let arr2 : i32[1]; + if a == b { + arr1[0] = 6; + } else { + arr2[0] = 9; + } + arr1[0] = 2; + for j = 0 to 4 { + arr2[0] += 1; + } + x += arr2[0]; + } + return x; +} + +#[entry] +fn tricky3_loop_implicit_clone(a : usize, b : usize) -> usize { + let x = 0; + for i = 0 to b { + let arr1 : usize[10]; + let arr2 : usize[10]; + arr1[1] = 1; + for kk = 0 to 10 { + arr2[kk] += arr1[kk]; + } + x += arr2[1]; + } + return x; +} + +#[entry] +fn no_implicit_clone(input : i32) -> i32 { + let arr : i32[2]; + arr[0] = input; + while input > 0 { + arr[0] += 1; + input -= 1; + } + let arr2 : i32[1]; + if input == 0 { + arr2[0] = 5; + } else { + arr2[0] = 3; + } + return arr[0] + arr2[0]; +} + +#[entry] +fn mirage_implicit_clone(input : i32) -> i32 { + let arr1 : i32[2]; + let arr2 : i32[2]; + let arr3 : i32[2]; + let arr4 : i32[2]; + arr1[0] = 7; + arr1[1] = 3; + arr2[0] = input; + arr2[1] = 45; + arr3[0] = -14; + arr3[1] = -5; + arr4[0] = -1; + arr4[1] = 0; + arr2 = arr4; + arr3 = arr2; + arr2 = arr1; + let p1 = arr1[0] + arr1[1] + arr2[0] + arr2[1] + arr3[0] + arr3[1] + arr4[0] + arr4[1]; // 18 + arr4 = arr2; + let p2 = arr1[0] + arr1[1] + arr2[0] + arr2[1] + arr3[0] + arr3[1] + arr4[0] + arr4[1]; // 29 + if input > 0 { + while input > 10 { + arr1[0] = arr1[1] + input; + arr1[1] = arr1[0] + input; + input -= 10; + } + } + let p3 = arr1[0]; // 592 + let x : i32 = 0; + while input < 20 { + let arr5 : i32[2]; + arr5[0] = 7; + let y = arr5[0] + arr5[1]; + arr5 = arr4; + arr5[1] += 2; + y += arr5[1]; + x += 12; + input += 1; + } + let p4 = x; // 204 + return p1 + p2 + p3 + p4; +} diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs index ca7ddeb1571be6698f6a9c3971ef617b3a6fd4ca..bc687ed386f091e674c5eb81122adf30c06f8da9 100644 --- a/juno_samples/implicit_clone/src/main.rs +++ b/juno_samples/implicit_clone/src/main.rs @@ -1,15 +1,40 @@ #![feature(future_join, box_as_ptr)] -extern crate async_std; -extern crate juno_build; - juno_build::juno!("implicit_clone"); fn main() { async_std::task::block_on(async { - let output = implicit_clone(3).await; + let output = simple_implicit_clone(3).await; println!("{}", output); assert_eq!(output, 11); + + let output = loop_implicit_clone(100).await; + println!("{}", output); + assert_eq!(output, 7); + + let output = double_loop_implicit_clone(3).await; + println!("{}", output); + assert_eq!(output, 42); + + let output = tricky_loop_implicit_clone(2, 2).await; + println!("{}", output); + assert_eq!(output, 130); + + let output = tricky2_loop_implicit_clone(2, 3).await; + println!("{}", output); + assert_eq!(output, 39); + + let output = tricky3_loop_implicit_clone(5, 7).await; + println!("{}", output); + assert_eq!(output, 7); + + let output = no_implicit_clone(4).await; + println!("{}", output); + assert_eq!(output, 13); + + let output = mirage_implicit_clone(73).await; + println!("{}", output); + assert_eq!(output, 843); }); } diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml index ea705dddd2fac0e4b5a4b8fe0ddfeef72039e3c4..8ad95853d25509d91713a080c0b63b01c0469110 100644 --- a/juno_samples/matmul/Cargo.toml +++ b/juno_samples/matmul/Cargo.toml @@ -13,6 +13,7 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" rand = "*" diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index 81f645e0666dfb22e075953c3d0a1a531909f1a0..926fbc33ecfa5ab31b40a92f778bb4d3b7f6a77e 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs index 948459dd4badfbdf31fea25be70520df22ff5b6e..bace3765789010655ae058aa92f95862b93b3084 100644 --- a/juno_samples/matmul/src/main.rs +++ b/juno_samples/matmul/src/main.rs @@ -1,13 +1,9 @@ -#![feature(future_join, box_as_ptr, let_chains)] - -extern crate async_std; -extern crate juno_build; -extern crate rand; - -use core::ptr::copy_nonoverlapping; +#![feature(box_as_ptr, let_chains)] use rand::random; +use hercules_rt::HerculesBox; + juno_build::juno!("matmul"); fn main() { @@ -17,45 +13,6 @@ fn main() { const K: usize = 128; let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect(); let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect(); - let mut a_bytes: Box<[u8]> = Box::new([0; I * J * 4]); - let mut b_bytes: Box<[u8]> = Box::new([0; J * K * 4]); - unsafe { - copy_nonoverlapping( - Box::as_ptr(&a) as *const u8, - Box::as_mut_ptr(&mut a_bytes) as *mut u8, - I * J * 4, - ); - copy_nonoverlapping( - Box::as_ptr(&b) as *const u8, - Box::as_mut_ptr(&mut b_bytes) as *mut u8, - J * K * 4, - ); - }; - let c_bytes = matmul( - I as u64, - J as u64, - K as u64, - a_bytes.clone(), - b_bytes.clone(), - ) - .await; - let mut c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); - unsafe { - copy_nonoverlapping( - Box::as_ptr(&c_bytes) as *const u8, - Box::as_mut_ptr(&mut c) as *mut u8, - I * K * 4, - ); - }; - let tiled_c_bytes = matmul(I as u64, J as u64, K as u64, a_bytes, b_bytes).await; - let mut tiled_c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); - unsafe { - copy_nonoverlapping( - Box::as_ptr(&tiled_c_bytes) as *const u8, - Box::as_mut_ptr(&mut tiled_c) as *mut u8, - I * K * 4, - ); - }; let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect(); for i in 0..I { for k in 0..K { @@ -64,8 +21,18 @@ fn main() { } } } - assert_eq!(c, correct_c); - assert_eq!(tiled_c, correct_c); + let mut c = { + let a = HerculesBox::from_slice(&a); + let b = HerculesBox::from_slice(&b); + matmul(I as u64, J as u64, K as u64, a, b).await + }; + assert_eq!(c.as_slice::<i32>(), &*correct_c); + let mut tiled_c = { + let a = HerculesBox::from_slice(&a); + let b = HerculesBox::from_slice(&b); + tiled_64_matmul(I as u64, J as u64, K as u64, a, b).await + }; + assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c); }); } @@ -73,3 +40,4 @@ fn main() { fn matmul_test() { main(); } + diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn index 775bb382a46a640cc45ddbf5d68b31c5c1d58bd3..ca9be73a86144ebe57048ed35adc2f36bfcd905b 100644 --- a/juno_samples/matmul/src/matmul.jn +++ b/juno_samples/matmul/src/matmul.jn @@ -17,25 +17,25 @@ fn matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]) -> i32[ #[entry] fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]) -> i32[n, l] { let res : i32[n, l]; + let atile : i32[64, 64]; + let btile : i32[64, 64]; + let ctile : i32[64, 64]; for bi = 0 to n / 64 { for bk = 0 to l / 64 { - // TODO: make these all the same size, clone analysis should undo GVN's - // combining of these three arrays. - let atile : i32[66, 64]; - let btile : i32[65, 64]; - let ctile : i32[64, 64]; + for ti = 0 to 64 { + for tk = 0 to 64 { + atile[ti, tk] = 0; + btile[ti, tk] = 0; + ctile[ti, tk] = 0; + } + } for tile_idx = 0 to m / 64 { for ti = 0 to 64 { for tk = 0 to 64 { atile[ti, tk] = a[bi * 64 + ti, tile_idx * 64 + tk]; btile[ti, tk] = b[tile_idx * 64 + ti, bk * 64 + tk]; - // TODO: remove setting ctile to zero explicitly, clone analysis - // should see a lack of a phi for ctile in the block loops and - // induce a copy of an initial value of ctile (all zeros) on each - // iteration of the block loops. - ctile[ti, tk] = 0; } } for ti = 0 to 64 { diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml index 7ffc13f21b155dbe6028d808be97aaf0e5ffb8d6..8c9b969d23019b8bbd3bf28b3506e2e497ae8ec7 100644 --- a/juno_samples/nested_ccp/Cargo.toml +++ b/juno_samples/nested_ccp/Cargo.toml @@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index 0fb54be347f011f4c29f1d7df98a0f29c85193c6..c5c7ca6a1b9ab5decf6a8cf0b8e8f13ff7122834 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 83132aca516bd59a6e5b5c0e5ca4562a2e1cc3cc..974a488c8f3574aaecfd40b1110b6a7022891f0f 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -1,32 +1,17 @@ #![feature(box_as_ptr, let_chains)] -extern crate async_std; -extern crate juno_build; - -use core::ptr::copy_nonoverlapping; +use hercules_rt::HerculesBox; juno_build::juno!("nested_ccp"); fn main() { async_std::task::block_on(async { - let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); - let b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); - let mut a_bytes: Box<[u8]> = Box::new([0; 12]); - let mut b_bytes: Box<[u8]> = Box::new([0; 36]); - unsafe { - copy_nonoverlapping( - Box::as_ptr(&a) as *const u8, - Box::as_mut_ptr(&mut a_bytes) as *mut u8, - 12, - ); - copy_nonoverlapping( - Box::as_ptr(&b) as *const u8, - Box::as_mut_ptr(&mut b_bytes) as *mut u8, - 36, - ); - }; - let output_example = ccp_example(a_bytes).await; - let output_median = median_array(9, b_bytes).await; + let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); + let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); + let a = HerculesBox::from_slice_mut(&mut a); + let b = HerculesBox::from_slice_mut(&mut b); + let output_example = ccp_example(a).await; + let output_median = median_array(9, b).await; println!("{}", output_example); println!("{}", output_median); assert_eq!(output_example, 1.0); diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml index 201c8d3782d4b41d7bfef5b7df4b5b29758e6e00..8060c5b3472ad898cb48e011332a852cd7b6705e 100644 --- a/juno_samples/simple3/Cargo.toml +++ b/juno_samples/simple3/Cargo.toml @@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" } [dependencies] juno_build = { path = "../../juno_build" } +hercules_rt = { path = "../../hercules_rt" } with_builtin_macros = "0.1.0" async-std = "*" diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index 0e476e8d41c7880741a3f474f341a0decf0bda4b..94760025d53abe7e10914052e1a7783386b316b0 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,4 +1,3 @@ -extern crate juno_build; use juno_build::JunoCompiler; fn main() { diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs index 89be5527a2e4d1e5842778818aa934db98fcdf09..1f6e213c41a27ceeb9c2988fe85bd956dec879d1 100644 --- a/juno_samples/simple3/src/main.rs +++ b/juno_samples/simple3/src/main.rs @@ -1,31 +1,16 @@ #![feature(box_as_ptr, let_chains)] -extern crate async_std; -extern crate juno_build; - -use core::ptr::copy_nonoverlapping; +use hercules_rt::HerculesBox; juno_build::juno!("simple3"); fn main() { async_std::task::block_on(async { - let a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]); - let b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]); - let mut a_bytes: Box<[u8]> = Box::new([0; 32]); - let mut b_bytes: Box<[u8]> = Box::new([0; 32]); - unsafe { - copy_nonoverlapping( - Box::as_ptr(&a) as *const u8, - Box::as_mut_ptr(&mut a_bytes) as *mut u8, - 32, - ); - copy_nonoverlapping( - Box::as_ptr(&b) as *const u8, - Box::as_mut_ptr(&mut b_bytes) as *mut u8, - 32, - ); - }; - let c = simple3(8, a_bytes, b_bytes).await; + let mut a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]); + let mut b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]); + let a = HerculesBox::from_slice_mut(&mut a); + let b = HerculesBox::from_slice_mut(&mut b); + let c = simple3(8, a, b).await; println!("{}", c); assert_eq!(c, 120); }); diff --git a/juno_scheduler/src/lib.rs b/juno_scheduler/src/lib.rs index 7e558d6bcb7a5c9250f3efe862ff235463c3599c..d515633eec468012ceee7305544b1859c1ebd621 100644 --- a/juno_scheduler/src/lib.rs +++ b/juno_scheduler/src/lib.rs @@ -1,5 +1,3 @@ -extern crate hercules_ir; - use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::Read; @@ -7,7 +5,7 @@ use std::io::Read; use lrlex::DefaultLexerTypes; use lrpar::NonStreamingLexer; -use self::hercules_ir::ir::*; +use hercules_ir::ir::*; mod parser; use crate::parser::lexer; diff --git a/reports/technical1/main.tex b/reports/technical1/main.tex index 4743d2069ce05baae6e37ea3f8c24582cbd0e9ff..e70625d4cd8adcf9157e1234d45eac177fa1f255 100644 --- a/reports/technical1/main.tex +++ b/reports/technical1/main.tex @@ -502,9 +502,6 @@ The manifest plus the Hercules module itself are loaded from a \texttt{.hbin} fi Here's an example host code written in Rust: \begin{lstlisting}[style=colouredRust] -extern crate async_std; -extern crate hercules_rt; - hercules_rt::use_hbin!("my_module.hbin"); fn main() {