diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index b15cf30106c76640016b1957fe9906ac01c74858..37bf814d85569a5aa8dd827e2fce17894da5a397 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -16,6 +16,7 @@ static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0); * LLVM bindings for Rust, and we are *not* writing any C++. */ pub fn cpu_codegen<W: Write>( + module_name: &str, function: &Function, types: &Vec<Type>, constants: &Vec<Constant>, @@ -27,6 +28,7 @@ pub fn cpu_codegen<W: Write>( w: &mut W, ) -> Result<(), Error> { let ctx = CPUContext { + module_name, function, types, constants, @@ -40,6 +42,7 @@ pub fn cpu_codegen<W: Write>( } struct CPUContext<'a> { + module_name: &'a str, function: &'a Function, types: &'a Vec<Type>, constants: &'a Vec<Constant>, @@ -65,16 +68,18 @@ impl<'a> CPUContext<'a> { if self.types[return_type.idx()].is_primitive() { write!( w, - "define dso_local {} @{}(", + "define dso_local {} @{}_{}(", self.get_type(return_type), - self.function.name + self.module_name, + self.function.name, )?; } else { write!( w, - "define dso_local nonnull noundef {} @{}(", + "define dso_local nonnull noundef {} @{}_{}(", self.get_type(return_type), - self.function.name + self.module_name, + self.function.name, )?; } } else { @@ -89,7 +94,11 @@ impl<'a> CPUContext<'a> { .collect::<Vec<_>>() .join(", "), )?; - write!(w, "define dso_local void @{}(", self.function.name,)?; + write!( + w, + "define dso_local void @{}_{}(", + self.module_name, self.function.name, + )?; } let mut first_param = true; // The first parameter is a pointer to CPU backing memory, if it's diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a3eea2745ea6dde2929b9cd6fcc17f5c6483643f..c9720273c03243d4874b27fc6c74f20fd21a6c33 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -14,6 +14,7 @@ use crate::*; * of similarities with the CPU LLVM generation plus custom GPU parallelization. */ pub fn gpu_codegen<W: Write>( + module_name: &str, function: &Function, types: &Vec<Type>, constants: &Vec<Constant>, @@ -170,6 +171,7 @@ pub fn gpu_codegen<W: Write>( }; let ctx = GPUContext { + module_name, function, types, constants, @@ -199,6 +201,7 @@ struct GPUKernelParams { } struct GPUContext<'a> { + module_name: &'a str, function: &'a Function, types: &'a Vec<Type>, constants: &'a Vec<Constant>, @@ -395,8 +398,8 @@ namespace cg = cooperative_groups; fn codegen_kernel_begin<W: Write>(&self, w: &mut W) -> Result<(), Error> { write!( w, - "__global__ void __launch_bounds__({}) {}_gpu(", - self.kernel_params.max_num_threads, self.function.name + "__global__ void __launch_bounds__({}) {}_{}_gpu(", + self.kernel_params.max_num_threads, self.module_name, self.function.name )?; let mut first_param = true; // The first parameter is a pointer to GPU backing memory, if it's @@ -645,7 +648,7 @@ namespace cg = cooperative_groups; } else { write!(w, "{}", self.get_type(self.function.return_types[0], false))?; } - write!(w, " {}(", self.function.name)?; + write!(w, " {}_{}(", self.module_name, self.function.name)?; let mut first_param = true; // The first parameter is a pointer to GPU backing memory, if it's @@ -721,8 +724,13 @@ namespace cg = cooperative_groups; write!(w, "\tcudaError_t err;\n")?; write!( w, - "\t{}_gpu<<<{}, {}, {}>>>({});\n", - self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args + "\t{}_{}_gpu<<<{}, {}, {}>>>({});\n", + self.module_name, + self.function.name, + num_blocks, + num_threads, + dynamic_shared_offset, + pass_args )?; write!(w, "\terr = cudaGetLastError();\n")?; write!( diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 8fa0c09ee512e3f2e43c5280bc3cb6947bc31dc5..6981a3da7e59176f73d6fecdde07fe636cc6aecf 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -74,6 +74,7 @@ use crate::*; * set some CUDA memory - the user can then take a CUDA reference to that box. */ pub fn rt_codegen<W: Write>( + module_name: &str, func_id: FunctionID, module: &Module, def_use: &ImmutableDefUseMap, @@ -96,6 +97,7 @@ pub fn rt_codegen<W: Write>( .map(|(fork, join)| (*join, *fork)) .collect(); let ctx = RTContext { + module_name, func_id, module, def_use, @@ -117,6 +119,7 @@ pub fn rt_codegen<W: Write>( } struct RTContext<'a> { + module_name: &'a str, func_id: FunctionID, module: &'a Module, def_use: &'a ImmutableDefUseMap, @@ -157,7 +160,8 @@ impl<'a> RTContext<'a> { // Dump the function signature. write!( w, - "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}(", + "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}_{}(", + self.module_name, func.name )?; let mut first_param = true; @@ -236,7 +240,7 @@ impl<'a> RTContext<'a> { // Create the return struct write!(w, "let mut ret_struct: ::std::mem::MaybeUninit<ReturnStruct> = ::std::mem::MaybeUninit::uninit();")?; // Call the device function - write!(w, "{}(", callee.name)?; + write!(w, "{}_{}(", self.module_name, callee.name)?; if self.backing_allocations[&callee_id].contains_key(&self.devices[callee_id.idx()]) { write!(w, "backing, ")?; @@ -672,8 +676,9 @@ impl<'a> RTContext<'a> { }; write!( block, - "{}{}(", + "{}{}_{}(", prefix, + self.module_name, self.module.functions[callee_id.idx()].name )?; for (device, (offset, size)) in self.backing_allocations[&self.func_id] @@ -1463,7 +1468,7 @@ impl<'a> RTContext<'a> { } // Call the wrapped function. - write!(w, "let ret = {}(", func.name)?; + write!(w, "let ret = {}_{}(", self.module_name, func.name)?; for (device, _) in self.backing_allocations[&self.func_id].iter() { write!( w, @@ -1630,8 +1635,9 @@ impl<'a> RTContext<'a> { let func = &self.module.functions[func_id.idx()]; write!( w, - "{}fn {}(", + "{}fn {}_{}(", if is_unsafe { "unsafe " } else { "" }, + self.module_name, func.name )?; let mut first_param = true; @@ -1667,7 +1673,7 @@ impl<'a> RTContext<'a> { func_id: FunctionID, ) -> Result<(), Error> { let func = &self.module.functions[func_id.idx()]; - write!(w, "fn {}(", func.name)?; + write!(w, "fn {}_{}(", self.module_name, func.name)?; let mut first_param = true; if self.backing_allocations[&func_id].contains_key(&self.devices[func_id.idx()]) { first_param = false; diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 17cea32500ab86779c01e37e2b03db842f9f3712..0e332a0033c50585160242f04bfdcceb37f87ad5 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -880,6 +880,18 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { } } + pub fn get_param_types(&self) -> &Vec<TypeID> { + self.updated_param_types + .as_ref() + .unwrap_or(&self.editor.function.param_types) + } + + pub fn get_return_types(&self) -> &Vec<TypeID> { + self.updated_return_types + .as_ref() + .unwrap_or(&self.editor.function.return_types) + } + pub fn set_param_types(&mut self, tys: Vec<TypeID>) { self.updated_param_types = Some(tys); } diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs index 99187dd2bbfb2a9bbc2bf5937cb40f5246ba5b29..38ed1b22d2d81be971aa867857fd01e3752ecc0a 100644 --- a/hercules_opt/src/inline.rs +++ b/hercules_opt/src/inline.rs @@ -1,8 +1,7 @@ +use std::cell::Ref; use std::collections::HashMap; -use hercules_ir::callgraph::*; -use hercules_ir::def_use::*; -use hercules_ir::ir::*; +use hercules_ir::*; use crate::*; @@ -235,3 +234,205 @@ fn inline_func( }); } } + +#[derive(Clone, Debug, Copy, PartialEq, Eq)] +enum ParameterLattice { + Top, + Constant(ConstantID), + // Dynamic constant + DynamicConstant(DynamicConstantID, FunctionID), + Bottom, +} + +impl ParameterLattice { + fn from_node(node: &Node, func_id: FunctionID) -> Self { + use ParameterLattice::*; + match node { + Node::Undef { ty: _ } => Top, + Node::Constant { id } => Constant(*id), + Node::DynamicConstant { id } => DynamicConstant(*id, func_id), + _ => Bottom, + } + } + + fn meet(&mut self, b: Self, cons: Ref<'_, Vec<Constant>>, dcs: Ref<'_, Vec<DynamicConstant>>) { + use ParameterLattice::*; + *self = match (*self, b) { + (Top, b) => b, + (a, Top) => a, + (Bottom, _) | (_, Bottom) => Bottom, + (Constant(id_a), Constant(id_b)) => { + if id_a == id_b { + Constant(id_a) + } else { + Bottom + } + } + (DynamicConstant(dc_a, f_a), DynamicConstant(dc_b, f_b)) => { + if dc_a == dc_b && f_a == f_b { + DynamicConstant(dc_a, f_a) + } else if let ( + ir::DynamicConstant::Constant(dcv_a), + ir::DynamicConstant::Constant(dcv_b), + ) = (&dcs[dc_a.idx()], &dcs[dc_b.idx()]) + && *dcv_a == *dcv_b + { + DynamicConstant(dc_a, f_a) + } else { + Bottom + } + } + (DynamicConstant(dc, _), Constant(con)) | (Constant(con), DynamicConstant(dc, _)) => { + match (&cons[con.idx()], &dcs[dc.idx()]) { + (ir::Constant::UnsignedInteger64(conv), ir::DynamicConstant::Constant(dcv)) + if *conv as usize == *dcv => + { + Constant(con) + } + _ => Bottom, + } + } + } + } +} + +/* + * Top level function to inline constant parameters and constant dynamic + * constant parameters. Identifies functions that are: + * + * 1. Not marked as entry. + * 2. At every call site, a particular parameter is always a specific constant + * or dynamic constant. + * + * These functions can have that constant "inlined" - the parameter is removed + * and all uses of the parameter becomes uses of the constant directly. + */ +pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { + // Run const inlining on each function, starting at the most shallow + // function first, since we want to propagate constants down the call graph. + for func_id in callgraph.topo().into_iter().rev() { + let func = editors[func_id.idx()].func(); + if func.entry || callgraph.num_callers(func_id) == 0 { + continue; + } + + // Figure out what we know about the parameters to this function. + let mut param_lattice = vec![ParameterLattice::Top; func.param_types.len()]; + let mut callers = vec![]; + for caller in callgraph.get_callers(func_id) { + let editor = &editors[caller.idx()]; + let nodes = &editor.func().nodes; + for id in editor.node_ids() { + if let Some((_, callee, _, args)) = nodes[id.idx()].try_call() + && callee == func_id + { + if editor.is_mutable(id) { + for (idx, id) in args.into_iter().enumerate() { + let lattice = ParameterLattice::from_node(&nodes[id.idx()], callee); + param_lattice[idx].meet( + lattice, + editor.get_constants(), + editor.get_dynamic_constants(), + ); + } + } else { + // If we can't modify the call node in the caller, then + // we can't perform the inlining. + param_lattice = vec![ParameterLattice::Bottom; func.param_types.len()]; + } + callers.push((caller, id)); + } + } + } + if param_lattice.iter().all(|v| *v == ParameterLattice::Bottom) { + continue; + } + + // Replace the arguments. + let editor = &mut editors[func_id.idx()]; + let mut param_idx_to_ids: HashMap<usize, Vec<NodeID>> = HashMap::new(); + for id in editor.node_ids() { + if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() { + param_idx_to_ids.entry(idx).or_default().push(id); + } + } + let mut params_to_remove = vec![]; + let success = editor.edit(|mut edit| { + let mut param_tys = edit.get_param_types().clone(); + let mut decrement_index_by = 0; + for idx in 0..param_tys.len() { + if let Some(node) = match param_lattice[idx] { + ParameterLattice::Top => Some(Node::Undef { ty: param_tys[idx] }), + ParameterLattice::Constant(id) => Some(Node::Constant { id }), + ParameterLattice::DynamicConstant(id, _) => { + // Rust moment. + let maybe_cons = edit.get_dynamic_constant(id).try_constant(); + if let Some(val) = maybe_cons { + Some(Node::DynamicConstant { + id: edit.add_dynamic_constant(DynamicConstant::Constant(val)), + }) + } else { + None + } + } + _ => None, + } && let Some(ids) = param_idx_to_ids.get(&idx) + { + let node = edit.add_node(node); + for id in ids { + edit = edit.replace_all_uses(*id, node)?; + edit = edit.delete_node(*id)?; + } + param_tys.remove(idx - decrement_index_by); + params_to_remove.push(idx); + decrement_index_by += 1; + } else if decrement_index_by != 0 + && let Some(ids) = param_idx_to_ids.get(&idx) + { + let node = edit.add_node(Node::Parameter { + index: idx - decrement_index_by, + }); + for id in ids { + edit = edit.replace_all_uses(*id, node)?; + edit = edit.delete_node(*id)?; + } + } + } + edit.set_param_types(param_tys); + Ok(edit) + }); + params_to_remove.reverse(); + + // Update callers. + if success { + for (caller, call) in callers { + let editor = &mut editors[caller.idx()]; + let success = editor.edit(|mut edit| { + let Node::Call { + control, + function, + dynamic_constants, + args, + } = edit.get_node(call).clone() + else { + panic!(); + }; + let mut args = args.into_vec(); + for idx in params_to_remove.iter() { + args.remove(*idx); + } + let node = edit.add_node(Node::Call { + control, + function, + dynamic_constants, + args: args.into_boxed_slice(), + }); + edit = edit.replace_all_uses(call, node)?; + edit = edit.delete_node(call)?; + Ok(edit) + }); + assert!(success); + } + } + } +} diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn index c7f4345bc5dc89d2fb55ee96231e6b5f6604ef4f..356bb3d91836ba0994cad56315b9a5588b0df8b7 100644 --- a/juno_samples/rodinia/backprop/src/backprop.jn +++ b/juno_samples/rodinia/backprop/src/backprop.jn @@ -4,7 +4,7 @@ fn squash(x: f32) -> f32 { } fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] { - let result : f32[m + 1]; + @res let result : f32[m + 1]; result[0] = 1.0; for j in 1..=m { diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 56fc2c9ae401985116fa7fbfdf69ed0e4e0ab926..fa3dccf123fb51fbe8d989b97a8ed35d1c763251 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -1,24 +1,24 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(backprop); -cpu(auto.backprop); - -inline(auto.backprop); -inline(auto.backprop); +simpl!(*); +inline(layer_forward); delete-uncalled(*); -sroa[true](*); -dce(*); -float-collections(*); -reuse-products(*); -dce(*); +no-memset(layer_forward@res); +lift-dc-math(*); +loop-bound-canon(*); +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} gcm(*); - diff --git a/juno_samples/rodinia/bfs/build.rs b/juno_samples/rodinia/bfs/build.rs index c19bae5d54b185ac2ec97ffc645fc86840c7ad15..bb8f9ff507e818b6010cf9e12bbf3e9cdf8c342d 100644 --- a/juno_samples/rodinia/bfs/build.rs +++ b/juno_samples/rodinia/bfs/build.rs @@ -13,6 +13,8 @@ fn main() { JunoCompiler::new() .file_in_src("bfs.jn") .unwrap() + .schedule_in_src("cpu.sch") + .unwrap() .build() .unwrap(); } diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index cf2ea086619dd431e8edd30c51d86d35972296fc..51dcd945429dfde02cb2313afa404e81f8722c84 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -13,8 +13,8 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] let visited: bool[n]; visited[source as u64] = true; - let cost: i32[n]; - for i in 0..n { + @cost @cost_init let cost: i32[n]; + @cost_init for i in 0..n { cost[i] = -1; } cost[source as u64] = 0; @@ -25,7 +25,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] while !stop { stop = true; - for i in 0..n { + @loop1 for i in 0..n { if mask[i] { mask[i] = false; @@ -42,7 +42,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] } } - for i in 0..n { + @loop2 for i in 0..n { if updated[i] { mask[i] = true; visited[i] = true; diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..44cfa8ad0161fac0afbccc2d383637ec8a2f1aa0 --- /dev/null +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -0,0 +1,29 @@ +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + +phi-elim(bfs); +no-memset(bfs@cost); +outline(bfs@cost_init); +let loop1 = outline(bfs@loop1); +let loop2 = outline(bfs@loop2); + +simpl!(*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); + +unforkify(*); + +gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 9cbdb942bb484342afe42a2fc711852878474508..5fe48a8395cfb6fada1d668b4f73fa6eb3487f5e 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -1,23 +1,30 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + crc(X); + slf(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(euler); -cpu(auto.euler); - -inline(auto.euler); -inline(auto.euler); +simpl!(*); +inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); delete-uncalled(*); +simpl!(*); +ip-sroa[false](*); +sroa[false](*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); -sroa[false](auto.euler); -dce(*); -float-collections(*); -dce(*); +unforkify(*); gcm(*); - diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 252015c368d594e843af34367450257d3459034f..6329c5046e15ca0bce646f4a778dcf8c8d781656 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -1,23 +1,30 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + crc(X); + slf(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(pre_euler); -cpu(auto.pre_euler); - -inline(auto.pre_euler); -inline(auto.pre_euler); +simpl!(*); +inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); delete-uncalled(*); +simpl!(*); +ip-sroa[false](*); +sroa[false](*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); -sroa[false](auto.pre_euler); -dce(*); -float-collections(*); -dce(*); +unforkify(*); gcm(*); - diff --git a/juno_samples/rodinia/srad/build.rs b/juno_samples/rodinia/srad/build.rs index 36ba61207bb08766e95f7437859d6d6d2146339c..5e1f78f762a39dcc10e131b4d359cfb1097575c8 100644 --- a/juno_samples/rodinia/srad/build.rs +++ b/juno_samples/rodinia/srad/build.rs @@ -13,6 +13,8 @@ fn main() { JunoCompiler::new() .file_in_src("srad.jn") .unwrap() + .schedule_in_src("cpu.sch") + .unwrap() .build() .unwrap(); } diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..1a81ddad3b55bcf9ffb76660ebdc1069338affd4 --- /dev/null +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -0,0 +1,35 @@ +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + +phi-elim(*); +let loop1 = outline(srad@loop1); +let loop2 = outline(srad@loop2); +let loop3 = outline(srad@loop3); +simpl!(*); +const-inline(*); +crc(*); +slf(*); +write-predication(*); +simpl!(*); +predication(*); +simpl!(*); +predication(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} +simpl!(*); + +fork-split(*); +unforkify(*); + +gcm(*); diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn index 5eea647c58949ebd951149f57e1961cebe6fc443..3e016a99b574c1dcde982e7277a5cbcdc1743c19 100644 --- a/juno_samples/rodinia/srad/src/srad.jn +++ b/juno_samples/rodinia/srad/src/srad.jn @@ -38,7 +38,7 @@ fn srad<nrows, ncols: usize>( // These loops should really be interchanged, but they aren't in the // Rodinia source (though they are in the HPVM source) - for i in 0..nrows { + @loop1 for i in 0..nrows { for j in 0..ncols { let tmp = image[j, i]; sum += tmp; @@ -57,7 +57,7 @@ fn srad<nrows, ncols: usize>( let c : f32[ncols, nrows]; - for j in 0..ncols { + @loop2 for j in 0..ncols { for i in 0..nrows { let Jc = image[j, i]; dN[j, i] = image[j, iN[i] as u64] - Jc; @@ -75,14 +75,15 @@ fn srad<nrows, ncols: usize>( let qsqr = num / (den * den); let den = (qsqr - q0sqr) / (q0sqr * (1 + q0sqr)); - c[j, i] = 1.0 / (1.0 + den); + let val = 1.0 / (1.0 + den); - if c[j, i] < 0 { c[j, i] = 0; } - else if c[j, i] > 1 { c[j, i] = 1; } + if val < 0 { c[j, i] = 0; } + else if val > 1 { c[j, i] = 1; } + else { c[j, i] = val; } } } - for j in 0..ncols { + @loop3 for j in 0..ncols { for i in 0..nrows { let cN = c[j, i]; let cS = c[j, iS[i] as u64]; diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs index 3c288ca7a61c855b3982ba6ed66c215c9e2942fe..bd27350a26d58f4e729b24d0026f12cd13ca7195 100644 --- a/juno_scheduler/src/compile.rs +++ b/juno_scheduler/src/compile.rs @@ -112,6 +112,7 @@ impl FromStr for Appliable { "ccp" => Ok(Appliable::Pass(ir::Pass::CCP)), "crc" | "collapse-read-chains" => Ok(Appliable::Pass(ir::Pass::CRC)), "clean-monoid-reduces" => Ok(Appliable::Pass(ir::Pass::CleanMonoidReduces)), + "const-inline" => Ok(Appliable::Pass(ir::Pass::ConstInline)), "dce" => Ok(Appliable::Pass(ir::Pass::DCE)), "delete-uncalled" => Ok(Appliable::DeleteUncalled), "float-collections" | "collections" => Ok(Appliable::Pass(ir::Pass::FloatCollections)), diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs index 3a087c0d40093c6363e67332c1bd489f22727a42..a0db884492120a43d0bb8fff89e689746ef1579e 100644 --- a/juno_scheduler/src/ir.rs +++ b/juno_scheduler/src/ir.rs @@ -8,8 +8,9 @@ pub enum Pass { ArrayToProduct, AutoOutline, CCP, - CleanMonoidReduces, CRC, + CleanMonoidReduces, + ConstInline, DCE, FloatCollections, ForkChunk, diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 5f2fa4cce02e04ca4499cbff8806356f81cc86bc..e049f985e0db36ae78368b8d33c01d22744fdcc6 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -951,6 +951,7 @@ impl PassManager { for idx in 0..module.functions.len() { match devices[idx] { Device::LLVM => cpu_codegen( + &module_name, &module.functions[idx], &module.types, &module.constants, @@ -966,6 +967,7 @@ impl PassManager { error: format!("{}", e), })?, Device::CUDA => gpu_codegen( + &module_name, &module.functions[idx], &module.types, &module.constants, @@ -986,6 +988,7 @@ impl PassManager { error: format!("{}", e), })?, Device::AsyncRust => rt_codegen( + &module_name, FunctionID::new(idx), &module, &def_uses[idx], @@ -1833,6 +1836,24 @@ fn run_pass( pm.delete_gravestones(); pm.clear_analyses(); } + Pass::ConstInline => { + assert!(args.is_empty()); + pm.make_callgraph(); + let callgraph = pm.callgraph.take().unwrap(); + + let mut editors: Vec<_> = build_selection(pm, selection, true) + .into_iter() + .map(|editor| editor.unwrap()) + .collect(); + const_inline(&mut editors, &callgraph); + + for func in editors { + changed |= func.modified(); + } + + pm.delete_gravestones(); + pm.clear_analyses(); + } Pass::CRC => { assert!(args.is_empty()); for func in build_selection(pm, selection, false) {