From d3ddf535eef68a3efdb8953f6fde7cec35eaa3f7 Mon Sep 17 00:00:00 2001 From: rarbore2 <rarbore2@illinois.edu> Date: Thu, 27 Feb 2025 14:27:35 -0600 Subject: [PATCH] Round 1 of rodinia schedule optimization --- hercules_cg/src/cpu.rs | 19 +- hercules_cg/src/gpu.rs | 18 +- hercules_cg/src/rt.rs | 18 +- hercules_opt/src/editor.rs | 12 + hercules_opt/src/inline.rs | 207 +++++++++++++++++- juno_samples/rodinia/backprop/src/backprop.jn | 2 +- juno_samples/rodinia/backprop/src/cpu.sch | 38 ++-- juno_samples/rodinia/bfs/build.rs | 2 + juno_samples/rodinia/bfs/src/bfs.jn | 8 +- juno_samples/rodinia/bfs/src/cpu.sch | 29 +++ juno_samples/rodinia/cfd/src/cpu_euler.sch | 43 ++-- .../rodinia/cfd/src/cpu_pre_euler.sch | 43 ++-- juno_samples/rodinia/srad/build.rs | 2 + juno_samples/rodinia/srad/src/cpu.sch | 35 +++ juno_samples/rodinia/srad/src/srad.jn | 13 +- juno_scheduler/src/compile.rs | 1 + juno_scheduler/src/ir.rs | 3 +- juno_scheduler/src/pm.rs | 21 ++ 18 files changed, 428 insertions(+), 86 deletions(-) create mode 100644 juno_samples/rodinia/bfs/src/cpu.sch create mode 100644 juno_samples/rodinia/srad/src/cpu.sch diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index b15cf301..37bf814d 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -16,6 +16,7 @@ static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0); * LLVM bindings for Rust, and we are *not* writing any C++. */ pub fn cpu_codegen<W: Write>( + module_name: &str, function: &Function, types: &Vec<Type>, constants: &Vec<Constant>, @@ -27,6 +28,7 @@ pub fn cpu_codegen<W: Write>( w: &mut W, ) -> Result<(), Error> { let ctx = CPUContext { + module_name, function, types, constants, @@ -40,6 +42,7 @@ pub fn cpu_codegen<W: Write>( } struct CPUContext<'a> { + module_name: &'a str, function: &'a Function, types: &'a Vec<Type>, constants: &'a Vec<Constant>, @@ -65,16 +68,18 @@ impl<'a> CPUContext<'a> { if self.types[return_type.idx()].is_primitive() { write!( w, - "define dso_local {} @{}(", + "define dso_local {} @{}_{}(", self.get_type(return_type), - self.function.name + self.module_name, + self.function.name, )?; } else { write!( w, - "define dso_local nonnull noundef {} @{}(", + "define dso_local nonnull noundef {} @{}_{}(", self.get_type(return_type), - self.function.name + self.module_name, + self.function.name, )?; } } else { @@ -89,7 +94,11 @@ impl<'a> CPUContext<'a> { .collect::<Vec<_>>() .join(", "), )?; - write!(w, "define dso_local void @{}(", self.function.name,)?; + write!( + w, + "define dso_local void @{}_{}(", + self.module_name, self.function.name, + )?; } let mut first_param = true; // The first parameter is a pointer to CPU backing memory, if it's diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a3eea274..c9720273 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -14,6 +14,7 @@ use crate::*; * of similarities with the CPU LLVM generation plus custom GPU parallelization. */ pub fn gpu_codegen<W: Write>( + module_name: &str, function: &Function, types: &Vec<Type>, constants: &Vec<Constant>, @@ -170,6 +171,7 @@ pub fn gpu_codegen<W: Write>( }; let ctx = GPUContext { + module_name, function, types, constants, @@ -199,6 +201,7 @@ struct GPUKernelParams { } struct GPUContext<'a> { + module_name: &'a str, function: &'a Function, types: &'a Vec<Type>, constants: &'a Vec<Constant>, @@ -395,8 +398,8 @@ namespace cg = cooperative_groups; fn codegen_kernel_begin<W: Write>(&self, w: &mut W) -> Result<(), Error> { write!( w, - "__global__ void __launch_bounds__({}) {}_gpu(", - self.kernel_params.max_num_threads, self.function.name + "__global__ void __launch_bounds__({}) {}_{}_gpu(", + self.kernel_params.max_num_threads, self.module_name, self.function.name )?; let mut first_param = true; // The first parameter is a pointer to GPU backing memory, if it's @@ -645,7 +648,7 @@ namespace cg = cooperative_groups; } else { write!(w, "{}", self.get_type(self.function.return_types[0], false))?; } - write!(w, " {}(", self.function.name)?; + write!(w, " {}_{}(", self.module_name, self.function.name)?; let mut first_param = true; // The first parameter is a pointer to GPU backing memory, if it's @@ -721,8 +724,13 @@ namespace cg = cooperative_groups; write!(w, "\tcudaError_t err;\n")?; write!( w, - "\t{}_gpu<<<{}, {}, {}>>>({});\n", - self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args + "\t{}_{}_gpu<<<{}, {}, {}>>>({});\n", + self.module_name, + self.function.name, + num_blocks, + num_threads, + dynamic_shared_offset, + pass_args )?; write!(w, "\terr = cudaGetLastError();\n")?; write!( diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 8fa0c09e..6981a3da 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -74,6 +74,7 @@ use crate::*; * set some CUDA memory - the user can then take a CUDA reference to that box. */ pub fn rt_codegen<W: Write>( + module_name: &str, func_id: FunctionID, module: &Module, def_use: &ImmutableDefUseMap, @@ -96,6 +97,7 @@ pub fn rt_codegen<W: Write>( .map(|(fork, join)| (*join, *fork)) .collect(); let ctx = RTContext { + module_name, func_id, module, def_use, @@ -117,6 +119,7 @@ pub fn rt_codegen<W: Write>( } struct RTContext<'a> { + module_name: &'a str, func_id: FunctionID, module: &'a Module, def_use: &'a ImmutableDefUseMap, @@ -157,7 +160,8 @@ impl<'a> RTContext<'a> { // Dump the function signature. write!( w, - "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}(", + "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}_{}(", + self.module_name, func.name )?; let mut first_param = true; @@ -236,7 +240,7 @@ impl<'a> RTContext<'a> { // Create the return struct write!(w, "let mut ret_struct: ::std::mem::MaybeUninit<ReturnStruct> = ::std::mem::MaybeUninit::uninit();")?; // Call the device function - write!(w, "{}(", callee.name)?; + write!(w, "{}_{}(", self.module_name, callee.name)?; if self.backing_allocations[&callee_id].contains_key(&self.devices[callee_id.idx()]) { write!(w, "backing, ")?; @@ -672,8 +676,9 @@ impl<'a> RTContext<'a> { }; write!( block, - "{}{}(", + "{}{}_{}(", prefix, + self.module_name, self.module.functions[callee_id.idx()].name )?; for (device, (offset, size)) in self.backing_allocations[&self.func_id] @@ -1463,7 +1468,7 @@ impl<'a> RTContext<'a> { } // Call the wrapped function. - write!(w, "let ret = {}(", func.name)?; + write!(w, "let ret = {}_{}(", self.module_name, func.name)?; for (device, _) in self.backing_allocations[&self.func_id].iter() { write!( w, @@ -1630,8 +1635,9 @@ impl<'a> RTContext<'a> { let func = &self.module.functions[func_id.idx()]; write!( w, - "{}fn {}(", + "{}fn {}_{}(", if is_unsafe { "unsafe " } else { "" }, + self.module_name, func.name )?; let mut first_param = true; @@ -1667,7 +1673,7 @@ impl<'a> RTContext<'a> { func_id: FunctionID, ) -> Result<(), Error> { let func = &self.module.functions[func_id.idx()]; - write!(w, "fn {}(", func.name)?; + write!(w, "fn {}_{}(", self.module_name, func.name)?; let mut first_param = true; if self.backing_allocations[&func_id].contains_key(&self.devices[func_id.idx()]) { first_param = false; diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 17cea325..0e332a00 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -880,6 +880,18 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { } } + pub fn get_param_types(&self) -> &Vec<TypeID> { + self.updated_param_types + .as_ref() + .unwrap_or(&self.editor.function.param_types) + } + + pub fn get_return_types(&self) -> &Vec<TypeID> { + self.updated_return_types + .as_ref() + .unwrap_or(&self.editor.function.return_types) + } + pub fn set_param_types(&mut self, tys: Vec<TypeID>) { self.updated_param_types = Some(tys); } diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs index 99187dd2..38ed1b22 100644 --- a/hercules_opt/src/inline.rs +++ b/hercules_opt/src/inline.rs @@ -1,8 +1,7 @@ +use std::cell::Ref; use std::collections::HashMap; -use hercules_ir::callgraph::*; -use hercules_ir::def_use::*; -use hercules_ir::ir::*; +use hercules_ir::*; use crate::*; @@ -235,3 +234,205 @@ fn inline_func( }); } } + +#[derive(Clone, Debug, Copy, PartialEq, Eq)] +enum ParameterLattice { + Top, + Constant(ConstantID), + // Dynamic constant + DynamicConstant(DynamicConstantID, FunctionID), + Bottom, +} + +impl ParameterLattice { + fn from_node(node: &Node, func_id: FunctionID) -> Self { + use ParameterLattice::*; + match node { + Node::Undef { ty: _ } => Top, + Node::Constant { id } => Constant(*id), + Node::DynamicConstant { id } => DynamicConstant(*id, func_id), + _ => Bottom, + } + } + + fn meet(&mut self, b: Self, cons: Ref<'_, Vec<Constant>>, dcs: Ref<'_, Vec<DynamicConstant>>) { + use ParameterLattice::*; + *self = match (*self, b) { + (Top, b) => b, + (a, Top) => a, + (Bottom, _) | (_, Bottom) => Bottom, + (Constant(id_a), Constant(id_b)) => { + if id_a == id_b { + Constant(id_a) + } else { + Bottom + } + } + (DynamicConstant(dc_a, f_a), DynamicConstant(dc_b, f_b)) => { + if dc_a == dc_b && f_a == f_b { + DynamicConstant(dc_a, f_a) + } else if let ( + ir::DynamicConstant::Constant(dcv_a), + ir::DynamicConstant::Constant(dcv_b), + ) = (&dcs[dc_a.idx()], &dcs[dc_b.idx()]) + && *dcv_a == *dcv_b + { + DynamicConstant(dc_a, f_a) + } else { + Bottom + } + } + (DynamicConstant(dc, _), Constant(con)) | (Constant(con), DynamicConstant(dc, _)) => { + match (&cons[con.idx()], &dcs[dc.idx()]) { + (ir::Constant::UnsignedInteger64(conv), ir::DynamicConstant::Constant(dcv)) + if *conv as usize == *dcv => + { + Constant(con) + } + _ => Bottom, + } + } + } + } +} + +/* + * Top level function to inline constant parameters and constant dynamic + * constant parameters. Identifies functions that are: + * + * 1. Not marked as entry. + * 2. At every call site, a particular parameter is always a specific constant + * or dynamic constant. + * + * These functions can have that constant "inlined" - the parameter is removed + * and all uses of the parameter becomes uses of the constant directly. + */ +pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { + // Run const inlining on each function, starting at the most shallow + // function first, since we want to propagate constants down the call graph. + for func_id in callgraph.topo().into_iter().rev() { + let func = editors[func_id.idx()].func(); + if func.entry || callgraph.num_callers(func_id) == 0 { + continue; + } + + // Figure out what we know about the parameters to this function. + let mut param_lattice = vec![ParameterLattice::Top; func.param_types.len()]; + let mut callers = vec![]; + for caller in callgraph.get_callers(func_id) { + let editor = &editors[caller.idx()]; + let nodes = &editor.func().nodes; + for id in editor.node_ids() { + if let Some((_, callee, _, args)) = nodes[id.idx()].try_call() + && callee == func_id + { + if editor.is_mutable(id) { + for (idx, id) in args.into_iter().enumerate() { + let lattice = ParameterLattice::from_node(&nodes[id.idx()], callee); + param_lattice[idx].meet( + lattice, + editor.get_constants(), + editor.get_dynamic_constants(), + ); + } + } else { + // If we can't modify the call node in the caller, then + // we can't perform the inlining. + param_lattice = vec![ParameterLattice::Bottom; func.param_types.len()]; + } + callers.push((caller, id)); + } + } + } + if param_lattice.iter().all(|v| *v == ParameterLattice::Bottom) { + continue; + } + + // Replace the arguments. + let editor = &mut editors[func_id.idx()]; + let mut param_idx_to_ids: HashMap<usize, Vec<NodeID>> = HashMap::new(); + for id in editor.node_ids() { + if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() { + param_idx_to_ids.entry(idx).or_default().push(id); + } + } + let mut params_to_remove = vec![]; + let success = editor.edit(|mut edit| { + let mut param_tys = edit.get_param_types().clone(); + let mut decrement_index_by = 0; + for idx in 0..param_tys.len() { + if let Some(node) = match param_lattice[idx] { + ParameterLattice::Top => Some(Node::Undef { ty: param_tys[idx] }), + ParameterLattice::Constant(id) => Some(Node::Constant { id }), + ParameterLattice::DynamicConstant(id, _) => { + // Rust moment. + let maybe_cons = edit.get_dynamic_constant(id).try_constant(); + if let Some(val) = maybe_cons { + Some(Node::DynamicConstant { + id: edit.add_dynamic_constant(DynamicConstant::Constant(val)), + }) + } else { + None + } + } + _ => None, + } && let Some(ids) = param_idx_to_ids.get(&idx) + { + let node = edit.add_node(node); + for id in ids { + edit = edit.replace_all_uses(*id, node)?; + edit = edit.delete_node(*id)?; + } + param_tys.remove(idx - decrement_index_by); + params_to_remove.push(idx); + decrement_index_by += 1; + } else if decrement_index_by != 0 + && let Some(ids) = param_idx_to_ids.get(&idx) + { + let node = edit.add_node(Node::Parameter { + index: idx - decrement_index_by, + }); + for id in ids { + edit = edit.replace_all_uses(*id, node)?; + edit = edit.delete_node(*id)?; + } + } + } + edit.set_param_types(param_tys); + Ok(edit) + }); + params_to_remove.reverse(); + + // Update callers. + if success { + for (caller, call) in callers { + let editor = &mut editors[caller.idx()]; + let success = editor.edit(|mut edit| { + let Node::Call { + control, + function, + dynamic_constants, + args, + } = edit.get_node(call).clone() + else { + panic!(); + }; + let mut args = args.into_vec(); + for idx in params_to_remove.iter() { + args.remove(*idx); + } + let node = edit.add_node(Node::Call { + control, + function, + dynamic_constants, + args: args.into_boxed_slice(), + }); + edit = edit.replace_all_uses(call, node)?; + edit = edit.delete_node(call)?; + Ok(edit) + }); + assert!(success); + } + } + } +} diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn index c7f4345b..356bb3d9 100644 --- a/juno_samples/rodinia/backprop/src/backprop.jn +++ b/juno_samples/rodinia/backprop/src/backprop.jn @@ -4,7 +4,7 @@ fn squash(x: f32) -> f32 { } fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] { - let result : f32[m + 1]; + @res let result : f32[m + 1]; result[0] = 1.0; for j in 1..=m { diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 56fc2c9a..fa3dccf1 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -1,24 +1,24 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(backprop); -cpu(auto.backprop); - -inline(auto.backprop); -inline(auto.backprop); +simpl!(*); +inline(layer_forward); delete-uncalled(*); -sroa[true](*); -dce(*); -float-collections(*); -reuse-products(*); -dce(*); +no-memset(layer_forward@res); +lift-dc-math(*); +loop-bound-canon(*); +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} gcm(*); - diff --git a/juno_samples/rodinia/bfs/build.rs b/juno_samples/rodinia/bfs/build.rs index c19bae5d..bb8f9ff5 100644 --- a/juno_samples/rodinia/bfs/build.rs +++ b/juno_samples/rodinia/bfs/build.rs @@ -13,6 +13,8 @@ fn main() { JunoCompiler::new() .file_in_src("bfs.jn") .unwrap() + .schedule_in_src("cpu.sch") + .unwrap() .build() .unwrap(); } diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index cf2ea086..51dcd945 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -13,8 +13,8 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] let visited: bool[n]; visited[source as u64] = true; - let cost: i32[n]; - for i in 0..n { + @cost @cost_init let cost: i32[n]; + @cost_init for i in 0..n { cost[i] = -1; } cost[source as u64] = 0; @@ -25,7 +25,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] while !stop { stop = true; - for i in 0..n { + @loop1 for i in 0..n { if mask[i] { mask[i] = false; @@ -42,7 +42,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] } } - for i in 0..n { + @loop2 for i in 0..n { if updated[i] { mask[i] = true; visited[i] = true; diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch new file mode 100644 index 00000000..44cfa8ad --- /dev/null +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -0,0 +1,29 @@ +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + +phi-elim(bfs); +no-memset(bfs@cost); +outline(bfs@cost_init); +let loop1 = outline(bfs@loop1); +let loop2 = outline(bfs@loop2); + +simpl!(*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); + +unforkify(*); + +gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 9cbdb942..5fe48a83 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -1,23 +1,30 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + crc(X); + slf(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(euler); -cpu(auto.euler); - -inline(auto.euler); -inline(auto.euler); +simpl!(*); +inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); delete-uncalled(*); +simpl!(*); +ip-sroa[false](*); +sroa[false](*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); -sroa[false](auto.euler); -dce(*); -float-collections(*); -dce(*); +unforkify(*); gcm(*); - diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 252015c3..6329c504 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -1,23 +1,30 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + crc(X); + slf(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(pre_euler); -cpu(auto.pre_euler); - -inline(auto.pre_euler); -inline(auto.pre_euler); +simpl!(*); +inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); delete-uncalled(*); +simpl!(*); +ip-sroa[false](*); +sroa[false](*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); -sroa[false](auto.pre_euler); -dce(*); -float-collections(*); -dce(*); +unforkify(*); gcm(*); - diff --git a/juno_samples/rodinia/srad/build.rs b/juno_samples/rodinia/srad/build.rs index 36ba6120..5e1f78f7 100644 --- a/juno_samples/rodinia/srad/build.rs +++ b/juno_samples/rodinia/srad/build.rs @@ -13,6 +13,8 @@ fn main() { JunoCompiler::new() .file_in_src("srad.jn") .unwrap() + .schedule_in_src("cpu.sch") + .unwrap() .build() .unwrap(); } diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch new file mode 100644 index 00000000..1a81ddad --- /dev/null +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -0,0 +1,35 @@ +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + +phi-elim(*); +let loop1 = outline(srad@loop1); +let loop2 = outline(srad@loop2); +let loop3 = outline(srad@loop3); +simpl!(*); +const-inline(*); +crc(*); +slf(*); +write-predication(*); +simpl!(*); +predication(*); +simpl!(*); +predication(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} +simpl!(*); + +fork-split(*); +unforkify(*); + +gcm(*); diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn index 5eea647c..3e016a99 100644 --- a/juno_samples/rodinia/srad/src/srad.jn +++ b/juno_samples/rodinia/srad/src/srad.jn @@ -38,7 +38,7 @@ fn srad<nrows, ncols: usize>( // These loops should really be interchanged, but they aren't in the // Rodinia source (though they are in the HPVM source) - for i in 0..nrows { + @loop1 for i in 0..nrows { for j in 0..ncols { let tmp = image[j, i]; sum += tmp; @@ -57,7 +57,7 @@ fn srad<nrows, ncols: usize>( let c : f32[ncols, nrows]; - for j in 0..ncols { + @loop2 for j in 0..ncols { for i in 0..nrows { let Jc = image[j, i]; dN[j, i] = image[j, iN[i] as u64] - Jc; @@ -75,14 +75,15 @@ fn srad<nrows, ncols: usize>( let qsqr = num / (den * den); let den = (qsqr - q0sqr) / (q0sqr * (1 + q0sqr)); - c[j, i] = 1.0 / (1.0 + den); + let val = 1.0 / (1.0 + den); - if c[j, i] < 0 { c[j, i] = 0; } - else if c[j, i] > 1 { c[j, i] = 1; } + if val < 0 { c[j, i] = 0; } + else if val > 1 { c[j, i] = 1; } + else { c[j, i] = val; } } } - for j in 0..ncols { + @loop3 for j in 0..ncols { for i in 0..nrows { let cN = c[j, i]; let cS = c[j, iS[i] as u64]; diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs index 3c288ca7..bd27350a 100644 --- a/juno_scheduler/src/compile.rs +++ b/juno_scheduler/src/compile.rs @@ -112,6 +112,7 @@ impl FromStr for Appliable { "ccp" => Ok(Appliable::Pass(ir::Pass::CCP)), "crc" | "collapse-read-chains" => Ok(Appliable::Pass(ir::Pass::CRC)), "clean-monoid-reduces" => Ok(Appliable::Pass(ir::Pass::CleanMonoidReduces)), + "const-inline" => Ok(Appliable::Pass(ir::Pass::ConstInline)), "dce" => Ok(Appliable::Pass(ir::Pass::DCE)), "delete-uncalled" => Ok(Appliable::DeleteUncalled), "float-collections" | "collections" => Ok(Appliable::Pass(ir::Pass::FloatCollections)), diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs index 3a087c0d..a0db8844 100644 --- a/juno_scheduler/src/ir.rs +++ b/juno_scheduler/src/ir.rs @@ -8,8 +8,9 @@ pub enum Pass { ArrayToProduct, AutoOutline, CCP, - CleanMonoidReduces, CRC, + CleanMonoidReduces, + ConstInline, DCE, FloatCollections, ForkChunk, diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 5f2fa4cc..e049f985 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -951,6 +951,7 @@ impl PassManager { for idx in 0..module.functions.len() { match devices[idx] { Device::LLVM => cpu_codegen( + &module_name, &module.functions[idx], &module.types, &module.constants, @@ -966,6 +967,7 @@ impl PassManager { error: format!("{}", e), })?, Device::CUDA => gpu_codegen( + &module_name, &module.functions[idx], &module.types, &module.constants, @@ -986,6 +988,7 @@ impl PassManager { error: format!("{}", e), })?, Device::AsyncRust => rt_codegen( + &module_name, FunctionID::new(idx), &module, &def_uses[idx], @@ -1833,6 +1836,24 @@ fn run_pass( pm.delete_gravestones(); pm.clear_analyses(); } + Pass::ConstInline => { + assert!(args.is_empty()); + pm.make_callgraph(); + let callgraph = pm.callgraph.take().unwrap(); + + let mut editors: Vec<_> = build_selection(pm, selection, true) + .into_iter() + .map(|editor| editor.unwrap()) + .collect(); + const_inline(&mut editors, &callgraph); + + for func in editors { + changed |= func.modified(); + } + + pm.delete_gravestones(); + pm.clear_analyses(); + } Pass::CRC => { assert!(args.is_empty()); for func in build_selection(pm, selection, false) { -- GitLab