From 0303d9e8f568562d18606110627a1bba2702c303 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 09:45:12 -0600 Subject: [PATCH 1/9] Need LTE in loop-bound-canon for backprop --- juno_samples/rodinia/backprop/src/backprop.jn | 2 +- juno_samples/rodinia/backprop/src/cpu.sch | 37 ++++++++++--------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn index c7f4345b..356bb3d9 100644 --- a/juno_samples/rodinia/backprop/src/backprop.jn +++ b/juno_samples/rodinia/backprop/src/backprop.jn @@ -4,7 +4,7 @@ fn squash(x: f32) -> f32 { } fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] { - let result : f32[m + 1]; + @res let result : f32[m + 1]; result[0] = 1.0; for j in 1..=m { diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index 56fc2c9a..b6e69a27 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -1,24 +1,25 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(backprop); -cpu(auto.backprop); - -inline(auto.backprop); -inline(auto.backprop); +simpl!(*); +inline(layer_forward); delete-uncalled(*); -sroa[true](*); -dce(*); -float-collections(*); -reuse-products(*); -dce(*); +no-memset(layer_forward@res); +lift-dc-math(*); +loop-bound-canon(*); +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} gcm(*); -- GitLab From 7c66673ddcd5a166d88f00758a0c42ff64b0c25e Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 09:58:20 -0600 Subject: [PATCH 2/9] BFS needs const inlining --- juno_samples/rodinia/backprop/src/cpu.sch | 1 - juno_samples/rodinia/bfs/build.rs | 2 ++ juno_samples/rodinia/bfs/src/bfs.jn | 8 +++---- juno_samples/rodinia/bfs/src/cpu.sch | 26 +++++++++++++++++++++++ 4 files changed, 32 insertions(+), 5 deletions(-) create mode 100644 juno_samples/rodinia/bfs/src/cpu.sch diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch index b6e69a27..fa3dccf1 100644 --- a/juno_samples/rodinia/backprop/src/cpu.sch +++ b/juno_samples/rodinia/backprop/src/cpu.sch @@ -22,4 +22,3 @@ fixpoint { } gcm(*); - diff --git a/juno_samples/rodinia/bfs/build.rs b/juno_samples/rodinia/bfs/build.rs index c19bae5d..bb8f9ff5 100644 --- a/juno_samples/rodinia/bfs/build.rs +++ b/juno_samples/rodinia/bfs/build.rs @@ -13,6 +13,8 @@ fn main() { JunoCompiler::new() .file_in_src("bfs.jn") .unwrap() + .schedule_in_src("cpu.sch") + .unwrap() .build() .unwrap(); } diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index cf2ea086..2e61f1a0 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -13,8 +13,8 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] let visited: bool[n]; visited[source as u64] = true; - let cost: i32[n]; - for i in 0..n { + @cost let cost: i32[n]; + @cost_init for i in 0..n { cost[i] = -1; } cost[source as u64] = 0; @@ -25,7 +25,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] while !stop { stop = true; - for i in 0..n { + @loop1 for i in 0..n { if mask[i] { mask[i] = false; @@ -42,7 +42,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] } } - for i in 0..n { + @loop2 for i in 0..n { if updated[i] { mask[i] = true; visited[i] = true; diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch new file mode 100644 index 00000000..1a94ea55 --- /dev/null +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -0,0 +1,26 @@ +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + +phi-elim(bfs); +no-memset(bfs@cost); +outline(bfs@cost_init); +let loop1 = outline(bfs@loop1); +let loop2 = outline(bfs@loop2); + +simpl!(*); +predication(*); + +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} + +gcm(*); -- GitLab From b336327e69c03752217bcc613ac3374e880367b1 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 10:52:10 -0600 Subject: [PATCH 3/9] Skeleton and analysis for const inline pass --- hercules_opt/src/inline.rs | 134 ++++++++++++++++++++++++++- juno_samples/rodinia/bfs/src/cpu.sch | 1 + juno_scheduler/src/compile.rs | 1 + juno_scheduler/src/ir.rs | 3 +- juno_scheduler/src/pm.rs | 18 ++++ 5 files changed, 153 insertions(+), 4 deletions(-) diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs index 99187dd2..c94ad962 100644 --- a/hercules_opt/src/inline.rs +++ b/hercules_opt/src/inline.rs @@ -1,8 +1,7 @@ +use std::cell::Ref; use std::collections::HashMap; -use hercules_ir::callgraph::*; -use hercules_ir::def_use::*; -use hercules_ir::ir::*; +use hercules_ir::*; use crate::*; @@ -235,3 +234,132 @@ fn inline_func( }); } } + +#[derive(Clone, Debug, Copy)] +enum ParameterLattice { + Top, + Constant(ConstantID), + // Dynamic constant + DynamicConstant(DynamicConstantID, FunctionID), + Bottom, +} + +impl ParameterLattice { + fn from_node(node: &Node, func_id: FunctionID) -> Self { + use ParameterLattice::*; + match node { + Node::Undef { ty: _ } => Top, + Node::Constant { id } => Constant(*id), + Node::DynamicConstant { id } => DynamicConstant(*id, func_id), + _ => Bottom, + } + } + + fn try_const_dc(self, dcs: Ref<'_, Vec<DynamicConstant>>) -> Option<usize> { + if let ParameterLattice::DynamicConstant(id, _) = self + && let DynamicConstant::Constant(val) = &dcs[id.idx()] + { + Some(*val) + } else { + None + } + } + + fn meet(&mut self, b: Self, cons: Ref<'_, Vec<Constant>>, dcs: Ref<'_, Vec<DynamicConstant>>) { + use ParameterLattice::*; + *self = match (*self, b) { + (Top, b) => b, + (a, Top) => a, + (Bottom, _) | (_, Bottom) => Bottom, + (Constant(id_a), Constant(id_b)) => { + if id_a == id_b { + Constant(id_a) + } else { + Bottom + } + } + (DynamicConstant(dc_a, f_a), DynamicConstant(dc_b, f_b)) => { + if dc_a == dc_b && f_a == f_b { + DynamicConstant(dc_a, f_a) + } else if let ( + ir::DynamicConstant::Constant(dcv_a), + ir::DynamicConstant::Constant(dcv_b), + ) = (&dcs[dc_a.idx()], &dcs[dc_b.idx()]) + && *dcv_a == *dcv_b + { + DynamicConstant(dc_a, f_a) + } else { + Bottom + } + } + (DynamicConstant(dc, _), Constant(con)) | (Constant(con), DynamicConstant(dc, _)) => { + match (&cons[con.idx()], &dcs[dc.idx()]) { + (ir::Constant::UnsignedInteger64(conv), ir::DynamicConstant::Constant(dcv)) + if *conv as usize == *dcv => + { + Constant(con) + } + _ => Bottom, + } + } + } + } +} + +/* + * Top level function to inline constant parameters and constant dynamic + * constant parameters. Identifies functions that are: + * + * 1. Not marked as entry. + * 2. At every call site, a particular parameter is always a specific constant + * or dynamic constant OR a particular dynamic constant parameter is always a + * specific constant. + * + * These functions can have that constant "inlined" - the parameter is removed + * and all uses of the parameter becomes uses of the constant directly. + */ +pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { + // Run const inlining on each function, starting at the most shallow + // function first, since we want to propagate constants down the call graph. + for func_id in callgraph.topo().into_iter().rev() { + let func = editors[func_id.idx()].func(); + if func.entry || callgraph.num_callers(func_id) == 0 { + continue; + } + + // Figure out what we know about the parameters (both normal and dynamic + // constant) to this function. + let mut param_lattice = vec![ParameterLattice::Top; func.param_types.len()]; + let mut dc_param_lattice = vec![ParameterLattice::Top; func.num_dynamic_constants as usize]; + for caller in callgraph.get_callers(func_id) { + let editor = &editors[caller.idx()]; + let nodes = &editor.func().nodes; + for id in editor.node_ids() { + if let Some((_, callee, dc_args, args)) = nodes[id.idx()].try_call() + && callee == func_id + { + for (idx, id) in args.into_iter().enumerate() { + let lattice = ParameterLattice::from_node(&nodes[id.idx()], callee); + param_lattice[idx].meet( + lattice, + editor.get_constants(), + editor.get_dynamic_constants(), + ); + } + + for (idx, id) in dc_args.into_iter().enumerate() { + let lattice = ParameterLattice::DynamicConstant(*id, func_id); + dc_param_lattice[idx].meet( + lattice, + editor.get_constants(), + editor.get_dynamic_constants(), + ); + } + } + } + } + println!("{}:", func.name); + println!("{:?}", param_lattice); + println!("{:?}", dc_param_lattice); + } +} diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 1a94ea55..d201765e 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -22,5 +22,6 @@ fixpoint { fork-guard-elim(*); fork-coalesce(*); } +const-inline(*); gcm(*); diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs index 3c288ca7..bd27350a 100644 --- a/juno_scheduler/src/compile.rs +++ b/juno_scheduler/src/compile.rs @@ -112,6 +112,7 @@ impl FromStr for Appliable { "ccp" => Ok(Appliable::Pass(ir::Pass::CCP)), "crc" | "collapse-read-chains" => Ok(Appliable::Pass(ir::Pass::CRC)), "clean-monoid-reduces" => Ok(Appliable::Pass(ir::Pass::CleanMonoidReduces)), + "const-inline" => Ok(Appliable::Pass(ir::Pass::ConstInline)), "dce" => Ok(Appliable::Pass(ir::Pass::DCE)), "delete-uncalled" => Ok(Appliable::DeleteUncalled), "float-collections" | "collections" => Ok(Appliable::Pass(ir::Pass::FloatCollections)), diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs index 3a087c0d..a0db8844 100644 --- a/juno_scheduler/src/ir.rs +++ b/juno_scheduler/src/ir.rs @@ -8,8 +8,9 @@ pub enum Pass { ArrayToProduct, AutoOutline, CCP, - CleanMonoidReduces, CRC, + CleanMonoidReduces, + ConstInline, DCE, FloatCollections, ForkChunk, diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 5f2fa4cc..3c142700 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -1833,6 +1833,24 @@ fn run_pass( pm.delete_gravestones(); pm.clear_analyses(); } + Pass::ConstInline => { + assert!(args.is_empty()); + pm.make_callgraph(); + let callgraph = pm.callgraph.take().unwrap(); + + let mut editors: Vec<_> = build_selection(pm, selection, true) + .into_iter() + .map(|editor| editor.unwrap()) + .collect(); + const_inline(&mut editors, &callgraph); + + for func in editors { + changed |= func.modified(); + } + + pm.delete_gravestones(); + pm.clear_analyses(); + } Pass::CRC => { assert!(args.is_empty()); for func in build_selection(pm, selection, false) { -- GitLab From 4572a7ec7ba13913b86127bdd3fa2edfa73f26ac Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 12:58:53 -0600 Subject: [PATCH 4/9] Inline constants --- hercules_opt/src/editor.rs | 12 +++ hercules_opt/src/inline.rs | 126 +++++++++++++++++++++++---- juno_samples/rodinia/bfs/src/cpu.sch | 1 + 3 files changed, 121 insertions(+), 18 deletions(-) diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 17cea325..0e332a00 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -880,6 +880,18 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { } } + pub fn get_param_types(&self) -> &Vec<TypeID> { + self.updated_param_types + .as_ref() + .unwrap_or(&self.editor.function.param_types) + } + + pub fn get_return_types(&self) -> &Vec<TypeID> { + self.updated_return_types + .as_ref() + .unwrap_or(&self.editor.function.return_types) + } + pub fn set_param_types(&mut self, tys: Vec<TypeID>) { self.updated_param_types = Some(tys); } diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs index c94ad962..895f1f73 100644 --- a/hercules_opt/src/inline.rs +++ b/hercules_opt/src/inline.rs @@ -235,7 +235,7 @@ fn inline_func( } } -#[derive(Clone, Debug, Copy)] +#[derive(Clone, Debug, Copy, PartialEq, Eq)] enum ParameterLattice { Top, Constant(ConstantID), @@ -331,6 +331,7 @@ pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { // constant) to this function. let mut param_lattice = vec![ParameterLattice::Top; func.param_types.len()]; let mut dc_param_lattice = vec![ParameterLattice::Top; func.num_dynamic_constants as usize]; + let mut callers = vec![]; for caller in callgraph.get_callers(func_id) { let editor = &editors[caller.idx()]; let nodes = &editor.func().nodes; @@ -338,28 +339,117 @@ pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { if let Some((_, callee, dc_args, args)) = nodes[id.idx()].try_call() && callee == func_id { - for (idx, id) in args.into_iter().enumerate() { - let lattice = ParameterLattice::from_node(&nodes[id.idx()], callee); - param_lattice[idx].meet( - lattice, - editor.get_constants(), - editor.get_dynamic_constants(), - ); + if editor.is_mutable(id) { + for (idx, id) in args.into_iter().enumerate() { + let lattice = ParameterLattice::from_node(&nodes[id.idx()], callee); + param_lattice[idx].meet( + lattice, + editor.get_constants(), + editor.get_dynamic_constants(), + ); + } + + for (idx, id) in dc_args.into_iter().enumerate() { + let lattice = ParameterLattice::DynamicConstant(*id, func_id); + dc_param_lattice[idx].meet( + lattice, + editor.get_constants(), + editor.get_dynamic_constants(), + ); + } + } else { + // If we can't modify the call node in the caller, then + // we can't perform the inlining. + param_lattice = vec![ParameterLattice::Bottom; func.param_types.len()]; + dc_param_lattice = + vec![ParameterLattice::Bottom; func.num_dynamic_constants as usize]; } + callers.push((caller, id)); + } + } + } + if param_lattice.iter().all(|v| *v == ParameterLattice::Bottom) + && dc_param_lattice + .iter() + .all(|v| *v == ParameterLattice::Bottom) + { + continue; + } - for (idx, id) in dc_args.into_iter().enumerate() { - let lattice = ParameterLattice::DynamicConstant(*id, func_id); - dc_param_lattice[idx].meet( - lattice, - editor.get_constants(), - editor.get_dynamic_constants(), - ); + // Replace the arguments. + let editor = &mut editors[func_id.idx()]; + let mut param_idx_to_ids: HashMap<usize, Vec<NodeID>> = HashMap::new(); + for id in editor.node_ids() { + if let Some(idx) = editor.func().nodes[id.idx()].try_parameter() { + param_idx_to_ids.entry(idx).or_default().push(id); + } + } + let mut params_to_remove = vec![]; + let success = editor.edit(|mut edit| { + let mut param_tys = edit.get_param_types().clone(); + let mut decrement_index_by = 0; + for idx in 0..param_tys.len() { + if let Some(node) = match param_lattice[idx] { + ParameterLattice::Top => Some(Node::Undef { ty: param_tys[idx] }), + ParameterLattice::Constant(id) => Some(Node::Constant { id }), + _ => None, + } && let Some(ids) = param_idx_to_ids.get(&idx) + { + let node = edit.add_node(node); + for id in ids { + edit = edit.replace_all_uses(*id, node)?; + edit = edit.delete_node(*id)?; + } + param_tys.remove(idx - decrement_index_by); + params_to_remove.push(idx); + decrement_index_by += 1; + } else if decrement_index_by != 0 + && let Some(ids) = param_idx_to_ids.get(&idx) + { + let node = edit.add_node(Node::Parameter { + index: idx - decrement_index_by, + }); + for id in ids { + edit = edit.replace_all_uses(*id, node)?; + edit = edit.delete_node(*id)?; } } } + edit.set_param_types(param_tys); + Ok(edit) + }); + params_to_remove.reverse(); + + // Update callers. + if success { + for (caller, call) in callers { + let editor = &mut editors[caller.idx()]; + let success = editor.edit(|mut edit| { + let Node::Call { + control, + function, + dynamic_constants, + args, + } = edit.get_node(call).clone() + else { + panic!(); + }; + let mut args = args.into_vec(); + for idx in params_to_remove.iter() { + args.remove(*idx); + } + let node = edit.add_node(Node::Call { + control, + function, + dynamic_constants, + args: args.into_boxed_slice(), + }); + edit = edit.replace_all_uses(call, node)?; + edit = edit.delete_node(call)?; + Ok(edit) + }); + assert!(success); + } } - println!("{}:", func.name); - println!("{:?}", param_lattice); - println!("{:?}", dc_param_lattice); } } diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index d201765e..04ffeaa6 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -23,5 +23,6 @@ fixpoint { fork-coalesce(*); } const-inline(*); +simpl!(*); gcm(*); -- GitLab From 84dfd900348e6bf47b1b6e9dad5ff27776f50541 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 13:08:55 -0600 Subject: [PATCH 5/9] Inline dynamic constants --- hercules_opt/src/inline.rs | 47 +++++++++-------------------- juno_samples/rodinia/bfs/src/bfs.jn | 2 +- 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs index 895f1f73..38ed1b22 100644 --- a/hercules_opt/src/inline.rs +++ b/hercules_opt/src/inline.rs @@ -255,16 +255,6 @@ impl ParameterLattice { } } - fn try_const_dc(self, dcs: Ref<'_, Vec<DynamicConstant>>) -> Option<usize> { - if let ParameterLattice::DynamicConstant(id, _) = self - && let DynamicConstant::Constant(val) = &dcs[id.idx()] - { - Some(*val) - } else { - None - } - } - fn meet(&mut self, b: Self, cons: Ref<'_, Vec<Constant>>, dcs: Ref<'_, Vec<DynamicConstant>>) { use ParameterLattice::*; *self = match (*self, b) { @@ -312,8 +302,7 @@ impl ParameterLattice { * * 1. Not marked as entry. * 2. At every call site, a particular parameter is always a specific constant - * or dynamic constant OR a particular dynamic constant parameter is always a - * specific constant. + * or dynamic constant. * * These functions can have that constant "inlined" - the parameter is removed * and all uses of the parameter becomes uses of the constant directly. @@ -327,16 +316,14 @@ pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { continue; } - // Figure out what we know about the parameters (both normal and dynamic - // constant) to this function. + // Figure out what we know about the parameters to this function. let mut param_lattice = vec![ParameterLattice::Top; func.param_types.len()]; - let mut dc_param_lattice = vec![ParameterLattice::Top; func.num_dynamic_constants as usize]; let mut callers = vec![]; for caller in callgraph.get_callers(func_id) { let editor = &editors[caller.idx()]; let nodes = &editor.func().nodes; for id in editor.node_ids() { - if let Some((_, callee, dc_args, args)) = nodes[id.idx()].try_call() + if let Some((_, callee, _, args)) = nodes[id.idx()].try_call() && callee == func_id { if editor.is_mutable(id) { @@ -348,31 +335,16 @@ pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { editor.get_dynamic_constants(), ); } - - for (idx, id) in dc_args.into_iter().enumerate() { - let lattice = ParameterLattice::DynamicConstant(*id, func_id); - dc_param_lattice[idx].meet( - lattice, - editor.get_constants(), - editor.get_dynamic_constants(), - ); - } } else { // If we can't modify the call node in the caller, then // we can't perform the inlining. param_lattice = vec![ParameterLattice::Bottom; func.param_types.len()]; - dc_param_lattice = - vec![ParameterLattice::Bottom; func.num_dynamic_constants as usize]; } callers.push((caller, id)); } } } - if param_lattice.iter().all(|v| *v == ParameterLattice::Bottom) - && dc_param_lattice - .iter() - .all(|v| *v == ParameterLattice::Bottom) - { + if param_lattice.iter().all(|v| *v == ParameterLattice::Bottom) { continue; } @@ -392,6 +364,17 @@ pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { if let Some(node) = match param_lattice[idx] { ParameterLattice::Top => Some(Node::Undef { ty: param_tys[idx] }), ParameterLattice::Constant(id) => Some(Node::Constant { id }), + ParameterLattice::DynamicConstant(id, _) => { + // Rust moment. + let maybe_cons = edit.get_dynamic_constant(id).try_constant(); + if let Some(val) = maybe_cons { + Some(Node::DynamicConstant { + id: edit.add_dynamic_constant(DynamicConstant::Constant(val)), + }) + } else { + None + } + } _ => None, } && let Some(ids) = param_idx_to_ids.get(&idx) { diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index 2e61f1a0..51dcd945 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -13,7 +13,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] let visited: bool[n]; visited[source as u64] = true; - @cost let cost: i32[n]; + @cost @cost_init let cost: i32[n]; @cost_init for i in 0..n { cost[i] = -1; } -- GitLab From d4268c90491efae0db1eeaa99755be9336a7bdbe Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 13:12:40 -0600 Subject: [PATCH 6/9] forkify bfs --- juno_samples/rodinia/bfs/src/cpu.sch | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 04ffeaa6..44cfa8ad 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -16,13 +16,14 @@ let loop2 = outline(bfs@loop2); simpl!(*); predication(*); - +const-inline(*); +simpl!(*); fixpoint { forkify(*); fork-guard-elim(*); - fork-coalesce(*); } -const-inline(*); simpl!(*); +unforkify(*); + gcm(*); -- GitLab From 99b1d2945e2d63ff6dbe3e68f5bd3a2b2c2fe2cb Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 13:56:11 -0600 Subject: [PATCH 7/9] cfd cpu schedules, add module name to name of generated functions to avoid collisions --- hercules_cg/src/cpu.rs | 19 +++++--- hercules_cg/src/gpu.rs | 9 ++-- hercules_cg/src/rt.rs | 18 +++++--- juno_samples/rodinia/cfd/src/cpu_euler.sch | 43 +++++++++++-------- .../rodinia/cfd/src/cpu_pre_euler.sch | 43 +++++++++++-------- juno_scheduler/src/pm.rs | 3 ++ 6 files changed, 85 insertions(+), 50 deletions(-) diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index b15cf301..37bf814d 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -16,6 +16,7 @@ static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0); * LLVM bindings for Rust, and we are *not* writing any C++. */ pub fn cpu_codegen<W: Write>( + module_name: &str, function: &Function, types: &Vec<Type>, constants: &Vec<Constant>, @@ -27,6 +28,7 @@ pub fn cpu_codegen<W: Write>( w: &mut W, ) -> Result<(), Error> { let ctx = CPUContext { + module_name, function, types, constants, @@ -40,6 +42,7 @@ pub fn cpu_codegen<W: Write>( } struct CPUContext<'a> { + module_name: &'a str, function: &'a Function, types: &'a Vec<Type>, constants: &'a Vec<Constant>, @@ -65,16 +68,18 @@ impl<'a> CPUContext<'a> { if self.types[return_type.idx()].is_primitive() { write!( w, - "define dso_local {} @{}(", + "define dso_local {} @{}_{}(", self.get_type(return_type), - self.function.name + self.module_name, + self.function.name, )?; } else { write!( w, - "define dso_local nonnull noundef {} @{}(", + "define dso_local nonnull noundef {} @{}_{}(", self.get_type(return_type), - self.function.name + self.module_name, + self.function.name, )?; } } else { @@ -89,7 +94,11 @@ impl<'a> CPUContext<'a> { .collect::<Vec<_>>() .join(", "), )?; - write!(w, "define dso_local void @{}(", self.function.name,)?; + write!( + w, + "define dso_local void @{}_{}(", + self.module_name, self.function.name, + )?; } let mut first_param = true; // The first parameter is a pointer to CPU backing memory, if it's diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a3eea274..1ba8302a 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -14,6 +14,7 @@ use crate::*; * of similarities with the CPU LLVM generation plus custom GPU parallelization. */ pub fn gpu_codegen<W: Write>( + module_name: &str, function: &Function, types: &Vec<Type>, constants: &Vec<Constant>, @@ -170,6 +171,7 @@ pub fn gpu_codegen<W: Write>( }; let ctx = GPUContext { + module_name, function, types, constants, @@ -199,6 +201,7 @@ struct GPUKernelParams { } struct GPUContext<'a> { + module_name: &'a str, function: &'a Function, types: &'a Vec<Type>, constants: &'a Vec<Constant>, @@ -395,8 +398,8 @@ namespace cg = cooperative_groups; fn codegen_kernel_begin<W: Write>(&self, w: &mut W) -> Result<(), Error> { write!( w, - "__global__ void __launch_bounds__({}) {}_gpu(", - self.kernel_params.max_num_threads, self.function.name + "__global__ void __launch_bounds__({}) {}_{}_gpu(", + self.kernel_params.max_num_threads, self.module_name, self.function.name )?; let mut first_param = true; // The first parameter is a pointer to GPU backing memory, if it's @@ -645,7 +648,7 @@ namespace cg = cooperative_groups; } else { write!(w, "{}", self.get_type(self.function.return_types[0], false))?; } - write!(w, " {}(", self.function.name)?; + write!(w, " {}_{}(", self.module_name, self.function.name)?; let mut first_param = true; // The first parameter is a pointer to GPU backing memory, if it's diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 8fa0c09e..6981a3da 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -74,6 +74,7 @@ use crate::*; * set some CUDA memory - the user can then take a CUDA reference to that box. */ pub fn rt_codegen<W: Write>( + module_name: &str, func_id: FunctionID, module: &Module, def_use: &ImmutableDefUseMap, @@ -96,6 +97,7 @@ pub fn rt_codegen<W: Write>( .map(|(fork, join)| (*join, *fork)) .collect(); let ctx = RTContext { + module_name, func_id, module, def_use, @@ -117,6 +119,7 @@ pub fn rt_codegen<W: Write>( } struct RTContext<'a> { + module_name: &'a str, func_id: FunctionID, module: &'a Module, def_use: &'a ImmutableDefUseMap, @@ -157,7 +160,8 @@ impl<'a> RTContext<'a> { // Dump the function signature. write!( w, - "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}(", + "#[allow(unused_assignments,unused_variables,unused_mut,unused_parens,unused_unsafe,non_snake_case)]async unsafe fn {}_{}(", + self.module_name, func.name )?; let mut first_param = true; @@ -236,7 +240,7 @@ impl<'a> RTContext<'a> { // Create the return struct write!(w, "let mut ret_struct: ::std::mem::MaybeUninit<ReturnStruct> = ::std::mem::MaybeUninit::uninit();")?; // Call the device function - write!(w, "{}(", callee.name)?; + write!(w, "{}_{}(", self.module_name, callee.name)?; if self.backing_allocations[&callee_id].contains_key(&self.devices[callee_id.idx()]) { write!(w, "backing, ")?; @@ -672,8 +676,9 @@ impl<'a> RTContext<'a> { }; write!( block, - "{}{}(", + "{}{}_{}(", prefix, + self.module_name, self.module.functions[callee_id.idx()].name )?; for (device, (offset, size)) in self.backing_allocations[&self.func_id] @@ -1463,7 +1468,7 @@ impl<'a> RTContext<'a> { } // Call the wrapped function. - write!(w, "let ret = {}(", func.name)?; + write!(w, "let ret = {}_{}(", self.module_name, func.name)?; for (device, _) in self.backing_allocations[&self.func_id].iter() { write!( w, @@ -1630,8 +1635,9 @@ impl<'a> RTContext<'a> { let func = &self.module.functions[func_id.idx()]; write!( w, - "{}fn {}(", + "{}fn {}_{}(", if is_unsafe { "unsafe " } else { "" }, + self.module_name, func.name )?; let mut first_param = true; @@ -1667,7 +1673,7 @@ impl<'a> RTContext<'a> { func_id: FunctionID, ) -> Result<(), Error> { let func = &self.module.functions[func_id.idx()]; - write!(w, "fn {}(", func.name)?; + write!(w, "fn {}_{}(", self.module_name, func.name)?; let mut first_param = true; if self.backing_allocations[&func_id].contains_key(&self.devices[func_id.idx()]) { first_param = false; diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 9cbdb942..5fe48a83 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -1,23 +1,30 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + crc(X); + slf(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(euler); -cpu(auto.euler); - -inline(auto.euler); -inline(auto.euler); +simpl!(*); +inline(compute_step_factor, compute_flux, compute_flux_contribution, time_step); delete-uncalled(*); +simpl!(*); +ip-sroa[false](*); +sroa[false](*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); -sroa[false](auto.euler); -dce(*); -float-collections(*); -dce(*); +unforkify(*); gcm(*); - diff --git a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch index 252015c3..6329c504 100644 --- a/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_pre_euler.sch @@ -1,23 +1,30 @@ -gvn(*); -dce(*); -phi-elim(*); -dce(*); -crc(*); -dce(*); -slf(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + crc(X); + slf(X); + dce(X); + infer-schedules(X); +} -let auto = auto-outline(pre_euler); -cpu(auto.pre_euler); - -inline(auto.pre_euler); -inline(auto.pre_euler); +simpl!(*); +inline(compute_step_factor, compute_flux, compute_flux_contributions, compute_flux_contribution, time_step); delete-uncalled(*); +simpl!(*); +ip-sroa[false](*); +sroa[false](*); +predication(*); +const-inline(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); +} +simpl!(*); -sroa[false](auto.pre_euler); -dce(*); -float-collections(*); -dce(*); +unforkify(*); gcm(*); - diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 3c142700..e049f985 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -951,6 +951,7 @@ impl PassManager { for idx in 0..module.functions.len() { match devices[idx] { Device::LLVM => cpu_codegen( + &module_name, &module.functions[idx], &module.types, &module.constants, @@ -966,6 +967,7 @@ impl PassManager { error: format!("{}", e), })?, Device::CUDA => gpu_codegen( + &module_name, &module.functions[idx], &module.types, &module.constants, @@ -986,6 +988,7 @@ impl PassManager { error: format!("{}", e), })?, Device::AsyncRust => rt_codegen( + &module_name, FunctionID::new(idx), &module, &def_uses[idx], -- GitLab From dfff5eecca3f6a133fb4c82f29d221a4f406ed13 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 14:20:22 -0600 Subject: [PATCH 8/9] srad cpu schedule --- juno_samples/rodinia/srad/build.rs | 2 ++ juno_samples/rodinia/srad/src/cpu.sch | 35 +++++++++++++++++++++++++++ juno_samples/rodinia/srad/src/srad.jn | 13 +++++----- 3 files changed, 44 insertions(+), 6 deletions(-) create mode 100644 juno_samples/rodinia/srad/src/cpu.sch diff --git a/juno_samples/rodinia/srad/build.rs b/juno_samples/rodinia/srad/build.rs index 36ba6120..5e1f78f7 100644 --- a/juno_samples/rodinia/srad/build.rs +++ b/juno_samples/rodinia/srad/build.rs @@ -13,6 +13,8 @@ fn main() { JunoCompiler::new() .file_in_src("srad.jn") .unwrap() + .schedule_in_src("cpu.sch") + .unwrap() .build() .unwrap(); } diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch new file mode 100644 index 00000000..1a81ddad --- /dev/null +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -0,0 +1,35 @@ +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + +phi-elim(*); +let loop1 = outline(srad@loop1); +let loop2 = outline(srad@loop2); +let loop3 = outline(srad@loop3); +simpl!(*); +const-inline(*); +crc(*); +slf(*); +write-predication(*); +simpl!(*); +predication(*); +simpl!(*); +predication(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} +simpl!(*); + +fork-split(*); +unforkify(*); + +gcm(*); diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn index 5eea647c..3e016a99 100644 --- a/juno_samples/rodinia/srad/src/srad.jn +++ b/juno_samples/rodinia/srad/src/srad.jn @@ -38,7 +38,7 @@ fn srad<nrows, ncols: usize>( // These loops should really be interchanged, but they aren't in the // Rodinia source (though they are in the HPVM source) - for i in 0..nrows { + @loop1 for i in 0..nrows { for j in 0..ncols { let tmp = image[j, i]; sum += tmp; @@ -57,7 +57,7 @@ fn srad<nrows, ncols: usize>( let c : f32[ncols, nrows]; - for j in 0..ncols { + @loop2 for j in 0..ncols { for i in 0..nrows { let Jc = image[j, i]; dN[j, i] = image[j, iN[i] as u64] - Jc; @@ -75,14 +75,15 @@ fn srad<nrows, ncols: usize>( let qsqr = num / (den * den); let den = (qsqr - q0sqr) / (q0sqr * (1 + q0sqr)); - c[j, i] = 1.0 / (1.0 + den); + let val = 1.0 / (1.0 + den); - if c[j, i] < 0 { c[j, i] = 0; } - else if c[j, i] > 1 { c[j, i] = 1; } + if val < 0 { c[j, i] = 0; } + else if val > 1 { c[j, i] = 1; } + else { c[j, i] = val; } } } - for j in 0..ncols { + @loop3 for j in 0..ncols { for i in 0..nrows { let cN = c[j, i]; let cS = c[j, iS[i] as u64]; -- GitLab From 2acc3edec7b8f001eb56cae90935e47716ff2800 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 27 Feb 2025 14:23:38 -0600 Subject: [PATCH 9/9] fix --- hercules_cg/src/gpu.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 1ba8302a..c9720273 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -724,8 +724,13 @@ namespace cg = cooperative_groups; write!(w, "\tcudaError_t err;\n")?; write!( w, - "\t{}_gpu<<<{}, {}, {}>>>({});\n", - self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args + "\t{}_{}_gpu<<<{}, {}, {}>>>({});\n", + self.module_name, + self.function.name, + num_blocks, + num_threads, + dynamic_shared_offset, + pass_args )?; write!(w, "\terr = cudaGetLastError();\n")?; write!( -- GitLab