From d4c680f1a547972a080bf28351591c4b3175ecd1 Mon Sep 17 00:00:00 2001 From: Xavier Routh <xrouth2@illinois.edu> Date: Sun, 16 Feb 2025 16:52:41 -0600 Subject: [PATCH 1/3] loop bound canonicalization --- hercules_opt/src/forkify.rs | 5 +- hercules_opt/src/ivar.rs | 116 ++++++++---- hercules_opt/src/lib.rs | 2 + hercules_opt/src/schedule.rs | 11 +- hercules_rt/src/lib.rs | 167 ++++++++++-------- .../hercules_tests/tests/loop_tests.rs | 5 +- juno_samples/fork_join_tests/src/main.rs | 4 +- juno_samples/median_window/build.rs | 6 +- juno_samples/median_window/src/main.rs | 14 +- juno_samples/products/src/main.rs | 10 +- juno_scheduler/src/compile.rs | 1 + juno_scheduler/src/ir.rs | 1 + juno_scheduler/src/pm.rs | 26 +++ 13 files changed, 231 insertions(+), 137 deletions(-) diff --git a/hercules_opt/src/forkify.rs b/hercules_opt/src/forkify.rs index 2f6466c0..a6308d66 100644 --- a/hercules_opt/src/forkify.rs +++ b/hercules_opt/src/forkify.rs @@ -115,15 +115,14 @@ pub fn forkify_loop( return false; }; - // FIXME: Make sure IV is not used outside the loop. - // Get bound let bound = match canonical_iv { InductionVariable::Basic { node: _, initializer: _, - update: _, final_value, + update_expression, + update_value, } => final_value .map(|final_value| get_node_as_dc(editor, final_value)) .and_then(|r| r.ok()), diff --git a/hercules_opt/src/ivar.rs b/hercules_opt/src/ivar.rs index edadd717..511371bf 100644 --- a/hercules_opt/src/ivar.rs +++ b/hercules_opt/src/ivar.rs @@ -43,7 +43,8 @@ nest! { pub Basic { node: NodeID, initializer: NodeID, - update: NodeID, + update_expression: NodeID, + update_value: NodeID, final_value: Option<NodeID>, }, SCEV(NodeID), // TODO @(xrouth) @@ -56,7 +57,8 @@ impl InductionVariable { InductionVariable::Basic { node, initializer: _, - update: _, + update_expression: _, + update_value: _, final_value: _, } => *node, InductionVariable::SCEV(_) => todo!(), @@ -75,12 +77,17 @@ pub fn calculate_loop_nodes(editor: &FunctionEditor, natural_loop: &Loop) -> Has // External Phi if let Node::Phi { control, data: _ } = data { match natural_loop.control.get(control.idx()) { - Some(v) => if !*v { - return true; - }, + Some(v) => { + if !*v { + return true; + } + } None => { - panic!("unexpceted index: {:?} for loop {:?}", control, natural_loop.header); - }, + panic!( + "unexpceted index: {:?} for loop {:?}", + control, natural_loop.header + ); + } } } // External Reduce @@ -91,24 +98,34 @@ pub fn calculate_loop_nodes(editor: &FunctionEditor, natural_loop: &Loop) -> Has } = data { match natural_loop.control.get(control.idx()) { - Some(v) => if !*v { - return true; - }, + Some(v) => { + if !*v { + return true; + } + } None => { - panic!("unexpceted index: {:?} for loop {:?}", control, natural_loop.header); - }, + panic!( + "unexpceted index: {:?} for loop {:?}", + control, natural_loop.header + ); + } } } // External Control if data.is_control() { match natural_loop.control.get(node.idx()) { - Some(v) => if !*v { - return true; - }, + Some(v) => { + if !*v { + return true; + } + } None => { - panic!("unexpceted index: {:?} for loop {:?}", node, natural_loop.header); - }, + panic!( + "unexpceted index: {:?} for loop {:?}", + node, natural_loop.header + ); + } } } @@ -332,13 +349,14 @@ pub fn has_const_fields(editor: &FunctionEditor, ivar: InductionVariable) -> boo InductionVariable::Basic { node: _, initializer, - update, final_value, + update_expression, + update_value, } => { if final_value.is_none() { return false; } - [initializer, update] + [initializer, update_value] .iter() .any(|node| !editor.node(node).is_constant()) } @@ -357,8 +375,9 @@ pub fn has_canonical_iv<'a>( InductionVariable::Basic { node: _, initializer, - update, final_value, + update_expression, + update_value, } => { (editor .node(initializer) @@ -366,9 +385,11 @@ pub fn has_canonical_iv<'a>( || editor .node(initializer) .is_zero_dc(&editor.get_dynamic_constants())) - && (editor.node(update).is_one_constant(&editor.get_constants()) + && (editor + .node(update_value) + .is_one_constant(&editor.get_constants()) || editor - .node(update) + .node(update_value) .is_one_dc(&editor.get_dynamic_constants())) && (final_value .map(|val| { @@ -458,8 +479,9 @@ pub fn compute_induction_vars( return Some(InductionVariable::Basic { node: phi_id, initializer: initializer_id, - update: b, final_value: None, + update_expression: *data_id, + update_value: b, }); } else { None @@ -476,13 +498,12 @@ pub fn compute_induction_vars( induction_variables } -// Find loop iterations -pub fn compute_iv_ranges( +pub fn get_loop_condition_ivs( editor: &FunctionEditor, l: &Loop, - induction_vars: Vec<InductionVariable>, + induction_vars: &Vec<InductionVariable>, loop_condition: &LoopExit, -) -> Vec<InductionVariable> { +) -> HashSet<NodeID> { let condition_node = match loop_condition { LoopExit::Conditional { if_node: _, @@ -507,11 +528,39 @@ pub fn compute_iv_ranges( // Bound IVs used in loop bound. let loop_bound_uses: HashSet<_> = walk_all_uses_stop_on(*condition_node, editor, stop_on).collect(); + + HashSet::from_iter(induction_vars.iter().filter_map(|iv| { + if loop_bound_uses.contains(&iv.phi()) { + Some(iv.phi()) + } else { + None + } + })) +} + +// Find loop iterations +pub fn compute_iv_ranges( + editor: &FunctionEditor, + l: &Loop, + induction_vars: Vec<InductionVariable>, + loop_condition: &LoopExit, +) -> Vec<InductionVariable> { + let condition_node = match loop_condition.clone() { + LoopExit::Conditional { + if_node: _, + condition_node, + } => condition_node, + LoopExit::Unconditional(_) => todo!(), + }; + + let loop_bound_iv_phis = get_loop_condition_ivs(editor, l, &induction_vars, loop_condition); + let (loop_bound_ivs, other_ivs): (Vec<InductionVariable>, Vec<InductionVariable>) = induction_vars .into_iter() - .partition(|f| loop_bound_uses.contains(&f.phi())); + .partition(|f| loop_bound_iv_phis.contains(&f.phi())); + // Assume there is only one loop bound iv. let Some(iv) = loop_bound_ivs.first() else { return other_ivs; }; @@ -520,6 +569,8 @@ pub fn compute_iv_ranges( return loop_bound_ivs.into_iter().chain(other_ivs).collect(); } + // Bound IVs used in the loop condition. + // FIXME: DO linear algerbra to solve for loop bounds with multiple variables involved. let final_value = match &editor.func().nodes[condition_node.idx()] { Node::Phi { @@ -570,8 +621,9 @@ pub fn compute_iv_ranges( InductionVariable::Basic { node: _, initializer: _, - update, final_value: _, + update_expression: _, + update_value: update, } => update, InductionVariable::SCEV(_) => todo!(), }; @@ -604,12 +656,14 @@ pub fn compute_iv_ranges( InductionVariable::Basic { node, initializer, - update, final_value: _, + update_expression, + update_value, } => InductionVariable::Basic { node: *node, initializer: *initializer, - update: *update, + update_expression: *update_expression, + update_value: *update_value, final_value, }, InductionVariable::SCEV(_) => todo!(), diff --git a/hercules_opt/src/lib.rs b/hercules_opt/src/lib.rs index a349230e..b56f9408 100644 --- a/hercules_opt/src/lib.rs +++ b/hercules_opt/src/lib.rs @@ -16,6 +16,7 @@ pub mod inline; pub mod interprocedural_sroa; pub mod ivar; pub mod lift_dc_math; +pub mod loop_bound_canon; pub mod outline; pub mod phi_elim; pub mod pred; @@ -43,6 +44,7 @@ pub use crate::inline::*; pub use crate::interprocedural_sroa::*; pub use crate::ivar::*; pub use crate::lift_dc_math::*; +pub use crate::loop_bound_canon::*; pub use crate::outline::*; pub use crate::phi_elim::*; pub use crate::pred::*; diff --git a/hercules_opt/src/schedule.rs b/hercules_opt/src/schedule.rs index 7ecf07a4..d95a3585 100644 --- a/hercules_opt/src/schedule.rs +++ b/hercules_opt/src/schedule.rs @@ -157,10 +157,7 @@ pub fn infer_monoid_reduce( let is_binop_monoid = |op| { matches!( op, - BinaryOperator::Add - | BinaryOperator::Mul - | BinaryOperator::Or - | BinaryOperator::And + BinaryOperator::Add | BinaryOperator::Mul | BinaryOperator::Or | BinaryOperator::And ) }; let is_intrinsic_monoid = |intrinsic| matches!(intrinsic, Intrinsic::Max | Intrinsic::Min); @@ -172,9 +169,9 @@ pub fn infer_monoid_reduce( init: _, reduct, } = func.nodes[id.idx()] - && (matches!(func.nodes[reduct.idx()], Node::Binary { left, right, op } - if ((left == id && !reduce_cycles[&id].contains(&right)) || - (right == id && !reduce_cycles[&id].contains(&left))) && + && (matches!(func.nodes[reduct.idx()], Node::Binary { left, right, op } + if ((left == id && !reduce_cycles[&id].contains(&right)) || + (right == id && !reduce_cycles[&id].contains(&left))) && is_binop_monoid(op)) || matches!(&func.nodes[reduct.idx()], Node::IntrinsicCall { intrinsic, args } if (args.contains(&id) && is_intrinsic_monoid(*intrinsic) && diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index f8fdf2ef..e9b8f11f 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -394,7 +394,7 @@ unsafe impl Sync for __RawPtrSendSync {} * * The data held at all of its non-None allocations and references is maintained so that it is the * same, and so methods will attempt to use the reference or allocation that is most convenient. - * + * * HerculesImmBox hold references to immutable memory only. All operations on these is through * immutable references, though internally it uses OnceLocks to protect its resources since the Box * may be used in multiple parallel threads if it is used in parallel Hercules code invocation. @@ -499,9 +499,9 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> { } } -impl<'a, T> HerculesImmBox<'a, T> -where - T: Default + Clone +impl<'a, T> HerculesImmBox<'a, T> +where + T: Default + Clone, { pub fn as_slice(&'a self) -> &'a [T] { self.as_cpu_ref().as_slice() @@ -517,18 +517,23 @@ where } else { #[cfg(feature = "cuda")] if let Some(cuda_ref) = self.cuda_ref.get() { - return - self.cpu_ref.get_or_init(|| { + return self + .cpu_ref + .get_or_init(|| { let elements = unsafe { cuda_ref.__size() / size_of::<T>() }; let mut alloc = Vec::new(); alloc.resize_with(elements, Default::default); let _ = cuda_ref.clone().to_cpu_ref(&mut alloc); - self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly"); + self.cpu_alloc + .set(alloc) + .map_err(|_| ()) + .expect("HerculesImmBox cpu_alloc was set unexpectedly"); let alloc = self.cpu_alloc.get().unwrap(); HerculesCPURef::from_slice(alloc) - }).clone(); + }) + .clone(); } panic!("HerculesImmBox has no reference to data") @@ -541,13 +546,19 @@ where cuda_ref.clone() } else { if let Some(cpu_ref) = self.cpu_ref.get() { - return self.cuda_ref.get_or_init(|| { - // Copy data to CUDA device - let alloc = CUDABox::from_cpu_ref(cpu_ref.clone()); - self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly"); - - self.cuda_alloc.get().unwrap().get_ref() - }).clone(); + return self + .cuda_ref + .get_or_init(|| { + // Copy data to CUDA device + let alloc = CUDABox::from_cpu_ref(cpu_ref.clone()); + self.cuda_alloc + .set(alloc) + .map_err(|_| ()) + .expect("HerculesImmBox cuda_alloc was set unexpectedly"); + + self.cuda_alloc.get().unwrap().get_ref() + }) + .clone(); } panic!("HerculesImmBox has no reference to data") @@ -651,7 +662,7 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> { impl<'a, T> HerculesMutBox<'a, T> where - T: Default + Clone + T: Default + Clone, { pub fn as_slice(&'a mut self) -> &'a mut [T] { self.as_cpu_ref().as_slice() @@ -659,42 +670,41 @@ where pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> { match self.loc { - HerculesMutBoxLocation::CPU => { - match self.cpu_alloc { - Allocation::None => panic!("No CPU reference"), - Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val), - Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val), - } - } + HerculesMutBoxLocation::CPU => match self.cpu_alloc { + Allocation::None => panic!("No CPU reference"), + Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val), + Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val), + }, #[cfg(feature = "cuda")] HerculesMutBoxLocation::CUDA => { - let cuda_ref : HerculesCUDARef<'a> = - match self.cuda_alloc { - Allocation::None => panic!("No GPU reference"), - Allocation::Reference(ref mut val) => val.dup().as_ref(), - Allocation::Allocation(ref val) => val.get_ref(), - }; + let cuda_ref: HerculesCUDARef<'a> = match self.cuda_alloc { + Allocation::None => panic!("No GPU reference"), + Allocation::Reference(ref mut val) => val.dup().as_ref(), + Allocation::Allocation(ref val) => val.get_ref(), + }; let elements = unsafe { cuda_ref.__size() / size_of::<T>() }; // Allocate host memory (if needed) - let cpu_alloc : Allocation<&'a mut [T], Vec<T>> = - match self.cpu_alloc.take() { - Allocation::Reference(val) if val.len() == elements => Allocation::Reference(val), - Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val), - _ => { - let mut alloc = Vec::new(); - alloc.resize_with(elements, Default::default); - Allocation::Allocation(alloc) - } - }; + let cpu_alloc: Allocation<&'a mut [T], Vec<T>> = match self.cpu_alloc.take() { + Allocation::Reference(val) if val.len() == elements => { + Allocation::Reference(val) + } + Allocation::Allocation(val) if val.len() == elements => { + Allocation::Allocation(val) + } + _ => { + let mut alloc = Vec::new(); + alloc.resize_with(elements, Default::default); + Allocation::Allocation(alloc) + } + }; self.cpu_alloc = cpu_alloc; - let cpu_ref : &'a mut [T] = - match &mut self.cpu_alloc { - Allocation::None => panic!(), - Allocation::Reference(val) => val, - Allocation::Allocation(val) => val, - }; + let cpu_ref: &'a mut [T] = match &mut self.cpu_alloc { + Allocation::None => panic!(), + Allocation::Reference(val) => val, + Allocation::Allocation(val) => val, + }; // Transfer data from CUDA device let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref); @@ -709,31 +719,32 @@ where pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> { match self.loc { HerculesMutBoxLocation::CPU => { - let cpu_ref : &'a [T] = - match self.cpu_alloc { - Allocation::None => panic!("No CPU reference"), - Allocation::Reference(ref val) => val, - Allocation::Allocation(ref val) => val, - }; + let cpu_ref: &'a [T] = match self.cpu_alloc { + Allocation::None => panic!("No CPU reference"), + Allocation::Reference(ref val) => val, + Allocation::Allocation(ref val) => val, + }; let size = cpu_ref.len() * size_of::<T>(); - let (cuda_alloc, copied) = - match self.cuda_alloc.take() { - Allocation::Reference(val) if unsafe { val.__size() == size } => (Allocation::Reference(val), false), - Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false), - _ => { - let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref)); - (Allocation::Allocation(alloc), true) - } - }; + let (cuda_alloc, copied) = match self.cuda_alloc.take() { + Allocation::Reference(val) if unsafe { val.__size() == size } => { + (Allocation::Reference(val), false) + } + Allocation::Allocation(val) if val.get_bytes() == size => { + (Allocation::Allocation(val), false) + } + _ => { + let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref)); + (Allocation::Allocation(alloc), true) + } + }; self.cuda_alloc = cuda_alloc; - let cuda_ref = - match self.cuda_alloc { - Allocation::None => panic!(), - Allocation::Reference(ref mut val) => val.dup(), - Allocation::Allocation(ref mut val) => val.get_ref_mut(), - }; + let cuda_ref = match self.cuda_alloc { + Allocation::None => panic!(), + Allocation::Reference(ref mut val) => val.dup(), + Allocation::Allocation(ref mut val) => val.get_ref_mut(), + }; if !copied { unsafe { @@ -744,13 +755,11 @@ where self.loc = HerculesMutBoxLocation::CUDA; cuda_ref } - HerculesMutBoxLocation::CUDA => { - match self.cuda_alloc { - Allocation::None => panic!("No GPU reference"), - Allocation::Reference(ref mut val) => val.dup(), - Allocation::Allocation(ref mut val) => val.get_ref_mut(), - } - } + HerculesMutBoxLocation::CUDA => match self.cuda_alloc { + Allocation::None => panic!("No GPU reference"), + Allocation::Reference(ref mut val) => val.dup(), + Allocation::Allocation(ref mut val) => val.get_ref_mut(), + }, } } } @@ -760,7 +769,8 @@ pub trait HerculesImmBoxTo<'a, T> { } impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T> -where T: Default + Clone +where + T: Default + Clone, { fn to(&'a self) -> HerculesCPURef<'a> { self.as_cpu_ref() @@ -769,7 +779,8 @@ where T: Default + Clone #[cfg(feature = "cuda")] impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T> -where T: Default + Clone +where + T: Default + Clone, { fn to(&'a self) -> HerculesCUDARef<'a> { self.as_cuda_ref() @@ -781,7 +792,8 @@ pub trait HerculesMutBoxTo<'a, T> { } impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> -where T: Default + Clone +where + T: Default + Clone, { fn to(&'a mut self) -> HerculesCPURefMut<'a> { self.as_cpu_ref() @@ -790,7 +802,8 @@ where T: Default + Clone #[cfg(feature = "cuda")] impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> -where T: Default + Clone +where + T: Default + Clone, { fn to(&'a mut self) -> HerculesCUDARefMut<'a> { self.as_cuda_ref() diff --git a/hercules_test/hercules_tests/tests/loop_tests.rs b/hercules_test/hercules_tests/tests/loop_tests.rs index f42a6520..64339520 100644 --- a/hercules_test/hercules_tests/tests/loop_tests.rs +++ b/hercules_test/hercules_tests/tests/loop_tests.rs @@ -409,10 +409,7 @@ fn matmul_pipeline() { // }; // assert_eq!(correct_c[0], value); - let schedule = Some(default_schedule![ - Xdot, - Verify, - ]); + let schedule = Some(default_schedule![Xdot, Verify,]); module = run_schedule_on_hercules(module, schedule).unwrap(); diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs index 21ccd7c4..cd715cac 100644 --- a/juno_samples/fork_join_tests/src/main.rs +++ b/juno_samples/fork_join_tests/src/main.rs @@ -45,12 +45,12 @@ fn main() { let mut r = runner!(test6); let output = r.run(73).await; - let correct = (73i32..73i32+1024i32).collect(); + let correct = (73i32..73i32 + 1024i32).collect(); assert(&correct, output); let mut r = runner!(test7); let output = r.run(42).await; - let correct: i32 = (42i32..42i32+32i32).sum(); + let correct: i32 = (42i32..42i32 + 32i32).sum(); assert_eq!(correct, output); let mut r = runner!(test8); diff --git a/juno_samples/median_window/build.rs b/juno_samples/median_window/build.rs index a6c29d5b..3ce241e4 100644 --- a/juno_samples/median_window/build.rs +++ b/juno_samples/median_window/build.rs @@ -4,7 +4,11 @@ fn main() { JunoCompiler::new() .file_in_src("median.jn") .unwrap() - .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) + .schedule_in_src(if cfg!(feature = "cuda") { + "gpu.sch" + } else { + "cpu.sch" + }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/median_window/src/main.rs b/juno_samples/median_window/src/main.rs index c515ac4b..63479dba 100644 --- a/juno_samples/median_window/src/main.rs +++ b/juno_samples/median_window/src/main.rs @@ -5,18 +5,14 @@ juno_build::juno!("median"); use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo}; fn main() { - let m = vec![86, 72, 14, 5, 55, - 25, 98, 89, 3, 66, - 44, 81, 27, 3, 40, - 18, 4, 57, 93, 34, - 70, 50, 50, 18, 34]; + let m = vec![ + 86, 72, 14, 5, 55, 25, 98, 89, 3, 66, 44, 81, 27, 3, 40, 18, 4, 57, 93, 34, 70, 50, 50, 18, + 34, + ]; let m = HerculesImmBox::from(m.as_slice()); let mut r = runner!(median_window); - let res = - async_std::task::block_on(async { - r.run(m.to()).await - }); + let res = async_std::task::block_on(async { r.run(m.to()).await }); assert_eq!(res, 57); } diff --git a/juno_samples/products/src/main.rs b/juno_samples/products/src/main.rs index b8abb59d..9a1e6ac2 100644 --- a/juno_samples/products/src/main.rs +++ b/juno_samples/products/src/main.rs @@ -7,14 +7,18 @@ juno_build::juno!("products"); fn main() { async_std::task::block_on(async { let input = vec![(0, 1), (2, 3)]; - let input : HerculesImmBox<(i32, i32)> = HerculesImmBox::from(input.as_slice()); + let input: HerculesImmBox<(i32, i32)> = HerculesImmBox::from(input.as_slice()); let mut r = runner!(product_read); - let res : Vec<i32> = HerculesMutBox::from(r.run(input.to()).await).as_slice().to_vec(); + let res: Vec<i32> = HerculesMutBox::from(r.run(input.to()).await) + .as_slice() + .to_vec(); assert_eq!(res, vec![0, 1, 2, 3]); // Technically this returns a product of two i32s, but we can interpret that as an array let mut r = runner!(product_return); - let res : Vec<i32> = HerculesMutBox::from(r.run(42, 17).await).as_slice().to_vec(); + let res: Vec<i32> = HerculesMutBox::from(r.run(42, 17).await) + .as_slice() + .to_vec(); assert_eq!(res, vec![42, 17]); }); } diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs index fc2a729e..1a4cb623 100644 --- a/juno_scheduler/src/compile.rs +++ b/juno_scheduler/src/compile.rs @@ -134,6 +134,7 @@ impl FromStr for Appliable { "fork-unroll" | "unroll" => Ok(Appliable::Pass(ir::Pass::ForkUnroll)), "fork-fusion" | "fusion" => Ok(Appliable::Pass(ir::Pass::ForkFusion)), "lift-dc-math" => Ok(Appliable::Pass(ir::Pass::LiftDCMath)), + "loop-bound-canon" => Ok(Appliable::Pass(ir::Pass::LoopBoundCanon)), "outline" => Ok(Appliable::Pass(ir::Pass::Outline)), "phi-elim" => Ok(Appliable::Pass(ir::Pass::PhiElim)), "predication" => Ok(Appliable::Pass(ir::Pass::Predication)), diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs index bf3fe037..5b6bd297 100644 --- a/juno_scheduler/src/ir.rs +++ b/juno_scheduler/src/ir.rs @@ -26,6 +26,7 @@ pub enum Pass { Inline, InterproceduralSROA, LiftDCMath, + LoopBoundCanon, Outline, PhiElim, Predication, diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 8db79b46..5011b52b 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -2726,6 +2726,32 @@ fn run_pass( Pass::Print => { println!("{:?}", args.get(0)); } + Pass::LoopBoundCanon => { + assert_eq!(args.len(), 0); + + pm.make_fork_join_maps(); + pm.make_loops(); + pm.make_control_subgraphs(); + let fork_join_maps = pm.fork_join_maps.take().unwrap(); + let loops = pm.loops.take().unwrap(); + let control_subgraphs = pm.control_subgraphs.take().unwrap(); + + for (((func, fork_join_map), loops), control_subgraph) in + build_selection(pm, selection, false) + .into_iter() + .zip(fork_join_maps.iter()) + .zip(loops.iter()) + .zip(control_subgraphs.iter()) + { + let Some(mut func) = func else { + continue; + }; + loop_bound_canon_toplevel(&mut func, fork_join_map, control_subgraph, loops); + changed |= func.modified(); + } + pm.delete_gravestones(); + pm.clear_analyses(); + } } println!("Ran Pass: {:?}", pass); -- GitLab From 78e484d35b62e43d5dc8c0bb1aa4c122879986bd Mon Sep 17 00:00:00 2001 From: Xavier Routh <xrouth2@illinois.edu> Date: Sun, 16 Feb 2025 18:13:57 -0600 Subject: [PATCH 2/3] add file oops --- hercules_opt/src/loop_bound_canon.rs | 314 +++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100644 hercules_opt/src/loop_bound_canon.rs diff --git a/hercules_opt/src/loop_bound_canon.rs b/hercules_opt/src/loop_bound_canon.rs new file mode 100644 index 00000000..0dce9c28 --- /dev/null +++ b/hercules_opt/src/loop_bound_canon.rs @@ -0,0 +1,314 @@ +use std::collections::HashMap; +use std::collections::HashSet; +use std::iter::zip; +use std::iter::FromIterator; + +use itertools::Itertools; +use nestify::nest; + +use hercules_ir::*; + +use crate::*; + +/* + * TODO: Forkify currently makes a bunch of small edits - this needs to be + * changed so that every loop that gets forkified corresponds to a single edit + * + sub-edits. This would allow us to run forkify on a subset of a function. + */ +pub fn loop_bound_canon_toplevel( + editor: &mut FunctionEditor, + fork_join_map: &HashMap<NodeID, NodeID>, + control_subgraph: &Subgraph, + loops: &LoopTree, +) -> bool { + let natural_loops = loops + .bottom_up_loops() + .into_iter() + .filter(|(k, _)| editor.func().nodes[k.idx()].is_region()); + + let natural_loops: Vec<_> = natural_loops.collect(); + + for l in natural_loops { + if editor.is_mutable(l.0) + && canonicalize_single_loop_bounds( + editor, + control_subgraph, + &Loop { + header: l.0, + control: l.1.clone(), + }, + ) + { + return true; + } + } + return false; +} + +pub fn canonicalize_single_loop_bounds( + editor: &mut FunctionEditor, + control_subgraph: &Subgraph, + l: &Loop, +) -> bool { + let function = editor.func(); + + let Some(loop_condition) = get_loop_exit_conditions(function, l, control_subgraph) else { + return false; + }; + + let LoopExit::Conditional { + if_node: loop_if, + condition_node, + } = loop_condition.clone() + else { + return false; + }; + + let loop_variance = compute_loop_variance(editor, l); + let ivs = compute_induction_vars(editor.func(), l, &loop_variance); + let ivs = compute_iv_ranges(editor, l, ivs, &loop_condition); + + if has_canonical_iv(editor, l, &ivs).is_some() { + // println!("has canon iv!"); + return true; + } + + let loop_bound_iv_phis = get_loop_condition_ivs(editor, l, &ivs, &loop_condition); + + let (loop_bound_ivs, _): (Vec<InductionVariable>, Vec<InductionVariable>) = ivs + .into_iter() + .partition(|f| loop_bound_iv_phis.contains(&f.phi())); + + // Assume there is only one loop bound iv. + if loop_bound_ivs.len() != 1 { + // println!("has multiple iv!"); + return false; + } + + let Some(iv) = loop_bound_ivs.first() else { + return false; + }; + + let InductionVariable::Basic { + node: iv_phi, + initializer, + final_value, + update_expression, + update_value, + } = iv + else { + return false; + }; + + let Some(final_value) = final_value else { + return false; + }; + + let Some(loop_pred) = editor + .get_uses(l.header) + .filter(|node| !l.control[node.idx()]) + .next() + else { + return false; + }; + + // If there is a guard, we need to edit it. + + // (init_id, bound_id, binop node, if node). + + // FIXME: This is quite fragile. + let guard_info: Option<(NodeID, NodeID, NodeID, NodeID)> = (|| { + let Node::Projection { + control, + selection: _, + } = editor.node(loop_pred) + else { + return None; + }; + + let Node::If { control, cond } = editor.node(control) else { + return None; + }; + + let Node::Binary { left, right, op } = editor.node(cond) else { + return None; + }; + + let Node::Binary { + left: _, + right: _, + op: loop_op, + } = editor.node(condition_node) + else { + return None; + }; + + if op != loop_op { + return None; + } + + if left != initializer { + return None; + } + + if right != final_value { + return None; + } + + return Some((*left, *right, *cond, *control)); + })(); + + // // If guard is none, if some, make sure it is a good guard! move on + // if let Some((init_id, bound_id, binop_node, if_node))= potential_guard_info { + + // }; + + // let fork_guard_condition = + + // Lift dc math should make all constant into DCs, so these should all be DCs. + let Node::DynamicConstant { id: init_dc_id } = *editor.node(initializer) else { + return false; + }; + let Node::DynamicConstant { id: update_dc_id } = *editor.node(update_value) else { + return false; + }; + + // We are assuming this is a simple loop bound (i.e only one induction variable involved), so that . + let Node::DynamicConstant { + id: loop_bound_dc_id, + } = *editor.node(final_value) + else { + return false; + }; + + // We need to do 4 (5) things, which are mostly separate. + + // 0) Make the update into addition. + // 1) Make the update a positive value. + // 2) Transform the condition into a `<` + // 3) Adjust update to be 1 (and bounds). + // 4) Change init to start from 0. + + // 5) Find some way to get fork-guard-elim to work with the new fork. + // ideally, this goes in fork-guard-elim, but for now we hack it to change the guard condition bounds + // here when we edit the loop bounds. + + // Right now we are just going to do (4), because I am lazy! + + // Collect info about the loop condition transformation. + let mut dc_bound_node = match *editor.node(condition_node) { + Node::Binary { left, right, op } => match op { + BinaryOperator::LT => { + if left == *update_expression && editor.node(right).is_dynamic_constant() { + right + } else { + return false; + } + } + BinaryOperator::LTE => todo!(), + BinaryOperator::GT => todo!(), + BinaryOperator::GTE => todo!(), + BinaryOperator::EQ => todo!(), + BinaryOperator::NE => todo!(), + BinaryOperator::Or => todo!(), + BinaryOperator::And => todo!(), + BinaryOperator::Xor => todo!(), + _ => panic!(), + }, + _ => return false, + }; + + let Node::DynamicConstant { + id: bound_node_dc_id, + } = *editor.node(dc_bound_node) + else { + return false; + }; + + // If increment is negative (how in the world do we know that...) + // Increment can be DefinetlyPostiive, Unknown, DefinetlyNegative. + + // // First, massage loop condition to be <, because that is normal! + // Also includes + // editor.edit(|mut edit| { + + // } + // Collect immediate IV users + + let update_expr_users: Vec<_> = editor + .get_users(*update_expression) + .filter(|node| *node != iv.phi() && *node != condition_node) + .collect(); + // println!("update_expr_users: {:?}", update_expr_users); + let iv_phi_users: Vec<_> = editor + .get_users(iv.phi()) + .filter(|node| *node != iv.phi() && *node != *update_expression) + .collect(); + + // println!(" iv_phi_users: {:?}", iv_phi_users); + + let result = editor.edit(|mut edit| { + // 4) Second, change loop IV to go from 0..N. + // we subtract off init from init and dc_bound_node, + // and then we add it back to uses of the IV. + let new_init_dc = DynamicConstant::Constant(0); + let new_init = Node::DynamicConstant { + id: edit.add_dynamic_constant(new_init_dc), + }; + let new_init = edit.add_node(new_init); + edit = edit.replace_all_uses_where(*initializer, new_init, |usee| *usee == iv.phi())?; + + let new_condition_id = DynamicConstant::sub(bound_node_dc_id, init_dc_id); + let new_condition = Node::DynamicConstant { + id: edit.add_dynamic_constant(new_condition_id), + }; + let new_condition = edit.add_node(new_condition); + edit = edit + .replace_all_uses_where(dc_bound_node, new_condition, |usee| *usee == condition_node)?; + + // Change loop guard: + if let Some((init_id, bound_id, binop_node, if_node)) = guard_info { + edit = edit.replace_all_uses_where(init_id, new_init, |usee| *usee == binop_node)?; + edit = + edit.replace_all_uses_where(bound_id, new_condition, |usee| *usee == binop_node)?; + } + + // Add back to uses of the IV + for user in update_expr_users { + let new_user = Node::Binary { + left: user, + right: *initializer, + op: BinaryOperator::Add, + }; + let new_user = edit.add_node(new_user); + edit = edit.replace_all_uses(user, new_user)?; + } + + let new_user = Node::Binary { + left: *update_expression, + right: *initializer, + op: BinaryOperator::Add, + }; + let new_user = edit.add_node(new_user); + edit = edit.replace_all_uses_where(*update_expression, new_user, |usee| { + *usee != iv.phi() + && *usee != *update_expression + && *usee != new_user + && *usee != condition_node + })?; + + let new_user = Node::Binary { + left: *iv_phi, + right: *initializer, + op: BinaryOperator::Add, + }; + let new_user = edit.add_node(new_user); + edit = edit.replace_all_uses_where(*iv_phi, new_user, |usee| { + *usee != iv.phi() && *usee != *update_expression && *usee != new_user + })?; + + Ok(edit) + }); + + return result; +} -- GitLab From e19a549c7e26b1b19f0fcfe25eddc7e3b946f10b Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 16 Feb 2025 18:23:03 -0600 Subject: [PATCH 3/3] remove comment --- hercules_opt/src/loop_bound_canon.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/hercules_opt/src/loop_bound_canon.rs b/hercules_opt/src/loop_bound_canon.rs index 0dce9c28..680236f1 100644 --- a/hercules_opt/src/loop_bound_canon.rs +++ b/hercules_opt/src/loop_bound_canon.rs @@ -10,11 +10,6 @@ use hercules_ir::*; use crate::*; -/* - * TODO: Forkify currently makes a bunch of small edits - this needs to be - * changed so that every loop that gets forkified corresponds to a single edit - * + sub-edits. This would allow us to run forkify on a subset of a function. - */ pub fn loop_bound_canon_toplevel( editor: &mut FunctionEditor, fork_join_map: &HashMap<NodeID, NodeID>, -- GitLab