From 91f3b55cb2d0e66301508d312d0c487cf295a81a Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 10:52:31 -0600 Subject: [PATCH 1/9] slow GPU schedule --- juno_samples/edge_detection/src/gpu.sch | 85 +++++++++++++++++++++---- 1 file changed, 71 insertions(+), 14 deletions(-) diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 1e51efb9..3da40fd3 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -1,22 +1,79 @@ -gvn(*); -phi-elim(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} -gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); +simpl!(*); ip-sroa(*); sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); +simpl!(*); -//forkify(*); -infer-schedules(*); +no-memset(gaussian_smoothing@res); +fixpoint { + forkify(gaussian_smoothing); + fork-guard-elim(gaussian_smoothing); + fork-coalesce(gaussian_smoothing); +} +predication(gaussian_smoothing); +simpl!(gaussian_smoothing); +predication(gaussian_smoothing); +simpl!(gaussian_smoothing); -gcm(*); +no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); +fixpoint { + forkify(laplacian_estimate); + fork-guard-elim(laplacian_estimate); + fork-coalesce(laplacian_estimate); +} +simpl!(laplacian_estimate); + +no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); +fixpoint { + forkify(zero_crossings); + fork-guard-elim(zero_crossings); + fork-coalesce(zero_crossings); +} +simpl!(zero_crossings); + +no-memset(gradient@res); +fixpoint { + forkify(gradient); + fork-guard-elim(gradient); + fork-coalesce(gradient); +} +predication(gradient); +simpl!(gradient); +predication(gradient); +simpl!(gradient); + +fixpoint { + forkify(max_gradient); + fork-guard-elim(max_gradient); + fork-coalesce(max_gradient); +} +simpl!(max_gradient); + +no-memset(reject_zero_crossings@res); fixpoint { - float-collections(*); - dce(*); - gcm(*); + forkify(reject_zero_crossings); + fork-guard-elim(reject_zero_crossings); + fork-coalesce(reject_zero_crossings); } +predication(reject_zero_crossings); +simpl!(reject_zero_crossings); + +async-call(edge_detection@le, edge_detection@zc); + +gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); + +simpl!(*); + +delete-uncalled(*); +gcm(*); + -- GitLab From 5ab36921092a378d02f1efb6791944e6347b6085 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 11:43:05 -0600 Subject: [PATCH 2/9] Speed up max gradient with tiling + cooperative groups --- hercules_cg/src/gpu.rs | 6 +++--- juno_samples/edge_detection/src/gpu.sch | 10 ++++++++-- juno_samples/edge_detection/src/lib.rs | 5 +++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 17f0f893..73dcf528 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1383,15 +1383,15 @@ extern \"C\" {} {}(", let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use); #[allow(unreachable_patterns)] let cg_op = match intrinsic { - Intrinsic::Max => "max", - Intrinsic::Min => "min", + Intrinsic::Max => "greater", + Intrinsic::Min => "less", _ => unreachable!(), }; let id_type_name = self.get_type(id_type, false); write!( w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", - tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name + tabs, define_variable, cg_tile, non_reduce_arg, cg_op, id_type_name )?; } else { let ty = &self.types[id_type.idx()]; diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 3da40fd3..ad3ec65c 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -8,6 +8,8 @@ macro simpl!(X) { infer-schedules(X); } +gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); + simpl!(*); ip-sroa(*); @@ -58,6 +60,12 @@ fixpoint { fork-coalesce(max_gradient); } simpl!(max_gradient); +fork-dim-merge(max_gradient); +simpl!(max_gradient); +fork-tile[32, 0, false, true](max_gradient); +simpl!(max_gradient); +fork-split(max_gradient); +simpl!(max_gradient); no-memset(reject_zero_crossings@res); fixpoint { @@ -70,8 +78,6 @@ simpl!(reject_zero_crossings); async-call(edge_detection@le, edge_detection@zc); -gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); - simpl!(*); delete-uncalled(*); diff --git a/juno_samples/edge_detection/src/lib.rs b/juno_samples/edge_detection/src/lib.rs index 6c2a15bd..dab84cf6 100644 --- a/juno_samples/edge_detection/src/lib.rs +++ b/juno_samples/edge_detection/src/lib.rs @@ -143,6 +143,11 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) { num_frames }; + println!( + "Running edge with {} rows, {} columns, {} gs, {} sz, and {} sb.", + height, width, gs, sz, sb, + ); + let mut r = runner!(edge_detection); let mut output = output.map(|filename| { -- GitLab From 0440711edb6404df0c60156dd6914ef955ab6abb Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 12:09:38 -0600 Subject: [PATCH 3/9] progress --- hercules_opt/src/editor.rs | 20 ++++++++++++++++++++ juno_samples/edge_detection/src/gpu.sch | 11 +++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 16e5c326..b33dc956 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -795,6 +795,26 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { self.add_constant(constant_to_construct) } + pub fn add_pos_inf_constant(&mut self, id: TypeID) -> ConstantID { + let ty = self.get_type(id).clone(); + let constant_to_construct = match ty { + Type::Float32 => Constant::Float32(ordered_float::OrderedFloat(f32::INFINITY)), + Type::Float64 => Constant::Float64(ordered_float::OrderedFloat(f64::INFINITY)), + _ => panic!(), + }; + self.add_constant(constant_to_construct) + } + + pub fn add_neg_inf_constant(&mut self, id: TypeID) -> ConstantID { + let ty = self.get_type(id).clone(); + let constant_to_construct = match ty { + Type::Float32 => Constant::Float32(ordered_float::OrderedFloat(f32::NEG_INFINITY)), + Type::Float64 => Constant::Float64(ordered_float::OrderedFloat(f64::NEG_INFINITY)), + _ => panic!(), + }; + self.add_constant(constant_to_construct) + } + pub fn get_constant(&self, id: ConstantID) -> impl Deref<Target = Constant> + '_ { if id.idx() < self.editor.constants.borrow().len() { Either::Left(Ref::map(self.editor.constants.borrow(), |constants| { diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index ad3ec65c..2a8960ee 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -62,10 +62,17 @@ fixpoint { simpl!(max_gradient); fork-dim-merge(max_gradient); simpl!(max_gradient); -fork-tile[32, 0, false, true](max_gradient); +fork-tile[1024, 0, false, true](max_gradient); +let out = fork-split(max_gradient); +fork-tile[32, 0, false, true](out._4_max_gradient.fj1); +let out = fork-split(max_gradient); simpl!(max_gradient); -fork-split(max_gradient); +xdot[true](max_gradient); +clean-monoid-reduces(max_gradient); +xdot[true](max_gradient); +fork-fission-bufferize[out._4_max_gradient.fj0, out._4_max_gradient.fj1](max_gradient); simpl!(max_gradient); +xdot[true](max_gradient); no-memset(reject_zero_crossings@res); fixpoint { -- GitLab From 3adec0d65e6c765d6b9a61436c6df9597d602749 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 14:55:30 -0600 Subject: [PATCH 4/9] Add min/max to clean monoid reduce --- hercules_ir/src/ir.rs | 52 ++++++++++++++++++++----- hercules_opt/src/editor.rs | 34 ++++++++++++++-- hercules_opt/src/fork_transforms.rs | 38 ++++++++++++++++++ hercules_opt/src/utils.rs | 26 +++++++++++++ juno_samples/edge_detection/src/cpu.sch | 2 + 5 files changed, 138 insertions(+), 14 deletions(-) diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index bf9698b3..f91efe58 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1050,6 +1050,38 @@ impl Constant { _ => false, } } + + pub fn is_largest(&self) -> bool { + match self { + Constant::Integer8(i8::MAX) => true, + Constant::Integer16(i16::MAX) => true, + Constant::Integer32(i32::MAX) => true, + Constant::Integer64(i64::MAX) => true, + Constant::UnsignedInteger8(u8::MAX) => true, + Constant::UnsignedInteger16(u16::MAX) => true, + Constant::UnsignedInteger32(u32::MAX) => true, + Constant::UnsignedInteger64(u64::MAX) => true, + Constant::Float32(ord) => *ord == OrderedFloat::<f32>(f32::INFINITY), + Constant::Float64(ord) => *ord == OrderedFloat::<f64>(f64::INFINITY), + _ => false, + } + } + + pub fn is_smallest(&self) -> bool { + match self { + Constant::Integer8(i8::MIN) => true, + Constant::Integer16(i16::MIN) => true, + Constant::Integer32(i32::MIN) => true, + Constant::Integer64(i64::MIN) => true, + Constant::UnsignedInteger8(u8::MIN) => true, + Constant::UnsignedInteger16(u16::MIN) => true, + Constant::UnsignedInteger32(u32::MIN) => true, + Constant::UnsignedInteger64(u64::MIN) => true, + Constant::Float32(ord) => *ord == OrderedFloat::<f32>(f32::NEG_INFINITY), + Constant::Float64(ord) => *ord == OrderedFloat::<f64>(f64::NEG_INFINITY), + _ => false, + } + } } impl DynamicConstant { @@ -1098,19 +1130,19 @@ impl DynamicConstant { } pub fn is_zero(&self) -> bool { - if *self == DynamicConstant::Constant(0) { - true - } else { - false - } + *self == DynamicConstant::Constant(0) } pub fn is_one(&self) -> bool { - if *self == DynamicConstant::Constant(1) { - true - } else { - false - } + *self == DynamicConstant::Constant(1) + } + + pub fn is_largest(&self) -> bool { + *self == DynamicConstant::Constant(usize::MAX) + } + + pub fn is_smallest(&self) -> bool { + *self == DynamicConstant::Constant(usize::MIN) } pub fn try_parameter(&self) -> Option<usize> { diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index b33dc956..57fe2042 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -795,22 +795,48 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { self.add_constant(constant_to_construct) } - pub fn add_pos_inf_constant(&mut self, id: TypeID) -> ConstantID { + pub fn add_largest_constant(&mut self, id: TypeID) -> ConstantID { let ty = self.get_type(id).clone(); let constant_to_construct = match ty { + Type::Boolean => Constant::Boolean(true), + Type::Integer8 => Constant::Integer8(i8::MAX), + Type::Integer16 => Constant::Integer16(i16::MAX), + Type::Integer32 => Constant::Integer32(i32::MAX), + Type::Integer64 => Constant::Integer64(i64::MAX), + Type::UnsignedInteger8 => Constant::UnsignedInteger8(u8::MAX), + Type::UnsignedInteger16 => Constant::UnsignedInteger16(u16::MAX), + Type::UnsignedInteger32 => Constant::UnsignedInteger32(u32::MAX), + Type::UnsignedInteger64 => Constant::UnsignedInteger64(u64::MAX), + Type::Float8 | Type::BFloat16 => panic!(), Type::Float32 => Constant::Float32(ordered_float::OrderedFloat(f32::INFINITY)), Type::Float64 => Constant::Float64(ordered_float::OrderedFloat(f64::INFINITY)), - _ => panic!(), + Type::Control => panic!("PANIC: Can't create largest constant for the control type."), + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => { + panic!("PANIC: Can't create largest constant of a collection type.") + } }; self.add_constant(constant_to_construct) } - pub fn add_neg_inf_constant(&mut self, id: TypeID) -> ConstantID { + pub fn add_smallest_constant(&mut self, id: TypeID) -> ConstantID { let ty = self.get_type(id).clone(); let constant_to_construct = match ty { + Type::Boolean => Constant::Boolean(true), + Type::Integer8 => Constant::Integer8(i8::MIN), + Type::Integer16 => Constant::Integer16(i16::MIN), + Type::Integer32 => Constant::Integer32(i32::MIN), + Type::Integer64 => Constant::Integer64(i64::MIN), + Type::UnsignedInteger8 => Constant::UnsignedInteger8(u8::MIN), + Type::UnsignedInteger16 => Constant::UnsignedInteger16(u16::MIN), + Type::UnsignedInteger32 => Constant::UnsignedInteger32(u32::MIN), + Type::UnsignedInteger64 => Constant::UnsignedInteger64(u64::MIN), + Type::Float8 | Type::BFloat16 => panic!(), Type::Float32 => Constant::Float32(ordered_float::OrderedFloat(f32::NEG_INFINITY)), Type::Float64 => Constant::Float64(ordered_float::OrderedFloat(f64::NEG_INFINITY)), - _ => panic!(), + Type::Control => panic!("PANIC: Can't create smallest constant for the control type."), + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => { + panic!("PANIC: Can't create smallest constant of a collection type.") + } }; self.add_constant(constant_to_construct) } diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index 283734a0..e635b3c0 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1556,6 +1556,44 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) }); } + Node::IntrinsicCall { + intrinsic: Intrinsic::Max, + args: _, + } if !is_smallest(editor, init) => { + editor.edit(|mut edit| { + let smallest = edit.add_smallest_constant(typing[init.idx()]); + let smallest = edit.add_node(Node::Constant { id: smallest }); + edit.sub_edit(id, smallest); + edit = edit.replace_all_uses_where(init, smallest, |u| *u == id)?; + let final_op = edit.add_node(Node::IntrinsicCall { + intrinsic: Intrinsic::Max, + args: Box::new([init, id]), + }); + for u in out_uses { + edit.sub_edit(u, final_op); + } + edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) + }); + } + Node::IntrinsicCall { + intrinsic: Intrinsic::Min, + args: _, + } if !is_largest(editor, init) => { + editor.edit(|mut edit| { + let largest = edit.add_largest_constant(typing[init.idx()]); + let largest = edit.add_node(Node::Constant { id: largest }); + edit.sub_edit(id, largest); + edit = edit.replace_all_uses_where(init, largest, |u| *u == id)?; + let final_op = edit.add_node(Node::IntrinsicCall { + intrinsic: Intrinsic::Min, + args: Box::new([init, id]), + }); + for u in out_uses { + edit.sub_edit(u, final_op); + } + edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) + }); + } _ => {} } } diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs index 1806d5c7..793fe9fa 100644 --- a/hercules_opt/src/utils.rs +++ b/hercules_opt/src/utils.rs @@ -567,3 +567,29 @@ pub fn is_one(editor: &FunctionEditor, id: NodeID) -> bool { .unwrap_or(false) || nodes[id.idx()].is_undef() } + +pub fn is_largest(editor: &FunctionEditor, id: NodeID) -> bool { + let nodes = &editor.func().nodes; + nodes[id.idx()] + .try_constant() + .map(|id| editor.get_constant(id).is_largest()) + .unwrap_or(false) + || nodes[id.idx()] + .try_dynamic_constant() + .map(|id| editor.get_dynamic_constant(id).is_largest()) + .unwrap_or(false) + || nodes[id.idx()].is_undef() +} + +pub fn is_smallest(editor: &FunctionEditor, id: NodeID) -> bool { + let nodes = &editor.func().nodes; + nodes[id.idx()] + .try_constant() + .map(|id| editor.get_constant(id).is_smallest()) + .unwrap_or(false) + || nodes[id.idx()] + .try_dynamic_constant() + .map(|id| editor.get_dynamic_constant(id).is_smallest()) + .unwrap_or(false) + || nodes[id.idx()].is_undef() +} diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index 3c3d09b3..d08e86e6 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -58,6 +58,8 @@ fixpoint { fork-coalesce(max_gradient); } simpl!(max_gradient); +clean-monoid-reduces(max_gradient); +xdot[true](max_gradient); no-memset(reject_zero_crossings@res); fixpoint { -- GitLab From e76923405e879bddd94888edd89a8088359622cd Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 15:20:59 -0600 Subject: [PATCH 5/9] Lower intrinsics in RT backend --- hercules_cg/src/rt.rs | 37 +++++++++++++++++++++++-- juno_samples/dot/src/cpu.sch | 2 +- juno_samples/edge_detection/src/cpu.sch | 15 ++++++++-- juno_scheduler/src/pm.rs | 10 ++----- 4 files changed, 51 insertions(+), 13 deletions(-) diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index d3013239..7cbb43ad 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -489,8 +489,24 @@ impl<'a> RTContext<'a> { Constant::UnsignedInteger16(val) => write!(block, "{}u16", val)?, Constant::UnsignedInteger32(val) => write!(block, "{}u32", val)?, Constant::UnsignedInteger64(val) => write!(block, "{}u64", val)?, - Constant::Float32(val) => write!(block, "{}f32", val)?, - Constant::Float64(val) => write!(block, "{}f64", val)?, + Constant::Float32(val) => { + if val == f32::INFINITY { + write!(block, "f32::INFINITY")? + } else if val == f32::NEG_INFINITY { + write!(block, "f32::NEG_INFINITY")? + } else { + write!(block, "{}f32", val)? + } + } + Constant::Float64(val) => { + if val == f64::INFINITY { + write!(block, "f64::INFINITY")? + } else if val == f64::NEG_INFINITY { + write!(block, "f64::NEG_INFINITY")? + } else { + write!(block, "{}f64", val)? + } + } Constant::Product(ty, _) | Constant::Summation(ty, _, _) | Constant::Array(ty) => { @@ -628,6 +644,23 @@ impl<'a> RTContext<'a> { } write!(block, "){};", postfix)?; } + Node::IntrinsicCall { + intrinsic, + ref args, + } => { + let block = &mut blocks.get_mut(&bb).unwrap().data; + write!( + block, + "{} = {}::{}(", + self.get_value(id, bb, true), + self.get_type(self.typing[id.idx()]), + intrinsic.lower_case_name(), + )?; + for arg in args { + write!(block, "{}, ", self.get_value(*arg, bb, false))?; + } + write!(block, ");")?; + } Node::LibraryCall { library_function, ref args, diff --git a/juno_samples/dot/src/cpu.sch b/juno_samples/dot/src/cpu.sch index 1f8953d9..5c763772 100644 --- a/juno_samples/dot/src/cpu.sch +++ b/juno_samples/dot/src/cpu.sch @@ -24,7 +24,7 @@ dce(*); let fission_out = fork-fission[out@loop](dot); simplify-cfg(dot); dce(dot); -unforkify(fission_out.dot.fj_loop_bottom); +unforkify(fission_out.dot.fj_bottom); ccp(dot); simplify-cfg(dot); gvn(dot); diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index d08e86e6..ead722ce 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -58,8 +58,17 @@ fixpoint { fork-coalesce(max_gradient); } simpl!(max_gradient); +fork-dim-merge(max_gradient); +simpl!(max_gradient); +fork-tile[8, 0, false, false](max_gradient); +let split = fork-split(max_gradient); clean-monoid-reduces(max_gradient); -xdot[true](max_gradient); +let out = outline(split._4_max_gradient.fj1); +simpl!(max_gradient, out); +unforkify(out); +let out = fork-fission[split._4_max_gradient.fj0](max_gradient); +simpl!(max_gradient); +unforkify(out._4_max_gradient.fj_bottom); no-memset(reject_zero_crossings@res); fixpoint { @@ -72,8 +81,8 @@ simpl!(reject_zero_crossings); async-call(edge_detection@le, edge_detection@zc); -fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); -unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); +fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); +unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); simpl!(*); diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 6931ce2e..392273d3 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -2560,7 +2560,7 @@ fn run_pass( let nodes_in_fork_joins = pm.nodes_in_fork_joins.take().unwrap(); let mut new_fork_joins = HashMap::new(); - let fork_label_name = &pm.labels.borrow()[fork_label.idx()].clone(); + let _fork_label_name = &pm.labels.borrow()[fork_label.idx()].clone(); for (mut func, created_fork_joins) in build_editors(pm).into_iter().zip(created_fork_joins) @@ -2583,13 +2583,9 @@ fn run_pass( // level of the split fork-joins being referred to. let mut func_record = HashMap::new(); for (idx, label) in labels { - let fmt = if idx % 2 == 0 { - format!("fj_{}_top", fork_label_name) - } else { - format!("fj_{}_bottom", fork_label_name) - }; + let fmt = if idx % 2 == 0 { "fj_top" } else { "fj_bottom" }; func_record.insert( - fmt, + fmt.to_string(), Value::Label { labels: vec![LabelInfo { func: func_id, -- GitLab From c59137b59269a2c93f8c5cf34e079378b4334e93 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 20:24:47 -0600 Subject: [PATCH 6/9] Emit infinity in GPU backend --- hercules_cg/src/gpu.rs | 16 +++++++++++----- juno_samples/edge_detection/src/gpu.sch | 3 --- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 73dcf528..6dc5d53e 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1797,7 +1797,7 @@ extern \"C\" {} {}(", Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?, Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, Constant::Float32(val) => { - write!(w, "{}{} = {}f;\n", tabs, name, format_float(**val as f64))? + write!(w, "{}{} = {};\n", tabs, name, format_float(**val as f64))? } Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, format_float(**val))?, // All three following collections involve align then allocate from the @@ -2223,9 +2223,15 @@ fn convert_type(ty: &Type, make_pointer: bool) -> String { } fn format_float(val: f64) -> String { - let mut s = val.to_string(); - if !s.contains('.') && !s.contains('e') && !s.contains('E') { - s.push_str(".0"); + if val == f64::INFINITY { + "INFINITY".to_string() + } else if val == f64::NEG_INFINITY { + "-INFINITY".to_string() + } else { + let mut s = val.to_string(); + if !s.contains('.') && !s.contains('e') && !s.contains('E') { + s.push_str(".0"); + } + s } - s } diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 2a8960ee..ed414084 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -67,12 +67,9 @@ let out = fork-split(max_gradient); fork-tile[32, 0, false, true](out._4_max_gradient.fj1); let out = fork-split(max_gradient); simpl!(max_gradient); -xdot[true](max_gradient); clean-monoid-reduces(max_gradient); -xdot[true](max_gradient); fork-fission-bufferize[out._4_max_gradient.fj0, out._4_max_gradient.fj1](max_gradient); simpl!(max_gradient); -xdot[true](max_gradient); no-memset(reject_zero_crossings@res); fixpoint { -- GitLab From 977f1540607e53cbc8f1299fe9ce1a114d7bf1de Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 20:34:48 -0600 Subject: [PATCH 7/9] Fix thread_block_tiles emit in GPU backend --- hercules_cg/src/gpu.rs | 53 +++++++++++++++++++------ juno_samples/edge_detection/src/gpu.sch | 8 +--- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 6dc5d53e..931071cb 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,8 +3,6 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::{Error, Write}; -use std::fs::{File, OpenOptions}; -use std::io::Write as _; use self::hercules_ir::*; @@ -269,7 +267,6 @@ impl GPUContext<'_> { self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; - self.codegen_goto_start(&mut top)?; write!(w, "{}", top)?; // Setup for CUDA's "goto" for control flow between basic blocks. @@ -281,10 +278,15 @@ impl GPUContext<'_> { (node_id, goto) }) .collect(); + let mut thread_block_tiles = String::new(); // If there are no forks, fast forward to single-block, single-thread codegen let (num_blocks, num_threads) = if self.fork_join_map.is_empty() { - self.codegen_data_control_no_forks(&mut dynamic_shared_offset, &mut gotos)?; + self.codegen_data_control_no_forks( + &mut dynamic_shared_offset, + &mut thread_block_tiles, + &mut gotos, + )?; ("1".to_string(), "1".to_string()) } else { // Create structures and determine block and thread parallelization strategy @@ -307,12 +309,15 @@ impl GPUContext<'_> { &mut dynamic_shared_offset, is_block_parallel, num_threads, + &mut thread_block_tiles, &mut gotos, )?; (num_blocks, num_threads.to_string()) }; // Emit all GPU kernel code from previous steps + self.codegen_goto_start(&mut thread_block_tiles)?; + write!(w, "{}", thread_block_tiles)?; let mut kernel_body = String::new(); let rev_po = self.control_subgraph.rev_po(NodeID::new(0)); write!(w, "\n")?; @@ -696,7 +701,7 @@ extern \"C\" {} {}(", let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { panic!("Expected fork node"); }; - let reduces = &self.fork_reduce_map[root_fork]; + let _reduces = &self.fork_reduce_map[root_fork]; if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors .iter() @@ -847,6 +852,7 @@ extern \"C\" {} {}(", fn codegen_data_control_no_forks( &self, dynamic_shared_offset: &mut String, + thread_block_tiles: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { (0..self.function.nodes.len()) @@ -858,8 +864,16 @@ extern \"C\" {} {}(", let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = - self.codegen_control_node(control, None, None, None, init, post_init, term)?; + let mut tabs = self.codegen_control_node( + control, + None, + None, + None, + thread_block_tiles, + init, + post_init, + term, + )?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, @@ -889,6 +903,7 @@ extern \"C\" {} {}(", dynamic_shared_offset: &mut String, is_block_parallel: bool, num_threads: usize, + thread_block_tiles: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { // First emit data and control gen for each control node outside any fork. @@ -900,8 +915,16 @@ extern \"C\" {} {}(", let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = - self.codegen_control_node(*control, None, None, None, init, post_init, term)?; + let mut tabs = self.codegen_control_node( + *control, + None, + None, + None, + thread_block_tiles, + init, + post_init, + term, + )?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, @@ -931,6 +954,7 @@ extern \"C\" {} {}(", Some(num_threads), Some(num_threads), Some(1), + thread_block_tiles, init, post_init, term, @@ -961,6 +985,7 @@ extern \"C\" {} {}(", 1, num_threads, dynamic_shared_offset, + thread_block_tiles, gotos, )?; } @@ -981,6 +1006,7 @@ extern \"C\" {} {}(", parent_quota: usize, num_threads: usize, dynamic_shared_offset: &mut String, + thread_block_tiles: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map @@ -1017,6 +1043,7 @@ extern \"C\" {} {}(", Some(available_thread_quota), Some(use_thread_quota), parallel_factor, + thread_block_tiles, init, post_init, term, @@ -1044,6 +1071,7 @@ extern \"C\" {} {}(", use_thread_quota, num_threads, dynamic_shared_offset, + thread_block_tiles, gotos, )?; } @@ -1504,6 +1532,7 @@ extern \"C\" {} {}(", available_thread_quota: Option<usize>, use_thread_quota: Option<usize>, parallel_factor: Option<usize>, + thread_block_tiles: &mut String, w_init: &mut String, w_post_init: &mut String, w_term: &mut String, @@ -1579,20 +1608,20 @@ extern \"C\" {} {}(", use_thread_quota }; write!( - w_init, + thread_block_tiles, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id )?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!( - w_init, + thread_block_tiles, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota )?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!( - w_init, + thread_block_tiles, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota )?; diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index ed414084..a3c804d5 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -62,13 +62,9 @@ fixpoint { simpl!(max_gradient); fork-dim-merge(max_gradient); simpl!(max_gradient); -fork-tile[1024, 0, false, true](max_gradient); -let out = fork-split(max_gradient); -fork-tile[32, 0, false, true](out._4_max_gradient.fj1); -let out = fork-split(max_gradient); -simpl!(max_gradient); +fork-tile[32, 0, false, true](max_gradient); +fork-split(max_gradient); clean-monoid-reduces(max_gradient); -fork-fission-bufferize[out._4_max_gradient.fj0, out._4_max_gradient.fj1](max_gradient); simpl!(max_gradient); no-memset(reject_zero_crossings@res); -- GitLab From 76338dee4402c64b9b8530898ff0864768b4cc13 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 20:54:58 -0600 Subject: [PATCH 8/9] The reduction tree works! --- hercules_opt/src/outline.rs | 12 ++++++++++-- .../edge_detection/src/edge_detection.jn | 2 +- juno_samples/edge_detection/src/gpu.sch | 17 ++++++++++++++--- juno_scheduler/src/pm.rs | 2 +- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs index 874e75e7..c6693336 100644 --- a/hercules_opt/src/outline.rs +++ b/hercules_opt/src/outline.rs @@ -23,7 +23,7 @@ pub fn outline( typing: &Vec<TypeID>, control_subgraph: &Subgraph, dom: &DomTree, - partition: &BTreeSet<NodeID>, + mut partition: BTreeSet<NodeID>, to_be_function_id: FunctionID, ) -> Option<Function> { // Step 1: do a whole bunch of analysis on the partition. @@ -34,6 +34,14 @@ pub fn outline( .any(|id| nodes[id.idx()].is_start() || nodes[id.idx()].is_parameter() || nodes[id.idx()].is_return()), "PANIC: Can't outline a partition containing the start node, parameter nodes, or return nodes." ); + for (idx, node) in nodes.into_iter().enumerate() { + if let Node::Constant { id } = node + && editor.get_constant(*id).is_scalar() + { + // Usually, you don't want to explicitly outline scalar constants. + partition.remove(&NodeID::new(idx)); + } + } let mut top_nodes = partition.iter().filter(|id| { nodes[id.idx()].is_control() && control_subgraph @@ -611,7 +619,7 @@ pub fn dumb_outline( typing, control_subgraph, dom, - &partition, + partition, to_be_function_id, ) } diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn index 3bc5bbfb..e1413488 100644 --- a/juno_samples/edge_detection/src/edge_detection.jn +++ b/juno_samples/edge_detection/src/edge_detection.jn @@ -189,7 +189,7 @@ fn gradient<n, m, sb: usize>( } fn max_gradient<n, m: usize>(gradient: f32[n, m]) -> f32 { - let max = gradient[0, 0]; + let max = -1.0; for i = 0 to n { for j = 0 to m { diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index a3c804d5..7ee2904f 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -8,7 +8,7 @@ macro simpl!(X) { infer-schedules(X); } -gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); +gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); simpl!(*); @@ -63,9 +63,21 @@ simpl!(max_gradient); fork-dim-merge(max_gradient); simpl!(max_gradient); fork-tile[32, 0, false, true](max_gradient); -fork-split(max_gradient); +let out = fork-split(max_gradient); clean-monoid-reduces(max_gradient); simpl!(max_gradient); +let fission = fork-fission[out._4_max_gradient.fj0](max_gradient); +simpl!(max_gradient); +fork-tile[32, 0, false, true](fission._4_max_gradient.fj_bottom); +let out = fork-split(fission._4_max_gradient.fj_bottom); +clean-monoid-reduces(max_gradient); +simpl!(max_gradient); +let top = outline(fission._4_max_gradient.fj_top); +let bottom = outline(out._4_max_gradient.fj0); +gpu(top, bottom); +ip-sroa(*); +sroa(*); +simpl!(*); no-memset(reject_zero_crossings@res); fixpoint { @@ -82,4 +94,3 @@ simpl!(*); delete-uncalled(*); gcm(*); - diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 392273d3..44b14257 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -2158,7 +2158,7 @@ fn run_pass( &typing[func.idx()], &control_subgraphs[func.idx()], &doms[func.idx()], - &nodes, + nodes, new_func_id, ); let Some(new_func) = new_func else { -- GitLab From ca9b2bf9b83dcf4baeb36c16994659a0276b6f02 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Thu, 20 Feb 2025 21:07:14 -0600 Subject: [PATCH 9/9] Hm --- hercules_opt/src/outline.rs | 16 ++++++++++------ juno_scheduler/src/pm.rs | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs index c6693336..088e5775 100644 --- a/hercules_opt/src/outline.rs +++ b/hercules_opt/src/outline.rs @@ -25,6 +25,7 @@ pub fn outline( dom: &DomTree, mut partition: BTreeSet<NodeID>, to_be_function_id: FunctionID, + outline_scalar_constants: bool, ) -> Option<Function> { // Step 1: do a whole bunch of analysis on the partition. let nodes = &editor.func().nodes; @@ -34,12 +35,14 @@ pub fn outline( .any(|id| nodes[id.idx()].is_start() || nodes[id.idx()].is_parameter() || nodes[id.idx()].is_return()), "PANIC: Can't outline a partition containing the start node, parameter nodes, or return nodes." ); - for (idx, node) in nodes.into_iter().enumerate() { - if let Node::Constant { id } = node - && editor.get_constant(*id).is_scalar() - { - // Usually, you don't want to explicitly outline scalar constants. - partition.remove(&NodeID::new(idx)); + if !outline_scalar_constants { + for (idx, node) in nodes.into_iter().enumerate() { + if let Node::Constant { id } = node + && editor.get_constant(*id).is_scalar() + { + // Usually, you don't want to explicitly outline scalar constants. + partition.remove(&NodeID::new(idx)); + } } } let mut top_nodes = partition.iter().filter(|id| { @@ -621,5 +624,6 @@ pub fn dumb_outline( dom, partition, to_be_function_id, + true, ) } diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 44b14257..675cfe1c 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -2160,6 +2160,7 @@ fn run_pass( &doms[func.idx()], nodes, new_func_id, + false, ); let Some(new_func) = new_func else { return Err(SchedulerError::PassError { -- GitLab