diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 17f0f8939f4961e36da41b7c975bf867376b5d60..931071cb2747ee03cb3e87f342f3737e6eb82404 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -3,8 +3,6 @@ extern crate hercules_ir; use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::{Error, Write}; -use std::fs::{File, OpenOptions}; -use std::io::Write as _; use self::hercules_ir::*; @@ -269,7 +267,6 @@ impl GPUContext<'_> { self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; self.codegen_helpers(&mut top)?; - self.codegen_goto_start(&mut top)?; write!(w, "{}", top)?; // Setup for CUDA's "goto" for control flow between basic blocks. @@ -281,10 +278,15 @@ impl GPUContext<'_> { (node_id, goto) }) .collect(); + let mut thread_block_tiles = String::new(); // If there are no forks, fast forward to single-block, single-thread codegen let (num_blocks, num_threads) = if self.fork_join_map.is_empty() { - self.codegen_data_control_no_forks(&mut dynamic_shared_offset, &mut gotos)?; + self.codegen_data_control_no_forks( + &mut dynamic_shared_offset, + &mut thread_block_tiles, + &mut gotos, + )?; ("1".to_string(), "1".to_string()) } else { // Create structures and determine block and thread parallelization strategy @@ -307,12 +309,15 @@ impl GPUContext<'_> { &mut dynamic_shared_offset, is_block_parallel, num_threads, + &mut thread_block_tiles, &mut gotos, )?; (num_blocks, num_threads.to_string()) }; // Emit all GPU kernel code from previous steps + self.codegen_goto_start(&mut thread_block_tiles)?; + write!(w, "{}", thread_block_tiles)?; let mut kernel_body = String::new(); let rev_po = self.control_subgraph.rev_po(NodeID::new(0)); write!(w, "\n")?; @@ -696,7 +701,7 @@ extern \"C\" {} {}(", let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else { panic!("Expected fork node"); }; - let reduces = &self.fork_reduce_map[root_fork]; + let _reduces = &self.fork_reduce_map[root_fork]; if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors .iter() @@ -847,6 +852,7 @@ extern \"C\" {} {}(", fn codegen_data_control_no_forks( &self, dynamic_shared_offset: &mut String, + thread_block_tiles: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { (0..self.function.nodes.len()) @@ -858,8 +864,16 @@ extern \"C\" {} {}(", let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = - self.codegen_control_node(control, None, None, None, init, post_init, term)?; + let mut tabs = self.codegen_control_node( + control, + None, + None, + None, + thread_block_tiles, + init, + post_init, + term, + )?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, @@ -889,6 +903,7 @@ extern \"C\" {} {}(", dynamic_shared_offset: &mut String, is_block_parallel: bool, num_threads: usize, + thread_block_tiles: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { // First emit data and control gen for each control node outside any fork. @@ -900,8 +915,16 @@ extern \"C\" {} {}(", let post_init = &mut goto.post_init; let body = &mut goto.body; let term = &mut goto.term; - let mut tabs = - self.codegen_control_node(*control, None, None, None, init, post_init, term)?; + let mut tabs = self.codegen_control_node( + *control, + None, + None, + None, + thread_block_tiles, + init, + post_init, + term, + )?; for data in self.bbs.1[control.idx()].iter() { self.codegen_data_node( *data, @@ -931,6 +954,7 @@ extern \"C\" {} {}(", Some(num_threads), Some(num_threads), Some(1), + thread_block_tiles, init, post_init, term, @@ -961,6 +985,7 @@ extern \"C\" {} {}(", 1, num_threads, dynamic_shared_offset, + thread_block_tiles, gotos, )?; } @@ -981,6 +1006,7 @@ extern \"C\" {} {}(", parent_quota: usize, num_threads: usize, dynamic_shared_offset: &mut String, + thread_block_tiles: &mut String, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map @@ -1017,6 +1043,7 @@ extern \"C\" {} {}(", Some(available_thread_quota), Some(use_thread_quota), parallel_factor, + thread_block_tiles, init, post_init, term, @@ -1044,6 +1071,7 @@ extern \"C\" {} {}(", use_thread_quota, num_threads, dynamic_shared_offset, + thread_block_tiles, gotos, )?; } @@ -1383,15 +1411,15 @@ extern \"C\" {} {}(", let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use); #[allow(unreachable_patterns)] let cg_op = match intrinsic { - Intrinsic::Max => "max", - Intrinsic::Min => "min", + Intrinsic::Max => "greater", + Intrinsic::Min => "less", _ => unreachable!(), }; let id_type_name = self.get_type(id_type, false); write!( w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", - tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name + tabs, define_variable, cg_tile, non_reduce_arg, cg_op, id_type_name )?; } else { let ty = &self.types[id_type.idx()]; @@ -1504,6 +1532,7 @@ extern \"C\" {} {}(", available_thread_quota: Option<usize>, use_thread_quota: Option<usize>, parallel_factor: Option<usize>, + thread_block_tiles: &mut String, w_init: &mut String, w_post_init: &mut String, w_term: &mut String, @@ -1579,20 +1608,20 @@ extern \"C\" {} {}(", use_thread_quota }; write!( - w_init, + thread_block_tiles, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id )?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!( - w_init, + thread_block_tiles, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota )?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!( - w_init, + thread_block_tiles, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota )?; @@ -1797,7 +1826,7 @@ extern \"C\" {} {}(", Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?, Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?, Constant::Float32(val) => { - write!(w, "{}{} = {}f;\n", tabs, name, format_float(**val as f64))? + write!(w, "{}{} = {};\n", tabs, name, format_float(**val as f64))? } Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, format_float(**val))?, // All three following collections involve align then allocate from the @@ -2223,9 +2252,15 @@ fn convert_type(ty: &Type, make_pointer: bool) -> String { } fn format_float(val: f64) -> String { - let mut s = val.to_string(); - if !s.contains('.') && !s.contains('e') && !s.contains('E') { - s.push_str(".0"); + if val == f64::INFINITY { + "INFINITY".to_string() + } else if val == f64::NEG_INFINITY { + "-INFINITY".to_string() + } else { + let mut s = val.to_string(); + if !s.contains('.') && !s.contains('e') && !s.contains('E') { + s.push_str(".0"); + } + s } - s } diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index d3013239f5f78ce8e63181c181c0d5a8cbf77f81..7cbb43ad54b439c1a61eb4a5f0bf927fc2f37eae 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -489,8 +489,24 @@ impl<'a> RTContext<'a> { Constant::UnsignedInteger16(val) => write!(block, "{}u16", val)?, Constant::UnsignedInteger32(val) => write!(block, "{}u32", val)?, Constant::UnsignedInteger64(val) => write!(block, "{}u64", val)?, - Constant::Float32(val) => write!(block, "{}f32", val)?, - Constant::Float64(val) => write!(block, "{}f64", val)?, + Constant::Float32(val) => { + if val == f32::INFINITY { + write!(block, "f32::INFINITY")? + } else if val == f32::NEG_INFINITY { + write!(block, "f32::NEG_INFINITY")? + } else { + write!(block, "{}f32", val)? + } + } + Constant::Float64(val) => { + if val == f64::INFINITY { + write!(block, "f64::INFINITY")? + } else if val == f64::NEG_INFINITY { + write!(block, "f64::NEG_INFINITY")? + } else { + write!(block, "{}f64", val)? + } + } Constant::Product(ty, _) | Constant::Summation(ty, _, _) | Constant::Array(ty) => { @@ -628,6 +644,23 @@ impl<'a> RTContext<'a> { } write!(block, "){};", postfix)?; } + Node::IntrinsicCall { + intrinsic, + ref args, + } => { + let block = &mut blocks.get_mut(&bb).unwrap().data; + write!( + block, + "{} = {}::{}(", + self.get_value(id, bb, true), + self.get_type(self.typing[id.idx()]), + intrinsic.lower_case_name(), + )?; + for arg in args { + write!(block, "{}, ", self.get_value(*arg, bb, false))?; + } + write!(block, ");")?; + } Node::LibraryCall { library_function, ref args, diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index bf9698b32125832b047ee9a94668bd0a09b93ac9..f91efe584c7422e2d7e1e542fed7141fcf684f53 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1050,6 +1050,38 @@ impl Constant { _ => false, } } + + pub fn is_largest(&self) -> bool { + match self { + Constant::Integer8(i8::MAX) => true, + Constant::Integer16(i16::MAX) => true, + Constant::Integer32(i32::MAX) => true, + Constant::Integer64(i64::MAX) => true, + Constant::UnsignedInteger8(u8::MAX) => true, + Constant::UnsignedInteger16(u16::MAX) => true, + Constant::UnsignedInteger32(u32::MAX) => true, + Constant::UnsignedInteger64(u64::MAX) => true, + Constant::Float32(ord) => *ord == OrderedFloat::<f32>(f32::INFINITY), + Constant::Float64(ord) => *ord == OrderedFloat::<f64>(f64::INFINITY), + _ => false, + } + } + + pub fn is_smallest(&self) -> bool { + match self { + Constant::Integer8(i8::MIN) => true, + Constant::Integer16(i16::MIN) => true, + Constant::Integer32(i32::MIN) => true, + Constant::Integer64(i64::MIN) => true, + Constant::UnsignedInteger8(u8::MIN) => true, + Constant::UnsignedInteger16(u16::MIN) => true, + Constant::UnsignedInteger32(u32::MIN) => true, + Constant::UnsignedInteger64(u64::MIN) => true, + Constant::Float32(ord) => *ord == OrderedFloat::<f32>(f32::NEG_INFINITY), + Constant::Float64(ord) => *ord == OrderedFloat::<f64>(f64::NEG_INFINITY), + _ => false, + } + } } impl DynamicConstant { @@ -1098,19 +1130,19 @@ impl DynamicConstant { } pub fn is_zero(&self) -> bool { - if *self == DynamicConstant::Constant(0) { - true - } else { - false - } + *self == DynamicConstant::Constant(0) } pub fn is_one(&self) -> bool { - if *self == DynamicConstant::Constant(1) { - true - } else { - false - } + *self == DynamicConstant::Constant(1) + } + + pub fn is_largest(&self) -> bool { + *self == DynamicConstant::Constant(usize::MAX) + } + + pub fn is_smallest(&self) -> bool { + *self == DynamicConstant::Constant(usize::MIN) } pub fn try_parameter(&self) -> Option<usize> { diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs index 16e5c3264d33a7c9bef85fc0fa3cec02963dbf48..57fe204245fc0adf75c8a046cfd9db25ebed119e 100644 --- a/hercules_opt/src/editor.rs +++ b/hercules_opt/src/editor.rs @@ -795,6 +795,52 @@ impl<'a, 'b> FunctionEdit<'a, 'b> { self.add_constant(constant_to_construct) } + pub fn add_largest_constant(&mut self, id: TypeID) -> ConstantID { + let ty = self.get_type(id).clone(); + let constant_to_construct = match ty { + Type::Boolean => Constant::Boolean(true), + Type::Integer8 => Constant::Integer8(i8::MAX), + Type::Integer16 => Constant::Integer16(i16::MAX), + Type::Integer32 => Constant::Integer32(i32::MAX), + Type::Integer64 => Constant::Integer64(i64::MAX), + Type::UnsignedInteger8 => Constant::UnsignedInteger8(u8::MAX), + Type::UnsignedInteger16 => Constant::UnsignedInteger16(u16::MAX), + Type::UnsignedInteger32 => Constant::UnsignedInteger32(u32::MAX), + Type::UnsignedInteger64 => Constant::UnsignedInteger64(u64::MAX), + Type::Float8 | Type::BFloat16 => panic!(), + Type::Float32 => Constant::Float32(ordered_float::OrderedFloat(f32::INFINITY)), + Type::Float64 => Constant::Float64(ordered_float::OrderedFloat(f64::INFINITY)), + Type::Control => panic!("PANIC: Can't create largest constant for the control type."), + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => { + panic!("PANIC: Can't create largest constant of a collection type.") + } + }; + self.add_constant(constant_to_construct) + } + + pub fn add_smallest_constant(&mut self, id: TypeID) -> ConstantID { + let ty = self.get_type(id).clone(); + let constant_to_construct = match ty { + Type::Boolean => Constant::Boolean(true), + Type::Integer8 => Constant::Integer8(i8::MIN), + Type::Integer16 => Constant::Integer16(i16::MIN), + Type::Integer32 => Constant::Integer32(i32::MIN), + Type::Integer64 => Constant::Integer64(i64::MIN), + Type::UnsignedInteger8 => Constant::UnsignedInteger8(u8::MIN), + Type::UnsignedInteger16 => Constant::UnsignedInteger16(u16::MIN), + Type::UnsignedInteger32 => Constant::UnsignedInteger32(u32::MIN), + Type::UnsignedInteger64 => Constant::UnsignedInteger64(u64::MIN), + Type::Float8 | Type::BFloat16 => panic!(), + Type::Float32 => Constant::Float32(ordered_float::OrderedFloat(f32::NEG_INFINITY)), + Type::Float64 => Constant::Float64(ordered_float::OrderedFloat(f64::NEG_INFINITY)), + Type::Control => panic!("PANIC: Can't create smallest constant for the control type."), + Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => { + panic!("PANIC: Can't create smallest constant of a collection type.") + } + }; + self.add_constant(constant_to_construct) + } + pub fn get_constant(&self, id: ConstantID) -> impl Deref<Target = Constant> + '_ { if id.idx() < self.editor.constants.borrow().len() { Either::Left(Ref::map(self.editor.constants.borrow(), |constants| { diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs index 283734a009ab3f619910b02465bf9d4d05856bef..e635b3c00d7bfa0090376d8056e65d8d01e60ce2 100644 --- a/hercules_opt/src/fork_transforms.rs +++ b/hercules_opt/src/fork_transforms.rs @@ -1556,6 +1556,44 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) }); } + Node::IntrinsicCall { + intrinsic: Intrinsic::Max, + args: _, + } if !is_smallest(editor, init) => { + editor.edit(|mut edit| { + let smallest = edit.add_smallest_constant(typing[init.idx()]); + let smallest = edit.add_node(Node::Constant { id: smallest }); + edit.sub_edit(id, smallest); + edit = edit.replace_all_uses_where(init, smallest, |u| *u == id)?; + let final_op = edit.add_node(Node::IntrinsicCall { + intrinsic: Intrinsic::Max, + args: Box::new([init, id]), + }); + for u in out_uses { + edit.sub_edit(u, final_op); + } + edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) + }); + } + Node::IntrinsicCall { + intrinsic: Intrinsic::Min, + args: _, + } if !is_largest(editor, init) => { + editor.edit(|mut edit| { + let largest = edit.add_largest_constant(typing[init.idx()]); + let largest = edit.add_node(Node::Constant { id: largest }); + edit.sub_edit(id, largest); + edit = edit.replace_all_uses_where(init, largest, |u| *u == id)?; + let final_op = edit.add_node(Node::IntrinsicCall { + intrinsic: Intrinsic::Min, + args: Box::new([init, id]), + }); + for u in out_uses { + edit.sub_edit(u, final_op); + } + edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op) + }); + } _ => {} } } diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs index 874e75e739b0f05f72f0f8cfa4c6ae6540ed9c6f..088e57750df0265883472ab111d5d0a6e5f21c6e 100644 --- a/hercules_opt/src/outline.rs +++ b/hercules_opt/src/outline.rs @@ -23,8 +23,9 @@ pub fn outline( typing: &Vec<TypeID>, control_subgraph: &Subgraph, dom: &DomTree, - partition: &BTreeSet<NodeID>, + mut partition: BTreeSet<NodeID>, to_be_function_id: FunctionID, + outline_scalar_constants: bool, ) -> Option<Function> { // Step 1: do a whole bunch of analysis on the partition. let nodes = &editor.func().nodes; @@ -34,6 +35,16 @@ pub fn outline( .any(|id| nodes[id.idx()].is_start() || nodes[id.idx()].is_parameter() || nodes[id.idx()].is_return()), "PANIC: Can't outline a partition containing the start node, parameter nodes, or return nodes." ); + if !outline_scalar_constants { + for (idx, node) in nodes.into_iter().enumerate() { + if let Node::Constant { id } = node + && editor.get_constant(*id).is_scalar() + { + // Usually, you don't want to explicitly outline scalar constants. + partition.remove(&NodeID::new(idx)); + } + } + } let mut top_nodes = partition.iter().filter(|id| { nodes[id.idx()].is_control() && control_subgraph @@ -611,7 +622,8 @@ pub fn dumb_outline( typing, control_subgraph, dom, - &partition, + partition, to_be_function_id, + true, ) } diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs index 1806d5c740e57a666f98ebf6c0ba40ee9a6461bd..793fe9fabfee830eef97687097aa5e40177a369d 100644 --- a/hercules_opt/src/utils.rs +++ b/hercules_opt/src/utils.rs @@ -567,3 +567,29 @@ pub fn is_one(editor: &FunctionEditor, id: NodeID) -> bool { .unwrap_or(false) || nodes[id.idx()].is_undef() } + +pub fn is_largest(editor: &FunctionEditor, id: NodeID) -> bool { + let nodes = &editor.func().nodes; + nodes[id.idx()] + .try_constant() + .map(|id| editor.get_constant(id).is_largest()) + .unwrap_or(false) + || nodes[id.idx()] + .try_dynamic_constant() + .map(|id| editor.get_dynamic_constant(id).is_largest()) + .unwrap_or(false) + || nodes[id.idx()].is_undef() +} + +pub fn is_smallest(editor: &FunctionEditor, id: NodeID) -> bool { + let nodes = &editor.func().nodes; + nodes[id.idx()] + .try_constant() + .map(|id| editor.get_constant(id).is_smallest()) + .unwrap_or(false) + || nodes[id.idx()] + .try_dynamic_constant() + .map(|id| editor.get_dynamic_constant(id).is_smallest()) + .unwrap_or(false) + || nodes[id.idx()].is_undef() +} diff --git a/juno_samples/dot/src/cpu.sch b/juno_samples/dot/src/cpu.sch index 1f8953d99dd6dabf02c35a97e7235adcdfc69fde..5c763772250c20ec90ccb70f5642d1db5f56fc80 100644 --- a/juno_samples/dot/src/cpu.sch +++ b/juno_samples/dot/src/cpu.sch @@ -24,7 +24,7 @@ dce(*); let fission_out = fork-fission[out@loop](dot); simplify-cfg(dot); dce(dot); -unforkify(fission_out.dot.fj_loop_bottom); +unforkify(fission_out.dot.fj_bottom); ccp(dot); simplify-cfg(dot); gvn(dot); diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch index 3c3d09b34f9bd8c4b412d4b19e3898769cf2670a..ead722ce0adfcf61ccc6dc79c70e1ba76d0e8eeb 100644 --- a/juno_samples/edge_detection/src/cpu.sch +++ b/juno_samples/edge_detection/src/cpu.sch @@ -58,6 +58,17 @@ fixpoint { fork-coalesce(max_gradient); } simpl!(max_gradient); +fork-dim-merge(max_gradient); +simpl!(max_gradient); +fork-tile[8, 0, false, false](max_gradient); +let split = fork-split(max_gradient); +clean-monoid-reduces(max_gradient); +let out = outline(split._4_max_gradient.fj1); +simpl!(max_gradient, out); +unforkify(out); +let out = fork-fission[split._4_max_gradient.fj0](max_gradient); +simpl!(max_gradient); +unforkify(out._4_max_gradient.fj_bottom); no-memset(reject_zero_crossings@res); fixpoint { @@ -70,8 +81,8 @@ simpl!(reject_zero_crossings); async-call(edge_detection@le, edge_detection@zc); -fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); -unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); +fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); +unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); simpl!(*); diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn index 3bc5bbfbe16b6eb15d0c6387d9f0d8397ce69cc6..e1413488e95d324e154ac478c2131db666fcbbf8 100644 --- a/juno_samples/edge_detection/src/edge_detection.jn +++ b/juno_samples/edge_detection/src/edge_detection.jn @@ -189,7 +189,7 @@ fn gradient<n, m, sb: usize>( } fn max_gradient<n, m: usize>(gradient: f32[n, m]) -> f32 { - let max = gradient[0, 0]; + let max = -1.0; for i = 0 to n { for j = 0 to m { diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch index 1e51efb9d84da1d20496d04c6aa939ffc2bc4123..7ee2904f7d1bb59780390360dbd9abc6b3934aba 100644 --- a/juno_samples/edge_detection/src/gpu.sch +++ b/juno_samples/edge_detection/src/gpu.sch @@ -1,22 +1,96 @@ -gvn(*); -phi-elim(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + +gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings); -gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings); +simpl!(*); ip-sroa(*); sroa(*); -dce(*); -gvn(*); -phi-elim(*); -dce(*); +simpl!(*); -//forkify(*); -infer-schedules(*); +no-memset(gaussian_smoothing@res); +fixpoint { + forkify(gaussian_smoothing); + fork-guard-elim(gaussian_smoothing); + fork-coalesce(gaussian_smoothing); +} +predication(gaussian_smoothing); +simpl!(gaussian_smoothing); +predication(gaussian_smoothing); +simpl!(gaussian_smoothing); -gcm(*); +no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2); +fixpoint { + forkify(laplacian_estimate); + fork-guard-elim(laplacian_estimate); + fork-coalesce(laplacian_estimate); +} +simpl!(laplacian_estimate); + +no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2); +fixpoint { + forkify(zero_crossings); + fork-guard-elim(zero_crossings); + fork-coalesce(zero_crossings); +} +simpl!(zero_crossings); + +no-memset(gradient@res); fixpoint { - float-collections(*); - dce(*); - gcm(*); + forkify(gradient); + fork-guard-elim(gradient); + fork-coalesce(gradient); } +predication(gradient); +simpl!(gradient); +predication(gradient); +simpl!(gradient); + +fixpoint { + forkify(max_gradient); + fork-guard-elim(max_gradient); + fork-coalesce(max_gradient); +} +simpl!(max_gradient); +fork-dim-merge(max_gradient); +simpl!(max_gradient); +fork-tile[32, 0, false, true](max_gradient); +let out = fork-split(max_gradient); +clean-monoid-reduces(max_gradient); +simpl!(max_gradient); +let fission = fork-fission[out._4_max_gradient.fj0](max_gradient); +simpl!(max_gradient); +fork-tile[32, 0, false, true](fission._4_max_gradient.fj_bottom); +let out = fork-split(fission._4_max_gradient.fj_bottom); +clean-monoid-reduces(max_gradient); +simpl!(max_gradient); +let top = outline(fission._4_max_gradient.fj_top); +let bottom = outline(out._4_max_gradient.fj0); +gpu(top, bottom); +ip-sroa(*); +sroa(*); +simpl!(*); + +no-memset(reject_zero_crossings@res); +fixpoint { + forkify(reject_zero_crossings); + fork-guard-elim(reject_zero_crossings); + fork-coalesce(reject_zero_crossings); +} +predication(reject_zero_crossings); +simpl!(reject_zero_crossings); + +async-call(edge_detection@le, edge_detection@zc); + +simpl!(*); + +delete-uncalled(*); +gcm(*); diff --git a/juno_samples/edge_detection/src/lib.rs b/juno_samples/edge_detection/src/lib.rs index 6c2a15bd394a8fed3828ea79f2f8470856ead846..dab84cf6206c3cb9b816c88c53c8ddfbec102994 100644 --- a/juno_samples/edge_detection/src/lib.rs +++ b/juno_samples/edge_detection/src/lib.rs @@ -143,6 +143,11 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) { num_frames }; + println!( + "Running edge with {} rows, {} columns, {} gs, {} sz, and {} sb.", + height, width, gs, sz, sb, + ); + let mut r = runner!(edge_detection); let mut output = output.map(|filename| { diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 6931ce2e17680146f68da4ad4c61a50dffcaadb0..675cfe1ca9f25f74f7c6ad3c969f3d2b9b9524fd 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -2158,8 +2158,9 @@ fn run_pass( &typing[func.idx()], &control_subgraphs[func.idx()], &doms[func.idx()], - &nodes, + nodes, new_func_id, + false, ); let Some(new_func) = new_func else { return Err(SchedulerError::PassError { @@ -2560,7 +2561,7 @@ fn run_pass( let nodes_in_fork_joins = pm.nodes_in_fork_joins.take().unwrap(); let mut new_fork_joins = HashMap::new(); - let fork_label_name = &pm.labels.borrow()[fork_label.idx()].clone(); + let _fork_label_name = &pm.labels.borrow()[fork_label.idx()].clone(); for (mut func, created_fork_joins) in build_editors(pm).into_iter().zip(created_fork_joins) @@ -2583,13 +2584,9 @@ fn run_pass( // level of the split fork-joins being referred to. let mut func_record = HashMap::new(); for (idx, label) in labels { - let fmt = if idx % 2 == 0 { - format!("fj_{}_top", fork_label_name) - } else { - format!("fj_{}_bottom", fork_label_name) - }; + let fmt = if idx % 2 == 0 { "fj_top" } else { "fj_bottom" }; func_record.insert( - fmt, + fmt.to_string(), Value::Label { labels: vec![LabelInfo { func: func_id,