diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 17f0f8939f4961e36da41b7c975bf867376b5d60..931071cb2747ee03cb3e87f342f3737e6eb82404 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -3,8 +3,6 @@ extern crate hercules_ir;
 
 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::fmt::{Error, Write};
-use std::fs::{File, OpenOptions};
-use std::io::Write as _;
 
 use self::hercules_ir::*;
 
@@ -269,7 +267,6 @@ impl GPUContext<'_> {
         self.codegen_dynamic_constants(&mut top)?;
         self.codegen_declare_data(&mut top)?;
         self.codegen_helpers(&mut top)?;
-        self.codegen_goto_start(&mut top)?;
         write!(w, "{}", top)?;
 
         // Setup for CUDA's "goto" for control flow between basic blocks.
@@ -281,10 +278,15 @@ impl GPUContext<'_> {
                 (node_id, goto)
             })
             .collect();
+        let mut thread_block_tiles = String::new();
 
         // If there are no forks, fast forward to single-block, single-thread codegen
         let (num_blocks, num_threads) = if self.fork_join_map.is_empty() {
-            self.codegen_data_control_no_forks(&mut dynamic_shared_offset, &mut gotos)?;
+            self.codegen_data_control_no_forks(
+                &mut dynamic_shared_offset,
+                &mut thread_block_tiles,
+                &mut gotos,
+            )?;
             ("1".to_string(), "1".to_string())
         } else {
             // Create structures and determine block and thread parallelization strategy
@@ -307,12 +309,15 @@ impl GPUContext<'_> {
                 &mut dynamic_shared_offset,
                 is_block_parallel,
                 num_threads,
+                &mut thread_block_tiles,
                 &mut gotos,
             )?;
             (num_blocks, num_threads.to_string())
         };
 
         // Emit all GPU kernel code from previous steps
+        self.codegen_goto_start(&mut thread_block_tiles)?;
+        write!(w, "{}", thread_block_tiles)?;
         let mut kernel_body = String::new();
         let rev_po = self.control_subgraph.rev_po(NodeID::new(0));
         write!(w, "\n")?;
@@ -696,7 +701,7 @@ extern \"C\" {} {}(",
         let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else {
             panic!("Expected fork node");
         };
-        let reduces = &self.fork_reduce_map[root_fork];
+        let _reduces = &self.fork_reduce_map[root_fork];
         if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) {
             let fork_size = factors
                 .iter()
@@ -847,6 +852,7 @@ extern \"C\" {} {}(",
     fn codegen_data_control_no_forks(
         &self,
         dynamic_shared_offset: &mut String,
+        thread_block_tiles: &mut String,
         gotos: &mut BTreeMap<NodeID, CudaGoto>,
     ) -> Result<(), Error> {
         (0..self.function.nodes.len())
@@ -858,8 +864,16 @@ extern \"C\" {} {}(",
                 let post_init = &mut goto.post_init;
                 let body = &mut goto.body;
                 let term = &mut goto.term;
-                let mut tabs =
-                    self.codegen_control_node(control, None, None, None, init, post_init, term)?;
+                let mut tabs = self.codegen_control_node(
+                    control,
+                    None,
+                    None,
+                    None,
+                    thread_block_tiles,
+                    init,
+                    post_init,
+                    term,
+                )?;
                 for data in self.bbs.1[control.idx()].iter() {
                     self.codegen_data_node(
                         *data,
@@ -889,6 +903,7 @@ extern \"C\" {} {}(",
         dynamic_shared_offset: &mut String,
         is_block_parallel: bool,
         num_threads: usize,
+        thread_block_tiles: &mut String,
         gotos: &mut BTreeMap<NodeID, CudaGoto>,
     ) -> Result<(), Error> {
         // First emit data and control gen for each control node outside any fork.
@@ -900,8 +915,16 @@ extern \"C\" {} {}(",
             let post_init = &mut goto.post_init;
             let body = &mut goto.body;
             let term = &mut goto.term;
-            let mut tabs =
-                self.codegen_control_node(*control, None, None, None, init, post_init, term)?;
+            let mut tabs = self.codegen_control_node(
+                *control,
+                None,
+                None,
+                None,
+                thread_block_tiles,
+                init,
+                post_init,
+                term,
+            )?;
             for data in self.bbs.1[control.idx()].iter() {
                 self.codegen_data_node(
                     *data,
@@ -931,6 +954,7 @@ extern \"C\" {} {}(",
                     Some(num_threads),
                     Some(num_threads),
                     Some(1),
+                    thread_block_tiles,
                     init,
                     post_init,
                     term,
@@ -961,6 +985,7 @@ extern \"C\" {} {}(",
                 1,
                 num_threads,
                 dynamic_shared_offset,
+                thread_block_tiles,
                 gotos,
             )?;
         }
@@ -981,6 +1006,7 @@ extern \"C\" {} {}(",
         parent_quota: usize,
         num_threads: usize,
         dynamic_shared_offset: &mut String,
+        thread_block_tiles: &mut String,
         gotos: &mut BTreeMap<NodeID, CudaGoto>,
     ) -> Result<(), Error> {
         let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map
@@ -1017,6 +1043,7 @@ extern \"C\" {} {}(",
                 Some(available_thread_quota),
                 Some(use_thread_quota),
                 parallel_factor,
+                thread_block_tiles,
                 init,
                 post_init,
                 term,
@@ -1044,6 +1071,7 @@ extern \"C\" {} {}(",
                 use_thread_quota,
                 num_threads,
                 dynamic_shared_offset,
+                thread_block_tiles,
                 gotos,
             )?;
         }
@@ -1383,15 +1411,15 @@ extern \"C\" {} {}(",
                     let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use);
                     #[allow(unreachable_patterns)]
                     let cg_op = match intrinsic {
-                        Intrinsic::Max => "max",
-                        Intrinsic::Min => "min",
+                        Intrinsic::Max => "greater",
+                        Intrinsic::Min => "less",
                         _ => unreachable!(),
                     };
                     let id_type_name = self.get_type(id_type, false);
                     write!(
                         w,
                         "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n",
-                        tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name
+                        tabs, define_variable, cg_tile, non_reduce_arg, cg_op, id_type_name
                     )?;
                 } else {
                     let ty = &self.types[id_type.idx()];
@@ -1504,6 +1532,7 @@ extern \"C\" {} {}(",
         available_thread_quota: Option<usize>,
         use_thread_quota: Option<usize>,
         parallel_factor: Option<usize>,
+        thread_block_tiles: &mut String,
         w_init: &mut String,
         w_post_init: &mut String,
         w_term: &mut String,
@@ -1579,20 +1608,20 @@ extern \"C\" {} {}(",
                         use_thread_quota
                     };
                     write!(
-                        w_init,
+                        thread_block_tiles,
                         "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
                         use_thread_per_id, cg_tile, use_thread_per_id
                     )?;
                     let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                     write!(
-                        w_init,
+                        thread_block_tiles,
                         "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
                         use_thread_quota, cg_tile_use, use_thread_quota
                     )?;
                     let available_thread_quota = available_thread_quota.unwrap();
                     let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                     write!(
-                        w_init,
+                        thread_block_tiles,
                         "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
@@ -1797,7 +1826,7 @@ extern \"C\" {} {}(",
             Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?,
             Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?,
             Constant::Float32(val) => {
-                write!(w, "{}{} = {}f;\n", tabs, name, format_float(**val as f64))?
+                write!(w, "{}{} = {};\n", tabs, name, format_float(**val as f64))?
             }
             Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, format_float(**val))?,
             // All three following collections involve align then allocate from the
@@ -2223,9 +2252,15 @@ fn convert_type(ty: &Type, make_pointer: bool) -> String {
 }
 
 fn format_float(val: f64) -> String {
-    let mut s = val.to_string();
-    if !s.contains('.') && !s.contains('e') && !s.contains('E') {
-        s.push_str(".0");
+    if val == f64::INFINITY {
+        "INFINITY".to_string()
+    } else if val == f64::NEG_INFINITY {
+        "-INFINITY".to_string()
+    } else {
+        let mut s = val.to_string();
+        if !s.contains('.') && !s.contains('e') && !s.contains('E') {
+            s.push_str(".0");
+        }
+        s
     }
-    s
 }
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index d3013239f5f78ce8e63181c181c0d5a8cbf77f81..7cbb43ad54b439c1a61eb4a5f0bf927fc2f37eae 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -489,8 +489,24 @@ impl<'a> RTContext<'a> {
                     Constant::UnsignedInteger16(val) => write!(block, "{}u16", val)?,
                     Constant::UnsignedInteger32(val) => write!(block, "{}u32", val)?,
                     Constant::UnsignedInteger64(val) => write!(block, "{}u64", val)?,
-                    Constant::Float32(val) => write!(block, "{}f32", val)?,
-                    Constant::Float64(val) => write!(block, "{}f64", val)?,
+                    Constant::Float32(val) => {
+                        if val == f32::INFINITY {
+                            write!(block, "f32::INFINITY")?
+                        } else if val == f32::NEG_INFINITY {
+                            write!(block, "f32::NEG_INFINITY")?
+                        } else {
+                            write!(block, "{}f32", val)?
+                        }
+                    }
+                    Constant::Float64(val) => {
+                        if val == f64::INFINITY {
+                            write!(block, "f64::INFINITY")?
+                        } else if val == f64::NEG_INFINITY {
+                            write!(block, "f64::NEG_INFINITY")?
+                        } else {
+                            write!(block, "{}f64", val)?
+                        }
+                    }
                     Constant::Product(ty, _)
                     | Constant::Summation(ty, _, _)
                     | Constant::Array(ty) => {
@@ -628,6 +644,23 @@ impl<'a> RTContext<'a> {
                 }
                 write!(block, "){};", postfix)?;
             }
+            Node::IntrinsicCall {
+                intrinsic,
+                ref args,
+            } => {
+                let block = &mut blocks.get_mut(&bb).unwrap().data;
+                write!(
+                    block,
+                    "{} = {}::{}(",
+                    self.get_value(id, bb, true),
+                    self.get_type(self.typing[id.idx()]),
+                    intrinsic.lower_case_name(),
+                )?;
+                for arg in args {
+                    write!(block, "{}, ", self.get_value(*arg, bb, false))?;
+                }
+                write!(block, ");")?;
+            }
             Node::LibraryCall {
                 library_function,
                 ref args,
diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs
index bf9698b32125832b047ee9a94668bd0a09b93ac9..f91efe584c7422e2d7e1e542fed7141fcf684f53 100644
--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -1050,6 +1050,38 @@ impl Constant {
             _ => false,
         }
     }
+
+    pub fn is_largest(&self) -> bool {
+        match self {
+            Constant::Integer8(i8::MAX) => true,
+            Constant::Integer16(i16::MAX) => true,
+            Constant::Integer32(i32::MAX) => true,
+            Constant::Integer64(i64::MAX) => true,
+            Constant::UnsignedInteger8(u8::MAX) => true,
+            Constant::UnsignedInteger16(u16::MAX) => true,
+            Constant::UnsignedInteger32(u32::MAX) => true,
+            Constant::UnsignedInteger64(u64::MAX) => true,
+            Constant::Float32(ord) => *ord == OrderedFloat::<f32>(f32::INFINITY),
+            Constant::Float64(ord) => *ord == OrderedFloat::<f64>(f64::INFINITY),
+            _ => false,
+        }
+    }
+
+    pub fn is_smallest(&self) -> bool {
+        match self {
+            Constant::Integer8(i8::MIN) => true,
+            Constant::Integer16(i16::MIN) => true,
+            Constant::Integer32(i32::MIN) => true,
+            Constant::Integer64(i64::MIN) => true,
+            Constant::UnsignedInteger8(u8::MIN) => true,
+            Constant::UnsignedInteger16(u16::MIN) => true,
+            Constant::UnsignedInteger32(u32::MIN) => true,
+            Constant::UnsignedInteger64(u64::MIN) => true,
+            Constant::Float32(ord) => *ord == OrderedFloat::<f32>(f32::NEG_INFINITY),
+            Constant::Float64(ord) => *ord == OrderedFloat::<f64>(f64::NEG_INFINITY),
+            _ => false,
+        }
+    }
 }
 
 impl DynamicConstant {
@@ -1098,19 +1130,19 @@ impl DynamicConstant {
     }
 
     pub fn is_zero(&self) -> bool {
-        if *self == DynamicConstant::Constant(0) {
-            true
-        } else {
-            false
-        }
+        *self == DynamicConstant::Constant(0)
     }
 
     pub fn is_one(&self) -> bool {
-        if *self == DynamicConstant::Constant(1) {
-            true
-        } else {
-            false
-        }
+        *self == DynamicConstant::Constant(1)
+    }
+
+    pub fn is_largest(&self) -> bool {
+        *self == DynamicConstant::Constant(usize::MAX)
+    }
+
+    pub fn is_smallest(&self) -> bool {
+        *self == DynamicConstant::Constant(usize::MIN)
     }
 
     pub fn try_parameter(&self) -> Option<usize> {
diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs
index 16e5c3264d33a7c9bef85fc0fa3cec02963dbf48..57fe204245fc0adf75c8a046cfd9db25ebed119e 100644
--- a/hercules_opt/src/editor.rs
+++ b/hercules_opt/src/editor.rs
@@ -795,6 +795,52 @@ impl<'a, 'b> FunctionEdit<'a, 'b> {
         self.add_constant(constant_to_construct)
     }
 
+    pub fn add_largest_constant(&mut self, id: TypeID) -> ConstantID {
+        let ty = self.get_type(id).clone();
+        let constant_to_construct = match ty {
+            Type::Boolean => Constant::Boolean(true),
+            Type::Integer8 => Constant::Integer8(i8::MAX),
+            Type::Integer16 => Constant::Integer16(i16::MAX),
+            Type::Integer32 => Constant::Integer32(i32::MAX),
+            Type::Integer64 => Constant::Integer64(i64::MAX),
+            Type::UnsignedInteger8 => Constant::UnsignedInteger8(u8::MAX),
+            Type::UnsignedInteger16 => Constant::UnsignedInteger16(u16::MAX),
+            Type::UnsignedInteger32 => Constant::UnsignedInteger32(u32::MAX),
+            Type::UnsignedInteger64 => Constant::UnsignedInteger64(u64::MAX),
+            Type::Float8 | Type::BFloat16 => panic!(),
+            Type::Float32 => Constant::Float32(ordered_float::OrderedFloat(f32::INFINITY)),
+            Type::Float64 => Constant::Float64(ordered_float::OrderedFloat(f64::INFINITY)),
+            Type::Control => panic!("PANIC: Can't create largest constant for the control type."),
+            Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => {
+                panic!("PANIC: Can't create largest constant of a collection type.")
+            }
+        };
+        self.add_constant(constant_to_construct)
+    }
+
+    pub fn add_smallest_constant(&mut self, id: TypeID) -> ConstantID {
+        let ty = self.get_type(id).clone();
+        let constant_to_construct = match ty {
+            Type::Boolean => Constant::Boolean(true),
+            Type::Integer8 => Constant::Integer8(i8::MIN),
+            Type::Integer16 => Constant::Integer16(i16::MIN),
+            Type::Integer32 => Constant::Integer32(i32::MIN),
+            Type::Integer64 => Constant::Integer64(i64::MIN),
+            Type::UnsignedInteger8 => Constant::UnsignedInteger8(u8::MIN),
+            Type::UnsignedInteger16 => Constant::UnsignedInteger16(u16::MIN),
+            Type::UnsignedInteger32 => Constant::UnsignedInteger32(u32::MIN),
+            Type::UnsignedInteger64 => Constant::UnsignedInteger64(u64::MIN),
+            Type::Float8 | Type::BFloat16 => panic!(),
+            Type::Float32 => Constant::Float32(ordered_float::OrderedFloat(f32::NEG_INFINITY)),
+            Type::Float64 => Constant::Float64(ordered_float::OrderedFloat(f64::NEG_INFINITY)),
+            Type::Control => panic!("PANIC: Can't create smallest constant for the control type."),
+            Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => {
+                panic!("PANIC: Can't create smallest constant of a collection type.")
+            }
+        };
+        self.add_constant(constant_to_construct)
+    }
+
     pub fn get_constant(&self, id: ConstantID) -> impl Deref<Target = Constant> + '_ {
         if id.idx() < self.editor.constants.borrow().len() {
             Either::Left(Ref::map(self.editor.constants.borrow(), |constants| {
diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index 283734a009ab3f619910b02465bf9d4d05856bef..e635b3c00d7bfa0090376d8056e65d8d01e60ce2 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1556,6 +1556,44 @@ pub fn clean_monoid_reduces(editor: &mut FunctionEditor, typing: &Vec<TypeID>) {
                     edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op)
                 });
             }
+            Node::IntrinsicCall {
+                intrinsic: Intrinsic::Max,
+                args: _,
+            } if !is_smallest(editor, init) => {
+                editor.edit(|mut edit| {
+                    let smallest = edit.add_smallest_constant(typing[init.idx()]);
+                    let smallest = edit.add_node(Node::Constant { id: smallest });
+                    edit.sub_edit(id, smallest);
+                    edit = edit.replace_all_uses_where(init, smallest, |u| *u == id)?;
+                    let final_op = edit.add_node(Node::IntrinsicCall {
+                        intrinsic: Intrinsic::Max,
+                        args: Box::new([init, id]),
+                    });
+                    for u in out_uses {
+                        edit.sub_edit(u, final_op);
+                    }
+                    edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op)
+                });
+            }
+            Node::IntrinsicCall {
+                intrinsic: Intrinsic::Min,
+                args: _,
+            } if !is_largest(editor, init) => {
+                editor.edit(|mut edit| {
+                    let largest = edit.add_largest_constant(typing[init.idx()]);
+                    let largest = edit.add_node(Node::Constant { id: largest });
+                    edit.sub_edit(id, largest);
+                    edit = edit.replace_all_uses_where(init, largest, |u| *u == id)?;
+                    let final_op = edit.add_node(Node::IntrinsicCall {
+                        intrinsic: Intrinsic::Min,
+                        args: Box::new([init, id]),
+                    });
+                    for u in out_uses {
+                        edit.sub_edit(u, final_op);
+                    }
+                    edit.replace_all_uses_where(id, final_op, |u| *u != reduct && *u != final_op)
+                });
+            }
             _ => {}
         }
     }
diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs
index 874e75e739b0f05f72f0f8cfa4c6ae6540ed9c6f..088e57750df0265883472ab111d5d0a6e5f21c6e 100644
--- a/hercules_opt/src/outline.rs
+++ b/hercules_opt/src/outline.rs
@@ -23,8 +23,9 @@ pub fn outline(
     typing: &Vec<TypeID>,
     control_subgraph: &Subgraph,
     dom: &DomTree,
-    partition: &BTreeSet<NodeID>,
+    mut partition: BTreeSet<NodeID>,
     to_be_function_id: FunctionID,
+    outline_scalar_constants: bool,
 ) -> Option<Function> {
     // Step 1: do a whole bunch of analysis on the partition.
     let nodes = &editor.func().nodes;
@@ -34,6 +35,16 @@ pub fn outline(
             .any(|id| nodes[id.idx()].is_start() || nodes[id.idx()].is_parameter() || nodes[id.idx()].is_return()),
         "PANIC: Can't outline a partition containing the start node, parameter nodes, or return nodes."
     );
+    if !outline_scalar_constants {
+        for (idx, node) in nodes.into_iter().enumerate() {
+            if let Node::Constant { id } = node
+                && editor.get_constant(*id).is_scalar()
+            {
+                // Usually, you don't want to explicitly outline scalar constants.
+                partition.remove(&NodeID::new(idx));
+            }
+        }
+    }
     let mut top_nodes = partition.iter().filter(|id| {
         nodes[id.idx()].is_control()
             && control_subgraph
@@ -611,7 +622,8 @@ pub fn dumb_outline(
         typing,
         control_subgraph,
         dom,
-        &partition,
+        partition,
         to_be_function_id,
+        true,
     )
 }
diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs
index 1806d5c740e57a666f98ebf6c0ba40ee9a6461bd..793fe9fabfee830eef97687097aa5e40177a369d 100644
--- a/hercules_opt/src/utils.rs
+++ b/hercules_opt/src/utils.rs
@@ -567,3 +567,29 @@ pub fn is_one(editor: &FunctionEditor, id: NodeID) -> bool {
             .unwrap_or(false)
         || nodes[id.idx()].is_undef()
 }
+
+pub fn is_largest(editor: &FunctionEditor, id: NodeID) -> bool {
+    let nodes = &editor.func().nodes;
+    nodes[id.idx()]
+        .try_constant()
+        .map(|id| editor.get_constant(id).is_largest())
+        .unwrap_or(false)
+        || nodes[id.idx()]
+            .try_dynamic_constant()
+            .map(|id| editor.get_dynamic_constant(id).is_largest())
+            .unwrap_or(false)
+        || nodes[id.idx()].is_undef()
+}
+
+pub fn is_smallest(editor: &FunctionEditor, id: NodeID) -> bool {
+    let nodes = &editor.func().nodes;
+    nodes[id.idx()]
+        .try_constant()
+        .map(|id| editor.get_constant(id).is_smallest())
+        .unwrap_or(false)
+        || nodes[id.idx()]
+            .try_dynamic_constant()
+            .map(|id| editor.get_dynamic_constant(id).is_smallest())
+            .unwrap_or(false)
+        || nodes[id.idx()].is_undef()
+}
diff --git a/juno_samples/dot/src/cpu.sch b/juno_samples/dot/src/cpu.sch
index 1f8953d99dd6dabf02c35a97e7235adcdfc69fde..5c763772250c20ec90ccb70f5642d1db5f56fc80 100644
--- a/juno_samples/dot/src/cpu.sch
+++ b/juno_samples/dot/src/cpu.sch
@@ -24,7 +24,7 @@ dce(*);
 let fission_out = fork-fission[out@loop](dot);
 simplify-cfg(dot);
 dce(dot);
-unforkify(fission_out.dot.fj_loop_bottom);
+unforkify(fission_out.dot.fj_bottom);
 ccp(dot);
 simplify-cfg(dot);
 gvn(dot);
diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index 3c3d09b34f9bd8c4b412d4b19e3898769cf2670a..ead722ce0adfcf61ccc6dc79c70e1ba76d0e8eeb 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -58,6 +58,17 @@ fixpoint {
   fork-coalesce(max_gradient);
 }
 simpl!(max_gradient);
+fork-dim-merge(max_gradient);
+simpl!(max_gradient);
+fork-tile[8, 0, false, false](max_gradient);
+let split = fork-split(max_gradient);
+clean-monoid-reduces(max_gradient);
+let out = outline(split._4_max_gradient.fj1);
+simpl!(max_gradient, out);
+unforkify(out);
+let out = fork-fission[split._4_max_gradient.fj0](max_gradient);
+simpl!(max_gradient);
+unforkify(out._4_max_gradient.fj_bottom);
 
 no-memset(reject_zero_crossings@res);
 fixpoint {
@@ -70,8 +81,8 @@ simpl!(reject_zero_crossings);
 
 async-call(edge_detection@le, edge_detection@zc);
 
-fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings);
-unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings);
+fork-split(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
+unforkify(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
 
 simpl!(*);
 
diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
index 3bc5bbfbe16b6eb15d0c6387d9f0d8397ce69cc6..e1413488e95d324e154ac478c2131db666fcbbf8 100644
--- a/juno_samples/edge_detection/src/edge_detection.jn
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -189,7 +189,7 @@ fn gradient<n, m, sb: usize>(
 }
 
 fn max_gradient<n, m: usize>(gradient: f32[n, m]) -> f32 {
-  let max = gradient[0, 0];
+  let max = -1.0;
 
   for i = 0 to n {
     for j = 0 to m {
diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch
index 1e51efb9d84da1d20496d04c6aa939ffc2bc4123..7ee2904f7d1bb59780390360dbd9abc6b3934aba 100644
--- a/juno_samples/edge_detection/src/gpu.sch
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -1,22 +1,96 @@
-gvn(*);
-phi-elim(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  infer-schedules(X);
+}
+
+gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, reject_zero_crossings);
 
-gpu(gaussian_smoothing, laplacian_estimate, zero_crossings, gradient, max_gradient, reject_zero_crossings);
+simpl!(*);
 
 ip-sroa(*);
 sroa(*);
-dce(*);
-gvn(*);
-phi-elim(*);
-dce(*);
+simpl!(*);
 
-//forkify(*);
-infer-schedules(*);
+no-memset(gaussian_smoothing@res);
+fixpoint {
+  forkify(gaussian_smoothing);
+  fork-guard-elim(gaussian_smoothing);
+  fork-coalesce(gaussian_smoothing);
+}
+predication(gaussian_smoothing);
+simpl!(gaussian_smoothing);
+predication(gaussian_smoothing);
+simpl!(gaussian_smoothing);
 
-gcm(*);
+no-memset(laplacian_estimate@res, laplacian_estimate@shr1, laplacian_estimate@shr2);
+fixpoint {
+  forkify(laplacian_estimate);
+  fork-guard-elim(laplacian_estimate);
+  fork-coalesce(laplacian_estimate);
+}
+simpl!(laplacian_estimate);
+
+no-memset(zero_crossings@res, zero_crossings@shr1, zero_crossings@shr2);
+fixpoint {
+  forkify(zero_crossings);
+  fork-guard-elim(zero_crossings);
+  fork-coalesce(zero_crossings);
+}
+simpl!(zero_crossings);
+
+no-memset(gradient@res);
 fixpoint {
-  float-collections(*);
-  dce(*);
-  gcm(*);
+  forkify(gradient);
+  fork-guard-elim(gradient);
+  fork-coalesce(gradient);
 }
+predication(gradient);
+simpl!(gradient);
+predication(gradient);
+simpl!(gradient);
+
+fixpoint {
+  forkify(max_gradient);
+  fork-guard-elim(max_gradient);
+  fork-coalesce(max_gradient);
+}
+simpl!(max_gradient);
+fork-dim-merge(max_gradient);
+simpl!(max_gradient);
+fork-tile[32, 0, false, true](max_gradient);
+let out = fork-split(max_gradient);
+clean-monoid-reduces(max_gradient);
+simpl!(max_gradient);
+let fission = fork-fission[out._4_max_gradient.fj0](max_gradient);
+simpl!(max_gradient);
+fork-tile[32, 0, false, true](fission._4_max_gradient.fj_bottom);
+let out = fork-split(fission._4_max_gradient.fj_bottom);
+clean-monoid-reduces(max_gradient);
+simpl!(max_gradient);
+let top = outline(fission._4_max_gradient.fj_top);
+let bottom = outline(out._4_max_gradient.fj0);
+gpu(top, bottom);
+ip-sroa(*);
+sroa(*);
+simpl!(*);
+
+no-memset(reject_zero_crossings@res);
+fixpoint {
+  forkify(reject_zero_crossings);
+  fork-guard-elim(reject_zero_crossings);
+  fork-coalesce(reject_zero_crossings);
+}
+predication(reject_zero_crossings);
+simpl!(reject_zero_crossings);
+
+async-call(edge_detection@le, edge_detection@zc);
+
+simpl!(*);
+
+delete-uncalled(*);
+gcm(*);
diff --git a/juno_samples/edge_detection/src/lib.rs b/juno_samples/edge_detection/src/lib.rs
index 6c2a15bd394a8fed3828ea79f2f8470856ead846..dab84cf6206c3cb9b816c88c53c8ddfbec102994 100644
--- a/juno_samples/edge_detection/src/lib.rs
+++ b/juno_samples/edge_detection/src/lib.rs
@@ -143,6 +143,11 @@ pub fn edge_detection_harness(args: EdgeDetectionInputs) {
         num_frames
     };
 
+    println!(
+        "Running edge with {} rows, {} columns, {} gs, {} sz, and {} sb.",
+        height, width, gs, sz, sb,
+    );
+
     let mut r = runner!(edge_detection);
 
     let mut output = output.map(|filename| {
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 6931ce2e17680146f68da4ad4c61a50dffcaadb0..675cfe1ca9f25f74f7c6ad3c969f3d2b9b9524fd 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -2158,8 +2158,9 @@ fn run_pass(
                 &typing[func.idx()],
                 &control_subgraphs[func.idx()],
                 &doms[func.idx()],
-                &nodes,
+                nodes,
                 new_func_id,
+                false,
             );
             let Some(new_func) = new_func else {
                 return Err(SchedulerError::PassError {
@@ -2560,7 +2561,7 @@ fn run_pass(
             let nodes_in_fork_joins = pm.nodes_in_fork_joins.take().unwrap();
             let mut new_fork_joins = HashMap::new();
 
-            let fork_label_name = &pm.labels.borrow()[fork_label.idx()].clone();
+            let _fork_label_name = &pm.labels.borrow()[fork_label.idx()].clone();
 
             for (mut func, created_fork_joins) in
                 build_editors(pm).into_iter().zip(created_fork_joins)
@@ -2583,13 +2584,9 @@ fn run_pass(
                 // level of the split fork-joins being referred to.
                 let mut func_record = HashMap::new();
                 for (idx, label) in labels {
-                    let fmt = if idx % 2 == 0 {
-                        format!("fj_{}_top", fork_label_name)
-                    } else {
-                        format!("fj_{}_bottom", fork_label_name)
-                    };
+                    let fmt = if idx % 2 == 0 { "fj_top" } else { "fj_bottom" };
                     func_record.insert(
-                        fmt,
+                        fmt.to_string(),
                         Value::Label {
                             labels: vec![LabelInfo {
                                 func: func_id,