diff --git a/hercules_opt/src/forkify.rs b/hercules_opt/src/forkify.rs
index 2f6466c0845c544225879eb941cb8ca798384dc6..a6308d66eae3bd47b3d607fb8970679843599168 100644
--- a/hercules_opt/src/forkify.rs
+++ b/hercules_opt/src/forkify.rs
@@ -115,15 +115,14 @@ pub fn forkify_loop(
         return false;
     };
 
-    // FIXME: Make sure IV is not used outside the loop.
-
     // Get bound
     let bound = match canonical_iv {
         InductionVariable::Basic {
             node: _,
             initializer: _,
-            update: _,
             final_value,
+            update_expression,
+            update_value,
         } => final_value
             .map(|final_value| get_node_as_dc(editor, final_value))
             .and_then(|r| r.ok()),
diff --git a/hercules_opt/src/ivar.rs b/hercules_opt/src/ivar.rs
index edadd71722698158c77fa1efaf81e48f3c7afcc9..511371bfee4c25d567b3e1e1513ddbd6cdf6e6fa 100644
--- a/hercules_opt/src/ivar.rs
+++ b/hercules_opt/src/ivar.rs
@@ -43,7 +43,8 @@ nest! {
         pub Basic {
             node: NodeID,
             initializer: NodeID,
-            update: NodeID,
+            update_expression: NodeID,
+            update_value: NodeID,
             final_value: Option<NodeID>,
         },
         SCEV(NodeID), // TODO @(xrouth)
@@ -56,7 +57,8 @@ impl InductionVariable {
             InductionVariable::Basic {
                 node,
                 initializer: _,
-                update: _,
+                update_expression: _,
+                update_value: _,
                 final_value: _,
             } => *node,
             InductionVariable::SCEV(_) => todo!(),
@@ -75,12 +77,17 @@ pub fn calculate_loop_nodes(editor: &FunctionEditor, natural_loop: &Loop) -> Has
             // External Phi
             if let Node::Phi { control, data: _ } = data {
                 match natural_loop.control.get(control.idx()) {
-                    Some(v) => if !*v {
-                        return true;
-                    },
+                    Some(v) => {
+                        if !*v {
+                            return true;
+                        }
+                    }
                     None => {
-                        panic!("unexpceted index: {:?} for loop {:?}", control, natural_loop.header);
-                    },
+                        panic!(
+                            "unexpceted index: {:?} for loop {:?}",
+                            control, natural_loop.header
+                        );
+                    }
                 }
             }
             // External Reduce
@@ -91,24 +98,34 @@ pub fn calculate_loop_nodes(editor: &FunctionEditor, natural_loop: &Loop) -> Has
             } = data
             {
                 match natural_loop.control.get(control.idx()) {
-                    Some(v) => if !*v {
-                        return true;
-                    },
+                    Some(v) => {
+                        if !*v {
+                            return true;
+                        }
+                    }
                     None => {
-                        panic!("unexpceted index: {:?} for loop {:?}", control, natural_loop.header);
-                    },
+                        panic!(
+                            "unexpceted index: {:?} for loop {:?}",
+                            control, natural_loop.header
+                        );
+                    }
                 }
             }
 
             // External Control
             if data.is_control() {
                 match natural_loop.control.get(node.idx()) {
-                    Some(v) => if !*v {
-                        return true;
-                    },
+                    Some(v) => {
+                        if !*v {
+                            return true;
+                        }
+                    }
                     None => {
-                        panic!("unexpceted index: {:?} for loop {:?}", node, natural_loop.header);
-                    },
+                        panic!(
+                            "unexpceted index: {:?} for loop {:?}",
+                            node, natural_loop.header
+                        );
+                    }
                 }
             }
 
@@ -332,13 +349,14 @@ pub fn has_const_fields(editor: &FunctionEditor, ivar: InductionVariable) -> boo
         InductionVariable::Basic {
             node: _,
             initializer,
-            update,
             final_value,
+            update_expression,
+            update_value,
         } => {
             if final_value.is_none() {
                 return false;
             }
-            [initializer, update]
+            [initializer, update_value]
                 .iter()
                 .any(|node| !editor.node(node).is_constant())
         }
@@ -357,8 +375,9 @@ pub fn has_canonical_iv<'a>(
         InductionVariable::Basic {
             node: _,
             initializer,
-            update,
             final_value,
+            update_expression,
+            update_value,
         } => {
             (editor
                 .node(initializer)
@@ -366,9 +385,11 @@ pub fn has_canonical_iv<'a>(
                 || editor
                     .node(initializer)
                     .is_zero_dc(&editor.get_dynamic_constants()))
-                && (editor.node(update).is_one_constant(&editor.get_constants())
+                && (editor
+                    .node(update_value)
+                    .is_one_constant(&editor.get_constants())
                     || editor
-                        .node(update)
+                        .node(update_value)
                         .is_one_dc(&editor.get_dynamic_constants()))
                 && (final_value
                     .map(|val| {
@@ -458,8 +479,9 @@ pub fn compute_induction_vars(
                                 return Some(InductionVariable::Basic {
                                     node: phi_id,
                                     initializer: initializer_id,
-                                    update: b,
                                     final_value: None,
+                                    update_expression: *data_id,
+                                    update_value: b,
                                 });
                             } else {
                                 None
@@ -476,13 +498,12 @@ pub fn compute_induction_vars(
     induction_variables
 }
 
-// Find loop iterations
-pub fn compute_iv_ranges(
+pub fn get_loop_condition_ivs(
     editor: &FunctionEditor,
     l: &Loop,
-    induction_vars: Vec<InductionVariable>,
+    induction_vars: &Vec<InductionVariable>,
     loop_condition: &LoopExit,
-) -> Vec<InductionVariable> {
+) -> HashSet<NodeID> {
     let condition_node = match loop_condition {
         LoopExit::Conditional {
             if_node: _,
@@ -507,11 +528,39 @@ pub fn compute_iv_ranges(
     // Bound IVs used in loop bound.
     let loop_bound_uses: HashSet<_> =
         walk_all_uses_stop_on(*condition_node, editor, stop_on).collect();
+
+    HashSet::from_iter(induction_vars.iter().filter_map(|iv| {
+        if loop_bound_uses.contains(&iv.phi()) {
+            Some(iv.phi())
+        } else {
+            None
+        }
+    }))
+}
+
+// Find loop iterations
+pub fn compute_iv_ranges(
+    editor: &FunctionEditor,
+    l: &Loop,
+    induction_vars: Vec<InductionVariable>,
+    loop_condition: &LoopExit,
+) -> Vec<InductionVariable> {
+    let condition_node = match loop_condition.clone() {
+        LoopExit::Conditional {
+            if_node: _,
+            condition_node,
+        } => condition_node,
+        LoopExit::Unconditional(_) => todo!(),
+    };
+
+    let loop_bound_iv_phis = get_loop_condition_ivs(editor, l, &induction_vars, loop_condition);
+
     let (loop_bound_ivs, other_ivs): (Vec<InductionVariable>, Vec<InductionVariable>) =
         induction_vars
             .into_iter()
-            .partition(|f| loop_bound_uses.contains(&f.phi()));
+            .partition(|f| loop_bound_iv_phis.contains(&f.phi()));
 
+    // Assume there is only one loop bound iv.
     let Some(iv) = loop_bound_ivs.first() else {
         return other_ivs;
     };
@@ -520,6 +569,8 @@ pub fn compute_iv_ranges(
         return loop_bound_ivs.into_iter().chain(other_ivs).collect();
     }
 
+    // Bound IVs used in the loop condition.
+
     // FIXME: DO linear algerbra to solve for loop bounds with multiple variables involved.
     let final_value = match &editor.func().nodes[condition_node.idx()] {
         Node::Phi {
@@ -570,8 +621,9 @@ pub fn compute_iv_ranges(
                                         InductionVariable::Basic {
                                             node: _,
                                             initializer: _,
-                                            update,
                                             final_value: _,
+                                            update_expression: _,
+                                            update_value: update,
                                         } => update,
                                         InductionVariable::SCEV(_) => todo!(),
                                     };
@@ -604,12 +656,14 @@ pub fn compute_iv_ranges(
         InductionVariable::Basic {
             node,
             initializer,
-            update,
             final_value: _,
+            update_expression,
+            update_value,
         } => InductionVariable::Basic {
             node: *node,
             initializer: *initializer,
-            update: *update,
+            update_expression: *update_expression,
+            update_value: *update_value,
             final_value,
         },
         InductionVariable::SCEV(_) => todo!(),
diff --git a/hercules_opt/src/lib.rs b/hercules_opt/src/lib.rs
index a349230e0add49e07b6a17dad05a358b17b3a8fa..b56f94086204444b49672ee6baefccdaa0b0cb8b 100644
--- a/hercules_opt/src/lib.rs
+++ b/hercules_opt/src/lib.rs
@@ -16,6 +16,7 @@ pub mod inline;
 pub mod interprocedural_sroa;
 pub mod ivar;
 pub mod lift_dc_math;
+pub mod loop_bound_canon;
 pub mod outline;
 pub mod phi_elim;
 pub mod pred;
@@ -43,6 +44,7 @@ pub use crate::inline::*;
 pub use crate::interprocedural_sroa::*;
 pub use crate::ivar::*;
 pub use crate::lift_dc_math::*;
+pub use crate::loop_bound_canon::*;
 pub use crate::outline::*;
 pub use crate::phi_elim::*;
 pub use crate::pred::*;
diff --git a/hercules_opt/src/loop_bound_canon.rs b/hercules_opt/src/loop_bound_canon.rs
new file mode 100644
index 0000000000000000000000000000000000000000..680236f168c04fb26f8f8befd9ea865835235fca
--- /dev/null
+++ b/hercules_opt/src/loop_bound_canon.rs
@@ -0,0 +1,309 @@
+use std::collections::HashMap;
+use std::collections::HashSet;
+use std::iter::zip;
+use std::iter::FromIterator;
+
+use itertools::Itertools;
+use nestify::nest;
+
+use hercules_ir::*;
+
+use crate::*;
+
+pub fn loop_bound_canon_toplevel(
+    editor: &mut FunctionEditor,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    control_subgraph: &Subgraph,
+    loops: &LoopTree,
+) -> bool {
+    let natural_loops = loops
+        .bottom_up_loops()
+        .into_iter()
+        .filter(|(k, _)| editor.func().nodes[k.idx()].is_region());
+
+    let natural_loops: Vec<_> = natural_loops.collect();
+
+    for l in natural_loops {
+        if editor.is_mutable(l.0)
+            && canonicalize_single_loop_bounds(
+                editor,
+                control_subgraph,
+                &Loop {
+                    header: l.0,
+                    control: l.1.clone(),
+                },
+            )
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+pub fn canonicalize_single_loop_bounds(
+    editor: &mut FunctionEditor,
+    control_subgraph: &Subgraph,
+    l: &Loop,
+) -> bool {
+    let function = editor.func();
+
+    let Some(loop_condition) = get_loop_exit_conditions(function, l, control_subgraph) else {
+        return false;
+    };
+
+    let LoopExit::Conditional {
+        if_node: loop_if,
+        condition_node,
+    } = loop_condition.clone()
+    else {
+        return false;
+    };
+
+    let loop_variance = compute_loop_variance(editor, l);
+    let ivs = compute_induction_vars(editor.func(), l, &loop_variance);
+    let ivs = compute_iv_ranges(editor, l, ivs, &loop_condition);
+
+    if has_canonical_iv(editor, l, &ivs).is_some() {
+        // println!("has canon iv!");
+        return true;
+    }
+
+    let loop_bound_iv_phis = get_loop_condition_ivs(editor, l, &ivs, &loop_condition);
+
+    let (loop_bound_ivs, _): (Vec<InductionVariable>, Vec<InductionVariable>) = ivs
+        .into_iter()
+        .partition(|f| loop_bound_iv_phis.contains(&f.phi()));
+
+    // Assume there is only one loop bound iv.
+    if loop_bound_ivs.len() != 1 {
+        // println!("has multiple iv!");
+        return false;
+    }
+
+    let Some(iv) = loop_bound_ivs.first() else {
+        return false;
+    };
+
+    let InductionVariable::Basic {
+        node: iv_phi,
+        initializer,
+        final_value,
+        update_expression,
+        update_value,
+    } = iv
+    else {
+        return false;
+    };
+
+    let Some(final_value) = final_value else {
+        return false;
+    };
+
+    let Some(loop_pred) = editor
+        .get_uses(l.header)
+        .filter(|node| !l.control[node.idx()])
+        .next()
+    else {
+        return false;
+    };
+
+    // If there is a guard, we need to edit it.
+
+    // (init_id, bound_id, binop node, if node).
+
+    // FIXME: This is quite fragile.
+    let guard_info: Option<(NodeID, NodeID, NodeID, NodeID)> = (|| {
+        let Node::Projection {
+            control,
+            selection: _,
+        } = editor.node(loop_pred)
+        else {
+            return None;
+        };
+
+        let Node::If { control, cond } = editor.node(control) else {
+            return None;
+        };
+
+        let Node::Binary { left, right, op } = editor.node(cond) else {
+            return None;
+        };
+
+        let Node::Binary {
+            left: _,
+            right: _,
+            op: loop_op,
+        } = editor.node(condition_node)
+        else {
+            return None;
+        };
+
+        if op != loop_op {
+            return None;
+        }
+
+        if left != initializer {
+            return None;
+        }
+
+        if right != final_value {
+            return None;
+        }
+
+        return Some((*left, *right, *cond, *control));
+    })();
+
+    // // If guard is none, if some, make sure it is a good guard! move on
+    // if let Some((init_id, bound_id, binop_node, if_node))= potential_guard_info {
+
+    // };
+
+    // let fork_guard_condition =
+
+    // Lift dc math should make all constant into DCs, so these should all be DCs.
+    let Node::DynamicConstant { id: init_dc_id } = *editor.node(initializer) else {
+        return false;
+    };
+    let Node::DynamicConstant { id: update_dc_id } = *editor.node(update_value) else {
+        return false;
+    };
+
+    // We are assuming this is a simple loop bound (i.e only one induction variable involved), so that .
+    let Node::DynamicConstant {
+        id: loop_bound_dc_id,
+    } = *editor.node(final_value)
+    else {
+        return false;
+    };
+
+    // We need to do 4 (5) things, which are mostly separate.
+
+    // 0) Make the update into addition.
+    // 1) Make the update a positive value.
+    // 2) Transform the condition into a `<`
+    // 3) Adjust update to be 1 (and bounds).
+    // 4) Change init to start from 0.
+
+    // 5) Find some way to get fork-guard-elim to work with the new fork.
+    // ideally, this goes in fork-guard-elim, but for now we hack it to change the guard condition bounds
+    // here when we edit the loop bounds.
+
+    // Right now we are just going to do (4), because I am lazy!
+
+    // Collect info about the loop condition transformation.
+    let mut dc_bound_node = match *editor.node(condition_node) {
+        Node::Binary { left, right, op } => match op {
+            BinaryOperator::LT => {
+                if left == *update_expression && editor.node(right).is_dynamic_constant() {
+                    right
+                } else {
+                    return false;
+                }
+            }
+            BinaryOperator::LTE => todo!(),
+            BinaryOperator::GT => todo!(),
+            BinaryOperator::GTE => todo!(),
+            BinaryOperator::EQ => todo!(),
+            BinaryOperator::NE => todo!(),
+            BinaryOperator::Or => todo!(),
+            BinaryOperator::And => todo!(),
+            BinaryOperator::Xor => todo!(),
+            _ => panic!(),
+        },
+        _ => return false,
+    };
+
+    let Node::DynamicConstant {
+        id: bound_node_dc_id,
+    } = *editor.node(dc_bound_node)
+    else {
+        return false;
+    };
+
+    // If increment is negative (how in the world do we know that...)
+    // Increment can be DefinetlyPostiive, Unknown, DefinetlyNegative.
+
+    // // First, massage loop condition to be <, because that is normal!
+    // Also includes
+    // editor.edit(|mut edit| {
+
+    // }
+    // Collect immediate IV users
+
+    let update_expr_users: Vec<_> = editor
+        .get_users(*update_expression)
+        .filter(|node| *node != iv.phi() && *node != condition_node)
+        .collect();
+    // println!("update_expr_users: {:?}", update_expr_users);
+    let iv_phi_users: Vec<_> = editor
+        .get_users(iv.phi())
+        .filter(|node| *node != iv.phi() && *node != *update_expression)
+        .collect();
+
+    // println!(" iv_phi_users: {:?}",  iv_phi_users);
+
+    let result = editor.edit(|mut edit| {
+        // 4) Second, change loop IV to go from 0..N.
+        // we subtract off init from init and dc_bound_node,
+        // and then we add it back to uses of the IV.
+        let new_init_dc = DynamicConstant::Constant(0);
+        let new_init = Node::DynamicConstant {
+            id: edit.add_dynamic_constant(new_init_dc),
+        };
+        let new_init = edit.add_node(new_init);
+        edit = edit.replace_all_uses_where(*initializer, new_init, |usee| *usee == iv.phi())?;
+
+        let new_condition_id = DynamicConstant::sub(bound_node_dc_id, init_dc_id);
+        let new_condition = Node::DynamicConstant {
+            id: edit.add_dynamic_constant(new_condition_id),
+        };
+        let new_condition = edit.add_node(new_condition);
+        edit = edit
+            .replace_all_uses_where(dc_bound_node, new_condition, |usee| *usee == condition_node)?;
+
+        // Change loop guard:
+        if let Some((init_id, bound_id, binop_node, if_node)) = guard_info {
+            edit = edit.replace_all_uses_where(init_id, new_init, |usee| *usee == binop_node)?;
+            edit =
+                edit.replace_all_uses_where(bound_id, new_condition, |usee| *usee == binop_node)?;
+        }
+
+        // Add back to uses of the IV
+        for user in update_expr_users {
+            let new_user = Node::Binary {
+                left: user,
+                right: *initializer,
+                op: BinaryOperator::Add,
+            };
+            let new_user = edit.add_node(new_user);
+            edit = edit.replace_all_uses(user, new_user)?;
+        }
+
+        let new_user = Node::Binary {
+            left: *update_expression,
+            right: *initializer,
+            op: BinaryOperator::Add,
+        };
+        let new_user = edit.add_node(new_user);
+        edit = edit.replace_all_uses_where(*update_expression, new_user, |usee| {
+            *usee != iv.phi()
+                && *usee != *update_expression
+                && *usee != new_user
+                && *usee != condition_node
+        })?;
+
+        let new_user = Node::Binary {
+            left: *iv_phi,
+            right: *initializer,
+            op: BinaryOperator::Add,
+        };
+        let new_user = edit.add_node(new_user);
+        edit = edit.replace_all_uses_where(*iv_phi, new_user, |usee| {
+            *usee != iv.phi() && *usee != *update_expression && *usee != new_user
+        })?;
+
+        Ok(edit)
+    });
+
+    return result;
+}
diff --git a/hercules_opt/src/schedule.rs b/hercules_opt/src/schedule.rs
index 7ecf07a424a07d0430bf23cfab2c991384f7c59d..d95a3585b77f6ff1ad17dec71b93ce424c3cc4fe 100644
--- a/hercules_opt/src/schedule.rs
+++ b/hercules_opt/src/schedule.rs
@@ -157,10 +157,7 @@ pub fn infer_monoid_reduce(
     let is_binop_monoid = |op| {
         matches!(
             op,
-            BinaryOperator::Add
-                | BinaryOperator::Mul
-                | BinaryOperator::Or
-                | BinaryOperator::And
+            BinaryOperator::Add | BinaryOperator::Mul | BinaryOperator::Or | BinaryOperator::And
         )
     };
     let is_intrinsic_monoid = |intrinsic| matches!(intrinsic, Intrinsic::Max | Intrinsic::Min);
@@ -172,9 +169,9 @@ pub fn infer_monoid_reduce(
             init: _,
             reduct,
         } = func.nodes[id.idx()]
-            && (matches!(func.nodes[reduct.idx()], Node::Binary { left, right, op } 
-                if ((left == id && !reduce_cycles[&id].contains(&right)) || 
-                    (right == id && !reduce_cycles[&id].contains(&left))) && 
+            && (matches!(func.nodes[reduct.idx()], Node::Binary { left, right, op }
+                if ((left == id && !reduce_cycles[&id].contains(&right)) ||
+                    (right == id && !reduce_cycles[&id].contains(&left))) &&
                     is_binop_monoid(op))
                 || matches!(&func.nodes[reduct.idx()], Node::IntrinsicCall { intrinsic, args }
                 if (args.contains(&id) && is_intrinsic_monoid(*intrinsic) && 
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index f8fdf2effa9bfc9bbf900b79915b8716706bde6c..e9b8f11f00edac8763ff0e376991fb1220af293c 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -394,7 +394,7 @@ unsafe impl Sync for __RawPtrSendSync {}
  *
  * The data held at all of its non-None allocations and references is maintained so that it is the
  * same, and so methods will attempt to use the reference or allocation that is most convenient.
- * 
+ *
  * HerculesImmBox hold references to immutable memory only. All operations on these is through
  * immutable references, though internally it uses OnceLocks to protect its resources since the Box
  * may be used in multiple parallel threads if it is used in parallel Hercules code invocation.
@@ -499,9 +499,9 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> {
     }
 }
 
-impl<'a, T> HerculesImmBox<'a, T> 
-where 
-    T: Default + Clone
+impl<'a, T> HerculesImmBox<'a, T>
+where
+    T: Default + Clone,
 {
     pub fn as_slice(&'a self) -> &'a [T] {
         self.as_cpu_ref().as_slice()
@@ -517,18 +517,23 @@ where
         } else {
             #[cfg(feature = "cuda")]
             if let Some(cuda_ref) = self.cuda_ref.get() {
-                return 
-                    self.cpu_ref.get_or_init(|| {
+                return self
+                    .cpu_ref
+                    .get_or_init(|| {
                         let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
 
                         let mut alloc = Vec::new();
                         alloc.resize_with(elements, Default::default);
                         let _ = cuda_ref.clone().to_cpu_ref(&mut alloc);
 
-                        self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly");
+                        self.cpu_alloc
+                            .set(alloc)
+                            .map_err(|_| ())
+                            .expect("HerculesImmBox cpu_alloc was set unexpectedly");
                         let alloc = self.cpu_alloc.get().unwrap();
                         HerculesCPURef::from_slice(alloc)
-                    }).clone();
+                    })
+                    .clone();
             }
 
             panic!("HerculesImmBox has no reference to data")
@@ -541,13 +546,19 @@ where
             cuda_ref.clone()
         } else {
             if let Some(cpu_ref) = self.cpu_ref.get() {
-                return self.cuda_ref.get_or_init(|| {
-                    // Copy data to CUDA device
-                    let alloc = CUDABox::from_cpu_ref(cpu_ref.clone());
-                    self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly");
-
-                    self.cuda_alloc.get().unwrap().get_ref()
-                }).clone();
+                return self
+                    .cuda_ref
+                    .get_or_init(|| {
+                        // Copy data to CUDA device
+                        let alloc = CUDABox::from_cpu_ref(cpu_ref.clone());
+                        self.cuda_alloc
+                            .set(alloc)
+                            .map_err(|_| ())
+                            .expect("HerculesImmBox cuda_alloc was set unexpectedly");
+
+                        self.cuda_alloc.get().unwrap().get_ref()
+                    })
+                    .clone();
             }
 
             panic!("HerculesImmBox has no reference to data")
@@ -651,7 +662,7 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
 
 impl<'a, T> HerculesMutBox<'a, T>
 where
-    T: Default + Clone
+    T: Default + Clone,
 {
     pub fn as_slice(&'a mut self) -> &'a mut [T] {
         self.as_cpu_ref().as_slice()
@@ -659,42 +670,41 @@ where
 
     pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> {
         match self.loc {
-            HerculesMutBoxLocation::CPU => {
-                match self.cpu_alloc {
-                    Allocation::None => panic!("No CPU reference"),
-                    Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val),
-                    Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val),
-                }
-            }
+            HerculesMutBoxLocation::CPU => match self.cpu_alloc {
+                Allocation::None => panic!("No CPU reference"),
+                Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val),
+                Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val),
+            },
             #[cfg(feature = "cuda")]
             HerculesMutBoxLocation::CUDA => {
-                let cuda_ref : HerculesCUDARef<'a> =
-                    match self.cuda_alloc {
-                        Allocation::None => panic!("No GPU reference"),
-                        Allocation::Reference(ref mut val) => val.dup().as_ref(),
-                        Allocation::Allocation(ref val) => val.get_ref(),
-                    };
+                let cuda_ref: HerculesCUDARef<'a> = match self.cuda_alloc {
+                    Allocation::None => panic!("No GPU reference"),
+                    Allocation::Reference(ref mut val) => val.dup().as_ref(),
+                    Allocation::Allocation(ref val) => val.get_ref(),
+                };
 
                 let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
 
                 // Allocate host memory (if needed)
-                let cpu_alloc : Allocation<&'a mut [T], Vec<T>> =
-                    match self.cpu_alloc.take() {
-                        Allocation::Reference(val)  if val.len() == elements => Allocation::Reference(val),
-                        Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val),
-                        _ => {
-                            let mut alloc = Vec::new();
-                            alloc.resize_with(elements, Default::default);
-                            Allocation::Allocation(alloc)
-                        }
-                    };
+                let cpu_alloc: Allocation<&'a mut [T], Vec<T>> = match self.cpu_alloc.take() {
+                    Allocation::Reference(val) if val.len() == elements => {
+                        Allocation::Reference(val)
+                    }
+                    Allocation::Allocation(val) if val.len() == elements => {
+                        Allocation::Allocation(val)
+                    }
+                    _ => {
+                        let mut alloc = Vec::new();
+                        alloc.resize_with(elements, Default::default);
+                        Allocation::Allocation(alloc)
+                    }
+                };
                 self.cpu_alloc = cpu_alloc;
-                let cpu_ref : &'a mut [T] =
-                    match &mut self.cpu_alloc {
-                        Allocation::None => panic!(),
-                        Allocation::Reference(val)  => val,
-                        Allocation::Allocation(val) => val,
-                    };
+                let cpu_ref: &'a mut [T] = match &mut self.cpu_alloc {
+                    Allocation::None => panic!(),
+                    Allocation::Reference(val) => val,
+                    Allocation::Allocation(val) => val,
+                };
 
                 // Transfer data from CUDA device
                 let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref);
@@ -709,31 +719,32 @@ where
     pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> {
         match self.loc {
             HerculesMutBoxLocation::CPU => {
-                let cpu_ref : &'a [T] =
-                    match self.cpu_alloc {
-                        Allocation::None => panic!("No CPU reference"),
-                        Allocation::Reference(ref val) => val,
-                        Allocation::Allocation(ref val) => val,
-                    };
+                let cpu_ref: &'a [T] = match self.cpu_alloc {
+                    Allocation::None => panic!("No CPU reference"),
+                    Allocation::Reference(ref val) => val,
+                    Allocation::Allocation(ref val) => val,
+                };
 
                 let size = cpu_ref.len() * size_of::<T>();
-                let (cuda_alloc, copied) =
-                    match self.cuda_alloc.take() {
-                        Allocation::Reference(val)  if unsafe { val.__size() == size } => (Allocation::Reference(val), false),
-                        Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false),
-                        _ => {
-                            let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref));
-                            (Allocation::Allocation(alloc), true)
-                        }
-                    };
+                let (cuda_alloc, copied) = match self.cuda_alloc.take() {
+                    Allocation::Reference(val) if unsafe { val.__size() == size } => {
+                        (Allocation::Reference(val), false)
+                    }
+                    Allocation::Allocation(val) if val.get_bytes() == size => {
+                        (Allocation::Allocation(val), false)
+                    }
+                    _ => {
+                        let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref));
+                        (Allocation::Allocation(alloc), true)
+                    }
+                };
                 self.cuda_alloc = cuda_alloc;
 
-                let cuda_ref =
-                    match self.cuda_alloc {
-                        Allocation::None => panic!(),
-                        Allocation::Reference(ref mut val) => val.dup(),
-                        Allocation::Allocation(ref mut val) => val.get_ref_mut(),
-                    };
+                let cuda_ref = match self.cuda_alloc {
+                    Allocation::None => panic!(),
+                    Allocation::Reference(ref mut val) => val.dup(),
+                    Allocation::Allocation(ref mut val) => val.get_ref_mut(),
+                };
 
                 if !copied {
                     unsafe {
@@ -744,13 +755,11 @@ where
                 self.loc = HerculesMutBoxLocation::CUDA;
                 cuda_ref
             }
-            HerculesMutBoxLocation::CUDA => {
-                match self.cuda_alloc {
-                    Allocation::None => panic!("No GPU reference"),
-                    Allocation::Reference(ref mut val) => val.dup(),
-                    Allocation::Allocation(ref mut val) => val.get_ref_mut(),
-                }
-            }
+            HerculesMutBoxLocation::CUDA => match self.cuda_alloc {
+                Allocation::None => panic!("No GPU reference"),
+                Allocation::Reference(ref mut val) => val.dup(),
+                Allocation::Allocation(ref mut val) => val.get_ref_mut(),
+            },
         }
     }
 }
@@ -760,7 +769,8 @@ pub trait HerculesImmBoxTo<'a, T> {
 }
 
 impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
-where T: Default + Clone
+where
+    T: Default + Clone,
 {
     fn to(&'a self) -> HerculesCPURef<'a> {
         self.as_cpu_ref()
@@ -769,7 +779,8 @@ where T: Default + Clone
 
 #[cfg(feature = "cuda")]
 impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
-where T: Default + Clone
+where
+    T: Default + Clone,
 {
     fn to(&'a self) -> HerculesCUDARef<'a> {
         self.as_cuda_ref()
@@ -781,7 +792,8 @@ pub trait HerculesMutBoxTo<'a, T> {
 }
 
 impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
-where T: Default + Clone
+where
+    T: Default + Clone,
 {
     fn to(&'a mut self) -> HerculesCPURefMut<'a> {
         self.as_cpu_ref()
@@ -790,7 +802,8 @@ where T: Default + Clone
 
 #[cfg(feature = "cuda")]
 impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
-where T: Default + Clone
+where
+    T: Default + Clone,
 {
     fn to(&'a mut self) -> HerculesCUDARefMut<'a> {
         self.as_cuda_ref()
diff --git a/hercules_test/hercules_tests/tests/loop_tests.rs b/hercules_test/hercules_tests/tests/loop_tests.rs
index f42a652002a51cea4daedad16374d8451c124928..64339520a3ccbb9a3f481049ca4c09d06c0be95c 100644
--- a/hercules_test/hercules_tests/tests/loop_tests.rs
+++ b/hercules_test/hercules_tests/tests/loop_tests.rs
@@ -409,10 +409,7 @@ fn matmul_pipeline() {
     // };
     // assert_eq!(correct_c[0], value);
 
-    let schedule = Some(default_schedule![
-        Xdot,
-        Verify,
-    ]);
+    let schedule = Some(default_schedule![Xdot, Verify,]);
 
     module = run_schedule_on_hercules(module, schedule).unwrap();
 
diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs
index 21ccd7c49b5627c4e07839cc7583c101e85c5013..cd715cace80ba3e58d0971b7956ec1e957b839e2 100644
--- a/juno_samples/fork_join_tests/src/main.rs
+++ b/juno_samples/fork_join_tests/src/main.rs
@@ -45,12 +45,12 @@ fn main() {
 
         let mut r = runner!(test6);
         let output = r.run(73).await;
-        let correct = (73i32..73i32+1024i32).collect();
+        let correct = (73i32..73i32 + 1024i32).collect();
         assert(&correct, output);
 
         let mut r = runner!(test7);
         let output = r.run(42).await;
-        let correct: i32 = (42i32..42i32+32i32).sum();
+        let correct: i32 = (42i32..42i32 + 32i32).sum();
         assert_eq!(correct, output);
 
         let mut r = runner!(test8);
diff --git a/juno_samples/median_window/build.rs b/juno_samples/median_window/build.rs
index a6c29d5b2184490b673dc381532562ba50d169ef..3ce241e42d4b0a082f74ec9a48c56e2afc9d23f9 100644
--- a/juno_samples/median_window/build.rs
+++ b/juno_samples/median_window/build.rs
@@ -4,7 +4,11 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("median.jn")
         .unwrap()
-        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .schedule_in_src(if cfg!(feature = "cuda") {
+            "gpu.sch"
+        } else {
+            "cpu.sch"
+        })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/median_window/src/main.rs b/juno_samples/median_window/src/main.rs
index c515ac4b57f049d27bb800c9eb7dc1ac9463589c..63479dbab1b3f4aaede401566ea00f3917cfa3b9 100644
--- a/juno_samples/median_window/src/main.rs
+++ b/juno_samples/median_window/src/main.rs
@@ -5,18 +5,14 @@ juno_build::juno!("median");
 use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
 
 fn main() {
-    let m = vec![86, 72, 14,  5, 55,
-                 25, 98, 89,  3, 66,
-                 44, 81, 27,  3, 40,
-                 18,  4, 57, 93, 34,
-                 70, 50, 50, 18, 34];
+    let m = vec![
+        86, 72, 14, 5, 55, 25, 98, 89, 3, 66, 44, 81, 27, 3, 40, 18, 4, 57, 93, 34, 70, 50, 50, 18,
+        34,
+    ];
     let m = HerculesImmBox::from(m.as_slice());
 
     let mut r = runner!(median_window);
-    let res =
-        async_std::task::block_on(async {
-            r.run(m.to()).await
-        });
+    let res = async_std::task::block_on(async { r.run(m.to()).await });
     assert_eq!(res, 57);
 }
 
diff --git a/juno_samples/products/src/main.rs b/juno_samples/products/src/main.rs
index b8abb59d5bc3b2cbbb0ef13ec1f44c8d0734e1bb..9a1e6ac296bb69a00aa3621bc69fa5f0403468c5 100644
--- a/juno_samples/products/src/main.rs
+++ b/juno_samples/products/src/main.rs
@@ -7,14 +7,18 @@ juno_build::juno!("products");
 fn main() {
     async_std::task::block_on(async {
         let input = vec![(0, 1), (2, 3)];
-        let input : HerculesImmBox<(i32, i32)> = HerculesImmBox::from(input.as_slice());
+        let input: HerculesImmBox<(i32, i32)> = HerculesImmBox::from(input.as_slice());
         let mut r = runner!(product_read);
-        let res : Vec<i32> = HerculesMutBox::from(r.run(input.to()).await).as_slice().to_vec();
+        let res: Vec<i32> = HerculesMutBox::from(r.run(input.to()).await)
+            .as_slice()
+            .to_vec();
         assert_eq!(res, vec![0, 1, 2, 3]);
 
         // Technically this returns a product of two i32s, but we can interpret that as an array
         let mut r = runner!(product_return);
-        let res : Vec<i32> = HerculesMutBox::from(r.run(42, 17).await).as_slice().to_vec();
+        let res: Vec<i32> = HerculesMutBox::from(r.run(42, 17).await)
+            .as_slice()
+            .to_vec();
         assert_eq!(res, vec![42, 17]);
     });
 }
diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs
index fc2a729ec40db410f2beb7c148b50534b4d25312..1a4cb623ce0443e83612fa498c6ba45e1a94813c 100644
--- a/juno_scheduler/src/compile.rs
+++ b/juno_scheduler/src/compile.rs
@@ -134,6 +134,7 @@ impl FromStr for Appliable {
             "fork-unroll" | "unroll" => Ok(Appliable::Pass(ir::Pass::ForkUnroll)),
             "fork-fusion" | "fusion" => Ok(Appliable::Pass(ir::Pass::ForkFusion)),
             "lift-dc-math" => Ok(Appliable::Pass(ir::Pass::LiftDCMath)),
+            "loop-bound-canon" => Ok(Appliable::Pass(ir::Pass::LoopBoundCanon)),
             "outline" => Ok(Appliable::Pass(ir::Pass::Outline)),
             "phi-elim" => Ok(Appliable::Pass(ir::Pass::PhiElim)),
             "predication" => Ok(Appliable::Pass(ir::Pass::Predication)),
diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs
index bf3fe03739c9159202d60e8a5908c8bb0da4cb28..5b6bd297b934b6c631df176afbc0c72bfa4584df 100644
--- a/juno_scheduler/src/ir.rs
+++ b/juno_scheduler/src/ir.rs
@@ -26,6 +26,7 @@ pub enum Pass {
     Inline,
     InterproceduralSROA,
     LiftDCMath,
+    LoopBoundCanon,
     Outline,
     PhiElim,
     Predication,
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 8db79b46199c4d9ab54e139590c1cc34a539f96e..5011b52bcec13f4e3bb455bca99f9d38868976bc 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -2726,6 +2726,32 @@ fn run_pass(
         Pass::Print => {
             println!("{:?}", args.get(0));
         }
+        Pass::LoopBoundCanon => {
+            assert_eq!(args.len(), 0);
+
+            pm.make_fork_join_maps();
+            pm.make_loops();
+            pm.make_control_subgraphs();
+            let fork_join_maps = pm.fork_join_maps.take().unwrap();
+            let loops = pm.loops.take().unwrap();
+            let control_subgraphs = pm.control_subgraphs.take().unwrap();
+
+            for (((func, fork_join_map), loops), control_subgraph) in
+                build_selection(pm, selection, false)
+                    .into_iter()
+                    .zip(fork_join_maps.iter())
+                    .zip(loops.iter())
+                    .zip(control_subgraphs.iter())
+            {
+                let Some(mut func) = func else {
+                    continue;
+                };
+                loop_bound_canon_toplevel(&mut func, fork_join_map, control_subgraph, loops);
+                changed |= func.modified();
+            }
+            pm.delete_gravestones();
+            pm.clear_analyses();
+        }
     }
     println!("Ran Pass: {:?}", pass);