diff --git a/hercules_opt/src/fork_guard_elim.rs b/hercules_opt/src/fork_guard_elim.rs
index eb755509e48eccb02a311fb6004abd37d263c7f8..df40e60f89f0490cacb35d6e9754f3b134ed1483 100644
--- a/hercules_opt/src/fork_guard_elim.rs
+++ b/hercules_opt/src/fork_guard_elim.rs
@@ -39,7 +39,6 @@ struct GuardedFork {
     guard_if: NodeID,
     fork_taken_proj: NodeID,
     fork_skipped_proj: NodeID,
-    guard_pred: NodeID,
     guard_join_region: NodeID,
     phi_reduce_map: HashMap<NodeID, NodeID>,
     factor: Factor, // The factor that matches the guard
@@ -305,7 +304,6 @@ fn guarded_fork(
         guard_if: if_node,
         fork_taken_proj: *control,
         fork_skipped_proj: other_pred,
-        guard_pred: if_pred,
         guard_join_region: join_control,
         phi_reduce_map: phi_nodes,
         factor,
@@ -326,13 +324,15 @@ pub fn fork_guard_elim(editor: &mut FunctionEditor, fork_join_map: &HashMap<Node
         join,
         fork_taken_proj,
         fork_skipped_proj,
-        guard_pred,
         phi_reduce_map,
         factor,
         guard_if,
         guard_join_region,
     } in guard_info
     {
+        let Some(guard_pred) = editor.get_uses(guard_if).next() else {
+            unreachable!()
+        };
         let new_fork_info = if let Factor::Max(idx, dc) = factor {
             let Node::Fork {
                 control: _,
diff --git a/hercules_opt/src/fork_transforms.rs b/hercules_opt/src/fork_transforms.rs
index e23f586ffb55a9525f3ac2e2f5f1626b9b20ef66..c4a6ba7f23d0671e408639f9b7af6b1f73015dce 100644
--- a/hercules_opt/src/fork_transforms.rs
+++ b/hercules_opt/src/fork_transforms.rs
@@ -1,5 +1,6 @@
 use std::collections::{HashMap, HashSet};
 use std::iter::zip;
+use std::thread::ThreadId;
 
 use bimap::BiMap;
 use itertools::Itertools;
@@ -693,3 +694,224 @@ pub(crate) fn split_fork(
         None
     }
 }
+
+pub fn chunk_all_forks_unguarded(
+    editor: &mut FunctionEditor,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    dim_idx: usize,
+    tile_size: usize,
+) -> () {
+    // Add dc
+    let mut dc_id = DynamicConstantID::new(0);
+    editor.edit(|mut edit| {
+        dc_id = edit.add_dynamic_constant(DynamicConstant::Constant(tile_size));
+        Ok(edit)
+    });
+
+    for (fork, _) in fork_join_map {
+        chunk_fork_unguarded(editor, *fork, dim_idx, dc_id);
+    }
+}
+// Splits a dimension of a single fork join into multiple.
+// Iterates an outer loop original_dim / tile_size times
+// adds a tile_size loop as the inner loop
+// Assumes that tile size divides original dim evenly.
+pub fn chunk_fork_unguarded(
+    editor: &mut FunctionEditor,
+    fork: NodeID,
+    dim_idx: usize,
+    tile_size: DynamicConstantID,
+) -> () {
+    // tid_dim_idx = tid_dim_idx * tile_size + tid_(dim_idx + 1)
+    let Node::Fork {
+        control: old_control,
+        factors: ref old_factors,
+    } = *editor.node(fork)
+    else {
+        return;
+    };
+    assert!(dim_idx < old_factors.len());
+    let mut new_factors: Vec<_> = old_factors.to_vec();
+    let fork_users: Vec<_> = editor
+        .get_users(fork)
+        .map(|f| (f, editor.node(f).clone()))
+        .collect();
+
+    editor.edit(|mut edit| {
+        let outer = DynamicConstant::div(new_factors[dim_idx], tile_size);
+        new_factors.insert(dim_idx + 1, tile_size);
+        new_factors[dim_idx] = edit.add_dynamic_constant(outer);
+
+        let new_fork = Node::Fork {
+            control: old_control,
+            factors: new_factors.into(),
+        };
+        let new_fork = edit.add_node(new_fork);
+
+        edit = edit.replace_all_uses(fork, new_fork)?;
+
+        for (tid, node) in fork_users {
+            let Node::ThreadID {
+                control: _,
+                dimension: tid_dim,
+            } = node
+            else {
+                continue;
+            };
+            if tid_dim > dim_idx {
+                let new_tid = Node::ThreadID {
+                    control: new_fork,
+                    dimension: tid_dim + 1,
+                };
+                let new_tid = edit.add_node(new_tid);
+                edit = edit.replace_all_uses(tid, new_tid)?;
+                edit = edit.delete_node(tid)?;
+            } else if tid_dim == dim_idx {
+                let tile_tid = Node::ThreadID {
+                    control: new_fork,
+                    dimension: tid_dim + 1,
+                };
+                let tile_tid = edit.add_node(tile_tid);
+
+                let tile_size = edit.add_node(Node::DynamicConstant { id: tile_size });
+                let mul = edit.add_node(Node::Binary {
+                    left: tid,
+                    right: tile_size,
+                    op: BinaryOperator::Mul,
+                });
+                let add = edit.add_node(Node::Binary {
+                    left: mul,
+                    right: tile_tid,
+                    op: BinaryOperator::Add,
+                });
+                edit = edit.replace_all_uses_where(tid, add, |usee| *usee != mul)?;
+            }
+        }
+        edit = edit.delete_node(fork)?;
+        Ok(edit)
+    });
+}
+
+pub fn merge_all_fork_dims(editor: &mut FunctionEditor, fork_join_map: &HashMap<NodeID, NodeID>) {
+    for (fork, _) in fork_join_map {
+        let Node::Fork {
+            control: _,
+            factors: dims,
+        } = editor.node(fork)
+        else {
+            unreachable!();
+        };
+
+        let mut fork = *fork;
+        for _ in 0..dims.len() - 1 {
+            let outer = 0;
+            let inner = 1;
+            fork = fork_dim_merge(editor, fork, outer, inner);
+        }
+    }
+}
+
+pub fn fork_dim_merge(
+    editor: &mut FunctionEditor,
+    fork: NodeID,
+    dim_idx1: usize,
+    dim_idx2: usize,
+) -> NodeID {
+    // tid_dim_idx1 (replaced w/) <- dim_idx1 / dim(dim_idx2)
+    // tid_dim_idx2 (replaced w/) <- dim_idx1 % dim(dim_idx2)
+    assert_ne!(dim_idx1, dim_idx2);
+
+    // Outer is smaller, and also closer to the left of the factors array.
+    let (outer_idx, inner_idx) = if dim_idx2 < dim_idx1 {
+        (dim_idx2, dim_idx1)
+    } else {
+        (dim_idx1, dim_idx2)
+    };
+    let Node::Fork {
+        control: old_control,
+        factors: ref old_factors,
+    } = *editor.node(fork)
+    else {
+        return fork;
+    };
+    let mut new_factors: Vec<_> = old_factors.to_vec();
+    let fork_users: Vec<_> = editor
+        .get_users(fork)
+        .map(|f| (f, editor.node(f).clone()))
+        .collect();
+    let mut new_nodes = vec![];
+    let outer_dc_id = new_factors[outer_idx];
+    let inner_dc_id = new_factors[inner_idx];
+    let mut new_fork = NodeID::new(0);
+
+    editor.edit(|mut edit| {
+        new_factors[outer_idx] = edit.add_dynamic_constant(DynamicConstant::mul(
+            new_factors[outer_idx],
+            new_factors[inner_idx],
+        ));
+        new_factors.remove(inner_idx);
+        new_fork = edit.add_node(Node::Fork {
+            control: old_control,
+            factors: new_factors.into(),
+        });
+        edit.sub_edit(fork, new_fork);
+        edit = edit.replace_all_uses(fork, new_fork)?;
+        edit = edit.delete_node(fork)?;
+
+        for (tid, node) in fork_users {
+            let Node::ThreadID {
+                control: _,
+                dimension: tid_dim,
+            } = node
+            else {
+                continue;
+            };
+            if tid_dim > inner_idx {
+                let new_tid = Node::ThreadID {
+                    control: new_fork,
+                    dimension: tid_dim - 1,
+                };
+                let new_tid = edit.add_node(new_tid);
+                edit = edit.replace_all_uses(tid, new_tid)?;
+                edit.sub_edit(tid, new_tid);
+            } else if tid_dim == outer_idx {
+                let outer_tid = Node::ThreadID {
+                    control: new_fork,
+                    dimension: outer_idx,
+                };
+                let outer_tid = edit.add_node(outer_tid);
+
+                let outer_dc = edit.add_node(Node::DynamicConstant { id: outer_dc_id });
+                new_nodes.push(outer_tid);
+
+                // inner_idx % dim(outer_idx)
+                let rem = edit.add_node(Node::Binary {
+                    left: outer_tid,
+                    right: outer_dc,
+                    op: BinaryOperator::Rem,
+                });
+                edit.sub_edit(tid, rem);
+                edit = edit.replace_all_uses(tid, rem)?;
+            } else if tid_dim == inner_idx {
+                let outer_tid = Node::ThreadID {
+                    control: new_fork,
+                    dimension: outer_idx,
+                };
+                let outer_tid = edit.add_node(outer_tid);
+
+                let outer_dc = edit.add_node(Node::DynamicConstant { id: outer_dc_id });
+                // inner_idx / dim(outer_idx)
+                let div = edit.add_node(Node::Binary {
+                    left: outer_tid,
+                    right: outer_dc,
+                    op: BinaryOperator::Div,
+                });
+                edit.sub_edit(tid, div);
+                edit = edit.replace_all_uses(tid, div)?;
+            }
+        }
+        Ok(edit)
+    });
+
+    new_fork
+}
diff --git a/hercules_opt/src/forkify.rs b/hercules_opt/src/forkify.rs
index 082f1ae958c7c9f90fd4719a17535fc7e604dfdd..f6db06ca955b40c0f331be6d48d39dfc126a17d1 100644
--- a/hercules_opt/src/forkify.rs
+++ b/hercules_opt/src/forkify.rs
@@ -152,6 +152,7 @@ pub fn forkify_loop(
         .filter(|id| !l.control[id.idx()])
         .collect();
 
+    // FIXME: @xrouth
     if loop_preds.len() != 1 {
         return false;
     }
@@ -297,7 +298,36 @@ pub fn forkify_loop(
     let (_, factors) = function.nodes[fork_id.idx()].try_fork().unwrap();
     let dimension = factors.len() - 1;
 
-    // Create ThreadID
+    // Start failable edit:
+
+    let redcutionable_phis_and_init: Vec<(_, NodeID)> = reductionable_phis
+        .iter()
+        .map(|reduction_phi| {
+            let LoopPHI::Reductionable {
+                phi,
+                data_cycle: _,
+                continue_latch,
+                is_associative: _,
+            } = reduction_phi
+            else {
+                panic!();
+            };
+
+            let function = editor.func();
+
+            let init = *zip(
+                editor.get_uses(l.header),
+                function.nodes[phi.idx()].try_phi().unwrap().1.iter(),
+            )
+            .filter(|(c, _)| *c == loop_pred)
+            .next()
+            .unwrap()
+            .1;
+
+            (reduction_phi, init)
+        })
+        .collect();
+
     editor.edit(|mut edit| {
         let thread_id = Node::ThreadID {
             control: fork_id,
@@ -310,43 +340,25 @@ pub fn forkify_loop(
             loop_nodes.contains(node)
         })?;
 
-        // Replace uses that are outside with DC - 1. Or just give up.
-        let bound_dc_node = edit.add_node(Node::DynamicConstant { id: bound_dc_id });
-        edit = edit.replace_all_uses_where(canonical_iv.phi(), bound_dc_node, |node| {
-            !loop_nodes.contains(node)
-        })?;
-
-        edit.delete_node(canonical_iv.phi())
-    });
-
-    for reduction_phi in reductionable_phis {
-        let LoopPHI::Reductionable {
-            phi,
-            data_cycle: _,
-            continue_latch,
-            is_associative: _,
-        } = reduction_phi
-        else {
-            panic!();
-        };
-
-        let function = editor.func();
+        edit = edit.delete_node(canonical_iv.phi())?;
 
-        let init = *zip(
-            editor.get_uses(l.header),
-            function.nodes[phi.idx()].try_phi().unwrap().1.iter(),
-        )
-        .filter(|(c, _)| *c == loop_pred)
-        .next()
-        .unwrap()
-        .1;
+        for (reduction_phi, init) in redcutionable_phis_and_init {
+            let LoopPHI::Reductionable {
+                phi,
+                data_cycle: _,
+                continue_latch,
+                is_associative: _,
+            } = *reduction_phi
+            else {
+                panic!();
+            };
 
-        editor.edit(|mut edit| {
             let reduce = Node::Reduce {
                 control: join_id,
                 init,
                 reduct: continue_latch,
             };
+
             let reduce_id = edit.add_node(reduce);
 
             if (!edit.get_node(init).is_reduce()
@@ -374,20 +386,13 @@ pub fn forkify_loop(
             edit = edit.replace_all_uses_where(continue_latch, reduce_id, |usee| {
                 !loop_nodes.contains(usee) && *usee != reduce_id
             })?;
-            edit.delete_node(phi)
-        });
-    }
-
-    // Replace all uses of the loop header with the fork
-    editor.edit(|edit| edit.replace_all_uses(l.header, fork_id));
-
-    editor.edit(|edit| edit.replace_all_uses(loop_continue_projection, fork_id));
+            edit = edit.delete_node(phi)?
+        }
 
-    editor.edit(|edit| edit.replace_all_uses(loop_exit_projection, join_id));
+        edit = edit.replace_all_uses(l.header, fork_id)?;
+        edit = edit.replace_all_uses(loop_continue_projection, fork_id)?;
+        edit = edit.replace_all_uses(loop_exit_projection, join_id)?;
 
-    // Get rid of loop condition
-    // DCE should get these, but delete them ourselves because we are nice :)
-    editor.edit(|mut edit| {
         edit = edit.delete_node(loop_continue_projection)?;
         edit = edit.delete_node(condition_node)?; // Might have to get rid of other users of this.
         edit = edit.delete_node(loop_exit_projection)?;
@@ -409,6 +414,7 @@ nest! {
             is_associative: bool,
         },
         LoopDependant(NodeID),
+        ControlDependant(NodeID), // This phi is redcutionable, but its cycle might depend on internal control within the loop.
         UsedByDependant(NodeID),
     }
 }
@@ -419,6 +425,7 @@ impl LoopPHI {
             LoopPHI::Reductionable { phi, .. } => *phi,
             LoopPHI::LoopDependant(node_id) => *node_id,
             LoopPHI::UsedByDependant(node_id) => *node_id,
+            LoopPHI::ControlDependant(node_id) => *node_id,
         }
     }
 }
@@ -528,6 +535,15 @@ pub fn analyze_phis<'a>(
             // PHIs on the frontier of the uses by the candidate phi, i.e in uses_for_dependance need
             // to have headers that postdominate the loop continue latch. The value of the PHI used needs to be defined
             // by the time the reduce is triggered (at the end of the loop's internal control).
+            // If anything in the intersection is a phi (that isn't this own phi), then the reduction cycle depends on control.
+            // Which is not allowed.
+            if intersection
+                .iter()
+                .any(|cycle_node| editor.node(cycle_node).is_phi() && *cycle_node != *phi)
+                || editor.node(loop_continue_latch).is_phi()
+            {
+                return LoopPHI::ControlDependant(*phi);
+            }
 
             // No nodes in data cycles with this phi (in the loop) are used outside the loop, besides the loop_continue_latch.
             // If some other node in the cycle is used, there is not a valid node to assign it after making the cycle a reduce.
diff --git a/hercules_test/hercules_interpreter/src/interpreter.rs b/hercules_test/hercules_interpreter/src/interpreter.rs
index 709c64fb0f8b45d7903d6b1e7f1a5d5ee2f28185..2e352644cc816a3fe5c43427fa7190708508bb82 100644
--- a/hercules_test/hercules_interpreter/src/interpreter.rs
+++ b/hercules_test/hercules_interpreter/src/interpreter.rs
@@ -775,16 +775,13 @@ impl<'a> FunctionExecutionState<'a> {
                     //     panic!("multi-dimensional forks unimplemented")
                     // }
 
-                    let factors = factors
-                        .iter()
-                        .map(|f| {
-                            dyn_const_value(
-                                &f,
-                                &self.module.dynamic_constants,
-                                &self.dynamic_constant_params,
-                            )
-                        })
-                        .rev();
+                    let factors = factors.iter().map(|f| {
+                        dyn_const_value(
+                            &f,
+                            &self.module.dynamic_constants,
+                            &self.dynamic_constant_params,
+                        )
+                    });
 
                     let n_tokens: usize = factors.clone().product();
 
diff --git a/hercules_test/hercules_interpreter/src/value.rs b/hercules_test/hercules_interpreter/src/value.rs
index 53911e05c2333a0e9b30c5bfdacb854f8409f692..4a802f7a7b7c2c4090380929fcc93824b2c79244 100644
--- a/hercules_test/hercules_interpreter/src/value.rs
+++ b/hercules_test/hercules_interpreter/src/value.rs
@@ -156,8 +156,16 @@ impl<'a> InterpreterVal {
             Constant::Float64(v) => Self::Float64(v),
 
             Constant::Product(ref type_id, ref constant_ids) => {
-                // Self::Product((), ())
-                todo!()
+                let contents = constant_ids.iter().map(|const_id| {
+                    InterpreterVal::from_constant(
+                        &constants[const_id.idx()],
+                        constants,
+                        types,
+                        dynamic_constants,
+                        dynamic_constant_params,
+                    )
+                });
+                InterpreterVal::Product(*type_id, contents.collect_vec().into_boxed_slice())
             }
             Constant::Summation(_, _, _) => todo!(),
             Constant::Array(type_id) => {
diff --git a/hercules_test/hercules_tests/tests/loop_tests.rs b/hercules_test/hercules_tests/tests/loop_tests.rs
index 7e40f602294243bd93e5ba3d075c10cfffa5d614..795642b25ba25bf6f5c845065a24d4c98119aa82 100644
--- a/hercules_test/hercules_tests/tests/loop_tests.rs
+++ b/hercules_test/hercules_tests/tests/loop_tests.rs
@@ -385,7 +385,7 @@ fn matmul_pipeline() {
     let dyn_consts = [I, J, K];
 
     // FIXME: This path should not leave the crate
-    let mut module = parse_module_from_hbin("../../juno_samples/matmul/out.hbin");
+    let mut module = parse_module_from_hbin("../../juno_samples/test/out.hbin");
     //
     let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
     for i in 0..I {
@@ -409,7 +409,15 @@ fn matmul_pipeline() {
     };
     assert_eq!(correct_c[0], value);
 
-    let schedule = Some(default_schedule![Xdot, ForkSplit, Unforkify, Xdot,]);
+    let schedule = Some(default_schedule![
+        AutoOutline,
+        InterproceduralSROA,
+        SROA,
+        InferSchedules,
+        DCE,
+        Xdot,
+        GCM
+    ]);
 
     module = run_schedule_on_hercules(module, schedule).unwrap();
 
diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs
index 80cf2cb42d30edfd43565f397d4973ea43b3c515..713c30d436fdb9316f92f7ecbeee3659f2924d83 100644
--- a/juno_scheduler/src/compile.rs
+++ b/juno_scheduler/src/compile.rs
@@ -109,6 +109,8 @@ impl FromStr for Appliable {
             "ip-sroa" | "interprocedural-sroa" => {
                 Ok(Appliable::Pass(ir::Pass::InterproceduralSROA))
             }
+            "fork-dim-merge" => Ok(Appliable::Pass(ir::Pass::ForkDimMerge)),
+            "fork-chunk" | "fork-tile" => Ok(Appliable::Pass(ir::Pass::ForkChunk)),
             "lift-dc-math" => Ok(Appliable::Pass(ir::Pass::LiftDCMath)),
             "outline" => Ok(Appliable::Pass(ir::Pass::Outline)),
             "phi-elim" => Ok(Appliable::Pass(ir::Pass::PhiElim)),
diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs
index d6a41baf99d8ed3cec6ab8183ae52e9956e6c5b0..9e85509f83a322da222e56257ffffc697ab14e30 100644
--- a/juno_scheduler/src/ir.rs
+++ b/juno_scheduler/src/ir.rs
@@ -12,6 +12,8 @@ pub enum Pass {
     ForkSplit,
     ForkCoalesce,
     Forkify,
+    ForkDimMerge,
+    ForkChunk,
     GCM,
     GVN,
     InferSchedules,
@@ -34,6 +36,7 @@ impl Pass {
     pub fn num_args(&self) -> usize {
         match self {
             Pass::Xdot => 1,
+            Pass::ForkChunk => 3,
             _ => 0,
         }
     }
diff --git a/juno_scheduler/src/lib.rs b/juno_scheduler/src/lib.rs
index 571d1fbf6da74e9ee454871cbe2fc59f43b5597e..ad9195fb3757d2bdc8e21777949ee894ed4d5808 100644
--- a/juno_scheduler/src/lib.rs
+++ b/juno_scheduler/src/lib.rs
@@ -60,7 +60,7 @@ fn build_schedule(sched_filename: String) -> Result<ScheduleStmt, String> {
     }
 }
 
-fn process_schedule(sched_filename: Option<String>) -> Result<ScheduleStmt, String> {
+pub fn process_schedule(sched_filename: Option<String>) -> Result<ScheduleStmt, String> {
     if let Some(name) = sched_filename {
         build_schedule(name)
     } else {
@@ -146,6 +146,41 @@ pub fn run_schedule_on_hercules(
     .map_err(|e| format!("Scheduling Error: {}", e))
 }
 
+pub fn run_schedule_from_file_on_hercules(
+    module: Module,
+    sched_filename: Option<String>,
+) -> Result<Module, String> {
+    let sched = process_schedule(sched_filename)?;
+
+    // Prepare the scheduler's string table and environment
+    // For this, we put all of the Hercules function names into the environment
+    // and string table
+    let mut strings = StringTable::new();
+    let mut env = Env::new();
+
+    env.open_scope();
+
+    for (idx, func) in module.functions.iter().enumerate() {
+        let func_name = strings.lookup_string(func.name.clone());
+        env.insert(
+            func_name,
+            Value::HerculesFunction {
+                func: FunctionID::new(idx),
+            },
+        );
+    }
+
+    env.open_scope();
+    schedule_module(
+        module,
+        sched,
+        strings,
+        env,
+        JunoFunctions { func_ids: vec![] },
+    )
+    .map_err(|e| format!("Scheduling Error: {}", e))
+}
+
 pub fn schedule_hercules(
     module: Module,
     sched_filename: Option<String>,
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 9c445519358f469375a83c3b1512bf0da38dce72..b2845913a20467a5cdce1dfe11c83b95a213f492 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -1543,29 +1543,35 @@ fn run_pass(
         }
         Pass::Forkify => {
             assert!(args.is_empty());
-            pm.make_fork_join_maps();
-            pm.make_control_subgraphs();
-            pm.make_loops();
-            let fork_join_maps = pm.fork_join_maps.take().unwrap();
-            let loops = pm.loops.take().unwrap();
-            let control_subgraphs = pm.control_subgraphs.take().unwrap();
-            for (((func, fork_join_map), loop_nest), control_subgraph) in
-                build_selection(pm, selection)
-                    .into_iter()
-                    .zip(fork_join_maps.iter())
-                    .zip(loops.iter())
-                    .zip(control_subgraphs.iter())
-            {
-                let Some(mut func) = func else {
-                    continue;
-                };
-                // TODO: uses direct return from forkify for now instead of
-                // func.modified, see comment on top of `forkify` for why. Fix
-                // this eventually.
-                changed |= forkify(&mut func, control_subgraph, fork_join_map, loop_nest);
+            loop {
+                let mut inner_changed = false;
+                pm.make_fork_join_maps();
+                pm.make_control_subgraphs();
+                pm.make_loops();
+                let fork_join_maps = pm.fork_join_maps.take().unwrap();
+                let loops = pm.loops.take().unwrap();
+                let control_subgraphs = pm.control_subgraphs.take().unwrap();
+                for (((func, fork_join_map), loop_nest), control_subgraph) in
+                    build_selection(pm, selection.clone())
+                        .into_iter()
+                        .zip(fork_join_maps.iter())
+                        .zip(loops.iter())
+                        .zip(control_subgraphs.iter())
+                {
+                    let Some(mut func) = func else {
+                        continue;
+                    };
+                    forkify(&mut func, control_subgraph, fork_join_map, loop_nest);
+                    changed |= func.modified();
+                    inner_changed |= func.modified();
+                }
+                pm.delete_gravestones();
+                pm.clear_analyses();
+
+                if !inner_changed {
+                    break;
+                }
             }
-            pm.delete_gravestones();
-            pm.clear_analyses();
         }
         Pass::GCM => {
             assert!(args.is_empty());
@@ -1906,6 +1912,65 @@ fn run_pass(
             pm.delete_gravestones();
             pm.clear_analyses();
         }
+        Pass::ForkChunk => {
+            assert_eq!(args.len(), 3);
+            let tile_size = args.get(0);
+            let dim_idx = args.get(1);
+
+            let Some(Value::Boolean { val: guarded_flag }) = args.get(2) else {
+                return Err(SchedulerError::PassError {
+                    pass: "forkChunk".to_string(),
+                    error: "expected boolean argument".to_string(),
+                });
+            };
+
+            let Some(Value::Integer { val: dim_idx }) = args.get(1) else {
+                return Err(SchedulerError::PassError {
+                    pass: "forkChunk".to_string(),
+                    error: "expected integer argument".to_string(),
+                });
+            };
+
+            let Some(Value::Integer { val: tile_size }) = args.get(0) else {
+                return Err(SchedulerError::PassError {
+                    pass: "forkChunk".to_string(),
+                    error: "expected integer argument".to_string(),
+                });
+            };
+
+            assert_eq!(*guarded_flag, true);
+            pm.make_fork_join_maps();
+            let fork_join_maps = pm.fork_join_maps.take().unwrap();
+            for (func, fork_join_map) in build_selection(pm, selection)
+                .into_iter()
+                .zip(fork_join_maps.iter())
+            {
+                let Some(mut func) = func else {
+                    continue;
+                };
+                chunk_all_forks_unguarded(&mut func, fork_join_map, *dim_idx, *tile_size);
+                changed |= func.modified();
+            }
+            pm.delete_gravestones();
+            pm.clear_analyses();
+        }
+        Pass::ForkDimMerge => {
+            assert!(args.is_empty());
+            pm.make_fork_join_maps();
+            let fork_join_maps = pm.fork_join_maps.take().unwrap();
+            for (func, fork_join_map) in build_selection(pm, selection)
+                .into_iter()
+                .zip(fork_join_maps.iter())
+            {
+                let Some(mut func) = func else {
+                    continue;
+                };
+                merge_all_fork_dims(&mut func, fork_join_map);
+                changed |= func.modified();
+            }
+            pm.delete_gravestones();
+            pm.clear_analyses();
+        }
         Pass::ForkCoalesce => {
             assert!(args.is_empty());
             pm.make_fork_join_maps();
@@ -2003,6 +2068,7 @@ fn run_pass(
             // Put BasicBlocks back, since it's needed for Codegen.
             pm.bbs = bbs;
         }
+        Pass::ForkChunk => todo!(),
     }
     println!("Ran Pass: {:?}", pass);