diff --git a/.gitignore b/.gitignore
index 16e4eda72be12bf1449508941f676c9453459632..f8a684ce5223884e46f65b7fa67b6328eadf3100 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,6 @@
 *.c
 *.o
 *.a
-*.hman
+*.hrt
 .*.swp
 .vscode
diff --git a/Cargo.lock b/Cargo.lock
index 23c5f4c79b036d0099f4a73d7a6c858cdd703b11..e9e4f311440fbafe440d4734b35fbcc54365bd3e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -267,6 +267,26 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acbc26382d871df4b7442e3df10a9402bf3cf5e55cbd66f12be38861425f0564"
 
+[[package]]
+name = "call"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "juno_build",
+ "rand",
+ "with_builtin_macros",
+]
+
+[[package]]
+name = "ccp"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "juno_build",
+ "rand",
+ "with_builtin_macros",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.0"
@@ -375,7 +395,6 @@ version = "0.1.0"
 dependencies = [
  "async-std",
  "clap",
- "hercules_rt",
  "juno_build",
  "rand",
  "with_builtin_macros",
@@ -448,7 +467,6 @@ version = "0.1.0"
 dependencies = [
  "async-std",
  "clap",
- "hercules_rt",
  "juno_build",
  "rand",
  "with_builtin_macros",
@@ -633,29 +651,6 @@ dependencies = [
  "take_mut",
 ]
 
-[[package]]
-name = "hercules_rt"
-version = "0.1.0"
-dependencies = [
- "hercules_rt_proc",
- "libc",
- "postcard",
- "serde",
-]
-
-[[package]]
-name = "hercules_rt_proc"
-version = "0.1.0"
-dependencies = [
- "anyhow",
- "hercules_cg",
- "hercules_ir",
- "hercules_opt",
- "postcard",
- "serde",
- "uuid",
-]
-
 [[package]]
 name = "hermit-abi"
 version = "0.4.0"
@@ -707,7 +702,6 @@ name = "juno_build"
 version = "0.1.0"
 dependencies = [
  "hercules_ir",
- "hercules_rt",
  "juno_frontend",
  "with_builtin_macros",
 ]
@@ -717,7 +711,6 @@ name = "juno_casts_and_intrinsics"
 version = "0.1.0"
 dependencies = [
  "async-std",
- "hercules_rt",
  "juno_build",
  "with_builtin_macros",
 ]
@@ -744,7 +737,6 @@ name = "juno_matmul"
 version = "0.1.0"
 dependencies = [
  "async-std",
- "hercules_rt",
  "juno_build",
  "with_builtin_macros",
 ]
@@ -754,7 +746,6 @@ name = "juno_nested_ccp"
 version = "0.1.0"
 dependencies = [
  "async-std",
- "hercules_rt",
  "juno_build",
  "with_builtin_macros",
 ]
@@ -774,7 +765,6 @@ name = "juno_simple3"
 version = "0.1.0"
 dependencies = [
  "async-std",
- "hercules_rt",
  "juno_build",
  "with_builtin_macros",
 ]
@@ -896,7 +886,6 @@ version = "0.1.0"
 dependencies = [
  "async-std",
  "clap",
- "hercules_rt",
  "juno_build",
  "rand",
  "with_builtin_macros",
@@ -1440,28 +1429,6 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 
-[[package]]
-name = "uuid"
-version = "1.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a"
-dependencies = [
- "getrandom",
- "rand",
- "uuid-macro-internal",
-]
-
-[[package]]
-name = "uuid-macro-internal"
-version = "1.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b91f57fe13a38d0ce9e28a03463d8d3c2468ed03d75375110ec71d93b449a08"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.79",
-]
-
 [[package]]
 name = "value-bag"
 version = "1.9.0"
diff --git a/Cargo.toml b/Cargo.toml
index bffe036473679129eaf32885918360c9782fc6a8..0b9262c841d1871c580677854625180e70a9309a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,10 +4,12 @@ members = [
 	"hercules_cg",
 	"hercules_ir",
 	"hercules_opt",
-	"hercules_rt",
-	"hercules_rt_proc",
 	
 	"hercules_tools/hercules_driver",
+
+	"juno_frontend",
+	"juno_scheduler",
+	"juno_build",
 	
 	#"hercules_test/hercules_interpreter",
 	#"hercules_test/hercules_tests",
@@ -15,10 +17,8 @@ members = [
 	"hercules_samples/dot",
 	"hercules_samples/matmul",
 	"hercules_samples/fac",
-
-	"juno_frontend",
-	"juno_scheduler",
-	"juno_build",
+	"hercules_samples/call",
+	"hercules_samples/ccp",
 
 	"juno_samples/simple3",
 	"juno_samples/matmul",
diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index 8437e9e44b2c7792b54f563f86f59ee5fa6bd23a..d9bf505c0e80350328d34ccd3a710bc2f0dbd267 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -1,10 +1,10 @@
 extern crate bitvec;
 extern crate hercules_ir;
 
-use std::cell::{Cell, RefCell};
-use std::collections::{HashMap, HashSet, VecDeque};
+use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
 use std::fmt::{Error, Write};
-use std::iter::once;
+use std::iter::{zip, FromIterator};
+use std::sync::atomic::{AtomicUsize, Ordering};
 
 use self::bitvec::prelude::*;
 
@@ -12,1310 +12,866 @@ use self::hercules_ir::*;
 
 use crate::*;
 
-/*
- * List of big TODOs that aren't urgent:
- *
- * 1. Return `undef` when a PartitionExit data output isn't dominated by that
- *    datum's definition. PartitionExit always returns the complete set of data
- *    outputs that could ever be needed from a partition - this is because we
- *    don't want to introduce sum types into partition signatures or do funky
- *    pointer tricks. So, we could run into the following situation:
- *
- *                                 Block 1
- *                                /       \
- *                            Block 2   Block 3
- *                               |         |
- *                           Define v1  Define v2
- *                              /           \
- *               PartitionExit(v1,v2)    PartitionExit(v1, v2)
- *
- *    Let's assume that the PartitionExits branch to different partitions where
- *    one of them needs v1 and the other needs v2. Notice that both
- *    PartitionExits need to return both v1 and v2, since their signatures must
- *    be identical, even though for both, one of v1 or v2 doesn't dominate the
- *    PartitionExit. What *should* happen here is that each PartitionExit gets
- *    lowered to an LLVM `ret`, where the non-dominating output is set to
- *    `undef`. This works since in the original, un-partitioned, Hercules IR,
- *    defs must dominate uses, so we won't run into a situation where a returned
- *    `undef` value is actually read. What happens currently is that the
- *    generated LLVM will `ret` `%v1` and `%v2`, which LLVM won't compile (since
- *    the code wouldn't be in SSA form). This should get fixed when we start
- *    compiling more complicated codes.
- *
- * 2. Handle >= 3D fork-joins and array accesses. This isn't conceptually
- *    difficult, but generating the LLVM code to implement these is annoying.
- *
- * 3. Handle ABI properly when taking in / returning structs taking more than 16
- *    bytes. When a passed / returned struct takes more than 16 bytes, it needs
- *    to be passed around via pointers. This is one of many platform specific C
- *    ABI rules we need to handle to be properly called from Rust (that 16 byte
- *    rule is actually x86-64 specific). I'm honestly not sure how to handle
- *    this well. We avoid running into the manifestation of this problem for
- *    some samples by removing unneeded parameters / return values from
- *    partitions at the schedule IR level, which we should do anyway, but this
- *    isn't a complete solution.
- */
+static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0);
 
 /*
- * The top level function to compile a schedule IR function into LLVM IR, for
+ * The top level function to compile a Hercules IR function into LLVM IR for
  * execution on the CPU. We generate LLVM IR textually, since there are no good
  * LLVM bindings for Rust, and we are *not* writing any C++.
  */
-pub fn cpu_compile<W: Write>(
-    function: &SFunction,
-    manifest: &PartitionManifest,
+pub fn cpu_codegen<W: Write>(
+    function: &Function,
+    types: &Vec<Type>,
+    constants: &Vec<Constant>,
+    dynamic_constants: &Vec<DynamicConstant>,
+    reverse_postorder: &Vec<NodeID>,
+    typing: &Vec<TypeID>,
+    control_subgraph: &Subgraph,
+    bbs: &Vec<NodeID>,
     w: &mut W,
 ) -> Result<(), Error> {
-    // Calculate basic analyses over schedule IR.
-    let virt_reg_to_inst_id = sched_virt_reg_to_inst_id(function);
-    let dep_graph = sched_dependence_graph(function, &virt_reg_to_inst_id);
-    let svalue_types = sched_svalue_types(function);
-    let parallel_reduce_infos = sched_parallel_reduce_sections(function);
-
-    // Calculate the names of each block. For blocks that are the top or bottom
-    // blocks of sequential fork-joins, references outside the fork-join
-    // actually need to refer to the header block. This is a bit complicated to
-    // handle, and we use these names in several places, so pre-calculate the
-    // block names. Intuitively, if we are "inside" a sequential fork-join,
-    // references to the top or bottom blocks actually refer to those blocks,
-    // while if we are "outside" the sequential fork-join, references to both
-    // the top or bottom blocks actually refer to the loop header block.
-    let mut block_names = HashMap::new();
-    for (block_idx, block) in function.blocks.iter().enumerate() {
-        for fork_join_id in parallel_reduce_infos
-            .keys()
-            .map(|id| Some(*id))
-            .chain(once(None))
-        {
-            let block_id = BlockID::new(block_idx);
-            let possible_parent = block.kind.try_fork_join_id();
-            let mut walk = fork_join_id;
-
-            // Check if the location in the key is "inside" the location of the
-            // block.
-            let is_inside = if let Some(parent) = possible_parent {
-                loop {
-                    if let Some(step) = walk {
-                        if step == parent {
-                            // If we see the block's location, then the key
-                            // location is "inside".
-                            break true;
-                        } else {
-                            // Walk the parent until we find something
-                            // interesting.
-                            walk = parallel_reduce_infos[&step].parent_fork_join_id;
-                        }
-                    } else {
-                        // If we don't find the block, then the key location is
-                        // "outside" the block's parallel-reduce.
-                        break false;
-                    }
-                }
-            } else {
-                // Every location is "inside" the top level sequential section.
-                true
-            };
-
-            // Check if the parent is a vectorized fork-join.
-            let is_parent_vectorized = possible_parent
-                // Check if the parent fork-join has a vector width.
-                .map(|parent| parallel_reduce_infos[&parent].vector_width.is_some())
-                // Sequential blocks are not vectorized.
-                .unwrap_or(false);
-
-            // If we are inside the block's fork-join or the block's fork-join
-            // is vectorized, then refer to the blocks directly. Vectorized
-            // fork-joins have the same LLVM IR control flow as the schedule IR
-            // control flow.
-            if is_inside || is_parent_vectorized {
-                block_names.insert((block_id, fork_join_id), format!("bb_{}", block_idx));
-            } else {
-                block_names.insert(
-                    (block_id, fork_join_id),
-                    format!("fork_join_seq_header_{}", possible_parent.unwrap().idx()),
-                );
-            }
-        }
-    }
-
-    // Create context for emitting LLVM IR.
     let ctx = CPUContext {
         function,
-        manifest,
-        block: Cell::new((0, &function.blocks[0])),
-
-        virt_reg_to_inst_id,
-        dep_graph,
-        svalue_types,
-        parallel_reduce_infos,
-
-        block_names,
-
-        vector_width: Cell::new(None),
-        outside_def_used_in_vector: RefCell::new(HashSet::new()),
-        vectors_from_parallel: RefCell::new(HashSet::new()),
-        vector_reduce_associative_vars: RefCell::new(HashSet::new()),
-        vector_reduce_cycle: Cell::new(false),
+        types,
+        constants,
+        dynamic_constants,
+        reverse_postorder,
+        typing,
+        control_subgraph,
+        bbs,
     };
-    ctx.emit_function(w)?;
-
-    Ok(())
+    ctx.codegen_function(w)
 }
 
-/*
- * Top level structure to hold analysis info and cell-ed state.
- */
 struct CPUContext<'a> {
-    function: &'a SFunction,
-    manifest: &'a PartitionManifest,
-    block: Cell<(usize, &'a SBlock)>,
-
-    // Basic analyses over schedule IR.
-    virt_reg_to_inst_id: HashMap<usize, InstID>,
-    dep_graph: HashMap<InstID, Vec<InstID>>,
-    svalue_types: HashMap<SValue, SType>,
-    parallel_reduce_infos: HashMap<ForkJoinID, ParallelReduceInfo>,
-
-    // Calculate the names of each block up front. For blocks that are the top
-    // or bottom blocks of sequential fork-joins, references outside the fork-
-    // join actually need to refer to the header block. This is a bit
-    // complicated to handle, and we use these names in several places, so pre-
-    // calculate the block names. Intuitively, if we are "inside" a sequential
-    // fork-join, references to the top or bottom blocks actually refer to those
-    // blocks, while if we are "outside" the sequential fork-join, references to
-    // both the top or bottom blocks actually refer to the loop header block.
-    // Fully vectorized fork-joins are not considered sequential.
-    block_names: HashMap<(BlockID, Option<ForkJoinID>), String>,
+    function: &'a Function,
+    types: &'a Vec<Type>,
+    constants: &'a Vec<Constant>,
+    dynamic_constants: &'a Vec<DynamicConstant>,
+    reverse_postorder: &'a Vec<NodeID>,
+    typing: &'a Vec<TypeID>,
+    control_subgraph: &'a Subgraph,
+    bbs: &'a Vec<NodeID>,
+}
 
-    // Track whether we are currently in a vectorized parallel section - this
-    // affects how we lower types, for example.
-    vector_width: Cell<Option<usize>>,
-    // Track which virtual registers are defined outside the vectorized parallel
-    // section and used within it.
-    outside_def_used_in_vector: RefCell<HashSet<usize>>,
-    // Track which virtual registers are defined in the vectorized parallel
-    // section and used in the vectorized reduce section.
-    vectors_from_parallel: RefCell<HashSet<usize>>,
-    // Track which reduction variables (store their virtual register and
-    // variable number) are associative in the vectorized reduce section.
-    vector_reduce_associative_vars: RefCell<HashSet<(usize, usize)>>,
-    // track whether there are any non-associative reduction variables in a
-    // vectorized reduce section (which corresponds to whether we need to
-    // generate explicit control flow or not).
-    vector_reduce_cycle: Cell<bool>,
+#[derive(Default, Debug)]
+struct LLVMBlock {
+    phis: String,
+    body: String,
+    term: String,
 }
 
 impl<'a> CPUContext<'a> {
-    fn emit_function<W: Write>(&self, w: &mut W) -> Result<(), Error> {
-        // Emit the partition function signature.
-        write!(w, "define ")?;
-        if self.function.return_types.len() == 1 {
-            self.emit_type(&self.function.return_types[0], w)?;
-        } else {
-            // Functions with multiple return values return said values in a
-            // struct.
-            self.emit_type(
-                &SType::Product(self.function.return_types.clone().into()),
-                w,
-            )?;
-        }
-        write!(w, " @{}(", self.manifest.name)?;
-        (0..self.function.param_types.len())
-            .map(|param_idx| Some(SValue::VirtualRegister(param_idx)))
-            .intersperse(None)
-            .map(|token| -> Result<(), Error> {
-                match token {
-                    Some(param_svalue) => {
-                        self.emit_svalue(&param_svalue, true, w)?;
-                    }
-                    None => write!(w, ", ")?,
-                }
-                Ok(())
-            })
-            .collect::<Result<(), Error>>()?;
-        // Technically, this may fail if for some reason there's a parallel
-        // launch partition with no parameters. Blame LLVM for being
-        // unnecessarily strict about commas of all things...
-        for parallel_launch_dim in 0..self.manifest.device.num_parallel_launch_dims() {
-            write!(w, ", i64 %parallel_launch_{}_low", parallel_launch_dim)?;
-            write!(w, ", i64 %parallel_launch_{}_len", parallel_launch_dim)?;
-        }
-        write!(w, ") {{\n",)?;
-
-        // Emit the function body. Emit each block, one at a time.
-        for (block_idx, block) in self.function.blocks.iter().enumerate() {
-            self.block.set((block_idx, block));
-
-            // For "tops" of sequential fork-joins, we emit a special top block
-            // to be the loop header for the fork-join loop.
-            if let Some(fork_join_id) = block.kind.try_parallel()
-                && self.parallel_reduce_infos[&fork_join_id]
-                    .top_parallel_block
-                    .idx()
-                    == block_idx
-                && self.parallel_reduce_infos[&fork_join_id]
-                    .vector_width
-                    .is_none()
-            {
-                self.emit_fork_join_seq_header(fork_join_id, block_idx, w)?;
-            }
-
-            // Emit the header for the block.
-            write!(
-                w,
-                "{}:\n",
-                &self.block_names[&(BlockID::new(block_idx), block.kind.try_fork_join_id())]
-            )?;
-
-            // If this block is in a vectorized parallel section, set up the
-            // context for vector code generation.
-            if let Some(fork_join_id) = block.kind.try_parallel()
-                && let Some(width) = self.parallel_reduce_infos[&fork_join_id].vector_width
-            {
-                self.setup_vectorized_parallel_block(width, w)?;
-            }
-
-            // If this block is in a vectorized reduce section, set up either a
-            // post-parallel reduction loop or a vector reduction, depending on
-            // whether there's an associative schedule on each reduction
-            // variable.
-            if let Some(fork_join_id) = block.kind.try_reduce()
-                && let Some(width) = self.parallel_reduce_infos[&fork_join_id].vector_width
-            {
-                self.setup_vectorized_reduce_block(fork_join_id, width, w)?;
+    fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> {
+        // Dump the function signature.
+        write!(
+            w,
+            "define {} @{}(",
+            self.get_type(self.function.return_type),
+            self.function.name
+        )?;
+        let mut first_param = true;
+        // The first set of parameters are dynamic constants.
+        for idx in 0..self.function.num_dynamic_constants {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(w, ", ")?;
             }
-
-            // For each basic block, emit instructions in that block. Emit using
-            // a worklist over the dependency graph.
-            let mut emitted = bitvec![u8, Lsb0; 0; block.insts.len()];
-            let mut worklist = VecDeque::from((0..block.insts.len()).collect::<Vec<_>>());
-            while let Some(inst_idx) = worklist.pop_front() {
-                let inst_id = InstID::new(block_idx, inst_idx);
-                let dependencies = &self.dep_graph[&inst_id];
-                let all_uses_emitted = dependencies
-                    .into_iter()
-                    // Check that all used instructions in this block...
-                    .filter(|inst_id| inst_id.idx_0() == block_idx)
-                    // were already emitted.
-                    .all(|inst_id| emitted[inst_id.idx_1()]);
-                // Phis don't need to wait for all of their uses to be emitted.
-                if block.insts[inst_idx].is_phi() || all_uses_emitted {
-                    self.emit_inst(
-                        block.virt_regs[inst_id.idx_1()].0,
-                        &block.insts[inst_idx],
-                        block.kind.try_fork_join_id(),
-                        w,
-                    )?;
-                    emitted.set(inst_id.idx_1(), true);
-                } else {
-                    worklist.push_back(inst_idx);
-                }
+            write!(w, "i64 %dc_p{}", idx)?;
+        }
+        // The second set of parameters are normal parameters.
+        for (idx, ty) in self.function.param_types.iter().enumerate() {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(w, ", ")?;
             }
-
-            self.reset_cells();
+            write!(w, "{} %p{}", self.get_type(*ty), idx)?;
         }
-        write!(w, "}}\n",)?;
+        write!(w, ") {{\n")?;
 
-        Ok(())
-    }
+        let mut blocks: BTreeMap<_, _> = (0..self.function.nodes.len())
+            .filter(|idx| self.function.nodes[*idx].is_control())
+            .map(|idx| (NodeID::new(idx), LLVMBlock::default()))
+            .collect();
 
-    fn emit_type<W: Write>(&self, stype: &SType, w: &mut W) -> Result<(), Error> {
-        if let Some(width) = self.vector_width.get() {
-            write!(w, "<{} x ", width)?;
-        }
+        // Emit calculation of dynamic constants into the start block. Just
+        // calculate every dynamic constant, and let LLVM clean them up.
+        self.codegen_dynamic_constants(
+            blocks.get_mut(&NodeID::new(0)).unwrap(),
+            self.function.num_dynamic_constants,
+        )?;
 
-        match stype {
-            SType::Boolean => write!(w, "i1")?,
-            SType::Integer8 | SType::UnsignedInteger8 => write!(w, "i8")?,
-            SType::Integer16 | SType::UnsignedInteger16 => write!(w, "i16")?,
-            SType::Integer32 | SType::UnsignedInteger32 => write!(w, "i32")?,
-            SType::Integer64 | SType::UnsignedInteger64 => write!(w, "i64")?,
-            SType::Float32 => write!(w, "float")?,
-            SType::Float64 => write!(w, "double")?,
-            SType::Product(fields) => {
-                write!(w, "{{")?;
-                fields
+        // Emit data flow into basic blocks.
+        let mut worklist = VecDeque::from_iter(
+            self.reverse_postorder
+                .into_iter()
+                .filter(|id| !self.function.nodes[id.idx()].is_control()),
+        );
+        let mut visited = bitvec![u8, Lsb0; 0; self.function.nodes.len()];
+        while let Some(id) = worklist.pop_front() {
+            let node = &self.function.nodes[id.idx()];
+            if node.is_phi()
+                || node.is_reduce()
+                || get_uses(node)
+                    .as_ref()
                     .into_iter()
-                    .map(Some)
-                    .intersperse(None)
-                    .map(|token| -> Result<(), Error> {
-                        match token {
-                            Some(field_ty) => self.emit_type(field_ty, w)?,
-                            None => write!(w, ", ")?,
-                        }
-                        Ok(())
-                    })
-                    .collect::<Result<(), Error>>()?;
-                write!(w, "}}")?;
+                    .all(|u| self.function.nodes[u.idx()].is_control() || visited[u.idx()])
+            {
+                self.codegen_data_node(*id, &mut blocks)?;
+                visited.set(id.idx(), true);
+            } else {
+                worklist.push_back(id);
             }
-            SType::ArrayRef(_) => write!(w, "ptr")?,
         }
 
-        if self.vector_width.get().is_some() {
-            write!(w, ">")?;
+        // Emit control flow into basic blocks.
+        for id in (0..self.function.nodes.len()).map(NodeID::new) {
+            if !self.function.nodes[id.idx()].is_control() {
+                continue;
+            }
+            self.codegen_control_node(id, &mut blocks)?;
         }
 
-        Ok(())
-    }
-
-    fn emit_constant<W: Write>(&self, sconstant: &SConstant, w: &mut W) -> Result<(), Error> {
-        match sconstant {
-            SConstant::Boolean(val) => write!(w, "{}", val)?,
-            SConstant::Integer8(val) => write!(w, "{}", val)?,
-            SConstant::Integer16(val) => write!(w, "{}", val)?,
-            SConstant::Integer32(val) => write!(w, "{}", val)?,
-            SConstant::Integer64(val) => write!(w, "{}", val)?,
-            SConstant::UnsignedInteger8(val) => write!(w, "{}", val)?,
-            SConstant::UnsignedInteger16(val) => write!(w, "{}", val)?,
-            SConstant::UnsignedInteger32(val) => write!(w, "{}", val)?,
-            SConstant::UnsignedInteger64(val) => write!(w, "{}", val)?,
-            SConstant::Float32(val) => {
-                if val.fract() == 0.0 {
-                    write!(w, "{}.0", val)?
-                } else {
-                    write!(w, "{}", val)?
-                }
-            }
-            SConstant::Float64(val) => {
-                if val.fract() == 0.0 {
-                    write!(w, "{}.0", val)?
-                } else {
-                    write!(w, "{}", val)?
-                }
-            }
-            SConstant::Product(fields) => {
-                write!(w, "{{")?;
-                fields
-                    .into_iter()
-                    .map(Some)
-                    .intersperse(None)
-                    .map(|token| -> Result<(), Error> {
-                        match token {
-                            Some(field_cons) => {
-                                self.emit_type(&field_cons.get_type(), w)?;
-                                write!(w, " ")?;
-                                self.emit_constant(field_cons, w)?;
-                            }
-                            None => write!(w, ", ")?,
-                        }
-                        Ok(())
-                    })
-                    .collect::<Result<(), Error>>()?;
-                write!(w, "}}")?;
-            }
+        // Dump the emitted basic blocks.
+        for (id, block) in blocks {
+            write!(
+                w,
+                "{}:\n{}{}{}",
+                self.get_block_name(id),
+                block.phis,
+                block.body,
+                block.term
+            )?;
         }
 
+        write!(w, "}}\n")?;
         Ok(())
     }
 
-    fn emit_svalue<W: Write>(&self, svalue: &SValue, add_ty: bool, w: &mut W) -> Result<(), Error> {
-        if add_ty {
-            self.emit_type(&self.svalue_types[svalue], w)?;
-            write!(w, " ")?;
-        }
-        if self.vector_width.get().is_some()
-            && svalue
-                .try_virt_reg()
-                .map(|virt_reg| self.outside_def_used_in_vector.borrow().contains(&virt_reg))
-                .unwrap_or(false)
-        {
-            match svalue {
-                SValue::VirtualRegister(virt_reg) => {
-                    write!(w, "%vec_{}_v{}", self.block.get().0, virt_reg)?
-                }
-                SValue::Constant(_) => todo!(),
-            }
-        } else if svalue
-            .try_virt_reg()
-            .map(|virt_reg| self.vectors_from_parallel.borrow().contains(&virt_reg))
-            .unwrap_or(false)
-        {
-            match svalue {
-                SValue::VirtualRegister(virt_reg) => write!(w, "%extract_v{}", virt_reg)?,
-                SValue::Constant(_) => todo!(),
-            }
-        } else {
-            match svalue {
-                SValue::VirtualRegister(virt_reg) => write!(w, "%v{}", virt_reg)?,
-                SValue::Constant(cons) => self.emit_constant(cons, w)?,
-            }
+    /*
+     * While control nodes in Hercules IR are predecessor-centric (each take a
+     * control input that defines the predecessor relationship), LLVM IR basic
+     * blocks are successor centric (each branch to successor blocks with a
+     * branch instruction). This difference requires explicit translation.
+     */
+    fn codegen_control_node(
+        &self,
+        id: NodeID,
+        blocks: &mut BTreeMap<NodeID, LLVMBlock>,
+    ) -> Result<(), Error> {
+        match self.function.nodes[id.idx()] {
+            // Start, region, and projection control nodes all have exactly one
+            // successor and are otherwise simple.
+            Node::Start
+            | Node::Region { preds: _ }
+            | Node::Projection {
+                control: _,
+                selection: _,
+            } => {
+                let term = &mut blocks.get_mut(&id).unwrap().term;
+                let succ = self.control_subgraph.succs(id).next().unwrap();
+                write!(term, "  br label %{}\n", self.get_block_name(succ))?
+            }
+            // If nodes have two successors - examine the projections to
+            // determine which branch is which, and branch between them.
+            Node::If { control: _, cond } => {
+                let term = &mut blocks.get_mut(&id).unwrap().term;
+                let mut succs = self.control_subgraph.succs(id);
+                let succ1 = succs.next().unwrap();
+                let succ2 = succs.next().unwrap();
+                let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some();
+                write!(
+                    term,
+                    "  br {}, label %{}, label %{}\n",
+                    self.get_value(cond, true),
+                    self.get_block_name(if succ1_is_true { succ1 } else { succ2 }),
+                    self.get_block_name(if succ1_is_true { succ2 } else { succ1 }),
+                )?
+            }
+            Node::Return { control: _, data } => {
+                let term = &mut blocks.get_mut(&id).unwrap().term;
+                write!(term, "  ret {}\n", self.get_value(data, true))?
+            }
+            _ => panic!("PANIC: Can't lower {:?}.", self.function.nodes[id.idx()]),
         }
         Ok(())
     }
 
-    fn emit_inst<W: Write>(
+    /*
+     * Lower data nodes in Hercules IR into LLVM instructions.
+     */
+    fn codegen_data_node(
         &self,
-        virt_reg: usize,
-        inst: &SInst,
-        location: Option<ForkJoinID>,
-        w: &mut W,
+        id: NodeID,
+        blocks: &mut BTreeMap<NodeID, LLVMBlock>,
     ) -> Result<(), Error> {
-        // Helper to emit the initial assignment to the destination virtual
-        // register, when applicable.
-        let self_svalue = SValue::VirtualRegister(virt_reg);
-        let emit_assign = |w: &mut W| -> Result<(), Error> { write!(w, "%v{} = ", virt_reg) };
-
-        write!(w, "  ")?;
-        match inst {
-            SInst::Phi { inputs } => {
-                emit_assign(w)?;
-                write!(w, "phi ")?;
-                self.emit_type(&self.svalue_types[&self_svalue], w)?;
-                write!(w, " ")?;
-                inputs
-                    .into_iter()
-                    .map(Some)
-                    .intersperse(None)
-                    .map(|token| match token {
-                        Some((pred_block_id, svalue)) => {
-                            write!(w, "[ ")?;
-                            self.emit_svalue(svalue, false, w)?;
-                            write!(w, ", %{} ]", &self.block_names[&(*pred_block_id, location)])?;
-                            Ok(())
-                        }
-                        None => write!(w, ", "),
-                    })
-                    .collect::<Result<(), Error>>()?;
-            }
-            SInst::ThreadID {
-                dimension,
-                fork_join,
-            } => {
-                emit_assign(w)?;
-                if let Some(width) = self.vector_width.get() {
-                    write!(w, "add <{} x i64> <", width)?;
-                    for idx in 0..width {
-                        if idx != 0 {
-                            write!(w, ", ")?;
-                        }
-                        write!(w, "i64 {}", idx)?;
+        match self.function.nodes[id.idx()] {
+            Node::Phi { control, ref data } => {
+                let phis = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().phis;
+                let preds = self.function.nodes[control.idx()].try_region().unwrap();
+                write!(
+                    phis,
+                    "  {} = phi {} ",
+                    self.get_value(id, false),
+                    self.get_type(self.typing[id.idx()])
+                )?;
+                for idx in 0..preds.len() {
+                    if idx != 0 {
+                        write!(phis, ", ")?;
                     }
-                    write!(w, ">, zeroinitializer")?;
-                } else {
-                    write!(w, "add i64 0, %thread_id_{}_{}", fork_join.idx(), dimension)?;
-                }
-            }
-            SInst::ReductionVariable { number } => {
-                write!(w, "; Already emitted reduction variable #{number}.")?;
-            }
-            SInst::Jump {
-                target,
-                parallel_entry: _,
-                reduce_exit,
-            } => {
-                if reduce_exit.is_some() && self.vector_reduce_cycle.get() {
-                    // If we're closing a non-vectorized reduction for a
-                    // vectorized parallel, jump back to the beginning of the
-                    // reduction, not the beginning of the parallel section.
-                    let self_block_idx = self.block.get().0;
                     write!(
-                        w,
-                        "br label %{}",
-                        &self.block_names[&(BlockID::new(self_block_idx), location)],
+                        phis,
+                        "[ {}, %{} ]",
+                        self.get_value(data[idx], false),
+                        self.get_block_name(preds[idx])
                     )?;
-                } else if reduce_exit.is_some() && self.vectors_from_parallel.borrow().is_empty() {
-                    // If we're closing a reduction and the parallel-reduce is
-                    // not vectorized, we need to branch back to the beginning
-                    // of the parallel-reduce.
-                    write!(
-                        w,
-                        "br label %fork_join_seq_header_{}",
-                        location.unwrap().idx(),
-                    )?;
-                } else {
-                    // If this is a normal jump (or is closing a reduction and
-                    // is vectorized, along with the parallel section), then
-                    // branch to the successor as expected.
-                    write!(w, "br label %{}", &self.block_names[&(*target, location)])?;
                 }
+                write!(phis, "\n")?;
             }
-            SInst::Branch {
-                cond,
-                false_target,
-                true_target,
-            } => {
-                // Branches aren't involved in any parallel-reduce shenanigans,
-                // so lowering them is straightforward.
-                write!(w, "br ")?;
-                self.emit_svalue(cond, true, w)?;
+            Node::Parameter { index } => {
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
+                let ty = self.get_type(self.typing[id.idx()]);
                 write!(
-                    w,
-                    ", label %{}, label %{}",
-                    &self.block_names[&(*true_target, location)],
-                    &self.block_names[&(*false_target, location)],
+                    body,
+                    "  {} = bitcast {} %p{} to {}\n",
+                    self.get_value(id, false),
+                    ty,
+                    index,
+                    ty
                 )?;
             }
-            SInst::PartitionExit { data_outputs } => {
-                if data_outputs.len() == 0 {
-                    write!(w, "ret {{}} zeroinitializer")?;
-                } else if data_outputs.len() == 1 {
-                    write!(w, "ret ")?;
-                    self.emit_svalue(&data_outputs[0], true, w)?;
-                } else {
-                    let ret_ty = SType::Product(
-                        data_outputs
-                            .iter()
-                            .map(|svalue| self.svalue_types[svalue].clone())
-                            .collect(),
-                    );
-                    write!(w, "%v{}_0 = insertvalue ", virt_reg)?;
-                    self.emit_type(&ret_ty, w)?;
-                    write!(w, " undef, ")?;
-                    self.emit_svalue(&data_outputs[0], true, w)?;
-                    write!(w, ", 0\n")?;
-                    for idx in 1..data_outputs.len() {
-                        write!(w, "  %v{}_{} = insertvalue ", virt_reg, idx)?;
-                        self.emit_type(&ret_ty, w)?;
-                        write!(w, " %v{}_{}, ", virt_reg, idx - 1)?;
-                        self.emit_svalue(&data_outputs[idx], true, w)?;
-                        write!(w, ", {}\n", idx)?;
+            Node::Constant { id: cons_id } => {
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
+                write!(body, "  {} = bitcast ", self.get_value(id, false))?;
+                match self.constants[cons_id.idx()] {
+                    Constant::Boolean(val) => write!(body, "i1 {} to i1\n", val)?,
+                    Constant::Integer8(val) => write!(body, "i8 {} to i8\n", val)?,
+                    Constant::Integer16(val) => write!(body, "i16 {} to i16\n", val)?,
+                    Constant::Integer32(val) => write!(body, "i32 {} to i32\n", val)?,
+                    Constant::Integer64(val) => write!(body, "i64 {} to i64\n", val)?,
+                    Constant::UnsignedInteger8(val) => write!(body, "i8 {} to i8\n", val)?,
+                    Constant::UnsignedInteger16(val) => write!(body, "i16 {} to i16\n", val)?,
+                    Constant::UnsignedInteger32(val) => write!(body, "i32 {} to i32\n", val)?,
+                    Constant::UnsignedInteger64(val) => write!(body, "i64 {} to i64\n", val)?,
+                    Constant::Float32(val) => {
+                        if val.fract() == 0.0 {
+                            write!(body, "float {}.0 to float\n", val)?
+                        } else {
+                            write!(body, "float {} to float\n", val)?
+                        }
+                    }
+                    Constant::Float64(val) => {
+                        if val.fract() == 0.0 {
+                            write!(body, "double {}.0 to double", val)?
+                        } else {
+                            write!(body, "double {} to double", val)?
+                        }
                     }
-                    write!(w, "  ret ")?;
-                    self.emit_type(&ret_ty, w)?;
-                    write!(w, " %v{}_{}", virt_reg, data_outputs.len() - 1)?;
+                    _ => panic!("PANIC: Can't dynamically allocate memory for an aggregate type within a CPU function."),
                 }
             }
-            SInst::Return { value } => {
-                write!(w, "ret ")?;
-                self.emit_svalue(value, true, w)?;
-            }
-            SInst::Unary { input, op } => {
-                emit_assign(w)?;
+            Node::DynamicConstant { id: dc_id } => {
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
+                // Dynamic constants are all pre-calculated at the top of the
+                // function.
+                write!(
+                    body,
+                    "  {} = bitcast i64 %dc{} to i64\n",
+                    self.get_value(id, false),
+                    dc_id.idx()
+                )?
+            }
+            Node::Unary { op, input } => {
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
                 match op {
-                    SUnaryOperator::Not => {
-                        write!(w, "xor ")?;
-                        self.emit_svalue(input, true, w)?;
-                        write!(w, ", -1")?;
-                    }
-                    SUnaryOperator::Neg => {
-                        if self.svalue_types[input].is_float() {
-                            write!(w, "fneg ")?;
-                            self.emit_svalue(input, true, w)?;
+                    UnaryOperator::Not => write!(
+                        body,
+                        "  {} = xor {}, -1\n",
+                        self.get_value(id, false),
+                        self.get_value(input, true)
+                    )?,
+                    UnaryOperator::Neg => {
+                        if self.types[self.typing[input.idx()].idx()].is_float() {
+                            write!(
+                                body,
+                                "  {} = fneg {}",
+                                self.get_value(id, false),
+                                self.get_value(input, true)
+                            )?
                         } else {
-                            write!(w, "mul ")?;
-                            self.emit_svalue(input, true, w)?;
-                            write!(w, ", -1")?;
+                            write!(
+                                body,
+                                "  {} = mul {}, -1",
+                                self.get_value(id, false),
+                                self.get_value(input, true)
+                            )?
                         }
                     }
-                    SUnaryOperator::Cast(dst_ty) => {
-                        let src_ty = &self.svalue_types[input];
-                        if src_ty.is_integer()
+                    UnaryOperator::Cast(dst_ty_id) => {
+                        let src_ty_id = self.typing[input.idx()];
+                        let dst_ty = &self.types[dst_ty_id.idx()];
+                        let src_ty = &self.types[src_ty_id.idx()];
+                        let opcode = if src_ty.is_integer()
                             && dst_ty.is_integer()
                             && src_ty.num_bits() > dst_ty.num_bits()
                         {
-                            write!(w, "trunc ")?;
+                            "trunc"
                         } else if src_ty.is_signed()
                             && dst_ty.is_integer()
                             && src_ty.num_bits() < dst_ty.num_bits()
                         {
-                            write!(w, "sext ")?;
+                            "sext"
                         } else if src_ty.is_integer()
                             && dst_ty.is_integer()
                             && src_ty.num_bits() < dst_ty.num_bits()
                         {
-                            write!(w, "zext ")?;
+                            "zext"
                         } else if src_ty.is_integer() && dst_ty.is_integer() {
                             // A no-op.
-                            write!(w, "bitcast ")?;
+                            "bitcast"
                         } else if src_ty.is_float()
                             && dst_ty.is_float()
                             && src_ty.num_bits() > dst_ty.num_bits()
                         {
-                            write!(w, "fptrunc ")?;
+                            "fptrunc"
                         } else if src_ty.is_float()
                             && dst_ty.is_float()
                             && src_ty.num_bits() < dst_ty.num_bits()
                         {
-                            write!(w, "fpext ")?;
+                            "fpext"
                         } else if src_ty.is_float() && dst_ty.is_float() {
                             // A no-op.
-                            write!(w, "bitcast ")?;
+                            "bitcast"
                         } else if src_ty.is_float() && dst_ty.is_signed() {
-                            write!(w, "fptosi ")?;
+                            "fptosi"
                         } else if src_ty.is_float() && dst_ty.is_integer() {
-                            write!(w, "fptoui ")?;
+                            "fptoui"
                         } else if src_ty.is_signed() && dst_ty.is_float() {
-                            write!(w, "sitofp ")?;
+                            "sitofp"
                         } else if src_ty.is_integer() && dst_ty.is_float() {
-                            write!(w, "uitofp ")?;
+                            "uitofp"
                         } else {
                             panic!("PANIC: Invalid cast type combination.");
-                        }
-                        self.emit_svalue(input, true, w)?;
-                        write!(w, " to ")?;
-                        self.emit_type(dst_ty, w)?;
+                        };
+                        write!(
+                            body,
+                            "  {} = {} {} to {}\n",
+                            self.get_value(id, false),
+                            opcode,
+                            self.get_value(input, true),
+                            self.get_type(dst_ty_id),
+                        )?
                     }
                 }
             }
-            SInst::Binary { left, right, op } => {
-                // If we're in a vectorized reduce block and this binary
-                // operation is reducing over an associative reduction variable,
-                // then we need to emit a LLVM vector reduce intrinsic.
-                // Otherwise lower into a normal LLVM binary op.
-                let try_associative_reduction = |sval: &SValue| {
-                    sval.try_virt_reg()
-                        .map(|virt_reg| {
-                            self.vector_reduce_associative_vars
-                                .borrow()
-                                .iter()
-                                .filter(|(red_virt_reg, _)| *red_virt_reg == virt_reg)
-                                .map(|(red_virt_reg, red_num)| (*red_virt_reg, *red_num))
-                                .next()
-                        })
-                        .flatten()
-                };
-                if let Some((red_virt_reg, red_num)) =
-                    try_associative_reduction(left).or(try_associative_reduction(right))
-                {
-                    let left_virt_reg = left
-                        .try_virt_reg()
-                        .expect("PANIC: Associative reduction can't involve constants.");
-                    let right_virt_reg = right
-                        .try_virt_reg()
-                        .expect("PANIC: Associative reduction can't involve constants.");
-                    let vector_virt_reg = if left_virt_reg != red_virt_reg {
-                        left_virt_reg
-                    } else if right_virt_reg != red_virt_reg {
-                        right_virt_reg
-                    } else {
-                        panic!("PANIC: Associative reduction can't use the reduction variable more than once.");
-                    };
-                    let info = &self.parallel_reduce_infos[&location.unwrap()];
-                    write!(w, "%v{} = call reassoc ", red_virt_reg)?;
-                    self.emit_type(&self.svalue_types[&self_svalue], w)?;
-                    let op = op.get_llvm_op(&self.svalue_types[left]);
-                    write!(w, " @llvm.vector.reduce.{}", op)?;
-                    let width = info.vector_width.unwrap();
-                    self.emit_reduce_suffix(width, &self.svalue_types[&self_svalue], w)?;
-                    write!(w, "(")?;
-                    self.emit_svalue(&info.reduce_inits[red_num], true, w)?;
-                    write!(w, ", ")?;
-                    self.vector_width.set(Some(width));
-                    let old_vectors_from_parallel = self.vectors_from_parallel.take();
-                    self.emit_svalue(&SValue::VirtualRegister(vector_virt_reg), true, w)?;
-                    self.vector_width.set(None);
-                    self.vectors_from_parallel
-                        .replace(old_vectors_from_parallel);
-                    write!(w, ")")?;
-                } else {
-                    emit_assign(w)?;
-                    let op = op.get_llvm_op(&self.svalue_types[left]);
-                    write!(w, "{} ", op)?;
-                    self.emit_svalue(left, true, w)?;
-                    write!(w, ", ")?;
-                    self.emit_svalue(right, false, w)?;
+            Node::Binary { op, left, right } => {
+                enum OpTy {
+                    Float,
+                    Unsigned,
+                    Signed,
                 }
-            }
-            SInst::Ternary {
+
+                let left_ty = &self.types[self.typing[left.idx()].idx()];
+                let op_ty = if left_ty.is_float() {
+                    OpTy::Float
+                } else if left_ty.is_unsigned() {
+                    OpTy::Unsigned
+                } else {
+                    OpTy::Signed
+                };
+
+                let opcode = match (op, op_ty) {
+                    (BinaryOperator::Add, OpTy::Float) => "fadd",
+                    (BinaryOperator::Add, _) => "add",
+                    (BinaryOperator::Sub, OpTy::Float) => "fsub",
+                    (BinaryOperator::Sub, _) => "sub",
+                    (BinaryOperator::Mul, OpTy::Float) => "fmul",
+                    (BinaryOperator::Mul, _) => "mul",
+                    (BinaryOperator::Div, OpTy::Float) => "fdiv",
+                    (BinaryOperator::Div, OpTy::Unsigned) => "udiv",
+                    (BinaryOperator::Div, OpTy::Signed) => "sdiv",
+                    (BinaryOperator::Rem, OpTy::Float) => "frem",
+                    (BinaryOperator::Rem, OpTy::Unsigned) => "urem",
+                    (BinaryOperator::Rem, OpTy::Signed) => "srem",
+                    (BinaryOperator::LT, OpTy::Float) => "fcmp olt",
+                    (BinaryOperator::LT, OpTy::Unsigned) => "icmp ult",
+                    (BinaryOperator::LT, OpTy::Signed) => "icmp slt",
+                    (BinaryOperator::LTE, OpTy::Float) => "fcmp ole",
+                    (BinaryOperator::LTE, OpTy::Unsigned) => "icmp ule",
+                    (BinaryOperator::LTE, OpTy::Signed) => "icmp sle",
+                    (BinaryOperator::GT, OpTy::Float) => "fcmp ogt",
+                    (BinaryOperator::GT, OpTy::Unsigned) => "icmp ugt",
+                    (BinaryOperator::GT, OpTy::Signed) => "icmp sgt",
+                    (BinaryOperator::GTE, OpTy::Float) => "fcmp oge",
+                    (BinaryOperator::GTE, OpTy::Unsigned) => "icmp uge",
+                    (BinaryOperator::GTE, OpTy::Signed) => "icmp sge",
+                    (BinaryOperator::EQ, OpTy::Float) => "fcmp oeq",
+                    (BinaryOperator::EQ, _) => "icmp eq",
+                    (BinaryOperator::NE, OpTy::Float) => "fcmp one",
+                    (BinaryOperator::NE, _) => "icmp ne",
+                    (BinaryOperator::Or, _) => "or",
+                    (BinaryOperator::And, _) => "and",
+                    (BinaryOperator::Xor, _) => "xor",
+                    (BinaryOperator::LSh, _) => "lsh",
+                    (BinaryOperator::RSh, OpTy::Unsigned) => "lshr",
+                    (BinaryOperator::RSh, _) => "ashr",
+                };
+
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
+                write!(
+                    body,
+                    "  {} = {} {}, {}\n",
+                    self.get_value(id, false),
+                    opcode,
+                    self.get_value(left, true),
+                    self.get_value(right, false),
+                )?
+            }
+            Node::Ternary {
+                op,
                 first,
                 second,
                 third,
-                op,
             } => {
-                emit_assign(w)?;
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
                 match op {
-                    STernaryOperator::Select => {
-                        write!(w, "select ")?;
-                        self.emit_svalue(first, true, w)?;
-                        write!(w, ", ")?;
-                        self.emit_svalue(second, true, w)?;
-                        write!(w, ", ")?;
-                        self.emit_svalue(third, true, w)?;
-                    }
+                    TernaryOperator::Select => write!(
+                        body,
+                        "  {} = select {}, {}, {}\n",
+                        self.get_value(id, false),
+                        self.get_value(first, true),
+                        self.get_value(second, true),
+                        self.get_value(third, true)
+                    )?,
                 }
             }
-            SInst::IntrinsicCall { intrinsic, args } => {
-                emit_assign(w)?;
-                write!(w, "call ")?;
-                self.emit_type(&self.svalue_types[&self_svalue], w)?;
+            Node::IntrinsicCall {
+                intrinsic,
+                ref args,
+            } => {
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
                 write!(
-                    w,
-                    " @llvm.{}.{}(",
-                    // TODO: make lower case names conform to LLVM expectations.
-                    intrinsic.lower_case_name(),
-                    Self::intrinsic_type_str(&self.svalue_types[&self_svalue])
+                    body,
+                    "  {} = call {} {}(",
+                    self.get_value(id, false),
+                    self.get_type(self.typing[id.idx()]),
+                    convert_intrinsic(&intrinsic, &self.types[self.typing[id.idx()].idx()]),
                 )?;
-                self.emit_svalue(&args[0], true, w)?;
-                for idx in 1..args.len() {
-                    write!(w, ", ")?;
-                    self.emit_svalue(&args[idx], true, w)?;
-                }
-                write!(w, ")")?;
-            }
-            SInst::ProductExtract { product, indices } => {
-                emit_assign(w)?;
-                write!(w, "extractvalue ")?;
-                self.emit_svalue(product, true, w)?;
-                for index in indices {
-                    write!(w, ", {}", index)?;
-                }
-            }
-            SInst::ProductInsert {
-                product,
-                data,
-                indices,
-            } => {
-                emit_assign(w)?;
-                write!(w, "insertvalue ")?;
-                self.emit_svalue(product, true, w)?;
-                write!(w, ", ")?;
-                self.emit_svalue(data, true, w)?;
-                for index in indices {
-                    write!(w, ", {}", index)?;
+                for idx in 0..args.len() {
+                    if idx != 0 {
+                        write!(body, ", ")?;
+                    }
+                    write!(body, "{}", self.get_value(args[idx], true))?;
                 }
+                write!(body, ")\n")?
             }
-            SInst::ArrayLoad {
-                array,
-                position,
-                bounds,
+            Node::Read {
+                collect,
+                ref indices,
             } => {
-                self.emit_linear_index_calc(virt_reg, position, bounds, w)?;
-                write!(w, "%load_ptr_{} = getelementptr ", virt_reg)?;
-                let old_width = self.vector_width.take();
-                self.emit_type(&self.svalue_types[&self_svalue], w)?;
-                self.vector_width.set(old_width);
-                write!(w, ", ")?;
-                self.emit_svalue(array, true, w)?;
-                write!(w, ", ")?;
-                self.emit_type(&self.svalue_types[&position[0]], w)?;
-                write!(w, " %calc_linear_idx_{}\n  ", virt_reg)?;
-                emit_assign(w)?;
-                if let Some(width) = self.vector_width.get() {
-                    write!(w, "call ")?;
-                    self.emit_type(&self.svalue_types[&self_svalue], w)?;
-                    write!(w, " @llvm.masked.gather")?;
-                    self.emit_gather_scatter_suffix(width, &self.svalue_types[&self_svalue], w)?;
-                    write!(w, "(")?;
-                    self.emit_type(&self.svalue_types[array], w)?;
-                    write!(w, " %load_ptr_{}, i32 8, <{} x i1> <", virt_reg, width)?;
-                    for idx in 0..width {
-                        if idx != 0 {
-                            write!(w, ", ")?;
-                        }
-                        write!(w, "i1 true")?;
-                    }
-                    write!(w, ">, ")?;
-                    self.emit_type(&self.svalue_types[&self_svalue], w)?;
-                    write!(w, " undef)")?;
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
+                let collect_name = self.get_value(collect, false);
+                let collect_ty = self.typing[collect.idx()];
+                let index_ptr_name =
+                    self.codegen_index_math(&collect_name, collect_ty, indices, body)?;
+                let self_ty = self.typing[id.idx()];
+                if self.types[self_ty.idx()].is_primitive() {
+                    // If this read reaches a primitive type, actually perform a
+                    // load.
+                    write!(
+                        body,
+                        "  {} = load {}, ptr {}\n",
+                        self.get_value(id, false),
+                        self.get_type(self_ty),
+                        index_ptr_name
+                    )?;
                 } else {
-                    write!(w, "load ")?;
-                    self.emit_type(&self.svalue_types[&self_svalue], w)?;
-                    write!(w, ", ptr %load_ptr_{}", virt_reg)?;
+                    // If this read doesn't reach a primitive type, just return
+                    // the calculated offset pointer for the sub-collection.
+                    write!(
+                        body,
+                        "  {} = bitcast ptr {} to ptr\n",
+                        self.get_value(id, false),
+                        index_ptr_name
+                    )?;
                 }
             }
-            SInst::ArrayStore {
-                array,
-                value,
-                position,
-                bounds,
+            Node::Write {
+                collect,
+                data,
+                ref indices,
             } => {
-                self.emit_linear_index_calc(virt_reg, position, bounds, w)?;
-                write!(w, "%store_ptr_{} = getelementptr ", virt_reg)?;
-                let old_width = self.vector_width.take();
-                self.emit_type(&self.svalue_types[value], w)?;
-                self.vector_width.set(old_width);
-                write!(w, ", ")?;
-                self.emit_svalue(array, true, w)?;
-                write!(w, ", ")?;
-                self.emit_type(&self.svalue_types[&position[0]], w)?;
-                write!(w, " %calc_linear_idx_{}\n  ", virt_reg)?;
-                if let Some(width) = self.vector_width.get() {
-                    write!(w, "call ")?;
-                    self.emit_type(&self.svalue_types[&self_svalue], w)?;
-                    write!(w, " @llvm.masked.scatter")?;
-                    self.emit_gather_scatter_suffix(width, &self.svalue_types[&self_svalue], w)?;
-                    write!(w, "(")?;
-                    self.emit_svalue(array, true, w)?;
-                    write!(w, ", ")?;
-                    self.emit_type(&self.svalue_types[array], w)?;
-                    write!(w, " %store_ptr_{}, i32 8, <{} x i1> <", virt_reg, width)?;
-                    for idx in 0..width {
-                        if idx != 0 {
-                            write!(w, ", ")?;
-                        }
-                        write!(w, "i1 true")?;
-                    }
-                    write!(w, ">)")?;
+                let body = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap().body;
+                let collect_name = self.get_value(collect, false);
+                let collect_ty = self.typing[collect.idx()];
+                let index_ptr_name =
+                    self.codegen_index_math(&collect_name, collect_ty, indices, body)?;
+                let data_ty = self.typing[data.idx()];
+                if self.types[data_ty.idx()].is_primitive() {
+                    // If the data item being written is a primitive type,
+                    // perform a single store of the data value.
+                    write!(
+                        body,
+                        "  store {}, ptr {}\n",
+                        self.get_value(data, true),
+                        index_ptr_name
+                    )?;
+                    write!(
+                        body,
+                        "  {} = bitcast {} to ptr\n",
+                        self.get_value(id, false),
+                        self.get_value(collect, true)
+                    )?;
                 } else {
-                    write!(w, "store ")?;
-                    self.emit_svalue(value, true, w)?;
-                    write!(w, ", ptr %store_ptr_{}", virt_reg)?;
+                    // If the data item being written is not a primitive type,
+                    // then perform a memcpy from the data collection to the
+                    // destination collection.
+                    let data_size = self.codegen_type_size(data_ty, body)?;
+                    write!(
+                        body,
+                        "  call void @llvm.memcpy.p0.p0.i64(ptr {}, {}, i64 {}, i1 false)\n",
+                        index_ptr_name,
+                        self.get_value(data, true),
+                        data_size
+                    )?;
                 }
             }
+            _ => panic!("PANIC: Can't lower {:?}.", self.function.nodes[id.idx()]),
         }
-        write!(w, "\n")?;
-
         Ok(())
     }
 
     /*
-     * Implement the index math to convert a multi-dimensional position to a
-     * linear position inside an array.
+     * Calculate all of the dynamic constants upfront.
      */
-    fn emit_linear_index_calc<W: Write>(
+    fn codegen_dynamic_constants(
         &self,
-        virt_reg: usize,
-        position: &[SValue],
-        bounds: &[SValue],
-        w: &mut W,
+        block: &mut LLVMBlock,
+        num_dc_params: u32,
     ) -> Result<(), Error> {
-        assert_eq!(position.len(), bounds.len());
-
-        if position.len() == 1 {
-            write!(w, "%calc_linear_idx_{} = add ", virt_reg)?;
-            self.emit_svalue(&position[0], true, w)?;
-            write!(w, ", zeroinitializer\n  ")?;
-        } else if position.len() == 2 {
-            write!(w, "%calc_linear_idx_{}_0 = mul ", virt_reg)?;
-            self.emit_svalue(&position[0], true, w)?;
-            write!(w, ", ")?;
-            self.emit_svalue(&bounds[1], false, w)?;
-            write!(w, "\n  %calc_linear_idx_{} = add ", virt_reg)?;
-            self.emit_svalue(&position[1], true, w)?;
-            write!(w, ", %calc_linear_idx_{}_0", virt_reg)?;
-            write!(w, "\n  ")?;
-        } else {
-            todo!("TODO: Handle the 3 or more dimensional array case.")
+        let body = &mut block.body;
+        for dc in dynamic_constants_bottom_up(&self.dynamic_constants) {
+            match self.dynamic_constants[dc.idx()] {
+                DynamicConstant::Constant(val) => {
+                    write!(body, "  %dc{} = bitcast i64 {} to i64\n", dc.idx(), val)?
+                }
+                DynamicConstant::Parameter(idx) => {
+                    if idx < num_dc_params as usize {
+                        write!(
+                            body,
+                            "  %dc{} = bitcast i64 %dc_p{} to i64\n",
+                            dc.idx(),
+                            idx
+                        )?
+                    } else {
+                        write!(body, "  %dc{} = bitcast i64 0 to i64\n", dc.idx())?
+                    }
+                }
+                DynamicConstant::Add(left, right) => write!(
+                    body,
+                    "  %dc{} = add i64%dc{},%dc{}\n",
+                    dc.idx(),
+                    left.idx(),
+                    right.idx()
+                )?,
+                DynamicConstant::Sub(left, right) => write!(
+                    body,
+                    "  %dc{} = sub i64%dc{},%dc{}\n",
+                    dc.idx(),
+                    left.idx(),
+                    right.idx()
+                )?,
+                DynamicConstant::Mul(left, right) => write!(
+                    body,
+                    "  %dc{} = mul i64%dc{},%dc{}\n",
+                    dc.idx(),
+                    left.idx(),
+                    right.idx()
+                )?,
+                DynamicConstant::Div(left, right) => write!(
+                    body,
+                    "  %dc{} = udiv i64%dc{},%dc{}\n",
+                    dc.idx(),
+                    left.idx(),
+                    right.idx()
+                )?,
+                DynamicConstant::Rem(left, right) => write!(
+                    body,
+                    "  %dc{} = urem i64%dc{},%dc{}\n",
+                    dc.idx(),
+                    left.idx(),
+                    right.idx()
+                )?,
+            }
         }
-
         Ok(())
     }
 
     /*
-     * LLVM intrinsics are a pain to emit textually...
+     * Emit logic to index into an collection.
      */
-    fn intrinsic_type_str(elem_ty: &SType) -> &'static str {
-        // We can't just use our previous routines for emitting types, because
-        // only inside intrinsics does LLVM use "f32" and "f64" properly!
-        match elem_ty {
-            SType::Boolean => "i1",
-            SType::Integer8 | SType::UnsignedInteger8 => "i8",
-            SType::Integer16 | SType::UnsignedInteger16 => "i16",
-            SType::Integer32 | SType::UnsignedInteger32 => "i32",
-            SType::Integer64 | SType::UnsignedInteger64 => "i64",
-            SType::Float32 => "f32",
-            SType::Float64 => "f64",
-            _ => panic!(),
-        }
-    }
-
-    fn emit_reduce_suffix<W: Write>(
+    fn codegen_index_math(
         &self,
-        width: usize,
-        elem_ty: &SType,
-        w: &mut W,
-    ) -> Result<(), Error> {
-        write!(w, ".v{}{}", width, Self::intrinsic_type_str(elem_ty))?;
-        Ok(())
-    }
+        collect_name: &str,
+        collect_ty: TypeID,
+        indices: &[Index],
+        body: &mut String,
+    ) -> Result<String, Error> {
+        let mut acc_ptr = collect_name.to_string();
+        for index in indices {
+            match index {
+                Index::Field(idx) => {
+                    let Type::Product(ref fields) = self.types[collect_ty.idx()] else {
+                        panic!()
+                    };
 
-    fn emit_gather_scatter_suffix<W: Write>(
-        &self,
-        width: usize,
-        elem_ty: &SType,
-        w: &mut W,
-    ) -> Result<(), Error> {
-        write!(
-            w,
-            ".v{}{}.v{}p0",
-            width,
-            Self::intrinsic_type_str(elem_ty),
-            width
-        )?;
-        Ok(())
+                    // Get the offset of the field at index `idx` by calculating
+                    // the product's size up to field `idx`, then offseting the
+                    // base pointer by that amount.
+                    let mut acc_offset = "0".to_string();
+                    for field in &fields[..*idx] {
+                        let field_align = get_type_alignment(&self.types, *field);
+                        let field = self.codegen_type_size(*field, body)?;
+                        acc_offset = Self::round_up_to(&acc_offset, field_align, body)?;
+                        acc_offset = Self::append(&acc_offset, &field, body)?;
+                    }
+                    acc_offset = Self::round_up_to(
+                        &acc_offset,
+                        get_type_alignment(&self.types, fields[*idx]),
+                        body,
+                    )?;
+                    acc_ptr = Self::gep(collect_name, &acc_offset, body)?;
+                }
+                Index::Variant(_) => {
+                    // The tag of a summation is at the end of the summation, so
+                    // the variant pointer is just the base pointer. Do nothing.
+                }
+                Index::Position(ref pos) => {
+                    let Type::Array(elem, ref dims) = self.types[collect_ty.idx()] else {
+                        panic!()
+                    };
+
+                    // The offset of the position into an array is:
+                    //
+                    //     ((0 * s1 + p1) * s2 + p2) * s3 + p3 ...
+                    let elem_size = self.codegen_type_size(elem, body)?;
+                    let mut acc_offset = "0".to_string();
+                    for (p, s) in zip(pos, dims) {
+                        let p = self.get_value(*p, false);
+                        let s = format!("%dc{}", s.idx());
+                        acc_offset = Self::multiply(&acc_offset, &s, body)?;
+                        acc_offset = Self::append(&acc_offset, &p, body)?;
+                    }
+
+                    // Convert offset in # elements -> # bytes.
+                    acc_offset = Self::multiply(&acc_offset, &elem_size, body)?;
+                    acc_ptr = Self::gep(collect_name, &acc_offset, body)?;
+                }
+            }
+        }
+        Ok(acc_ptr)
     }
 
     /*
-     * Emit the loop header implementing a sequential fork-join. For historical
-     * reasons, "sequential" fork-joins are just fork-joins that are lowered to
-     * LLVM level loops. This includes fork-joins that end up getting
-     * parallelized across threads via low/high bounds.
+     * Emit logic to calculate the size of a type. This needs to be emitted as
+     * IR since the size of an array may depend on array constants.
      */
-    fn emit_fork_join_seq_header<W: Write>(
-        &self,
-        fork_join_id: ForkJoinID,
-        block_idx: usize,
-        w: &mut W,
-    ) -> Result<(), Error> {
-        let info = &self.parallel_reduce_infos[&fork_join_id];
-        let entry_name = &self.block_names[&(info.predecessor, Some(fork_join_id))];
-        let loop_name = &self.block_names[&(info.reduce_block, Some(fork_join_id))];
-        let parallel_launch = self.manifest.device.num_parallel_launch_dims() > 0 && info.top_level;
-
-        // Start the header of the loop.
-        write!(w, "fork_join_seq_header_{}:\n", fork_join_id.idx())?;
-
-        // Emit the phis for the linear loop index variable and the reduction
-        // variables.
-        write!(
-            w,
-            "  %linear_{} = phi i64 [ 0, %{} ], [ %linear_{}_inc, %{} ]\n",
-            block_idx, entry_name, block_idx, loop_name,
-        )?;
-        for (var_num, virt_reg) in info.reduction_variables.iter() {
-            write!(w, "  %v{} = phi ", virt_reg)?;
-            self.emit_type(&self.svalue_types[&SValue::VirtualRegister(*virt_reg)], w)?;
-            write!(w, " [ ")?;
-            self.emit_svalue(&info.reduce_inits[*var_num], false, w)?;
-            write!(w, ", %{} ], [ ", entry_name)?;
-            self.emit_svalue(&info.reduce_reducts[*var_num], false, w)?;
-            write!(w, ", %{} ]\n", loop_name)?;
-        }
+    fn codegen_type_size(&self, ty: TypeID, body: &mut String) -> Result<String, Error> {
+        match self.types[ty.idx()] {
+            Type::Control => panic!(),
+            Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => Ok("1".to_string()),
+            Type::Integer16 | Type::UnsignedInteger16 => Ok("2".to_string()),
+            Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => Ok("4".to_string()),
+            Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => Ok("8".to_string()),
+            Type::Product(ref fields) => {
+                let fields_align = fields
+                    .into_iter()
+                    .map(|id| get_type_alignment(&self.types, *id));
+                let fields: Result<Vec<String>, Error> = fields
+                    .into_iter()
+                    .map(|id| self.codegen_type_size(*id, body))
+                    .collect();
+
+                // Emit LLVM IR to round up to the alignment of the next field,
+                // and then add the size of that field. At the end, round up to
+                // the alignment of the whole struct.
+                let mut acc_size = "0".to_string();
+                for (field_align, field) in zip(fields_align, fields?) {
+                    acc_size = Self::round_up_to(&acc_size, field_align, body)?;
+                    acc_size = Self::append(&acc_size, &field, body)?;
+                }
+                Self::round_up_to(&acc_size, get_type_alignment(&self.types, ty), body)
+            }
+            Type::Summation(ref variants) => {
+                let variants: Result<Vec<String>, Error> = variants
+                    .into_iter()
+                    .map(|id| self.codegen_type_size(*id, body))
+                    .collect();
+
+                // The size of a summation is the size of the largest field,
+                // plus 1 byte and alignment for the discriminant.
+                let mut acc_size = "0".to_string();
+                for variant in variants? {
+                    acc_size = Self::max(&acc_size, &variant, body)?;
+                }
 
-        // Calculate the loop bounds.
-        if info.thread_counts.len() == 1 {
-            write!(w, "  %bound_{} = add i64 0, ", block_idx)?;
-            if parallel_launch {
-                write!(w, "%parallel_launch_0_len")?;
-            } else {
-                self.emit_svalue(&info.thread_counts[0], false, w)?;
+                // No alignment is necessary for the 1 byte discriminant.
+                acc_size = Self::append(&acc_size, "1", body)?;
+                Self::round_up_to(&acc_size, get_type_alignment(&self.types, ty), body)
             }
-            write!(w, "\n")?;
-        } else if info.thread_counts.len() == 2 {
-            write!(w, "  %bound_{} = mul ", block_idx)?;
-            if parallel_launch {
-                write!(w, "i64 %parallel_launch_0_len, %parallel_launch_1_len")?;
-            } else {
-                self.emit_svalue(&info.thread_counts[0], true, w)?;
-                write!(w, ", ")?;
-                self.emit_svalue(&info.thread_counts[1], false, w)?;
+            Type::Array(elem, ref bounds) => {
+                // The size of an array is the size of the element multipled by
+                // the dynamic constant bounds.
+                let mut acc_size = self.codegen_type_size(elem, body)?;
+                for dc in bounds {
+                    acc_size = Self::multiply(&acc_size, &format!("dc{}", dc.idx()), body)?;
+                }
+                Ok(acc_size)
             }
-            write!(w, "\n")?;
-        } else {
-            todo!("TODO: Handle the 3 or more dimensional fork-join case.")
         }
+    }
 
-        // Calculate the multi-dimensional thread indices.
-        if info.thread_counts.len() == 1 && parallel_launch {
-            write!(
-                w,
-                "  %thread_id_{}_0 = add i64 %parallel_launch_0_low, %linear_{}\n",
-                fork_join_id.idx(),
-                block_idx
-            )?;
-        } else if info.thread_counts.len() == 1 {
-            write!(
-                w,
-                "  %thread_id_{}_0 = add i64 0, %linear_{}\n",
-                fork_join_id.idx(),
-                block_idx
-            )?;
-        } else if info.thread_counts.len() == 2 && parallel_launch {
-            write!(
-                w,
-                "  %unshifted_id_{}_0 = udiv i64 %linear_{}, %parallel_launch_1_len\n",
-                fork_join_id.idx(),
-                block_idx
-            )?;
-            write!(
-                w,
-                "  %unshifted_id_{}_1 = urem i64 %linear_{}, %parallel_launch_1_len\n",
-                fork_join_id.idx(),
-                block_idx
-            )?;
-            write!(
-                w,
-                "  %thread_id_{}_0 = add i64 %unshifted_id_{}_0, %parallel_launch_0_low\n",
-                fork_join_id.idx(),
-                fork_join_id.idx(),
-            )?;
-            write!(
-                w,
-                "  %thread_id_{}_1 = add i64 %unshifted_id_{}_1, %parallel_launch_1_low\n",
-                fork_join_id.idx(),
-                fork_join_id.idx(),
-            )?;
-        } else if info.thread_counts.len() == 2 {
-            write!(
-                w,
-                "  %thread_id_{}_0 = udiv i64 %linear_{}, ",
-                fork_join_id.idx(),
-                block_idx
-            )?;
-            self.emit_svalue(&info.thread_counts[1], false, w)?;
-            write!(w, "\n")?;
-            write!(
-                w,
-                "  %thread_id_{}_1 = urem i64 %linear_{}, ",
-                fork_join_id.idx(),
-                block_idx
-            )?;
-            self.emit_svalue(&info.thread_counts[1], false, w)?;
-            write!(w, "\n")?;
+    fn get_value(&self, id: NodeID, ty: bool) -> String {
+        if ty {
+            format!("{} %v{}", self.get_type(self.typing[id.idx()]), id.idx())
         } else {
-            todo!("TODO: Handle the 3 or more dimensional fork-join case.")
+            format!("%v{}", id.idx())
         }
+    }
 
-        // Increment the linear index.
-        write!(
-            w,
-            "  %linear_{}_inc = add i64 %linear_{}, 1\n",
-            block_idx, block_idx
-        )?;
+    fn get_block_name(&self, id: NodeID) -> String {
+        format!("bb_{}", id.idx())
+    }
+
+    fn get_type(&self, id: TypeID) -> &'static str {
+        convert_type(&self.types[id.idx()])
+    }
 
-        // Emit the branch.
+    // Use the trick that given a number `x` and a power of two `m`, we can
+    // compute rounding up `x` to the next multiple of
+    // `m` via:
+    //
+    //     (x + m - 1) & -m
+    //
+    // Which is equivalent to the following LLVM IR (`m` is a constant):
+    //
+    //     %1 = add i64 %x, (m-1)
+    //     %2 = and i64 %1, -m
+    fn round_up_to(x: &str, m: usize, body: &mut String) -> Result<String, Error> {
+        let init_body_len = Self::gen_filler_id();
         write!(
-            w,
-            "  %cond_{} = icmp ult i64 %linear_{}, %bound_{}\n",
-            block_idx, block_idx, block_idx
+            body,
+            "  %round_up_to.{} = add i64 {}, {}\n",
+            init_body_len,
+            x,
+            m - 1
         )?;
-        let top_name = &self.block_names[&(BlockID::new(block_idx), Some(fork_join_id))];
-        let succ_name = &self.block_names[&(info.successor, Some(fork_join_id))];
+        let name = format!("%round_up_to.{}", Self::gen_filler_id());
         write!(
-            w,
-            "  br i1 %cond_{}, label %{}, label %{}\n",
-            block_idx, top_name, succ_name
+            body,
+            "  {} = and i64 %round_up_to.{}, -{}\n",
+            name, init_body_len, m
         )?;
-
-        Ok(())
+        Ok(name)
     }
 
-    /*
-     * Calculate and emit block-level info for vectorized parallel blocks.
-     */
-    fn setup_vectorized_parallel_block<W: Write>(
-        &self,
-        width: usize,
-        w: &mut W,
-    ) -> Result<(), Error> {
-        let (block_idx, block) = self.block.get();
-
-        // Get the uses of virtual registers defined outside the
-        // vectorized region.
-        let mut outside_def_used_in_vector = HashSet::new();
-        for inst in block.insts.iter() {
-            for virt_reg in sched_get_uses(inst).filter_map(|svalue| svalue.try_virt_reg()) {
-                let outside = match self.virt_reg_to_inst_id.get(&virt_reg) {
-                    Some(use_inst_id) => {
-                        block.kind != self.function.blocks[use_inst_id.idx_0()].kind
-                    }
-                    // Parameters are always defined outside the vectorized
-                    // region as scalars.
-                    None => true,
-                };
-                if outside {
-                    outside_def_used_in_vector.insert(virt_reg);
-                }
-            }
-        }
-
-        // Broadcast scalar values into vector values. The vector
-        // register produced needs to be indexed in name by the block
-        // index. This is because we may end up using the same value in
-        // multiple vectorized blocks, and we can't have those
-        // vectorized scalars have the same name.
-        for outside_virt_reg in outside_def_used_in_vector.iter() {
-            write!(
-                w,
-                "  %vec1_{}_v{} = insertelement <1 x ",
-                block_idx, outside_virt_reg
-            )?;
-            let elem_ty = &self.svalue_types[&SValue::VirtualRegister(*outside_virt_reg)];
-            self.emit_type(elem_ty, w)?;
-            write!(w, "> undef, ")?;
-            self.emit_type(elem_ty, w)?;
-            write!(w, " %v{}, i32 0\n", outside_virt_reg)?;
-            write!(
-                w,
-                "  %vec_{}_v{} = shufflevector <1 x ",
-                block_idx, outside_virt_reg
-            )?;
-            self.emit_type(elem_ty, w)?;
-            write!(w, "> %vec1_{}_v{}, <1 x ", block_idx, outside_virt_reg)?;
-            self.emit_type(elem_ty, w)?;
-            write!(w, "> undef, <{} x i32> zeroinitializer\n", width)?;
-        }
-
-        // Set the cell values in the context.
-        self.vector_width.set(Some(width));
-        self.outside_def_used_in_vector
-            .replace(outside_def_used_in_vector);
-
-        Ok(())
+    // Emit LLVM IR to add the size of the next field.
+    fn append(x: &str, f: &str, body: &mut String) -> Result<String, Error> {
+        let name = format!("%append.{}", Self::gen_filler_id());
+        write!(body, "  {} = add i64 {}, {}\n", name, x, f)?;
+        Ok(name)
     }
 
-    /*
-     * Calculate and emit block-level info for vectorized reduce blocks.
-     */
-    fn setup_vectorized_reduce_block<W: Write>(
-        &self,
-        fork_join_id: ForkJoinID,
-        width: usize,
-        w: &mut W,
-    ) -> Result<(), Error> {
-        let (block_idx, block) = self.block.get();
-
-        // Get uses of vector values defined in the parallel region.
-        let mut vectors_from_parallel = HashSet::new();
-        for inst in block.insts.iter() {
-            for virt_reg in sched_get_uses(inst).filter_map(|svalue| svalue.try_virt_reg()) {
-                if let Some(inst_id) = self.virt_reg_to_inst_id.get(&virt_reg)
-                    && self.function.blocks[inst_id.idx_0()].kind
-                        == SBlockKind::Parallel(fork_join_id)
-                {
-                    vectors_from_parallel.insert(virt_reg);
-                }
-            }
-        }
-
-        // Each reduction may be representable by an LLVM reduction intrinsic.
-        // If every reduction in this reduce block is, then we don't need to
-        // generate an explicit loop. If any one reduction isn't representable
-        // as a single intrinsic, then we need to generate an explicit loop. The
-        // explicit loop calculates the reduction for all reductions that can't
-        // be represented by intrinsics, while intrinsics are still used to
-        // calculate reductions that can be represented by them. Currently, the
-        // "associative" schedule captures this info per reduction variable.
-        let all_intrinsic_representable = block
-            .insts
-            .iter()
-            .enumerate()
-            .filter(|(_, inst)| inst.is_reduction_variable())
-            .all(|(inst_idx, _)| block.schedules[&inst_idx].contains(&SSchedule::Associative));
-        if !all_intrinsic_representable {
-            let info = &self.parallel_reduce_infos[&fork_join_id];
-            let entry_name = &self.block_names[&(info.bottom_parallel_block, Some(fork_join_id))];
-            let self_name = &self.block_names[&(info.reduce_block, Some(fork_join_id))];
-            let succ_name = &self.block_names[&(info.successor, Some(fork_join_id))];
-
-            // Emit a loop header for the reduce.
-            write!(
-                w,
-                "  %linear_{} = phi i64 [ 0, %{} ], [ %linear_{}_inc, %{}_reduce_body ]\n",
-                block_idx, entry_name, block_idx, self_name,
-            )?;
-            // Emit phis for reduction variables here, since they need to be
-            // above everything emitted below.
-            for (var_num, virt_reg) in info.reduction_variables.iter() {
-                // Only emit phis for reduction variables that aren't
-                // implemented in intrinsics.
-                if !block.schedules[&self.virt_reg_to_inst_id[virt_reg].idx_1()]
-                    .contains(&SSchedule::Associative)
-                {
-                    write!(w, "  %v{} = phi ", virt_reg)?;
-                    self.emit_type(&self.svalue_types[&SValue::VirtualRegister(*virt_reg)], w)?;
-                    write!(w, " [ ")?;
-                    self.emit_svalue(&info.reduce_inits[*var_num], false, w)?;
-                    write!(w, ", %{} ], [ ", entry_name)?;
-                    self.emit_svalue(&info.reduce_reducts[*var_num], false, w)?;
-                    write!(w, ", %{}_reduce_body ]\n", self_name)?;
-                }
-            }
-            write!(
-                w,
-                "  %linear_{}_inc = add i64 %linear_{}, 1\n",
-                block_idx, block_idx
-            )?;
-            // The loop bound is the constant vector width.
-            write!(
-                w,
-                "  %cond_{} = icmp ult i64 %linear_{}, {}\n",
-                block_idx, block_idx, width
-            )?;
-            // Branch to the reduce loop body.
-            write!(
-                w,
-                "  br i1 %cond_{}, label %{}_reduce_body, label %{}\n",
-                block_idx, self_name, succ_name
-            )?;
-            // The rest of the reduce block gets put into a "body" block.
-            write!(w, "{}_reduce_body:\n", self_name)?;
-            // Extract the needed element from the used parallel vectors.
-            self.vector_width.set(Some(width));
-            for virt_reg in vectors_from_parallel.iter() {
-                write!(w, "  %extract_v{} = extractelement ", virt_reg)?;
-                self.emit_svalue(&SValue::VirtualRegister(*virt_reg), true, w)?;
-                write!(w, ", i64 %linear_{}\n", block_idx)?;
-            }
-            self.vector_width.set(None);
-
-            // Signal that the terminator needs to be a conditional branch to
-            // close the loop.
-            self.vector_reduce_cycle.set(true);
-        }
+    // Emit LLVM IR to get the maximum of two sizes.
+    fn max(a: &str, b: &str, body: &mut String) -> Result<String, Error> {
+        let name = format!("%max.{}", Self::gen_filler_id());
+        write!(
+            body,
+            "  {} = call i64 @llvm.umax.i64(i64 {}, i64 {})\n",
+            name, a, b
+        )?;
+        Ok(name)
+    }
 
-        let vector_reduce_associative_vars = block
-            .insts
-            .iter()
-            .enumerate()
-            .filter_map(|(inst_idx, inst)| {
-                inst.try_reduction_variable()
-                    .map(|num| (inst_idx, block.virt_regs[inst_idx].0, num))
-            })
-            .filter(|(inst_idx, _, _)| block.schedules[&inst_idx].contains(&SSchedule::Associative))
-            .map(|(_, virt_reg, num)| (virt_reg, num))
-            .collect();
+    // Emit LLVM IR to multiply two sizes.
+    fn multiply(a: &str, b: &str, body: &mut String) -> Result<String, Error> {
+        let name = format!("%multiply.{}", Self::gen_filler_id());
+        write!(body, "  {} = mul i64 {}, {}\n", name, a, b)?;
+        Ok(name)
+    }
 
-        self.vectors_from_parallel.replace(vectors_from_parallel);
-        self.vector_reduce_associative_vars
-            .replace(vector_reduce_associative_vars);
+    // Emit LLVM IR to gep a pointer from a byte size.
+    fn gep(ptr: &str, size: &str, body: &mut String) -> Result<String, Error> {
+        let name = format!("%gep.{}", Self::gen_filler_id());
+        write!(
+            body,
+            "  {} = getelementptr i8, ptr {}, i64 {}\n",
+            name, ptr, size
+        )?;
+        Ok(name)
+    }
 
-        Ok(())
+    fn gen_filler_id() -> usize {
+        NUM_FILLER_REGS.fetch_add(1, Ordering::Relaxed)
     }
+}
 
-    /*
-     * Reset the cells storing block specific context configuration.
-     */
-    pub fn reset_cells(&self) {
-        self.vector_width.take();
-        self.outside_def_used_in_vector.take();
-        self.vectors_from_parallel.take();
-        self.vector_reduce_associative_vars.take();
-        self.vector_reduce_cycle.take();
+fn convert_type(ty: &Type) -> &'static str {
+    match ty {
+        Type::Boolean => "i1",
+        Type::Integer8 | Type::UnsignedInteger8 => "i8",
+        Type::Integer16 | Type::UnsignedInteger16 => "i16",
+        Type::Integer32 | Type::UnsignedInteger32 => "i32",
+        Type::Integer64 | Type::UnsignedInteger64 => "i64",
+        Type::Float32 => "float",
+        Type::Float64 => "double",
+        Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "ptr",
+        _ => panic!(),
     }
 }
 
-impl SBinaryOperator {
-    fn get_llvm_op(&self, left_ty: &SType) -> &'static str {
-        enum OpTy {
-            Float,
-            Unsigned,
-            Signed,
-        }
+fn convert_intrinsic(intrinsic: &Intrinsic, ty: &Type) -> String {
+    let intrinsic = match intrinsic {
+        Intrinsic::Abs => "abs",
+        Intrinsic::ACos => "acos",
+        Intrinsic::ASin => "asin",
+        Intrinsic::ATan => "atan",
+        Intrinsic::ATan2 => "atan2",
+        Intrinsic::Ceil => "ceil",
+        Intrinsic::Cos => "cos",
+        Intrinsic::Cosh => "cosh",
+        Intrinsic::Exp => "exp",
+        Intrinsic::Exp2 => "exp2",
+        Intrinsic::Floor => "floor",
+        Intrinsic::Ln => "log",
+        Intrinsic::Log10 => "log10",
+        Intrinsic::Log2 => "log2",
+        Intrinsic::Pow => "pow",
+        Intrinsic::Powf => "pow",
+        Intrinsic::Powi => "powi",
+        Intrinsic::Round => "round",
+        Intrinsic::Sin => "sin",
+        Intrinsic::Sinh => "sinh",
+        Intrinsic::Sqrt => "sqrt",
+        Intrinsic::Tan => "tan",
+        Intrinsic::Tanh => "tanh",
+        _ => panic!(),
+    };
 
-        let op_ty = if left_ty.is_float() {
-            OpTy::Float
-        } else if left_ty.is_unsigned() {
-            OpTy::Unsigned
-        } else {
-            OpTy::Signed
-        };
+    // We can't just use our previous routines for emitting types, because only
+    // inside intrinsics does LLVM use "f32" and "f64" properly!
+    let ty = match ty {
+        Type::Boolean => "i1",
+        Type::Integer8 | Type::UnsignedInteger8 => "i8",
+        Type::Integer16 | Type::UnsignedInteger16 => "i16",
+        Type::Integer32 | Type::UnsignedInteger32 => "i32",
+        Type::Integer64 | Type::UnsignedInteger64 => "i64",
+        Type::Float32 => "f32",
+        Type::Float64 => "f64",
+        _ => panic!(),
+    };
 
-        match (self, op_ty) {
-            (SBinaryOperator::Add, OpTy::Float) => "fadd",
-            (SBinaryOperator::Add, _) => "add",
-            (SBinaryOperator::Sub, OpTy::Float) => "fsub",
-            (SBinaryOperator::Sub, _) => "sub",
-            (SBinaryOperator::Mul, OpTy::Float) => "fmul",
-            (SBinaryOperator::Mul, _) => "mul",
-            (SBinaryOperator::Div, OpTy::Float) => "fdiv",
-            (SBinaryOperator::Div, OpTy::Unsigned) => "udiv",
-            (SBinaryOperator::Div, OpTy::Signed) => "sdiv",
-            (SBinaryOperator::Rem, OpTy::Float) => "frem",
-            (SBinaryOperator::Rem, OpTy::Unsigned) => "urem",
-            (SBinaryOperator::Rem, OpTy::Signed) => "srem",
-            (SBinaryOperator::LT, OpTy::Float) => "fcmp olt",
-            (SBinaryOperator::LT, OpTy::Unsigned) => "icmp ult",
-            (SBinaryOperator::LT, OpTy::Signed) => "icmp slt",
-            (SBinaryOperator::LTE, OpTy::Float) => "fcmp ole",
-            (SBinaryOperator::LTE, OpTy::Unsigned) => "icmp ule",
-            (SBinaryOperator::LTE, OpTy::Signed) => "icmp sle",
-            (SBinaryOperator::GT, OpTy::Float) => "fcmp ogt",
-            (SBinaryOperator::GT, OpTy::Unsigned) => "icmp ugt",
-            (SBinaryOperator::GT, OpTy::Signed) => "icmp sgt",
-            (SBinaryOperator::GTE, OpTy::Float) => "fcmp oge",
-            (SBinaryOperator::GTE, OpTy::Unsigned) => "icmp uge",
-            (SBinaryOperator::GTE, OpTy::Signed) => "icmp sge",
-            (SBinaryOperator::EQ, OpTy::Float) => "fcmp oeq",
-            (SBinaryOperator::EQ, _) => "icmp eq",
-            (SBinaryOperator::NE, OpTy::Float) => "fcmp one",
-            (SBinaryOperator::NE, _) => "icmp ne",
-            (SBinaryOperator::Or, _) => "or",
-            (SBinaryOperator::And, _) => "and",
-            (SBinaryOperator::Xor, _) => "xor",
-            (SBinaryOperator::LSh, _) => "lsh",
-            (SBinaryOperator::RSh, OpTy::Unsigned) => "lshr",
-            (SBinaryOperator::RSh, _) => "ashr",
-        }
-    }
+    format!("@llvm.{}.{}", intrinsic, ty)
 }
diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs
index 88171d33923fc773bd8ba8377e6eba80986612be..9013eff7fd2c310388d9869050b38d235a4e205c 100644
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
@@ -1,15 +1,9 @@
-#![feature(let_chains, iter_intersperse, map_try_insert)]
+#![feature(if_let_guard, let_chains)]
 
 pub mod cpu;
-pub mod manifest;
-pub mod sched_dot;
-pub mod sched_gen;
-pub mod sched_ir;
-pub mod sched_schedule;
+pub mod mem;
+pub mod rt;
 
 pub use crate::cpu::*;
-pub use crate::manifest::*;
-pub use crate::sched_dot::*;
-pub use crate::sched_gen::*;
-pub use crate::sched_ir::*;
-pub use crate::sched_schedule::*;
+pub use crate::mem::*;
+pub use crate::rt::*;
diff --git a/hercules_cg/src/manifest.rs b/hercules_cg/src/manifest.rs
deleted file mode 100644
index d9224c05b93435ec33c67ac261f70c47e9f3622b..0000000000000000000000000000000000000000
--- a/hercules_cg/src/manifest.rs
+++ /dev/null
@@ -1,180 +0,0 @@
-extern crate serde;
-
-extern crate hercules_ir;
-
-use std::collections::BTreeSet;
-use std::iter::once;
-
-use self::serde::Deserialize;
-use self::serde::Serialize;
-
-use self::hercules_ir::*;
-
-use crate::*;
-
-/*
- * A manifest stores metadata about a Hercules function. This metadata is used
- * by the runtime to actually call a Hercules function.
- */
-#[derive(Debug, Clone, Hash, Serialize, Deserialize)]
-pub struct Manifest {
-    // The signature of each Hercules function is represented in terms of
-    // STypes, since this is the lowest level type representation that Hercules
-    // constructs before reaching target-specific backends.
-    pub param_types: Vec<(SType, ParameterKind)>,
-    pub return_type: SType,
-
-    // The dynamic constants (potentially) used in this Hercules function.
-    pub dynamic_constants: Vec<DynamicConstant>,
-    // The dimensions for array constants defined and used in this Hercules
-    // function.
-    pub array_constants: Vec<Box<[DynamicConstantID]>>,
-
-    // The partitions that make up this Hercules function.
-    pub partitions: Vec<PartitionManifest>,
-}
-
-#[derive(Debug, Clone, Hash, Serialize, Deserialize)]
-pub struct PartitionManifest {
-    // Each partition has one corresponding SFunction.
-    pub name: SFunctionName,
-    // Record the type and kind of each parameter.
-    pub parameters: Vec<(SType, ParameterKind)>,
-    // Record the type and kind of each return value.
-    pub returns: Vec<(SType, ReturnKind)>,
-    // Record the list of possible successors from this partition.
-    pub successors: Vec<PartitionID>,
-    // Device specific parts of the manifest. Represents details of calling
-    // partition functions not present in the schedule IR type information
-    // (since schedule IR is target independent).
-    pub device: DeviceManifest,
-}
-
-#[derive(Debug, Clone, Hash, Serialize, Deserialize, PartialEq, Eq)]
-pub enum ParameterKind {
-    // A parameter corresponding to a parameter of the Hercules function.
-    HerculesParameter(usize),
-    // A parameter corresponding to some data defined in some other partition.
-    DataInput(NodeID),
-    // A parameter corresponding to a dynamic constant input to the Hercules
-    // function.
-    DynamicConstant(usize),
-    // A parameter corresponding to an array constant used in the partition.
-    ArrayConstant(ArrayID),
-}
-
-#[derive(Debug, Clone, Hash, Serialize, Deserialize)]
-pub enum ReturnKind {
-    // A return value corresponding to the return value of the Hercules
-    // function.
-    HerculesReturn,
-    // A return value corresponding to some data used in some other partition.
-    DataOutput(NodeID),
-    // An integer specifying which partition should be executed next, if this
-    // partition has multiple successors.
-    NextPartition,
-}
-
-#[derive(Debug, Clone, Hash, Serialize, Deserialize)]
-pub enum DeviceManifest {
-    CPU {
-        // If there's a top level fork-join that we parallel launch, specify,
-        // for each thread dimension, how many tiles we want to spawn, and
-        // the thread count. The thread count is a dynamic constant.
-        parallel_launch: Box<[(usize, DynamicConstantID)]>,
-    },
-    GPU,
-    Call {
-        // This is a Hercules function name, not a schedule IR function name.
-        callee: String,
-    },
-}
-
-impl Manifest {
-    pub fn all_visible_types(&self) -> impl Iterator<Item = SType> + '_ {
-        self.param_types
-            // Include the Hercules function parameter types.
-            .iter()
-            .map(|(ty, _)| ty.clone())
-            // Include the Hercules function return type.
-            .chain(once(self.return_type.clone()))
-            // Include the partition parameter types.
-            .chain(
-                self.partitions
-                    .iter()
-                    .map(|partition| partition.parameters.iter().map(|(ty, _)| ty.clone()))
-                    .flatten(),
-            )
-            // Include the partition return types.
-            .chain(
-                self.partitions
-                    .iter()
-                    .map(|partition| partition.returns.iter().map(|(ty, _)| ty.clone()))
-                    .flatten(),
-            )
-            // Include the product types formed by the partition return types,
-            // since multiple return values are returned inside a struct.
-            .chain(self.partitions.iter().map(|partition| {
-                SType::Product(partition.returns.iter().map(|(ty, _)| ty.clone()).collect())
-            }))
-    }
-
-    pub fn transitive_closure_type_set(type_set: BTreeSet<SType>) -> BTreeSet<SType> {
-        let mut closure = BTreeSet::new();
-        let mut workset: BTreeSet<&SType> = type_set.iter().collect();
-
-        while let Some(ty) = workset.pop_last() {
-            match ty {
-                SType::Product(fields) => workset.extend(fields),
-                SType::ArrayRef(elem) => {
-                    workset.insert(elem);
-                }
-                _ => {}
-            }
-            closure.insert(ty.clone());
-        }
-
-        closure
-    }
-}
-
-impl PartitionManifest {
-    pub fn data_inputs(&self) -> impl Iterator<Item = (NodeID, &SType)> + '_ {
-        self.parameters.iter().filter_map(|(stype, param_kind)| {
-            if let ParameterKind::DataInput(id) = param_kind {
-                Some((*id, stype))
-            } else {
-                None
-            }
-        })
-    }
-
-    pub fn data_outputs(&self) -> impl Iterator<Item = (NodeID, &SType)> + '_ {
-        self.returns.iter().filter_map(|(stype, return_kind)| {
-            if let ReturnKind::DataOutput(id) = return_kind {
-                Some((*id, stype))
-            } else {
-                None
-            }
-        })
-    }
-}
-
-impl DeviceManifest {
-    pub fn cpu() -> Self {
-        DeviceManifest::CPU {
-            parallel_launch: Box::new([]),
-        }
-    }
-
-    pub fn gpu() -> Self {
-        DeviceManifest::GPU
-    }
-
-    pub fn num_parallel_launch_dims(&self) -> usize {
-        match self {
-            DeviceManifest::CPU { parallel_launch } => parallel_launch.len(),
-            _ => panic!(),
-        }
-    }
-}
diff --git a/hercules_cg/src/mem.rs b/hercules_cg/src/mem.rs
new file mode 100644
index 0000000000000000000000000000000000000000..0c053455953937c9ff4955cc94b6482e0bb59373
--- /dev/null
+++ b/hercules_cg/src/mem.rs
@@ -0,0 +1,291 @@
+extern crate bitvec;
+extern crate hercules_ir;
+
+use std::collections::{BTreeMap, BTreeSet};
+
+use self::bitvec::prelude::*;
+
+use self::hercules_ir::*;
+
+#[derive(Debug)]
+pub struct MemoryObjects {
+    node_id_to_memory_objects: Vec<Vec<usize>>,
+    memory_object_to_origin: Vec<NodeID>,
+    parameter_index_to_memory_object: Vec<Option<usize>>,
+    possibly_returned_memory_objects: Vec<usize>,
+}
+
+impl MemoryObjects {
+    pub fn memory_objects(&self, id: NodeID) -> &Vec<usize> {
+        &self.node_id_to_memory_objects[id.idx()]
+    }
+
+    pub fn origin(&self, memory_object: usize) -> NodeID {
+        self.memory_object_to_origin[memory_object]
+    }
+
+    pub fn memory_object_of_parameter(&self, parameter: usize) -> Option<usize> {
+        self.parameter_index_to_memory_object[parameter]
+    }
+
+    pub fn returned_memory_objects(&self) -> &Vec<usize> {
+        &self.possibly_returned_memory_objects
+    }
+
+    pub fn num_memory_objects(&self) -> usize {
+        self.memory_object_to_origin.len()
+    }
+}
+
+#[derive(Debug)]
+pub struct MemoryObjectsMutability {
+    func_to_memory_object_to_mutable: Vec<BitVec<u8, Lsb0>>,
+}
+
+impl MemoryObjectsMutability {
+    pub fn is_mutable(&self, id: FunctionID, memory_object: usize) -> bool {
+        self.func_to_memory_object_to_mutable[id.idx()][memory_object]
+    }
+}
+
+/*
+ * Each node is assigned a set of memory objects output-ed from the node. This
+ * is just a set of memory object IDs (usize).
+ */
+#[derive(PartialEq, Eq, Clone, Debug)]
+struct MemoryObjectLattice {
+    objs: BTreeSet<usize>,
+}
+
+impl Semilattice for MemoryObjectLattice {
+    fn meet(a: &Self, b: &Self) -> Self {
+        MemoryObjectLattice {
+            objs: a.objs.union(&b.objs).map(|x| *x).collect(),
+        }
+    }
+
+    fn top() -> Self {
+        MemoryObjectLattice {
+            objs: BTreeSet::new(),
+        }
+    }
+
+    fn bottom() -> Self {
+        // Technically, this lattice is unbounded - technically technically, the
+        // lattice is bounded by the number of memory objects in a given
+        // instance, but incorporating this information is not possible in our
+        // Semilattice inferface. Luckily bottom() isn't necessary if we never
+        // call it, which we don't here.
+        panic!()
+    }
+}
+
+/*
+ * Top level function to analyze memory objects in a Hercules function. These
+ * are distinct collections (products, summations, arrays) that are used in a
+ * function where we try to disambiguate a string of values produced in the
+ * immutable value semantics of Hercules IR into a smaller amount of distinct
+ * memory object that can be modified in-place.
+ */
+pub fn memory_objects(
+    function: &Function,
+    types: &Vec<Type>,
+    reverse_postorder: &Vec<NodeID>,
+    typing: &Vec<TypeID>,
+) -> MemoryObjects {
+    // Find memory objects originating at parameters, constants, calls, or
+    // undefs.
+    let memory_object_to_origin: Vec<_> = function
+        .nodes
+        .iter()
+        .enumerate()
+        .filter(|(idx, node)| {
+            (node.is_parameter() || node.is_constant() || node.is_call() || node.is_undef())
+                && !types[typing[*idx].idx()].is_primitive()
+        })
+        .map(|(idx, _)| NodeID::new(idx))
+        .collect();
+    let node_id_to_originating_memory_obj: BTreeMap<_, _> = memory_object_to_origin
+        .iter()
+        .enumerate()
+        .map(|(idx, id)| (*id, idx))
+        .collect();
+
+    // Map parameter index to memory object, if applicable. Panic if two
+    // parameter nodes with the same index are found - those really should get
+    // removed by GVN!
+    let mut parameter_index_to_memory_object = vec![None; function.param_types.len()];
+    for (memory_object, origin) in memory_object_to_origin.iter().enumerate() {
+        if let Some(param) = function.nodes[origin.idx()].try_parameter() {
+            assert!(
+                parameter_index_to_memory_object[param].is_none(),
+                "PANIC: Found multiple parameter nodes with the same index."
+            );
+            parameter_index_to_memory_object[param] = Some(memory_object);
+        }
+    }
+
+    // Run dataflow analysis to figure out which memory objects each data node
+    // may be. Note that there's a strict subset of data nodes that can assigned
+    // memory objects:
+    //
+    // - Phi: selects between memory objects in SSA form, may be assigned
+    //   multiple possible memory objects.
+    // - Reduce: reduces over a memory object, similar to phis.
+    // - Parameter: may originate a memory object.
+    // - Constant: may originate a memory object.
+    // - Call: may originate a memory object - if doesn't originate a memory
+    //   object, doesn't become one based on arguments, as arguments are passed
+    //   to callee.
+    // - Read: may extract a smaller memory object from input - this is
+    //   considered to be the same memory object as the input, as no copy takes
+    //   place.
+    // - Write: updates a memory object.
+    // - Undef: may originate a dummy memory object.
+    //
+    // Some notable omissions are:
+    //
+    // - Return: doesn't technically "output" a memory object, but may consume
+    //   one. As in the logic with calls not returning a memory object, returns
+    //   are not assigned memory objects.
+    // - Ternary (select): selecting over memory objects is a gray area
+    //   currently. Bail if we see a select over memory objects.
+    assert!(!function.nodes.iter().enumerate().any(|(idx, node)| node
+        .try_ternary(TernaryOperator::Select)
+        .is_some()
+        && !types[typing[idx].idx()].is_primitive()));
+    let lattice = forward_dataflow(function, reverse_postorder, |inputs, id| {
+        match function.nodes[id.idx()] {
+            Node::Phi {
+                control: _,
+                data: _,
+            }
+            | Node::Reduce {
+                control: _,
+                init: _,
+                reduct: _,
+            } => inputs
+                .into_iter()
+                .fold(MemoryObjectLattice::top(), |acc, input| {
+                    MemoryObjectLattice::meet(&acc, input)
+                }),
+            Node::Parameter { index: _ }
+            | Node::Constant { id: _ }
+            | Node::Call {
+                control: _,
+                function: _,
+                dynamic_constants: _,
+                args: _,
+            }
+            | Node::Undef { ty: _ }
+                if let Some(obj) = node_id_to_originating_memory_obj.get(&id) =>
+            {
+                MemoryObjectLattice {
+                    objs: [*obj].iter().map(|x| *x).collect(),
+                }
+            }
+            Node::Read {
+                collect: _,
+                indices: _,
+            }
+            | Node::Write {
+                collect: _,
+                data: _,
+                indices: _,
+            } => inputs[0].clone(),
+            _ => MemoryObjectLattice::top(),
+        }
+    });
+
+    // Look at the memory objects the data input to each return could be.
+    let mut possibly_returned_memory_objects = BTreeSet::new();
+    for node in function.nodes.iter() {
+        if let Node::Return { control: _, data } = node {
+            possibly_returned_memory_objects = possibly_returned_memory_objects
+                .union(&lattice[data.idx()].objs)
+                .map(|x| *x)
+                .collect();
+        }
+    }
+    let possibly_returned_memory_objects = possibly_returned_memory_objects.into_iter().collect();
+
+    let node_id_to_memory_objects = lattice
+        .into_iter()
+        .map(|lattice| lattice.objs.into_iter().collect())
+        .collect();
+    MemoryObjects {
+        node_id_to_memory_objects,
+        memory_object_to_origin,
+        parameter_index_to_memory_object,
+        possibly_returned_memory_objects,
+    }
+}
+
+/*
+ * Determine if each memory object in each function is mutated or not.
+ */
+pub fn memory_objects_mutability(
+    module: &Module,
+    callgraph: &CallGraph,
+    memory_objects: &Vec<MemoryObjects>,
+) -> MemoryObjectsMutability {
+    let mut mutated: Vec<_> = memory_objects
+        .iter()
+        .map(|memory_objects| bitvec![u8, Lsb0; 0; memory_objects.num_memory_objects()])
+        .collect();
+    let topo = callgraph.topo();
+
+    for func_id in topo {
+        // A memory object is mutated when:
+        // 1. The object is the subject of a write node.
+        // 2. The object is passed as argument to a function that mutates it.
+        for (idx, node) in module.functions[func_id.idx()].nodes.iter().enumerate() {
+            if node.is_write() {
+                // Every memory object that the write itself corresponds to it
+                // mutable in this function.
+                for memory_object in memory_objects[func_id.idx()].memory_objects(NodeID::new(idx))
+                {
+                    mutated[func_id.idx()].set(*memory_object, true);
+                }
+            } else if let Some((_, callee_id, _, args)) = node.try_call() {
+                for (param_idx, arg) in args.into_iter().enumerate() {
+                    // If this parameter corresponds to a memory object and it's
+                    // mutable in the callee...
+                    if let Some(param_callee_memory_object) =
+                        memory_objects[callee_id.idx()].memory_object_of_parameter(param_idx)
+                        && mutated[callee_id.idx()][param_callee_memory_object]
+                    {
+                        // Then every memory object corresponding to the
+                        // argument node in this function is mutable.
+                        for memory_object in memory_objects[func_id.idx()].memory_objects(*arg) {
+                            mutated[func_id.idx()].set(*memory_object, true);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    MemoryObjectsMutability {
+        func_to_memory_object_to_mutable: mutated,
+    }
+}
+
+/*
+ * The alignment of a type does not depend on dynamic constants.
+ */
+pub fn get_type_alignment(types: &Vec<Type>, ty: TypeID) -> usize {
+    match types[ty.idx()] {
+        Type::Control => panic!(),
+        Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1,
+        Type::Integer16 | Type::UnsignedInteger16 => 2,
+        Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4,
+        Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8,
+        Type::Product(ref members) | Type::Summation(ref members) => members
+            .into_iter()
+            .map(|id| get_type_alignment(types, *id))
+            .max()
+            .unwrap_or(1),
+        Type::Array(elem, _) => get_type_alignment(types, elem),
+    }
+}
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
new file mode 100644
index 0000000000000000000000000000000000000000..ddbc8f539e711d51c9d1005995068c6c2ee70eb8
--- /dev/null
+++ b/hercules_cg/src/rt.rs
@@ -0,0 +1,629 @@
+extern crate bitvec;
+extern crate hercules_ir;
+
+use std::collections::{BTreeMap, VecDeque};
+use std::fmt::{Error, Write};
+use std::iter::{zip, FromIterator};
+
+use self::bitvec::prelude::*;
+
+use self::hercules_ir::*;
+
+use crate::*;
+
+/*
+ * Entry Hercules functions are lowered to async Rust code to achieve easy task
+ * level parallelism. This Rust is generated textually, and is included via a
+ * procedural macro in the user's Rust code.
+ */
+pub fn rt_codegen<W: Write>(
+    func_id: FunctionID,
+    module: &Module,
+    reverse_postorder: &Vec<NodeID>,
+    typing: &Vec<TypeID>,
+    control_subgraph: &Subgraph,
+    bbs: &Vec<NodeID>,
+    callgraph: &CallGraph,
+    memory_objects: &Vec<MemoryObjects>,
+    memory_objects_mutability: &MemoryObjectsMutability,
+    w: &mut W,
+) -> Result<(), Error> {
+    let ctx = RTContext {
+        func_id,
+        module,
+        reverse_postorder,
+        typing,
+        control_subgraph,
+        bbs,
+        callgraph,
+        memory_objects,
+        _memory_objects_mutability: memory_objects_mutability,
+    };
+    ctx.codegen_function(w)
+}
+
+struct RTContext<'a> {
+    func_id: FunctionID,
+    module: &'a Module,
+    reverse_postorder: &'a Vec<NodeID>,
+    typing: &'a Vec<TypeID>,
+    control_subgraph: &'a Subgraph,
+    bbs: &'a Vec<NodeID>,
+    callgraph: &'a CallGraph,
+    memory_objects: &'a Vec<MemoryObjects>,
+    // TODO: use once memory objects are passed in a custom type where this
+    // actually matters.
+    _memory_objects_mutability: &'a MemoryObjectsMutability,
+}
+
+impl<'a> RTContext<'a> {
+    fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> {
+        let func = &self.get_func();
+
+        // Dump the function signature.
+        write!(
+            w,
+            "#[allow(unused_variables,unused_mut)]async fn {}(",
+            func.name
+        )?;
+        let mut first_param = true;
+        // The first set of parameters are dynamic constants.
+        for idx in 0..func.num_dynamic_constants {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(w, ", ")?;
+            }
+            write!(w, "dc_p{}: u64", idx)?;
+        }
+        // The second set of parameters are normal parameters.
+        for idx in 0..func.param_types.len() {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(w, ", ")?;
+            }
+            if !self.module.types[func.param_types[idx].idx()].is_primitive() {
+                write!(w, "mut ")?;
+            }
+            write!(
+                w,
+                "p_i{}: {}",
+                idx,
+                self.get_type_interface(func.param_types[idx])
+            )?;
+        }
+        write!(w, ") -> {} {{\n", self.get_type_interface(func.return_type))?;
+
+        // Copy the "interface" parameters to "non-interface" parameters.
+        // The purpose of this is to convert memory objects from a Box<[u8]>
+        // type to a *mut u8 type. This name copying is done so that we can
+        // easily construct memory objects just after this by moving the
+        // "interface" parameters.
+        for (idx, ty) in func.param_types.iter().enumerate() {
+            if self.module.types[ty.idx()].is_primitive() {
+                write!(w, "    let p{} = p_i{};\n", idx, idx)?;
+            } else {
+                write!(
+                    w,
+                    "    let p{} = ::std::boxed::Box::as_mut_ptr(&mut p_i{}) as *mut u8;\n",
+                    idx, idx
+                )?;
+            }
+        }
+
+        // Collect the boxes representing ownership over memory objects for this
+        // function. The actual emitted computation is done entirely using
+        // pointers, so these get emitted to hold onto ownership over the
+        // underlying memory and to automatically clean them up when this
+        // function returns. Memory objects are inside Options, since their
+        // ownership may get passed to other called RT functions. If this
+        // function returns a memory object, then at the very end, right before
+        // the return, the to-be-returned pointer is compared against the owned
+        // memory objects - it should match exactly one of those objects, and
+        // that box is what's actually returned.
+        let mem_obj_ty = "::core::option::Option<::std::boxed::Box<[u8]>>";
+        for memory_object in 0..self.memory_objects[self.func_id.idx()].num_memory_objects() {
+            let origin = self.memory_objects[self.func_id.idx()].origin(memory_object);
+            match func.nodes[origin.idx()] {
+                Node::Parameter { index } => write!(
+                    w,
+                    "    let mut mem_obj{}: {} = Some(p_i{});\n",
+                    memory_object, mem_obj_ty, index
+                )?,
+                Node::Constant { id: _ } => {
+                    let size = self.codegen_type_size(self.typing[origin.idx()]);
+                    write!(
+                        w,
+                        "    let mut mem_obj{}: {} = Some((0..{}).map(|_| 0u8).collect());\n",
+                        memory_object, mem_obj_ty, size
+                    )?
+                }
+                Node::Call {
+                    control: _,
+                    function: _,
+                    dynamic_constants: _,
+                    args: _,
+                }
+                | Node::Undef { ty: _ } => write!(
+                    w,
+                    "    let mut mem_obj{}: {} = None;\n",
+                    memory_object, mem_obj_ty,
+                )?,
+                _ => panic!(),
+            }
+        }
+
+        // Dump signatures for called CPU functions.
+        write!(w, "    extern \"C\" {{\n")?;
+        for callee in self.callgraph.get_callees(self.func_id) {
+            let callee = &self.module.functions[callee.idx()];
+            write!(w, "        fn {}(", callee.name)?;
+            let mut first_param = true;
+            for idx in 0..callee.num_dynamic_constants {
+                if first_param {
+                    first_param = false;
+                } else {
+                    write!(w, ", ")?;
+                }
+                write!(w, "dc{}: u64", idx)?;
+            }
+            for (idx, ty) in callee.param_types.iter().enumerate() {
+                if first_param {
+                    first_param = false;
+                } else {
+                    write!(w, ", ")?;
+                }
+                write!(w, "p{}: {}", idx, self.get_type(*ty))?;
+            }
+            write!(w, ") -> {};\n", self.get_type(callee.return_type))?;
+        }
+        write!(w, "    }}\n")?;
+
+        // Declare intermediary variables for every value.
+        for idx in 0..func.nodes.len() {
+            if func.nodes[idx].is_control() {
+                continue;
+            }
+            write!(
+                w,
+                "    let mut node_{}: {} = {};\n",
+                idx,
+                self.get_type(self.typing[idx]),
+                if self.module.types[self.typing[idx].idx()].is_integer() {
+                    "0"
+                } else if self.module.types[self.typing[idx].idx()].is_float() {
+                    "0.0"
+                } else {
+                    "::core::ptr::null::<u8>() as _"
+                }
+            )?;
+        }
+
+        // The core executor is a Rust loop. We literally run a "control token"
+        // as described in the original sea of nodes paper through the basic
+        // blocks to drive execution.
+        write!(
+            w,
+            "    let mut control_token: i8 = 0;\n    loop {{\n        match control_token {{\n",
+        )?;
+
+        let mut blocks: BTreeMap<_, _> = (0..func.nodes.len())
+            .filter(|idx| func.nodes[*idx].is_control())
+            .map(|idx| (NodeID::new(idx), String::new()))
+            .collect();
+
+        // Emit data flow into basic blocks.
+        let mut worklist = VecDeque::from_iter(
+            self.reverse_postorder
+                .into_iter()
+                .filter(|id| !func.nodes[id.idx()].is_control()),
+        );
+        let mut visited = bitvec![u8, Lsb0; 0; func.nodes.len()];
+        while let Some(id) = worklist.pop_front() {
+            let node = &func.nodes[id.idx()];
+            if node.is_phi()
+                || node.is_reduce()
+                || get_uses(node)
+                    .as_ref()
+                    .into_iter()
+                    .all(|u| func.nodes[u.idx()].is_control() || visited[u.idx()])
+            {
+                self.codegen_data_node(*id, &mut blocks)?;
+                visited.set(id.idx(), true);
+            } else {
+                worklist.push_back(id);
+            }
+        }
+
+        // Emit control flow into basic blocks.
+        for id in (0..func.nodes.len()).map(NodeID::new) {
+            if !func.nodes[id.idx()].is_control() {
+                continue;
+            }
+            self.codegen_control_node(id, &mut blocks)?;
+        }
+
+        // Dump the emitted basic blocks.
+        for (id, block) in blocks {
+            write!(
+                w,
+                "            {} => {{\n{}            }}\n",
+                id.idx(),
+                block
+            )?;
+        }
+
+        // Close the match, loop, and function.
+        write!(w, "            _ => panic!()\n        }}\n    }}\n}}\n")?;
+        Ok(())
+    }
+
+    /*
+     * While control nodes in Hercules IR are predecessor-centric (each take a
+     * control input that defines the predecessor relationship), the Rust loop
+     * we generate is successor centric. This difference requires explicit
+     * translation.
+     */
+    fn codegen_control_node(
+        &self,
+        id: NodeID,
+        blocks: &mut BTreeMap<NodeID, String>,
+    ) -> Result<(), Error> {
+        let func = &self.get_func();
+        match func.nodes[id.idx()] {
+            // Start, region, and projection control nodes all have exactly one
+            // successor and are otherwise simple.
+            Node::Start
+            | Node::Region { preds: _ }
+            | Node::Projection {
+                control: _,
+                selection: _,
+            } => {
+                let block = &mut blocks.get_mut(&id).unwrap();
+                let succ = self.control_subgraph.succs(id).next().unwrap();
+                write!(block, "                control_token = {};\n", succ.idx())?
+            }
+            // If nodes have two successors - examine the projections to
+            // determine which branch is which, and branch between them.
+            Node::If { control: _, cond } => {
+                let block = &mut blocks.get_mut(&id).unwrap();
+                let mut succs = self.control_subgraph.succs(id);
+                let succ1 = succs.next().unwrap();
+                let succ2 = succs.next().unwrap();
+                let succ1_is_true = func.nodes[succ1.idx()].try_projection(1).is_some();
+                write!(
+                    block,
+                    "                control_token = if {} {{ {} }} else {{ {} }};\n",
+                    self.get_value(cond),
+                    if succ1_is_true { succ1 } else { succ2 }.idx(),
+                    if succ1_is_true { succ2 } else { succ1 }.idx(),
+                )?
+            }
+            Node::Return { control: _, data } => {
+                let block = &mut blocks.get_mut(&id).unwrap();
+                let memory_objects = self.memory_objects[self.func_id.idx()].memory_objects(data);
+                if memory_objects.is_empty() {
+                    write!(block, "                return {};\n", self.get_value(data))?
+                } else {
+                    // If the value to return is a memory object, figure out
+                    // which memory object it actually is at runtime and return
+                    // that box.
+                    for memory_object in memory_objects {
+                        write!(block, "                if let Some(mut mem_obj) = mem_obj{} && ::std::boxed::Box::as_mut_ptr(&mut mem_obj) as *mut u8 == {} {{\n", memory_object, self.get_value(data))?;
+                        write!(block, "                    return mem_obj;\n")?;
+                        write!(block, "                }}\n")?;
+                    }
+                    write!(block, "                panic!(\"HERCULES PANIC: Pointer to be returned doesn't match any known memory objects.\");\n")?
+                }
+            }
+            _ => panic!("PANIC: Can't lower {:?}.", func.nodes[id.idx()]),
+        }
+        Ok(())
+    }
+
+    /*
+     * Lower data nodes in Hercules IR into Rust statements.
+     */
+    fn codegen_data_node(
+        &self,
+        id: NodeID,
+        blocks: &mut BTreeMap<NodeID, String>,
+    ) -> Result<(), Error> {
+        let func = &self.get_func();
+        match func.nodes[id.idx()] {
+            Node::Parameter { index } => {
+                let block = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap();
+                write!(
+                    block,
+                    "                {} = p{};\n",
+                    self.get_value(id),
+                    index
+                )?
+            }
+            Node::Constant { id: cons_id } => {
+                let block = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap();
+                write!(block, "                {} = ", self.get_value(id))?;
+                match self.module.constants[cons_id.idx()] {
+                    Constant::Boolean(val) => write!(block, "{}bool", val)?,
+                    Constant::Integer8(val) => write!(block, "{}i8", val)?,
+                    Constant::Integer16(val) => write!(block, "{}i16", val)?,
+                    Constant::Integer32(val) => write!(block, "{}i32", val)?,
+                    Constant::Integer64(val) => write!(block, "{}i64", val)?,
+                    Constant::UnsignedInteger8(val) => write!(block, "{}u8", val)?,
+                    Constant::UnsignedInteger16(val) => write!(block, "{}u16", val)?,
+                    Constant::UnsignedInteger32(val) => write!(block, "{}u32", val)?,
+                    Constant::UnsignedInteger64(val) => write!(block, "{}u64", val)?,
+                    Constant::Float32(val) => write!(block, "{}f32", val)?,
+                    Constant::Float64(val) => write!(block, "{}f64", val)?,
+                    Constant::Product(_, _) | Constant::Summation(_, _, _) | Constant::Array(_) => {
+                        let memory_objects =
+                            self.memory_objects[self.func_id.idx()].memory_objects(id);
+                        assert_eq!(memory_objects.len(), 1);
+                        let memory_object = memory_objects[0];
+                        write!(
+                            block,
+                            "::std::boxed::Box::as_mut_ptr(mem_obj{}.as_mut().unwrap()) as *mut u8",
+                            memory_object
+                        )?
+                    }
+                }
+                write!(block, ";\n")?
+            }
+            Node::Call {
+                control: _,
+                function: callee_id,
+                ref dynamic_constants,
+                ref args,
+            } => {
+                let block = &mut blocks.get_mut(&self.bbs[id.idx()]).unwrap();
+                write!(
+                    block,
+                    "                {} = unsafe {{ {}(",
+                    self.get_value(id),
+                    self.module.functions[callee_id.idx()].name
+                )?;
+                for dc in dynamic_constants {
+                    self.codegen_dynamic_constant(*dc, block)?;
+                    write!(block, ", ")?;
+                }
+                for arg in args {
+                    write!(block, "{}, ", self.get_value(*arg))?;
+                }
+                write!(block, ") }};\n")?;
+
+                // When a CPU function is called that returns a memory object,
+                // that memory object must have come from one of its parameters.
+                // Dynamically figure out which one it came from, so that we can
+                // move it to the slot of the output memory object.
+                let call_memory_objects =
+                    self.memory_objects[self.func_id.idx()].memory_objects(id);
+                if !call_memory_objects.is_empty() {
+                    assert_eq!(call_memory_objects.len(), 1);
+                    let call_memory_object = call_memory_objects[0];
+
+                    let callee_returned_memory_objects =
+                        self.memory_objects[callee_id.idx()].returned_memory_objects();
+                    let possible_params: Vec<_> = (0..self.module.functions[callee_id.idx()]
+                        .param_types
+                        .len())
+                        .filter(|idx| {
+                            let memory_object_of_param = self.memory_objects[callee_id.idx()]
+                                .memory_object_of_parameter(*idx);
+                            // Look at parameters that could be the source of
+                            // the memory object returned by the function.
+                            memory_object_of_param
+                                .map(|memory_object_of_param| {
+                                    callee_returned_memory_objects.contains(&memory_object_of_param)
+                                })
+                                .unwrap_or(false)
+                        })
+                        .collect();
+                    let arg_memory_objects = args
+                        .into_iter()
+                        .enumerate()
+                        .filter(|(idx, _)| possible_params.contains(idx))
+                        .map(|(_, arg)| {
+                            self.memory_objects[self.func_id.idx()]
+                                .memory_objects(*arg)
+                                .into_iter()
+                        })
+                        .flatten();
+
+                    // Dynamically check which of the memory objects
+                    // corresponding to arguments to the call was returned by
+                    // the call. Move that memory object into the memory object
+                    // of the call.
+                    let mut first_obj = true;
+                    for arg_memory_object in arg_memory_objects {
+                        write!(block, "                ")?;
+                        if first_obj {
+                            first_obj = false;
+                        } else {
+                            write!(block, "else ")?;
+                        }
+                        write!(block, "if let Some(mem_obj) = mem_obj{}.as_mut() && ::std::boxed::Box::as_mut_ptr(mem_obj) as *mut u8 == {} {{\n", arg_memory_object, self.get_value(id))?;
+                        write!(
+                            block,
+                            "                    mem_obj{} = mem_obj{}.take();\n",
+                            call_memory_object, arg_memory_object
+                        )?;
+                        write!(block, "                }}\n")?;
+                    }
+                    write!(block, "                else {{\n")?;
+                    write!(block, "                    panic!(\"HERCULES PANIC: Pointer returned from called function doesn't match any known memory objects.\");\n")?;
+                    write!(block, "                }}\n")?;
+                }
+            }
+            _ => panic!("PANIC: Can't lower {:?}.", func.nodes[id.idx()]),
+        }
+        Ok(())
+    }
+
+    /*
+     * Lower dynamic constant in Hercules IR into a Rust expression.
+     */
+    fn codegen_dynamic_constant<W: Write>(
+        &self,
+        id: DynamicConstantID,
+        w: &mut W,
+    ) -> Result<(), Error> {
+        match self.module.dynamic_constants[id.idx()] {
+            DynamicConstant::Constant(val) => write!(w, "{}", val)?,
+            DynamicConstant::Parameter(idx) => write!(w, "dc_p{}", idx)?,
+            DynamicConstant::Add(left, right) => {
+                write!(w, "(")?;
+                self.codegen_dynamic_constant(left, w)?;
+                write!(w, "+")?;
+                self.codegen_dynamic_constant(right, w)?;
+                write!(w, ")")?;
+            }
+            DynamicConstant::Sub(left, right) => {
+                write!(w, "(")?;
+                self.codegen_dynamic_constant(left, w)?;
+                write!(w, "-")?;
+                self.codegen_dynamic_constant(right, w)?;
+                write!(w, ")")?;
+            }
+            DynamicConstant::Mul(left, right) => {
+                write!(w, "(")?;
+                self.codegen_dynamic_constant(left, w)?;
+                write!(w, "*")?;
+                self.codegen_dynamic_constant(right, w)?;
+                write!(w, ")")?;
+            }
+            DynamicConstant::Div(left, right) => {
+                write!(w, "(")?;
+                self.codegen_dynamic_constant(left, w)?;
+                write!(w, "/")?;
+                self.codegen_dynamic_constant(right, w)?;
+                write!(w, ")")?;
+            }
+            DynamicConstant::Rem(left, right) => {
+                write!(w, "(")?;
+                self.codegen_dynamic_constant(left, w)?;
+                write!(w, "%")?;
+                self.codegen_dynamic_constant(right, w)?;
+                write!(w, ")")?;
+            }
+        }
+        Ok(())
+    }
+
+    /*
+     * Lower the size of a type into a Rust expression.
+     */
+    fn codegen_type_size(&self, ty: TypeID) -> String {
+        match self.module.types[ty.idx()] {
+            Type::Control => panic!(),
+            Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => "1".to_string(),
+            Type::Integer16 | Type::UnsignedInteger16 => "2".to_string(),
+            Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => "4".to_string(),
+            Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => "8".to_string(),
+            Type::Product(ref fields) => {
+                let fields_align = fields
+                    .into_iter()
+                    .map(|id| get_type_alignment(&self.module.types, *id));
+                let fields: Vec<String> = fields
+                    .into_iter()
+                    .map(|id| self.codegen_type_size(*id))
+                    .collect();
+
+                // Emit LLVM IR to round up to the alignment of the next field,
+                // and then add the size of that field. At the end, round up to
+                // the alignment of the whole struct.
+                let mut acc_size = "0".to_string();
+                for (field_align, field) in zip(fields_align, fields) {
+                    acc_size = format!(
+                        "(({} + {}) & !{})",
+                        acc_size,
+                        field_align - 1,
+                        field_align - 1
+                    );
+                    acc_size = format!("({} + {})", acc_size, field);
+                }
+                let total_align = get_type_alignment(&self.module.types, ty);
+                format!(
+                    "(({} + {}) & !{})",
+                    acc_size,
+                    total_align - 1,
+                    total_align - 1
+                )
+            }
+            Type::Summation(ref variants) => {
+                let variants = variants.into_iter().map(|id| self.codegen_type_size(*id));
+
+                // The size of a summation is the size of the largest field,
+                // plus 1 byte and alignment for the discriminant.
+                let mut acc_size = "0".to_string();
+                for variant in variants {
+                    acc_size = format!("::core::cmp::max({}, {})", acc_size, variant);
+                }
+
+                // No alignment is necessary for the 1 byte discriminant.
+                let total_align = get_type_alignment(&self.module.types, ty);
+                format!(
+                    "(({} + 1 + {}) & !{})",
+                    acc_size,
+                    total_align - 1,
+                    total_align - 1
+                )
+            }
+            Type::Array(elem, ref bounds) => {
+                // The size of an array is the size of the element multipled by
+                // the dynamic constant bounds.
+                let mut acc_size = self.codegen_type_size(elem);
+                for dc in bounds {
+                    acc_size = format!("{} * ", acc_size);
+                    self.codegen_dynamic_constant(*dc, &mut acc_size).unwrap();
+                }
+                format!("({})", acc_size)
+            }
+        }
+    }
+
+    fn get_func(&self) -> &Function {
+        &self.module.functions[self.func_id.idx()]
+    }
+
+    fn get_value(&self, id: NodeID) -> String {
+        format!("node_{}", id.idx())
+    }
+
+    fn get_type(&self, id: TypeID) -> &'static str {
+        convert_type(&self.module.types[id.idx()])
+    }
+
+    fn get_type_interface(&self, id: TypeID) -> &'static str {
+        convert_type_interface(&self.module.types[id.idx()])
+    }
+}
+
+fn convert_type(ty: &Type) -> &'static str {
+    match ty {
+        Type::Boolean => "bool",
+        Type::Integer8 => "i8",
+        Type::Integer16 => "i16",
+        Type::Integer32 => "i32",
+        Type::Integer64 => "i64",
+        Type::UnsignedInteger8 => "u8",
+        Type::UnsignedInteger16 => "u16",
+        Type::UnsignedInteger32 => "u32",
+        Type::UnsignedInteger64 => "u64",
+        Type::Float32 => "f32",
+        Type::Float64 => "f64",
+        Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "*mut u8",
+        _ => panic!(),
+    }
+}
+
+/*
+ * Collection types are passed to / returned from runtime functions through a
+ * wrapper type for ownership tracking reasons.
+ */
+fn convert_type_interface(ty: &Type) -> &'static str {
+    match ty {
+        Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "Box<[u8]>",
+        _ => convert_type(ty),
+    }
+}
diff --git a/hercules_cg/src/sched_dot.rs b/hercules_cg/src/sched_dot.rs
deleted file mode 100644
index f044618931f0ffe45aeb6da1ef2798f4bc7bdfe6..0000000000000000000000000000000000000000
--- a/hercules_cg/src/sched_dot.rs
+++ /dev/null
@@ -1,174 +0,0 @@
-extern crate bitvec;
-extern crate hercules_ir;
-extern crate rand;
-
-use std::collections::{HashMap, VecDeque};
-use std::env::temp_dir;
-use std::fmt::Write;
-use std::fs::File;
-use std::io::Write as _;
-use std::process::Command;
-
-use self::bitvec::prelude::*;
-use self::rand::Rng;
-
-use self::hercules_ir::*;
-
-use crate::*;
-
-/*
- * Top level function to compute a dot graph for a schedule IR module, and
- * immediately render it using xdot.
- */
-pub fn xdot_sched_module(module: &SModule) {
-    let mut tmp_path = temp_dir();
-    let mut rng = rand::thread_rng();
-    let num: u64 = rng.gen();
-    tmp_path.push(format!("sched_dot_{}.dot", num));
-    let mut file = File::create(tmp_path.clone()).expect("PANIC: Unable to open output file.");
-    let mut contents = String::new();
-    write_dot(&module, &mut contents).expect("PANIC: Unable to generate output file contents.");
-    file.write_all(contents.as_bytes())
-        .expect("PANIC: Unable to write output file contents.");
-    Command::new("xdot")
-        .args([tmp_path])
-        .output()
-        .expect("PANIC: Couldn't execute xdot. Is xdot installed?");
-}
-
-/*
- * Top level function to write a schedule IR module out as a dot graph.
- */
-pub fn write_dot<W: Write>(module: &SModule, w: &mut W) -> std::fmt::Result {
-    write_digraph_header(w)?;
-
-    for (function_name, function) in module.functions.iter() {
-        // Schedule the SFunction to form a linear ordering of instructions.
-        let virt_reg_to_inst_id = sched_virt_reg_to_inst_id(function);
-        let dep_graph = sched_dependence_graph(function, &virt_reg_to_inst_id);
-        let mut block_to_inst_list = (0..function.blocks.len())
-            .map(|block_idx| (block_idx, vec![]))
-            .collect::<HashMap<usize, Vec<(&SInst, usize, Option<&Vec<SSchedule>>)>>>();
-        for (block_idx, block) in function.blocks.iter().enumerate() {
-            let mut emitted = bitvec![u8, Lsb0; 0; block.insts.len()];
-            let mut worklist = VecDeque::from((0..block.insts.len()).collect::<Vec<_>>());
-            while let Some(inst_idx) = worklist.pop_front() {
-                let inst_id = InstID::new(block_idx, inst_idx);
-                let dependencies = &dep_graph[&inst_id];
-                let all_uses_emitted = dependencies
-                    .into_iter()
-                    // Check that all used instructions in this block...
-                    .filter(|inst_id| inst_id.idx_0() == block_idx)
-                    // were already emitted.
-                    .all(|inst_id| emitted[inst_id.idx_1()]);
-                // Phis don't need to wait for all of their uses to be added.
-                if block.insts[inst_idx].is_phi() || all_uses_emitted {
-                    block_to_inst_list.get_mut(&block_idx).unwrap().push((
-                        &block.insts[inst_idx],
-                        block.virt_regs[inst_idx].0,
-                        block.schedules.get(&inst_idx),
-                    ));
-                    emitted.set(inst_id.idx_1(), true);
-                } else {
-                    worklist.push_back(inst_idx);
-                }
-            }
-        }
-
-        // A SFunction is a subgraph.
-        write_subgraph_header(function_name, w)?;
-
-        // Each SBlock is a record node.
-        for (block_idx, block) in function.blocks.iter().enumerate() {
-            // Emit the instructions in scheduled order.
-            write_block(function_name, block_idx, &block_to_inst_list[&block_idx], w)?;
-
-            // Add control edges.
-            for succ in block.successors().as_ref() {
-                write_control_edge(function_name, block_idx, succ.idx(), w)?;
-            }
-        }
-
-        write_graph_footer(w)?;
-    }
-
-    write_graph_footer(w)?;
-    Ok(())
-}
-
-fn write_digraph_header<W: Write>(w: &mut W) -> std::fmt::Result {
-    write!(w, "digraph \"Module\" {{\n")?;
-    write!(w, "compound=true\n")?;
-    Ok(())
-}
-
-fn write_subgraph_header<W: Write>(function_name: &SFunctionName, w: &mut W) -> std::fmt::Result {
-    write!(w, "subgraph {} {{\n", function_name)?;
-    write!(w, "label=\"{}\"\n", function_name)?;
-    write!(w, "bgcolor=ivory4\n")?;
-    write!(w, "cluster=true\n")?;
-    Ok(())
-}
-
-fn write_graph_footer<W: Write>(w: &mut W) -> std::fmt::Result {
-    write!(w, "}}\n")?;
-    Ok(())
-}
-
-fn write_block<W: Write>(
-    function_name: &SFunctionName,
-    block_idx: usize,
-    insts: &[(&SInst, usize, Option<&Vec<SSchedule>>)],
-    w: &mut W,
-) -> std::fmt::Result {
-    write!(w, "{}_{} [label=\"{{", function_name, block_idx,)?;
-    for token in insts.into_iter().map(|token| Some(token)).intersperse(None) {
-        match token {
-            Some((inst, virt_reg, schedules)) => {
-                write!(w, "%{} = {}(", virt_reg, inst.upper_case_name())?;
-                for token in sched_get_uses(inst).map(|u| Some(u)).intersperse(None) {
-                    match token {
-                        Some(SValue::VirtualRegister(use_virt_reg)) => {
-                            write!(w, "%{}", use_virt_reg)?
-                        }
-                        Some(SValue::Constant(scons)) => write!(w, "{:?}", scons)?,
-                        None => write!(w, ", ")?,
-                    }
-                }
-                write!(w, ")")?;
-                if let Some(schedules) = schedules
-                    && !schedules.is_empty()
-                {
-                    write!(w, " [")?;
-                    for token in schedules.into_iter().map(|s| Some(s)).intersperse(None) {
-                        match token {
-                            Some(schedule) => write!(w, "{:?}", schedule)?,
-                            None => write!(w, ", ")?,
-                        }
-                    }
-                    write!(w, "]")?;
-                }
-            }
-            None => write!(w, " | ")?,
-        }
-    }
-    write!(
-        w,
-        "}}\", shape = \"Mrecord\", style = \"filled\", fillcolor = \"lightblue\"];\n"
-    )?;
-    Ok(())
-}
-
-fn write_control_edge<W: Write>(
-    function_name: &SFunctionName,
-    src: usize,
-    dst: usize,
-    w: &mut W,
-) -> std::fmt::Result {
-    write!(
-        w,
-        "{}_{} -> {}_{} [color=\"black\"];\n",
-        function_name, src, function_name, dst
-    )?;
-    Ok(())
-}
diff --git a/hercules_cg/src/sched_gen.rs b/hercules_cg/src/sched_gen.rs
deleted file mode 100644
index 70a898b6c259b8bf7543b5ff4d1202132c71cf9b..0000000000000000000000000000000000000000
--- a/hercules_cg/src/sched_gen.rs
+++ /dev/null
@@ -1,1461 +0,0 @@
-extern crate bitvec;
-
-extern crate hercules_ir;
-
-use std::cell::Cell;
-use std::collections::{HashMap, VecDeque};
-use std::iter::zip;
-use std::mem::{swap, take};
-
-use self::bitvec::prelude::*;
-
-use self::hercules_ir::*;
-
-use crate::*;
-
-pub fn sched_compile(
-    module: &Module,
-    def_uses: &Vec<ImmutableDefUseMap>,
-    typing: &ModuleTyping,
-    control_subgraphs: &Vec<Subgraph>,
-    fork_join_maps: &Vec<HashMap<NodeID, NodeID>>,
-    fork_join_nests: &Vec<HashMap<NodeID, Vec<NodeID>>>,
-    antideps: &Vec<Vec<(NodeID, NodeID)>>,
-    bbs: &Vec<Vec<NodeID>>,
-    plans: &Vec<Plan>,
-) -> SModule {
-    let stypes = convert_to_sched_ir_types(&module.types);
-    let sconstants = convert_to_sched_ir_constants(&module.constants);
-    let function_names: HashMap<FunctionID, String> = module
-        .functions
-        .iter()
-        .enumerate()
-        .map(|(idx, function)| (FunctionID::new(idx), function.name.clone()))
-        .collect();
-
-    let mut functions = HashMap::new();
-    let mut manifests = HashMap::new();
-    for idx in 0..module.functions.len() {
-        let (sfunctions, manifest) = FunctionContext::new(
-            &module.functions[idx],
-            &module.types,
-            &module.constants,
-            &module.dynamic_constants,
-            &def_uses[idx],
-            &typing[idx],
-            &control_subgraphs[idx],
-            &fork_join_maps[idx],
-            &fork_join_nests[idx],
-            &antideps[idx],
-            &bbs[idx],
-            &plans[idx],
-            &stypes,
-            &sconstants,
-            &function_names,
-        )
-        .compile_function();
-
-        functions.extend(sfunctions.into_iter());
-        manifests.insert(module.functions[idx].name.clone(), manifest);
-    }
-
-    SModule {
-        functions,
-        manifests,
-    }
-}
-
-fn convert_to_sched_ir_types(types: &Vec<Type>) -> Vec<Option<SType>> {
-    let mut stypes = vec![None; types.len()];
-
-    for id in types_bottom_up(types) {
-        stypes[id.idx()] = match &types[id.idx()] {
-            Type::Control => None,
-            Type::Boolean => Some(SType::Boolean),
-            Type::Integer8 => Some(SType::Integer8),
-            Type::Integer16 => Some(SType::Integer16),
-            Type::Integer32 => Some(SType::Integer32),
-            Type::Integer64 => Some(SType::Integer64),
-            Type::UnsignedInteger8 => Some(SType::UnsignedInteger8),
-            Type::UnsignedInteger16 => Some(SType::UnsignedInteger16),
-            Type::UnsignedInteger32 => Some(SType::UnsignedInteger32),
-            Type::UnsignedInteger64 => Some(SType::UnsignedInteger64),
-            Type::Float32 => Some(SType::Float32),
-            Type::Float64 => Some(SType::Float64),
-            Type::Product(fields) => {
-                let mut typs = vec![];
-                let mut res_none = false;
-                for id in fields {
-                    if types[id.idx()].is_array() {
-                        res_none = true;
-                        break;
-                    } else {
-                        match &stypes[id.idx()] {
-                            None => {
-                                res_none = true;
-                                break;
-                            }
-                            Some(t) => typs.push(t.clone()),
-                        }
-                    }
-                }
-                if res_none {
-                    None
-                } else {
-                    Some(SType::Product(typs.into()))
-                }
-            }
-            Type::Summation(_) => todo!(),
-            Type::Array(elem_ty, _) => match &stypes[elem_ty.idx()] {
-                None => None,
-                Some(t) => Some(SType::ArrayRef(Box::new(t.clone()))),
-            },
-        };
-    }
-
-    stypes
-}
-
-fn convert_to_sched_ir_constants(constants: &Vec<Constant>) -> Vec<Option<SConstant>> {
-    let mut sconstants = vec![None; constants.len()];
-
-    for id in constants_bottom_up(constants) {
-        sconstants[id.idx()] = match &constants[id.idx()] {
-            Constant::Boolean(val) => Some(SConstant::Boolean(*val)),
-            Constant::Integer8(val) => Some(SConstant::Integer8(*val)),
-            Constant::Integer16(val) => Some(SConstant::Integer16(*val)),
-            Constant::Integer32(val) => Some(SConstant::Integer32(*val)),
-            Constant::Integer64(val) => Some(SConstant::Integer64(*val)),
-            Constant::UnsignedInteger8(val) => Some(SConstant::UnsignedInteger8(*val)),
-            Constant::UnsignedInteger16(val) => Some(SConstant::UnsignedInteger16(*val)),
-            Constant::UnsignedInteger32(val) => Some(SConstant::UnsignedInteger32(*val)),
-            Constant::UnsignedInteger64(val) => Some(SConstant::UnsignedInteger64(*val)),
-            Constant::Float32(val) => Some(SConstant::Float32(*val)),
-            Constant::Float64(val) => Some(SConstant::Float64(*val)),
-            Constant::Product(_, fields) => {
-                let mut consts = vec![];
-                let mut res_none = false;
-                for id in fields {
-                    if constants[id.idx()].is_array() {
-                        res_none = true;
-                        break;
-                    } else {
-                        match &sconstants[id.idx()] {
-                            None => {
-                                res_none = true;
-                                break;
-                            }
-                            Some(c) => consts.push(c.clone()),
-                        }
-                    }
-                }
-                if res_none {
-                    None
-                } else {
-                    Some(SConstant::Product(consts.into()))
-                }
-            }
-            Constant::Summation(_, _, _) => todo!(),
-            // Array constants are never generated inline schedule IR.
-            Constant::Array(_) => None,
-        };
-    }
-
-    sconstants
-}
-
-/*
- * Converts one Hercules function to N schedule IR functions, where N is the
- * number of partitions in the Hercules function.
- */
-struct FunctionContext<'a> {
-    function: &'a Function,
-    types: &'a Vec<Type>,
-    constants: &'a Vec<Constant>,
-    dynamic_constants: &'a Vec<DynamicConstant>,
-    def_use: &'a ImmutableDefUseMap,
-    typing: &'a Vec<TypeID>,
-    control_subgraph: &'a Subgraph,
-    fork_join_map: &'a HashMap<NodeID, NodeID>,
-    fork_join_nest: &'a HashMap<NodeID, Vec<NodeID>>,
-    antideps: &'a Vec<(NodeID, NodeID)>,
-    bbs: &'a Vec<NodeID>,
-    plan: &'a Plan,
-    stypes: &'a Vec<Option<SType>>,
-    sconstants: &'a Vec<Option<SConstant>>,
-    function_names: &'a HashMap<FunctionID, String>,
-
-    top_nodes: Vec<NodeID>,
-    partition_graph: Subgraph,
-    inverted_partition_map: Vec<Vec<NodeID>>,
-    data_inputs: Vec<Vec<NodeID>>,
-    data_outputs: Vec<Vec<NodeID>>,
-
-    num_virtual_registers: Vec<Cell<usize>>,
-}
-
-impl<'a> FunctionContext<'a> {
-    fn new(
-        function: &'a Function,
-        types: &'a Vec<Type>,
-        constants: &'a Vec<Constant>,
-        dynamic_constants: &'a Vec<DynamicConstant>,
-        def_use: &'a ImmutableDefUseMap,
-        typing: &'a Vec<TypeID>,
-        control_subgraph: &'a Subgraph,
-        fork_join_map: &'a HashMap<NodeID, NodeID>,
-        fork_join_nest: &'a HashMap<NodeID, Vec<NodeID>>,
-        antideps: &'a Vec<(NodeID, NodeID)>,
-        bbs: &'a Vec<NodeID>,
-        plan: &'a Plan,
-        stypes: &'a Vec<Option<SType>>,
-        sconstants: &'a Vec<Option<SConstant>>,
-        function_names: &'a HashMap<FunctionID, String>,
-    ) -> Self {
-        let inverted_partition_map = plan.invert_partition_map();
-        let top_nodes = plan.compute_top_nodes(function, control_subgraph, &inverted_partition_map);
-        let partition_graph = partition_graph(function, def_use, plan);
-        let data_inputs = plan.compute_data_inputs(function);
-        let data_outputs = plan.compute_data_outputs(function, def_use);
-
-        let num_virtual_registers = vec![Cell::new(0); plan.num_partitions];
-
-        FunctionContext {
-            function,
-            types,
-            constants,
-            dynamic_constants,
-            def_use,
-            typing,
-            control_subgraph,
-            fork_join_map,
-            fork_join_nest,
-            antideps,
-            bbs,
-            plan,
-            stypes,
-            sconstants,
-            function_names,
-
-            top_nodes,
-            partition_graph,
-            inverted_partition_map,
-            data_inputs,
-            data_outputs,
-
-            num_virtual_registers,
-        }
-    }
-
-    /*
-     * Top level function to compile a Hercules IR function into simple IR
-     * functions.
-     */
-    fn compile_function(&self) -> (HashMap<SFunctionName, SFunction>, Manifest) {
-        let (mut manifest, array_node_to_array_id) = self.compute_manifest();
-
-        manifest
-            .partitions
-            .iter()
-            .enumerate()
-            .for_each(|(idx, partition_manifest)| {
-                self.num_virtual_registers[idx].set(partition_manifest.parameters.len())
-            });
-
-        let partition_functions = (0..self.plan.num_partitions)
-            .map(|partition_idx| {
-                let name = self.get_sfunction_name(partition_idx);
-                let sfunction =
-                    self.compile_partition(partition_idx, &manifest, &array_node_to_array_id);
-                self.update_manifest(&mut manifest.partitions[partition_idx], &sfunction);
-                (name, sfunction)
-            })
-            .collect();
-
-        (partition_functions, manifest)
-    }
-
-    /*
-     * Compute the manifest for a Hercules function. This includes all of the
-     * partition signature information.
-     */
-    fn compute_manifest(&self) -> (Manifest, HashMap<NodeID, ArrayID>) {
-        // The manifest needs to contain metadata for allocating arrays.
-        let dynamic_constants = self.dynamic_constants.clone();
-        let array_constants = self
-            .function
-            .nodes
-            .iter()
-            .filter_map(|node| {
-                if let Some(cons) = node.try_constant()
-                    && let Some(ty) = self.constants[cons.idx()].try_array_type()
-                {
-                    let extents = self.types[ty.idx()]
-                        .try_extents()
-                        .expect("PANIC: Type of array constant is not an array type.");
-                    Some(extents.into_iter().map(|id| *id).collect())
-                } else {
-                    None
-                }
-            })
-            .collect();
-
-        // Assign each array constant a unique ID for noting which ones to pass
-        // to what partition functions.
-        let array_node_to_array_id = (0..self.function.nodes.len())
-            .filter(|node_idx| {
-                if let Some(cons) = self.function.nodes[*node_idx].try_constant() {
-                    self.constants[cons.idx()].is_array()
-                } else {
-                    false
-                }
-            })
-            .enumerate()
-            .map(|(idx, node_idx)| (NodeID::new(node_idx), ArrayID::new(idx)))
-            .collect::<HashMap<NodeID, ArrayID>>();
-
-        let partitions = (0..self.plan.num_partitions)
-            .map(|partition_idx| {
-                let partition = &self.inverted_partition_map[partition_idx];
-                let name = self.get_sfunction_name(partition_idx);
-                let mut parameters = vec![];
-                let mut returns = vec![];
-
-                // Compute the signature of each partitions' schedule IR
-                // function, which has the following structure:
-                // 1. If the partition is the entry partition, the first
-                //    parameters are the parameters to the Hercules function. If
-                //    not, then the first parameters are all of the data inputs
-                //    to the partition. Note that parameter nodes are always in
-                //    the partition of the start node (the entry partition), so
-                //    function parameters used in other partitions are treated
-                //    as an inter-partition data dependence.
-                if partition_idx == 0 {
-                    parameters.extend(self.function.param_types.iter().enumerate().map(
-                        |(param_idx, ty_id)| {
-                            (
-                                self.stypes[ty_id.idx()].clone().unwrap(),
-                                ParameterKind::HerculesParameter(param_idx),
-                            )
-                        },
-                    ));
-                } else {
-                    parameters.extend(self.data_inputs[partition_idx].iter().map(|node_id| {
-                        (
-                            self.stypes[self.typing[node_id.idx()].idx()]
-                                .clone()
-                                .unwrap(),
-                            ParameterKind::DataInput(*node_id),
-                        )
-                    }))
-                }
-
-                // 2. The second set of parameters are references to zero-ed
-                //    memories for implementing array constants. Implicit array
-                //    cloning is, for now, forbidden. Array constants are
-                //    rematerialized into each partition that uses the constant,
-                //    so look over all of the uses of all the nodes in the
-                //    partition, not all of the nodes in the partition.
-                parameters.extend(
-                    partition
-                        .iter()
-                        .map(|node_id| {
-                            get_uses(&self.function.nodes[node_id.idx()])
-                                .as_ref()
-                                .iter()
-                                .filter_map(|use_id| {
-                                    if let Some(array_id) = array_node_to_array_id.get(use_id) {
-                                        Some((
-                                            self.stypes[self.typing[use_id.idx()].idx()]
-                                                .clone()
-                                                .unwrap(),
-                                            ParameterKind::ArrayConstant(*array_id),
-                                        ))
-                                    } else {
-                                        None
-                                    }
-                                })
-                                .collect::<Vec<_>>()
-                        })
-                        .flatten(),
-                );
-
-                // 3. The third set of parameters are the dynamic constants
-                //    passed to the overall function.
-                parameters.extend((0..self.function.num_dynamic_constants).map(|idx| {
-                    (
-                        SType::UnsignedInteger64,
-                        ParameterKind::DynamicConstant(idx as usize),
-                    )
-                }));
-
-                // Note that many partitions will be given unused parameters
-                // (mainly dynamic constants). These will be removed during the
-                // small amount of optimization done on simple IR.
-
-                // Simple IR functions may return multiple values (this is to
-                // avoid needing to pack / un-pack product types). The return
-                // value of an exit partition is the return value of the
-                // Hercules function. The return values of non-exit partitions
-                // are the data outputs of the partition, possibly plus an
-                // integer specifying what partition should be executed next, if
-                // there are multiple successor partitions. A valid partitioning
-                // will only contain partitions with either a branch to another
-                // partition xor a return node.
-                let successors = self
-                    .partition_graph
-                    .succs(NodeID::new(partition_idx))
-                    .map(|node_id| PartitionID::new(node_id.idx()))
-                    .collect::<Vec<PartitionID>>();
-                if partition
-                    .iter()
-                    .any(|node_id| self.function.nodes[node_id.idx()].is_return())
-                {
-                    assert_eq!(successors.len(), 0);
-                    returns.push((
-                        self.stypes[self.function.return_type.idx()]
-                            .clone()
-                            .unwrap(),
-                        ReturnKind::HerculesReturn,
-                    ));
-                } else {
-                    assert!(successors.len() > 0);
-                    returns.extend(self.data_outputs[partition_idx].iter().map(|node_id| {
-                        (
-                            self.stypes[self.typing[node_id.idx()].idx()]
-                                .clone()
-                                .unwrap(),
-                            ReturnKind::DataOutput(*node_id),
-                        )
-                    }));
-                    if successors.len() > 1 {
-                        returns.push((SType::Integer8, ReturnKind::NextPartition));
-                    }
-                }
-
-                let device = match self.plan.partition_devices[partition_idx] {
-                    Device::CPU => DeviceManifest::cpu(),
-                    Device::GPU => DeviceManifest::gpu(),
-                    Device::AsyncRust => todo!(),
-                };
-
-                PartitionManifest {
-                    name,
-                    parameters,
-                    returns,
-                    successors,
-                    device,
-                }
-            })
-            .collect();
-
-        // The parameters for the overall Hercules function is computed in a
-        // similar fashion as for the individual partition functions.
-        let mut param_types = vec![];
-        param_types.extend(self.function.param_types.iter().enumerate().map(
-            |(param_idx, ty_id)| {
-                (
-                    self.stypes[ty_id.idx()].clone().unwrap(),
-                    ParameterKind::HerculesParameter(param_idx),
-                )
-            },
-        ));
-        param_types.extend(array_node_to_array_id.iter().map(|(node_id, array_id)| {
-            (
-                self.stypes[self.typing[node_id.idx()].idx()]
-                    .clone()
-                    .unwrap(),
-                ParameterKind::ArrayConstant(*array_id),
-            )
-        }));
-        param_types.extend((0..self.function.num_dynamic_constants).map(|idx| {
-            (
-                SType::UnsignedInteger64,
-                ParameterKind::DynamicConstant(idx as usize),
-            )
-        }));
-
-        // The return type is just the schedule IR type corresponding to the
-        // Hercules function's return type.
-        let return_type = self.stypes[self.function.return_type.idx()]
-            .clone()
-            .unwrap();
-
-        let manifest = Manifest {
-            param_types,
-            return_type,
-            dynamic_constants,
-            array_constants,
-            partitions,
-        };
-        (manifest, array_node_to_array_id)
-    }
-
-    /*
-     * Compile a partition into an SFunction.
-     */
-    fn compile_partition(
-        &self,
-        partition_idx: usize,
-        manifest: &Manifest,
-        array_node_to_array_id: &HashMap<NodeID, ArrayID>,
-    ) -> SFunction {
-        let partition = &self.inverted_partition_map[partition_idx];
-        let mut blocks = vec![];
-
-        // First, create basic blocks inside the SFunction corresponding to the
-        // control nodes in the partition. If this isn't the entry partition
-        // (partition #0), add an entry block, since the first basic block in a
-        // partition may have a predecessor inside the partition.
-        let mut control_id_to_block_id = HashMap::new();
-        let mut fork_node_id_to_fork_join_id = HashMap::new();
-        if partition_idx != 0 {
-            // Create an explicit entry block, if one is already created via the
-            // Start node.
-            blocks.push(SBlock::default());
-        }
-        for node in partition {
-            if self.function.nodes[node.idx()].is_control() {
-                control_id_to_block_id.insert(*node, BlockID::new(blocks.len()));
-                let mut block = SBlock::default();
-                if let Some(imm_fork) = self.fork_join_nest[node].get(0) {
-                    let new_id = ForkJoinID::new(fork_node_id_to_fork_join_id.len());
-                    let fork_join_id = *fork_node_id_to_fork_join_id
-                        .entry(*imm_fork)
-                        .or_insert(new_id);
-                    block.kind = if self.function.nodes[node.idx()].is_join() {
-                        SBlockKind::Reduce(fork_join_id)
-                    } else {
-                        SBlockKind::Parallel(fork_join_id)
-                    };
-                }
-                blocks.push(block);
-            }
-        }
-
-        // Second, assign every data node a SValue. This map incorporates info
-        // from the manifest to make using SFunction parameters easy.
-        let mut data_id_to_svalue = manifest.partitions[partition_idx]
-            .parameters
-            .iter()
-            .enumerate()
-            .filter_map(|(idx, (_, kind))| match kind {
-                // Assign SValues to nodes defined outside the partition and
-                // passed in via SFunction parameters.
-                ParameterKind::DataInput(node_id) => Some((*node_id, SValue::VirtualRegister(idx))),
-                _ => None,
-            })
-            .chain(
-                // Assign SValues for nodes inside the partition.
-                partition
-                    .iter()
-                    .filter(|node_id| !self.function.nodes[node_id.idx()].is_control())
-                    .filter_map(|data_id| {
-                        let value = match self.function.nodes[data_id.idx()] {
-                            // Phis in a block with no predecessors inside the
-                            // current partition don't get lowered to phis in
-                            // schedule IR - they get lowered to partition
-                            // parameters. Phis with some predecessors in the
-                            // same partition and some in a different partition
-                            // get lowered to a combination of a SFunction
-                            // parameter and a phi instruction, and uses of the
-                            // phi node should become uses of the phi
-                            // instruction.
-                            Node::Phi { control, data: _ }
-                                if self.control_subgraph.preds(control).all(|pred| {
-                                    self.plan.partitions[pred.idx()]
-                                        != self.plan.partitions[control.idx()]
-                                }) =>
-                            {
-                                // If the phi just gets lowered to a parameter,
-                                // it got added above when adding the virtual
-                                // registers for the SFunction parameters.
-                                return None;
-                            }
-                            // Figure out which virtual constant in the
-                            // signature of the current SFunction corresponds to
-                            // a particular Hercules parameter.
-                            Node::Parameter { index } => SValue::VirtualRegister(
-                                manifest.partitions[partition_idx]
-                                    .parameters
-                                    .iter()
-                                    .position(|(_, kind)| {
-                                        *kind == ParameterKind::HerculesParameter(index)
-                                    })
-                                    .unwrap(),
-                            ),
-                            // Wait to assign SValues to constants. We assign
-                            // SValues to constants in user partitions, not in
-                            // the partition the constant node happens to be in.
-                            Node::Constant { id: _ } => {
-                                return None;
-                            }
-                            // Dynamic constant nodes get generated upfront,
-                            // since they may or may not need a virtual register
-                            // freshly allocated for them. The math necessary
-                            // for them gets put in the block corresponding to
-                            // the control node the DynamicConstant node was
-                            // scheduled to.
-                            Node::DynamicConstant { id } => {
-                                let block_id = control_id_to_block_id[&self.bbs[data_id.idx()]];
-                                self.compile_dynamic_constant(
-                                    id,
-                                    &mut blocks[block_id.idx()],
-                                    partition_idx,
-                                    manifest,
-                                )
-                            }
-                            // Wait to assign SValues to array writes.
-                            Node::Write {
-                                collect: _,
-                                data: _,
-                                indices: _,
-                            } if self.types[self.typing[data_id.idx()].idx()].is_array() => {
-                                return None
-                            }
-                            _ => SValue::VirtualRegister(self.make_virt_reg(partition_idx)),
-                        };
-                        Some((*data_id, value))
-                    }),
-            )
-            .chain(
-                // Assign SValues for constants used by nodes in the partition.
-                partition
-                    .iter()
-                    .map(|node_id| {
-                        get_uses(&self.function.nodes[node_id.idx()])
-                            .as_ref()
-                            .iter()
-                            .filter_map(|use_id| {
-                                if let Node::Constant { id } = self.function.nodes[use_id.idx()] {
-                                    // Array constants map to the parameter the
-                                    // array memory is passed in through - all
-                                    // other constants are represented inline in
-                                    // an SValue.
-                                    let svalue = if let Some(array_id) =
-                                        array_node_to_array_id.get(use_id)
-                                    {
-                                        SValue::VirtualRegister(
-                                            manifest.partitions[partition_idx]
-                                                .parameters
-                                                .iter()
-                                                .position(|(_, kind)| {
-                                                    *kind == ParameterKind::ArrayConstant(*array_id)
-                                                })
-                                                .unwrap(),
-                                        )
-                                    } else {
-                                        SValue::Constant(self.sconstants[id.idx()].clone().unwrap())
-                                    };
-                                    Some((*use_id, svalue))
-                                } else {
-                                    None
-                                }
-                            })
-                            .collect::<Vec<_>>()
-                    })
-                    .flatten(),
-            )
-            .collect::<HashMap<_, _>>();
-
-        // Next, assign all the array write nodes. Array write nodes are
-        // recursively assigned the SValue of their `collect` input.
-        let mut worklist = partition
-            .iter()
-            .filter(|id| {
-                self.function.nodes[id.idx()].is_write() && !data_id_to_svalue.contains_key(id)
-            })
-            .map(|id| *id)
-            .collect::<VecDeque<_>>();
-        while let Some(id) = worklist.pop_front() {
-            let pred = match self.function.nodes[id.idx()] {
-                Node::Write {
-                    data: _,
-                    indices: _,
-                    collect,
-                } => collect,
-                _ => panic!("PANIC: Filtered out write nodes, but found a different node kind."),
-            };
-            if let Some(svalue) = data_id_to_svalue.get(&pred) {
-                data_id_to_svalue.insert(id, svalue.clone());
-            } else {
-                worklist.push_front(id);
-            }
-        }
-
-        // Third, generate code for every node in the partition. Iterates
-        // through a worklist of nodes in the partition. For non-phi and non-
-        // reduce nodes, only emit once all data uses are emitted. In addition,
-        // consider additional anti-dependence edges from read to write nodes.
-        // Def-use and anti-dependence edges are the only ordering we guarantee
-        // in schedule IR basic blocks, and it's up to device-specific backends
-        // to perform instruction scheduling.
-        let mut visited = bitvec![u8, Lsb0; 0; self.function.nodes.len()];
-        let mut worklist = partition.iter().map(|id| *id).collect::<VecDeque<_>>();
-        while let Some(id) = worklist.pop_front() {
-            if self.function.nodes[id.idx()].is_phi()
-                || self.function.nodes[id.idx()].is_reduce()
-                || get_uses(&self.function.nodes[id.idx()])
-                    .as_ref()
-                    .into_iter()
-                    // If this node isn't a phi or reduce, we need to check that
-                    // all uses, as well as all reads we anti-depend with, have
-                    // been emitted.
-                    .chain(self.antideps.iter().filter_map(|(read, write)| {
-                        if id == *write {
-                            Some(read)
-                        } else {
-                            None
-                        }
-                    }))
-                    // Only data dependencies within this partition need to have
-                    // already been visited.
-                    .all(|use_id| {
-                        self.plan.partitions[use_id.idx()] != PartitionID::new(partition_idx)
-                            || self.function.nodes[use_id.idx()].is_control()
-                            || visited[use_id.idx()]
-                    })
-            {
-                // Once all of the data dependencies for this node are emitted,
-                // this node can be emitted.
-                self.compile_node(
-                    id,
-                    &control_id_to_block_id,
-                    &data_id_to_svalue,
-                    &fork_node_id_to_fork_join_id,
-                    &mut blocks,
-                    partition_idx,
-                    manifest,
-                );
-                visited.set(id.idx(), true);
-            } else {
-                // Skip emitting node if it's not a phi or reduce node and if
-                // its data uses are not emitted yet.
-                worklist.push_back(id);
-            }
-        }
-
-        // Fourth, add the jump from the explicit entry block to the top node's
-        // block in the partition.
-        if partition_idx != 0 {
-            // Explicitly jump to the block corresponding to the top of the
-            // partition. That block may be a parallel block, but it's not a
-            // reduce block.
-            let top_node = self.top_nodes[partition_idx];
-            let top_block = control_id_to_block_id[&top_node];
-            let parallel_entry = if self.function.nodes[top_node.idx()].is_fork() {
-                self.copy_schedules(top_node, &mut blocks[0]);
-                Some(self.compile_parallel_entry(
-                    top_node,
-                    &data_id_to_svalue,
-                    &mut blocks[0],
-                    partition_idx,
-                    manifest,
-                ))
-            } else {
-                None
-            };
-            blocks[0].insts.push(SInst::Jump {
-                target: top_block,
-                parallel_entry,
-                reduce_exit: None,
-            });
-            blocks[0]
-                .virt_regs
-                .push((self.make_virt_reg(partition_idx), SType::Boolean));
-        }
-
-        // Fifth, make sure every block's schedules map is "filled".
-        for block in blocks.iter_mut() {
-            for inst_idx in 0..block.insts.len() {
-                let _ = block.schedules.try_insert(inst_idx, vec![]);
-            }
-        }
-
-        SFunction {
-            blocks,
-            param_types: manifest.partitions[partition_idx]
-                .parameters
-                .iter()
-                .map(|(sty, _)| sty.clone())
-                .collect(),
-            return_types: manifest.partitions[partition_idx]
-                .returns
-                .iter()
-                .map(|(sty, _)| sty.clone())
-                .collect(),
-        }
-    }
-
-    fn compile_node(
-        &self,
-        id: NodeID,
-        control_id_to_block_id: &HashMap<NodeID, BlockID>,
-        data_id_to_svalue: &HashMap<NodeID, SValue>,
-        fork_node_id_to_fork_join_id: &HashMap<NodeID, ForkJoinID>,
-        blocks: &mut Vec<SBlock>,
-        partition_idx: usize,
-        manifest: &Manifest,
-    ) {
-        let bb = self.bbs[id.idx()];
-        let block_id = Cell::new(control_id_to_block_id[&bb]);
-        let mut block = take(&mut blocks[block_id.get().idx()]);
-
-        // Uses of reduce nodes inside their corresponding reduce block need to
-        // refer to the reduction variable instruction, not the output of the
-        // reduce block.
-        let get_svalue = |id: NodeID| data_id_to_svalue[&id].clone();
-        let self_virt_reg = || get_svalue(id).try_virt_reg().unwrap();
-
-        // Helper function to lower a jump to a particular control node.
-        let lower_jmp = |dst: NodeID, block: &mut SBlock| {
-            if let Some(block_id) = control_id_to_block_id.get(&dst) {
-                // The successor block is in this partition. Add extra info to
-                // the jump if we're jumping into a parallel section or out of a
-                // reduce section. Note that both of those may be true at once.
-                let parallel_entry = if self.function.nodes[dst.idx()].is_fork() {
-                    self.copy_schedules(dst, block);
-                    Some(self.compile_parallel_entry(
-                        dst,
-                        data_id_to_svalue,
-                        block,
-                        partition_idx,
-                        manifest,
-                    ))
-                } else {
-                    None
-                };
-                let reduce_exit = if self.function.nodes[id.idx()].is_join() {
-                    Some(self.compile_reduce_exit(id, data_id_to_svalue))
-                } else {
-                    None
-                };
-                block.insts.push(SInst::Jump {
-                    target: *block_id,
-                    parallel_entry,
-                    reduce_exit,
-                });
-            } else {
-                assert_ne!(
-                    self.plan.partitions[id.idx()],
-                    self.plan.partitions[dst.idx()]
-                );
-
-                // The successor block is in a different partition.
-                let next_partition = self.plan.partitions[dst.idx()];
-                let data_outputs = manifest.partitions[partition_idx]
-                    .returns
-                    .iter()
-                    .map(|(_, kind)| match kind {
-                        ReturnKind::DataOutput(id) => get_svalue(*id).clone(),
-                        ReturnKind::HerculesReturn => panic!("PANIC: Partition can't contain a HerculesReturn kind of return value when it jumps to another partition."),
-                        ReturnKind::NextPartition => SValue::Constant(SConstant::Integer8(next_partition.idx() as i8)),
-                    })
-                    .collect();
-
-                block.insts.push(SInst::PartitionExit { data_outputs });
-            }
-            block
-                .virt_regs
-                .push((self.make_virt_reg(partition_idx), SType::Boolean));
-        };
-
-        // Helper function to generate the dynamic constant math to compute the
-        // bounds of an array type of a node.
-        let lower_extents = |id: NodeID, block: &mut SBlock| {
-            self.types[self.typing[id.idx()].idx()]
-                .try_extents()
-                .unwrap()
-                .iter()
-                .map(|dc| self.compile_dynamic_constant(*dc, block, partition_idx, manifest))
-                .collect()
-        };
-
-        // Emit schedule IR instructions corresponding to this Hercules IR node.
-        match self.function.nodes[id.idx()] {
-            // Forks are super simple to lower here. Since what's sequential /
-            // parallel / reducing is encoded in basic block kinds, and entry /
-            // exits are handled in `lower_jmp`, we just need to add a jump like
-            // any other control block with one successor.
-            Node::Start
-            | Node::Region { preds: _ }
-            | Node::Projection {
-                control: _,
-                selection: _,
-            }
-            | Node::Fork {
-                control: _,
-                factors: _,
-            } => {
-                let mut succs = self.control_subgraph.succs(id);
-                assert_eq!(succs.len(), 1);
-                let succ = succs.next().unwrap();
-                lower_jmp(succ, &mut block);
-            }
-            Node::Join { control: _ } => {
-                let mut succs = self.control_subgraph.succs(id);
-                assert_eq!(succs.len(), 1);
-                let succ = succs.next().unwrap();
-                if self.plan.partitions[id.idx()] != self.plan.partitions[succ.idx()] {
-                    // If the successor is in another partition, we need to add
-                    // a sequential block to hold the PartitionExit. Add a jump,
-                    // with reduce exit metadata, to the reduce block.
-                    let exit_block_id = BlockID::new(blocks.len());
-                    let reduce_exit = self.compile_reduce_exit(id, data_id_to_svalue);
-                    block.insts.push(SInst::Jump {
-                        target: exit_block_id,
-                        parallel_entry: None,
-                        reduce_exit: Some(reduce_exit),
-                    });
-                    block
-                        .virt_regs
-                        .push((self.make_virt_reg(partition_idx), SType::Boolean));
-                    // The exit block contains just a PartitionExit instruction.
-                    let mut exit_block = SBlock::default();
-                    // `lower_jmp` depends on `block_id`, so temporarily update.
-                    let old_block_id = block_id.get();
-                    block_id.set(exit_block_id);
-                    lower_jmp(succ, &mut exit_block);
-                    block_id.set(old_block_id);
-                    blocks.push(exit_block);
-                } else {
-                    // Otherwise, lower the jump as normal.
-                    lower_jmp(succ, &mut block);
-                }
-            }
-            Node::If { control: _, cond } => {
-                let mut succs = self.control_subgraph.succs(id);
-                let mut proj1 = succs.next().unwrap();
-                let mut proj2 = succs.next().unwrap();
-                assert_eq!(succs.next(), None);
-                if self.function.nodes[proj1.idx()].try_proj().unwrap().1 == 1 {
-                    swap(&mut proj1, &mut proj2);
-                }
-                block.insts.push(SInst::Branch {
-                    cond: get_svalue(cond).clone(),
-                    false_target: control_id_to_block_id[&self.bbs[proj1.idx()]],
-                    true_target: control_id_to_block_id[&self.bbs[proj2.idx()]],
-                });
-                block
-                    .virt_regs
-                    .push((self.make_virt_reg(partition_idx), SType::Boolean));
-            }
-            Node::Return { control: _, data } => {
-                block.insts.push(SInst::Return {
-                    value: get_svalue(data).clone(),
-                });
-                block
-                    .virt_regs
-                    .push((self.make_virt_reg(partition_idx), SType::Boolean));
-            }
-
-            Node::Phi { control, ref data } => {
-                let control_uses = get_uses(&self.function.nodes[control.idx()]);
-                let mut found_in_partition_predecessor = false;
-                let mut found_out_of_partition_predecessor = false;
-                let inputs = zip(control_uses.as_ref().iter(), data.iter())
-                    .filter_map(|(control_use, data_id)| {
-                        if let Some(block_id) = control_id_to_block_id.get(control_use) {
-                            // If any of the predecessors are in this partition,
-                            // we actually generate a phi instruction.
-                            // Otherwise, we just need to refer to the parameter
-                            // of the SFunction corresponding to this phi.
-                            found_in_partition_predecessor = true;
-                            Some((*block_id, get_svalue(*data_id).clone()))
-                        } else if let Some(param_idx) = manifest.partitions[partition_idx]
-                            .parameters
-                            .iter()
-                            .position(|(_, kind)| *kind == ParameterKind::DataInput(id))
-                        {
-                            // This input to the phi is corresponds to all of
-                            // the inputs from control locations outside this
-                            // partition. This does *not* include constant nodes
-                            // in other partitions - those get propagated (see
-                            // below). Don't add multiple inputs for block #0.
-                            if found_out_of_partition_predecessor {
-                                return None;
-                            }
-                            // This predecessor for the phi gets passed in
-                            // via a parameter set up for this phi.
-                            found_out_of_partition_predecessor = true;
-                            Some((BlockID::new(0), SValue::VirtualRegister(param_idx)))
-                        } else {
-                            // This input to the phi is a constant located
-                            // outside this partition these get propagated in
-                            // schedule IR.
-                            found_in_partition_predecessor = true;
-                            let svalue = get_svalue(*data_id).clone();
-                            Some((BlockID::new(0), svalue))
-                        }
-                    })
-                    .collect();
-
-                // If there's at least one predecessor inside this partition, we
-                // need to generate an actual phi instruction.
-                if found_in_partition_predecessor {
-                    block.insts.push(SInst::Phi { inputs });
-                    block.virt_regs.push((
-                        self_virt_reg(),
-                        self.stypes[self.typing[id.idx()].idx()].clone().unwrap(),
-                    ));
-                }
-            }
-
-            Node::ThreadID { control, dimension } => {
-                let fork_join = fork_node_id_to_fork_join_id[&control];
-                block.insts.push(SInst::ThreadID {
-                    dimension,
-                    fork_join,
-                });
-                block
-                    .virt_regs
-                    .push((self_virt_reg(), SType::UnsignedInteger64));
-            }
-            Node::Reduce {
-                control,
-                init: _,
-                reduct: _,
-            } => {
-                // Determine the reduction variable number based on the users of
-                // the join node.
-                let number = self
-                    .def_use
-                    .get_users(control)
-                    .iter()
-                    .filter(|user| self.function.nodes[user.idx()].is_reduce())
-                    .position(|user| *user == id)
-                    .unwrap();
-                self.copy_schedules(id, &mut block);
-                block.insts.push(SInst::ReductionVariable { number });
-                block.virt_regs.push((
-                    self_virt_reg(),
-                    self.stypes[self.typing[id.idx()].idx()].clone().unwrap(),
-                ));
-            }
-
-            Node::Unary { input, op } => {
-                block.insts.push(SInst::Unary {
-                    input: get_svalue(input).clone(),
-                    op: convert_unary_op(op, &self.stypes),
-                });
-                block.virt_regs.push((
-                    self_virt_reg(),
-                    self.stypes[self.typing[id.idx()].idx()].clone().unwrap(),
-                ));
-            }
-            Node::Binary { left, right, op } => {
-                block.insts.push(SInst::Binary {
-                    left: get_svalue(left).clone(),
-                    right: get_svalue(right).clone(),
-                    op: convert_binary_op(op),
-                });
-                block.virt_regs.push((
-                    self_virt_reg(),
-                    self.stypes[self.typing[id.idx()].idx()].clone().unwrap(),
-                ));
-            }
-            Node::Ternary {
-                first,
-                second,
-                third,
-                op,
-            } => {
-                block.insts.push(SInst::Ternary {
-                    first: get_svalue(first).clone(),
-                    second: get_svalue(second).clone(),
-                    third: get_svalue(third).clone(),
-                    op: convert_ternary_op(op),
-                });
-                block.virt_regs.push((
-                    self_virt_reg(),
-                    self.stypes[self.typing[id.idx()].idx()].clone().unwrap(),
-                ));
-            }
-            Node::IntrinsicCall {
-                intrinsic,
-                ref args,
-            } => {
-                let args = args.iter().map(|id| get_svalue(*id).clone()).collect();
-                block.insts.push(SInst::IntrinsicCall { intrinsic, args });
-                block.virt_regs.push((
-                    self_virt_reg(),
-                    self.stypes[self.typing[id.idx()].idx()].clone().unwrap(),
-                ));
-            }
-
-            Node::Read {
-                collect,
-                ref indices,
-            } => {
-                let mut collect_svalue = get_svalue(collect);
-                let mut prod_indices = &indices[..];
-
-                // We currently only support top-level arrays and products. The
-                // array and product portions become separate instructions.
-                // Since arrays are always root types, handle them first.
-                if let Some(position) = indices[0].try_position() {
-                    // If there's both an array load and a product extract, we
-                    // need to allocate an intermediary virtual register.
-                    let dst_virt_reg = if indices.len() > 1 {
-                        self.make_virt_reg(partition_idx)
-                    } else {
-                        self_virt_reg()
-                    };
-
-                    let position = position.iter().map(|id| get_svalue(*id)).collect();
-                    // Array loads need the dynamic constant bounds for indexing
-                    // math.
-                    let bounds = lower_extents(collect, &mut block);
-                    let load_ty = if let SType::ArrayRef(elem_ty) = self.stypes
-                        [self.typing[collect.idx()].idx()]
-                    .clone()
-                    .unwrap()
-                    {
-                        *elem_ty
-                    } else {
-                        panic!("PANIC: Type of collection isn't an array when an ArrayLoad use is generated.")
-                    };
-                    block.insts.push(SInst::ArrayLoad {
-                        array: collect_svalue,
-                        position,
-                        bounds,
-                    });
-                    block.virt_regs.push((dst_virt_reg, load_ty));
-
-                    // The product extract needs to extract from the product
-                    // loaded from the array.
-                    collect_svalue = SValue::VirtualRegister(dst_virt_reg);
-                    prod_indices = &indices[1..];
-                }
-
-                // Handle the product indices.
-                if prod_indices.len() > 0 {
-                    let indices = prod_indices
-                        .iter()
-                        .map(|index| index.try_field().unwrap())
-                        .collect();
-                    block.insts.push(SInst::ProductExtract {
-                        product: collect_svalue,
-                        indices,
-                    });
-                    block.virt_regs.push((
-                        self_virt_reg(),
-                        self.stypes[self.typing[id.idx()].idx()].clone().unwrap(),
-                    ));
-                }
-            }
-            Node::Write {
-                collect,
-                data,
-                ref indices,
-            } => {
-                // We currently only support top-level arrays and products.
-                // There are three cases that we handle separately:
-                // 1. Writing to an array. This just lowers to an ArrayStore.
-                // 2. Writing to a product inside an array. This lowers to an
-                //    ArrayLoad to get the initial product value, a
-                //    ProductInsert to update the product value, and an
-                //    ArrayStore to write the new product value into the array.
-                // 3. Writing to a product. This just lowers to a ProductInsert.
-
-                if let Some(position) = indices[0].try_position()
-                    && indices.len() == 1
-                {
-                    // Handle case #1.
-                    let position = position.iter().map(|id| get_svalue(*id)).collect();
-                    // Array stores need the dynamic constant bounds for
-                    // indexing math.
-                    let bounds = lower_extents(collect, &mut block);
-                    block.insts.push(SInst::ArrayStore {
-                        array: get_svalue(collect),
-                        value: get_svalue(data),
-                        position,
-                        bounds,
-                    });
-                    // Array stores don't produce a meaningful virtual register.
-                    block
-                        .virt_regs
-                        .push((self.make_virt_reg(partition_idx), SType::Boolean));
-                } else if let Some(position) = indices[0].try_position() {
-                    // Handle case #2.
-                    let position = position
-                        .iter()
-                        .map(|id| get_svalue(*id))
-                        .collect::<Box<[_]>>();
-                    let bounds = lower_extents(collect, &mut block);
-
-                    // Load the product.
-                    let load_virt_reg = self.make_virt_reg(partition_idx);
-                    let load_ty = if let SType::ArrayRef(elem_ty) = self.stypes
-                        [self.typing[collect.idx()].idx()]
-                    .clone()
-                    .unwrap()
-                    {
-                        *elem_ty
-                    } else {
-                        panic!("PANIC: Type of collection isn't an array when an ArrayLoad use is generated.")
-                    };
-                    block.insts.push(SInst::ArrayLoad {
-                        array: get_svalue(collect),
-                        position: position.clone(),
-                        bounds: bounds.clone(),
-                    });
-                    block.virt_regs.push((load_virt_reg, load_ty.clone()));
-
-                    // Update the product.
-                    let update_virt_reg = self.make_virt_reg(partition_idx);
-                    let indices = indices[1..]
-                        .iter()
-                        .map(|index| index.try_field().unwrap())
-                        .collect();
-                    block.insts.push(SInst::ProductInsert {
-                        product: SValue::VirtualRegister(load_virt_reg),
-                        data: get_svalue(data),
-                        indices,
-                    });
-                    block.virt_regs.push((update_virt_reg, load_ty));
-
-                    // Store the product.
-                    block.insts.push(SInst::ArrayStore {
-                        array: get_svalue(collect),
-                        value: SValue::VirtualRegister(update_virt_reg),
-                        position,
-                        bounds,
-                    });
-                    block
-                        .virt_regs
-                        .push((self.make_virt_reg(partition_idx), SType::Boolean));
-                } else {
-                    // Handle case #3.
-                    let indices = indices
-                        .iter()
-                        .map(|index| index.try_field().unwrap())
-                        .collect();
-                    block.insts.push(SInst::ProductInsert {
-                        product: get_svalue(collect),
-                        data: get_svalue(data),
-                        indices,
-                    });
-                    // Product insertions do produce a virtual register, since
-                    // they create a new product value.
-                    block.virt_regs.push((
-                        self_virt_reg(),
-                        self.stypes[self.typing[id.idx()].idx()].clone().unwrap(),
-                    ));
-                }
-            }
-
-            // There are a few nodes for which no code needs to get emitted.
-            _ => {}
-        }
-
-        blocks[block_id.get().idx()] = block;
-    }
-
-    /*
-     * Helper to copy over schedules.
-     */
-    fn copy_schedules(&self, src: NodeID, block: &mut SBlock) {
-        block.schedules.insert(
-            block.insts.len(),
-            self.plan.schedules[src.idx()]
-                .iter()
-                .map(|schedule| sched_make_schedule(schedule))
-                .collect(),
-        );
-    }
-
-    /*
-     * Compiles a reference to a dynamic constant into math to compute that
-     * dynamic constant. We need a mutable reference to some basic block, since
-     * we may need to generate math inline to compute the dynamic constant.
-     */
-    fn compile_dynamic_constant(
-        &self,
-        dc: DynamicConstantID,
-        block: &mut SBlock,
-        partition_idx: usize,
-        manifest: &Manifest,
-    ) -> SValue {
-        match self.dynamic_constants[dc.idx()] {
-            DynamicConstant::Constant(cons) => {
-                SValue::Constant(SConstant::UnsignedInteger64(cons as u64))
-            }
-            DynamicConstant::Parameter(idx) => SValue::VirtualRegister(
-                manifest.partitions[partition_idx]
-                    .parameters
-                    .iter()
-                    .position(|(_, kind)| *kind == ParameterKind::DynamicConstant(idx))
-                    .unwrap(),
-            ),
-
-            DynamicConstant::Add(left, right)
-            | DynamicConstant::Sub(left, right)
-            | DynamicConstant::Mul(left, right)
-            | DynamicConstant::Div(left, right)
-            | DynamicConstant::Rem(left, right) => {
-                let left = self.compile_dynamic_constant(left, block, partition_idx, manifest);
-                let right = self.compile_dynamic_constant(right, block, partition_idx, manifest);
-                let output_virt_reg = self.make_virt_reg(partition_idx);
-                block.insts.push(SInst::Binary {
-                    left,
-                    right,
-                    op: match self.dynamic_constants[dc.idx()] {
-                        DynamicConstant::Add(_, _) => SBinaryOperator::Add,
-                        DynamicConstant::Sub(_, _) => SBinaryOperator::Sub,
-                        DynamicConstant::Mul(_, _) => SBinaryOperator::Mul,
-                        DynamicConstant::Div(_, _) => SBinaryOperator::Div,
-                        DynamicConstant::Rem(_, _) => SBinaryOperator::Rem,
-                        _ => panic!(),
-                    },
-                });
-                block
-                    .virt_regs
-                    .push((output_virt_reg, SType::UnsignedInteger64));
-                SValue::VirtualRegister(output_virt_reg)
-            }
-        }
-    }
-
-    /*
-     * Makes a parallel entry for a jump to a fork.
-     */
-    fn compile_parallel_entry(
-        &self,
-        fork: NodeID,
-        data_id_to_svalue: &HashMap<NodeID, SValue>,
-        block: &mut SBlock,
-        partition_idx: usize,
-        manifest: &Manifest,
-    ) -> ParallelEntry {
-        let (_, factors) = self.function.nodes[fork.idx()].try_fork().unwrap();
-        let thread_counts = factors
-            .iter()
-            .map(|dc_id| self.compile_dynamic_constant(*dc_id, block, partition_idx, manifest))
-            .collect();
-        let reduce_inits = self
-            .def_use
-            .get_users(self.fork_join_map[&fork])
-            .iter()
-            .filter_map(|user| self.function.nodes[user.idx()].try_reduce())
-            .map(|(_, init, _)| data_id_to_svalue[&init].clone())
-            .collect();
-        ParallelEntry {
-            thread_counts,
-            reduce_inits,
-        }
-    }
-
-    /*
-     * Makes a reduce exit for a jump from a join.
-     */
-    fn compile_reduce_exit(
-        &self,
-        join: NodeID,
-        data_id_to_svalue: &HashMap<NodeID, SValue>,
-    ) -> ReduceExit {
-        let reduce_reducts = self
-            .def_use
-            .get_users(join)
-            .iter()
-            .filter(|user| self.function.nodes[user.idx()].is_reduce())
-            .map(|reduce| {
-                // The SValues that get passed to the reduce exit are the
-                // `reduct` input to the reduce node.
-                data_id_to_svalue[&get_uses(&self.function.nodes[reduce.idx()]).as_ref()[2]].clone()
-            })
-            .collect();
-        ReduceExit { reduce_reducts }
-    }
-
-    fn make_virt_reg(&self, partition_idx: usize) -> usize {
-        let virt_reg = self.num_virtual_registers[partition_idx].get();
-        self.num_virtual_registers[partition_idx].set(virt_reg + 1);
-        virt_reg
-    }
-
-    fn get_sfunction_name(&self, partition_idx: usize) -> SFunctionName {
-        format!("{}_{}", self.function.name, partition_idx)
-    }
-
-    /*
-     * There is some information we can only add to the manifest once we've
-     * computed the schedule IR.
-     */
-    fn update_manifest(&self, manifest: &mut PartitionManifest, function: &SFunction) {
-        let parallel_reduce_infos = sched_parallel_reduce_sections(function);
-
-        // Add parallel launch info for CPU partitions. This relies on checking
-        // schedules inside the generated schedule IR.
-        let partition_name = manifest.name.clone();
-        if let Some(tiles) = function.blocks[0].schedules[&0]
-            .iter()
-            .filter_map(|schedule| schedule.try_parallel_launch())
-            .next()
-            && parallel_reduce_infos
-                .into_iter()
-                .any(|(_, info)| info.top_level)
-            && let DeviceManifest::CPU { parallel_launch } = &mut manifest.device
-        {
-            let parallel_entry = function.blocks[0].insts[0].try_jump().unwrap().1.unwrap();
-            assert_eq!(tiles.len(), parallel_entry.thread_counts.len());
-            let top_level_fork_id = self
-                .fork_join_nest
-                .iter()
-                // Find control nodes in the fork join nesting whose only nest
-                // it itself (is a top level fork-join).
-                .filter(|(id, nest)| nest.len() == 1 && nest[0] == **id)
-                // Only consider forks in this partition.
-                .filter(|(id, _)| {
-                    self.get_sfunction_name(self.plan.partitions[id.idx()].idx()) == partition_name
-                })
-                .next()
-                .unwrap()
-                .0;
-            *parallel_launch = zip(
-                tiles.into_iter(),
-                self.function.nodes[top_level_fork_id.idx()]
-                    .try_fork()
-                    .unwrap()
-                    .1,
-            )
-            .map(|(num_chunks, count_dc_id)| (*num_chunks, *count_dc_id))
-            .collect();
-        }
-    }
-}
-
-fn convert_unary_op(op: UnaryOperator, simple_ir_types: &[Option<SType>]) -> SUnaryOperator {
-    match op {
-        UnaryOperator::Not => SUnaryOperator::Not,
-        UnaryOperator::Neg => SUnaryOperator::Neg,
-        UnaryOperator::Cast(ty) => SUnaryOperator::Cast(simple_ir_types[ty.idx()].clone().unwrap()),
-    }
-}
-
-fn convert_binary_op(op: BinaryOperator) -> SBinaryOperator {
-    match op {
-        BinaryOperator::Add => SBinaryOperator::Add,
-        BinaryOperator::Sub => SBinaryOperator::Sub,
-        BinaryOperator::Mul => SBinaryOperator::Mul,
-        BinaryOperator::Div => SBinaryOperator::Div,
-        BinaryOperator::Rem => SBinaryOperator::Rem,
-        BinaryOperator::LT => SBinaryOperator::LT,
-        BinaryOperator::LTE => SBinaryOperator::LTE,
-        BinaryOperator::GT => SBinaryOperator::GT,
-        BinaryOperator::GTE => SBinaryOperator::GTE,
-        BinaryOperator::EQ => SBinaryOperator::EQ,
-        BinaryOperator::NE => SBinaryOperator::NE,
-        BinaryOperator::Or => SBinaryOperator::Or,
-        BinaryOperator::And => SBinaryOperator::And,
-        BinaryOperator::Xor => SBinaryOperator::Xor,
-        BinaryOperator::LSh => SBinaryOperator::LSh,
-        BinaryOperator::RSh => SBinaryOperator::RSh,
-    }
-}
-
-fn convert_ternary_op(op: TernaryOperator) -> STernaryOperator {
-    match op {
-        TernaryOperator::Select => STernaryOperator::Select,
-    }
-}
diff --git a/hercules_cg/src/sched_ir.rs b/hercules_cg/src/sched_ir.rs
deleted file mode 100644
index acbc74d2e3101eec164f90c5632099c8f1fe8935..0000000000000000000000000000000000000000
--- a/hercules_cg/src/sched_ir.rs
+++ /dev/null
@@ -1,673 +0,0 @@
-extern crate ordered_float;
-extern crate serde;
-
-extern crate hercules_ir;
-
-use std::collections::HashMap;
-
-use self::serde::Deserialize;
-use self::serde::Serialize;
-
-use self::hercules_ir::*;
-
-use crate::*;
-
-/*
- * A schedule IR module is a list of functions and a description of each
- * Hercules function in terms of schedule IR functions (called the manifest).
- */
-#[derive(Debug, Default, Clone)]
-pub struct SModule {
-    // Refer to schedule IR functions by their name.
-    pub functions: HashMap<SFunctionName, SFunction>,
-    // Each Hercules function maps to a manifest.
-    pub manifests: HashMap<String, Manifest>,
-}
-
-/*
- * A schedule IR function consists of a CFG of basic blocks, each containing
- * instructions. Instructions can produce virtual register outputs, and SSA form
- * is guaranteed. SFunctions can have multiple parameters and return values -
- * many values may cross partition boundaries at once.
- *
- * Since SFunctions represent partitions, many SFunctions don't "return".
- * Instead, conceptually they "jump" to the next partition to run. SFunctions
- * that jump to another partition contain the "PartitionExit" instruction, while
- * SFunctions that return from the Hercules function contain the "Return"
- * instruction. An SFunction must contain either PartitionExits xor Returns.
- *
- * There are two special kinds of basic blocks for representing fork-joins:
- * parallel blocks and reduce blocks. Each parallel / reduce block is associated
- * with a unique ID per fork-join. A parallel block can contain a "ThreadID"
- * instruction, which gets the Nth thread ID. A reduce block can contain a
- * "Reduce" instruction, which gets the last value of the Mth reduction
- * variable. When jumping to a parallel block, a u64 must be provided per fork
- * dimension, specifying how many threads should spawn, and an initial value per
- * reduction variable must be provided. When jumping out of a reduce block, a
- * "new" value for each reduction variable must be provided.
- */
-#[derive(Debug, Default, Clone)]
-pub struct SFunction {
-    pub blocks: Vec<SBlock>,
-    pub param_types: Vec<SType>,
-    pub return_types: Vec<SType>,
-}
-
-impl SFunction {
-    pub fn get_inst(&self, id: InstID) -> &SInst {
-        &self.blocks[id.idx_0()].insts[id.idx_1()]
-    }
-
-    pub fn get_inst_mut(&mut self, id: InstID) -> &mut SInst {
-        &mut self.blocks[id.idx_0()].insts[id.idx_1()]
-    }
-}
-
-/*
- * Use a very simple representation for blocks, since modification is not a
- * priority. Unlike many IRs (say LLVM), the instructions in schedule IR blocks
- * aren't necessarily ordered, as different backends may have different
- * scheduling considerations. This means that, for example, each SBlock must
- * contain exactly one terminating instruction, but the position of that
- * instruction may not be at the end of the block. All that's required is that
- * defs precede uses, and that loads and stores to array references are ordered.
- */
-#[derive(Debug, Default, Clone)]
-pub struct SBlock {
-    pub insts: Vec<SInst>,
-    // The virtual registers created by each instruction. Technically, this will
-    // assign instructions like ArrayStores and Regions a virtual register,
-    // which doesn't make sense. These virtual registers are just ignored. Each
-    // virtual register has a certain schedule IR type. The type of virtual
-    // registers produced by certain instructions, like Jump or ArrayStore, is
-    // set to SType::Boolean, but it's not meaningful.
-    pub virt_regs: Vec<(usize, SType)>,
-    // Map from instruction index in the block to a list of schedules attached
-    // to that instruction.
-    pub schedules: HashMap<usize, Vec<SSchedule>>,
-    pub kind: SBlockKind,
-}
-
-impl SBlock {
-    pub fn successors(&self) -> BlockSuccessors {
-        self.insts
-            .iter()
-            .map(|inst| inst.block_successors())
-            .filter(|successors| *successors != BlockSuccessors::Zero)
-            .next()
-            .unwrap_or(BlockSuccessors::Zero)
-    }
-}
-
-#[derive(Debug, Default, Clone, PartialEq, Eq)]
-pub enum SBlockKind {
-    #[default]
-    Sequential,
-    Parallel(ForkJoinID),
-    Reduce(ForkJoinID),
-}
-
-impl SBlockKind {
-    pub fn try_parallel(&self) -> Option<ForkJoinID> {
-        if let SBlockKind::Parallel(id) = self {
-            Some(*id)
-        } else {
-            None
-        }
-    }
-
-    pub fn try_reduce(&self) -> Option<ForkJoinID> {
-        if let SBlockKind::Reduce(id) = self {
-            Some(*id)
-        } else {
-            None
-        }
-    }
-
-    pub fn try_fork_join_id(&self) -> Option<ForkJoinID> {
-        match self {
-            SBlockKind::Sequential => None,
-            SBlockKind::Parallel(id) | SBlockKind::Reduce(id) => Some(*id),
-        }
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum SSchedule {
-    ParallelLaunch(Box<[usize]>),
-    ParallelReduce,
-    Vectorizable(usize),
-    Associative,
-}
-
-impl SSchedule {
-    pub fn try_parallel_launch(&self) -> Option<&[usize]> {
-        if let SSchedule::ParallelLaunch(tiles) = self {
-            Some(tiles)
-        } else {
-            None
-        }
-    }
-
-    pub fn try_vectorizable(&self) -> Option<usize> {
-        if let SSchedule::Vectorizable(width) = self {
-            Some(*width)
-        } else {
-            None
-        }
-    }
-}
-
-pub fn sched_make_schedule(schedule: &Schedule) -> SSchedule {
-    match schedule {
-        Schedule::ParallelFork(tiles) => SSchedule::ParallelLaunch(tiles.clone()),
-        Schedule::ParallelReduce => SSchedule::ParallelReduce,
-        Schedule::Vectorizable(width) => SSchedule::Vectorizable(*width),
-        Schedule::Associative => SSchedule::Associative,
-    }
-}
-
-/*
- * Unlike Hercules IR, we can represent a reference to an array (so that we
- * don't need to use an array value in this IR). This is fine, since we're not
- * doing much analysis / optimization at this stage, and most platforms we want
- * to target have a similar model for working with arrays anyway. We still need
- * value product types, since the layout of these types may be platform
- * dependent.
- */
-#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
-pub enum SType {
-    Boolean,
-    Integer8,
-    Integer16,
-    Integer32,
-    Integer64,
-    UnsignedInteger8,
-    UnsignedInteger16,
-    UnsignedInteger32,
-    UnsignedInteger64,
-    Float32,
-    Float64,
-    // Don't intern STypes to make developing the code generator easier.
-    Product(Box<[SType]>),
-    // Array types don't include their bounds, since dynamic constants are not
-    // an IR-level concept in schedule IR.
-    ArrayRef(Box<SType>),
-}
-
-impl SType {
-    pub fn is_float(&self) -> bool {
-        match self {
-            SType::Float32 | SType::Float64 => true,
-            _ => false,
-        }
-    }
-
-    pub fn is_unsigned(&self) -> bool {
-        match self {
-            SType::UnsignedInteger8
-            | SType::UnsignedInteger16
-            | SType::UnsignedInteger32
-            | SType::UnsignedInteger64 => true,
-            _ => false,
-        }
-    }
-
-    pub fn is_signed(&self) -> bool {
-        match self {
-            SType::Integer8 | SType::Integer16 | SType::Integer32 | SType::Integer64 => true,
-            _ => false,
-        }
-    }
-
-    pub fn is_integer(&self) -> bool {
-        self.is_unsigned() || self.is_signed() || *self == SType::Boolean
-    }
-
-    pub fn num_bits(&self) -> u8 {
-        match self {
-            SType::Boolean => 1,
-            SType::Integer8 | SType::UnsignedInteger8 => 8,
-            SType::Integer16 | SType::UnsignedInteger16 => 16,
-            SType::Integer32 | SType::UnsignedInteger32 | SType::Float32 => 32,
-            SType::Integer64 | SType::UnsignedInteger64 | SType::Float64 => 64,
-            _ => panic!(),
-        }
-    }
-
-    pub fn try_product(&self) -> Option<&[SType]> {
-        if let SType::Product(fields) = self {
-            Some(fields)
-        } else {
-            None
-        }
-    }
-}
-
-/*
- * Represents constants, except for array constants.
- */
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum SConstant {
-    Boolean(bool),
-    Integer8(i8),
-    Integer16(i16),
-    Integer32(i32),
-    Integer64(i64),
-    UnsignedInteger8(u8),
-    UnsignedInteger16(u16),
-    UnsignedInteger32(u32),
-    UnsignedInteger64(u64),
-    Float32(ordered_float::OrderedFloat<f32>),
-    Float64(ordered_float::OrderedFloat<f64>),
-    // Don't intern SConstants to make developing the code generator easier.
-    Product(Box<[SConstant]>),
-}
-
-impl SConstant {
-    pub fn get_type(&self) -> SType {
-        match self {
-            SConstant::Boolean(_) => SType::Boolean,
-            SConstant::Integer8(_) => SType::Integer8,
-            SConstant::Integer16(_) => SType::Integer16,
-            SConstant::Integer32(_) => SType::Integer32,
-            SConstant::Integer64(_) => SType::Integer64,
-            SConstant::UnsignedInteger8(_) => SType::UnsignedInteger8,
-            SConstant::UnsignedInteger16(_) => SType::UnsignedInteger16,
-            SConstant::UnsignedInteger32(_) => SType::UnsignedInteger32,
-            SConstant::UnsignedInteger64(_) => SType::UnsignedInteger64,
-            SConstant::Float32(_) => SType::Float32,
-            SConstant::Float64(_) => SType::Float64,
-            SConstant::Product(fields) => {
-                SType::Product(fields.into_iter().map(|field| field.get_type()).collect())
-            }
-        }
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum SValue {
-    Constant(SConstant),
-    VirtualRegister(usize),
-}
-
-impl SValue {
-    pub fn try_virt_reg(&self) -> Option<usize> {
-        if let SValue::VirtualRegister(vr) = self {
-            Some(*vr)
-        } else {
-            None
-        }
-    }
-
-    pub fn try_constant(&self) -> Option<&SConstant> {
-        if let SValue::Constant(cons) = self {
-            Some(cons)
-        } else {
-            None
-        }
-    }
-}
-
-/*
- * Typical instructions of a CFG + SSA IR, plus some instructions for
- * representing particular Hercules IR quirks.
- */
-#[derive(Debug, Clone)]
-pub enum SInst {
-    Phi {
-        inputs: Box<[(BlockID, SValue)]>,
-    },
-    ThreadID {
-        dimension: usize,
-        fork_join: ForkJoinID,
-    },
-    ReductionVariable {
-        number: usize,
-    },
-    Jump {
-        target: BlockID,
-        parallel_entry: Option<ParallelEntry>,
-        reduce_exit: Option<ReduceExit>,
-    },
-    Branch {
-        cond: SValue,
-        false_target: BlockID,
-        true_target: BlockID,
-    },
-    PartitionExit {
-        data_outputs: Box<[SValue]>,
-    },
-    Return {
-        value: SValue,
-    },
-    Unary {
-        input: SValue,
-        op: SUnaryOperator,
-    },
-    Binary {
-        left: SValue,
-        right: SValue,
-        op: SBinaryOperator,
-    },
-    Ternary {
-        first: SValue,
-        second: SValue,
-        third: SValue,
-        op: STernaryOperator,
-    },
-    IntrinsicCall {
-        intrinsic: Intrinsic,
-        args: Box<[SValue]>,
-    },
-    ProductExtract {
-        product: SValue,
-        indices: Box<[usize]>,
-    },
-    ProductInsert {
-        product: SValue,
-        data: SValue,
-        indices: Box<[usize]>,
-    },
-    ArrayLoad {
-        array: SValue,
-        position: Box<[SValue]>,
-        bounds: Box<[SValue]>,
-    },
-    ArrayStore {
-        array: SValue,
-        value: SValue,
-        position: Box<[SValue]>,
-        bounds: Box<[SValue]>,
-    },
-}
-
-impl SInst {
-    pub fn is_reduction_variable(&self) -> bool {
-        if let SInst::ReductionVariable { number: _ } = self {
-            true
-        } else {
-            false
-        }
-    }
-
-    pub fn is_phi(&self) -> bool {
-        if let SInst::Phi { inputs: _ } = self {
-            true
-        } else {
-            false
-        }
-    }
-
-    pub fn is_jump(&self) -> bool {
-        if let SInst::Jump {
-            target: _,
-            parallel_entry: _,
-            reduce_exit: _,
-        } = self
-        {
-            true
-        } else {
-            false
-        }
-    }
-
-    pub fn is_branch(&self) -> bool {
-        if let SInst::Branch {
-            cond: _,
-            false_target: _,
-            true_target: _,
-        } = self
-        {
-            true
-        } else {
-            false
-        }
-    }
-
-    pub fn is_partition_exit(&self) -> bool {
-        if let SInst::PartitionExit { data_outputs: _ } = self {
-            true
-        } else {
-            false
-        }
-    }
-
-    pub fn is_return(&self) -> bool {
-        if let SInst::Return { value: _ } = self {
-            true
-        } else {
-            false
-        }
-    }
-
-    pub fn is_terminator(&self) -> bool {
-        self.is_jump() || self.is_branch() || self.is_partition_exit() || self.is_return()
-    }
-
-    pub fn try_thread_id(&self) -> Option<(usize, ForkJoinID)> {
-        if let SInst::ThreadID {
-            dimension,
-            fork_join,
-        } = self
-        {
-            Some((*dimension, *fork_join))
-        } else {
-            None
-        }
-    }
-
-    pub fn try_reduction_variable(&self) -> Option<usize> {
-        if let SInst::ReductionVariable { number } = self {
-            Some(*number)
-        } else {
-            None
-        }
-    }
-
-    pub fn try_jump(&self) -> Option<(BlockID, Option<&ParallelEntry>, Option<&ReduceExit>)> {
-        if let SInst::Jump {
-            target,
-            parallel_entry,
-            reduce_exit,
-        } = self
-        {
-            Some((*target, parallel_entry.as_ref(), reduce_exit.as_ref()))
-        } else {
-            None
-        }
-    }
-
-    pub fn block_successors(&self) -> BlockSuccessors {
-        match self {
-            SInst::Jump {
-                target,
-                parallel_entry: _,
-                reduce_exit: _,
-            } => BlockSuccessors::One([*target]),
-            SInst::Branch {
-                cond: _,
-                false_target,
-                true_target,
-            } => BlockSuccessors::Two([*false_target, *true_target]),
-            _ => BlockSuccessors::Zero,
-        }
-    }
-
-    pub fn upper_case_name(&self) -> &'static str {
-        match self {
-            SInst::Phi { inputs: _ } => "Phi",
-            SInst::ThreadID {
-                dimension: _,
-                fork_join: _,
-            } => "ThreadID",
-            SInst::ReductionVariable { number: _ } => "ReductionVariable",
-            SInst::Jump {
-                target: _,
-                parallel_entry: _,
-                reduce_exit: _,
-            } => "Jump",
-            SInst::Branch {
-                cond: _,
-                false_target: _,
-                true_target: _,
-            } => "Branch",
-            SInst::PartitionExit { data_outputs: _ } => "PartitionExit",
-            SInst::Return { value: _ } => "Return",
-            SInst::Unary { input: _, op } => op.upper_case_name(),
-            SInst::Binary {
-                left: _,
-                right: _,
-                op,
-            } => op.upper_case_name(),
-            SInst::Ternary {
-                first: _,
-                second: _,
-                third: _,
-                op,
-            } => op.upper_case_name(),
-            SInst::IntrinsicCall { intrinsic, args: _ } => intrinsic.upper_case_name(),
-            SInst::ProductExtract {
-                product: _,
-                indices: _,
-            } => "ProductExtract",
-            SInst::ProductInsert {
-                product: _,
-                data: _,
-                indices: _,
-            } => "ProductInsert",
-            SInst::ArrayLoad {
-                array: _,
-                position: _,
-                bounds: _,
-            } => "ArrayLoad",
-            SInst::ArrayStore {
-                array: _,
-                value: _,
-                position: _,
-                bounds: _,
-            } => "ArrayStore",
-        }
-    }
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub enum BlockSuccessors {
-    Zero,
-    One([BlockID; 1]),
-    Two([BlockID; 2]),
-}
-
-impl AsRef<[BlockID]> for BlockSuccessors {
-    fn as_ref(&self) -> &[BlockID] {
-        match self {
-            BlockSuccessors::Zero => &[],
-            BlockSuccessors::One(x) => x,
-            BlockSuccessors::Two(x) => x,
-        }
-    }
-}
-
-/*
- * On entering a parallel section, we need to specify how many threads to spawn
- * and what the initial values of the reduction variables are.
- */
-#[derive(Debug, Clone)]
-pub struct ParallelEntry {
-    pub thread_counts: Box<[SValue]>,
-    pub reduce_inits: Box<[SValue]>,
-}
-
-/*
- * On exiting a reduce section, we need to specify which instructions in the
- * reduce block correspond to what reduction variables. This also specifies
- * which values defined inside the reduce block can be used outside the block.
- */
-#[derive(Debug, Clone)]
-pub struct ReduceExit {
-    pub reduce_reducts: Box<[SValue]>,
-}
-
-/*
- * The operator types are mostly the same.
- */
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum SUnaryOperator {
-    Not,
-    Neg,
-    Cast(SType),
-}
-
-impl SUnaryOperator {
-    pub fn upper_case_name(&self) -> &'static str {
-        match self {
-            SUnaryOperator::Not => "Not",
-            SUnaryOperator::Neg => "Neg",
-            SUnaryOperator::Cast(_) => "Cast",
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum SBinaryOperator {
-    Add,
-    Sub,
-    Mul,
-    Div,
-    Rem,
-    LT,
-    LTE,
-    GT,
-    GTE,
-    EQ,
-    NE,
-    Or,
-    And,
-    Xor,
-    LSh,
-    RSh,
-}
-
-impl SBinaryOperator {
-    pub fn upper_case_name(&self) -> &'static str {
-        match self {
-            SBinaryOperator::Add => "Add",
-            SBinaryOperator::Sub => "Sub",
-            SBinaryOperator::Mul => "Mul",
-            SBinaryOperator::Div => "Div",
-            SBinaryOperator::Rem => "Rem",
-            SBinaryOperator::LT => "LT",
-            SBinaryOperator::LTE => "LTE",
-            SBinaryOperator::GT => "GT",
-            SBinaryOperator::GTE => "GTE",
-            SBinaryOperator::EQ => "EQ",
-            SBinaryOperator::NE => "NE",
-            SBinaryOperator::Or => "Or",
-            SBinaryOperator::And => "And",
-            SBinaryOperator::Xor => "Xor",
-            SBinaryOperator::LSh => "LSh",
-            SBinaryOperator::RSh => "RSh",
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum STernaryOperator {
-    Select,
-}
-
-impl STernaryOperator {
-    pub fn upper_case_name(&self) -> &'static str {
-        match self {
-            STernaryOperator::Select => "Select",
-        }
-    }
-}
-
-pub type SFunctionName = String;
-
-define_id_type!(ArrayID);
-define_id_type!(BlockID);
-define_id_type!(ForkJoinID);
-
-define_dual_id_type!(InstID);
diff --git a/hercules_cg/src/sched_schedule.rs b/hercules_cg/src/sched_schedule.rs
deleted file mode 100644
index 4ba407f0e316e6611e311ff095bff6121a922c5a..0000000000000000000000000000000000000000
--- a/hercules_cg/src/sched_schedule.rs
+++ /dev/null
@@ -1,465 +0,0 @@
-extern crate hercules_ir;
-
-use std::collections::{HashMap, HashSet, VecDeque};
-use std::iter::{empty, once, zip};
-
-use self::hercules_ir::*;
-
-use crate::*;
-
-/*
- * Iterate over the uses of a instruction.
- */
-pub fn sched_get_uses(inst: &SInst) -> Box<dyn Iterator<Item = &SValue> + '_> {
-    match inst {
-        SInst::Phi { inputs } => Box::new(inputs.iter().map(|(_, svalue)| svalue)),
-        SInst::ThreadID {
-            dimension: _,
-            fork_join: _,
-        } => Box::new(empty()),
-        SInst::ReductionVariable { number: _ } => Box::new(empty()),
-        SInst::Jump {
-            target: _,
-            parallel_entry,
-            reduce_exit,
-        } => {
-            let first = parallel_entry.as_ref().map(|parallel_entry| {
-                parallel_entry
-                    .thread_counts
-                    .iter()
-                    .chain(parallel_entry.reduce_inits.iter())
-            });
-            let second = reduce_exit
-                .as_ref()
-                .map(|reduce_exit| reduce_exit.reduce_reducts.iter());
-            match (first, second) {
-                (Some(first), Some(second)) => Box::new(first.chain(second)),
-                (Some(first), None) => Box::new(first),
-                (None, Some(second)) => Box::new(second),
-                (None, None) => Box::new(empty()),
-            }
-        }
-        SInst::Branch {
-            cond,
-            false_target: _,
-            true_target: _,
-        } => Box::new(once(cond)),
-        SInst::PartitionExit { data_outputs } => Box::new(data_outputs.iter()),
-        SInst::Return { value } => Box::new(once(value)),
-        SInst::Unary { input, op: _ } => Box::new(once(input)),
-        SInst::Binary { left, right, op: _ } => Box::new(once(left).chain(once(right))),
-        SInst::Ternary {
-            first,
-            second,
-            third,
-            op: _,
-        } => Box::new(once(first).chain(once(second)).chain(once(third))),
-        SInst::IntrinsicCall { intrinsic: _, args } => Box::new(args.iter()),
-        SInst::ProductExtract {
-            product,
-            indices: _,
-        } => Box::new(once(product)),
-        SInst::ProductInsert {
-            product,
-            data,
-            indices: _,
-        } => Box::new(once(product).chain(once(data))),
-        SInst::ArrayLoad {
-            array,
-            position,
-            bounds,
-        } => Box::new(once(array).chain(position.iter()).chain(bounds.iter())),
-        SInst::ArrayStore {
-            array,
-            value,
-            position,
-            bounds,
-        } => Box::new(
-            once(array)
-                .chain(once(value))
-                .chain(position.iter())
-                .chain(bounds.iter()),
-        ),
-    }
-}
-
-/*
- * Map virtual registers to corresponding instruction IDs.
- */
-pub fn sched_virt_reg_to_inst_id(function: &SFunction) -> HashMap<usize, InstID> {
-    let mut virt_reg_to_inst_id = HashMap::new();
-    for block_idx in 0..function.blocks.len() {
-        let block = &function.blocks[block_idx];
-        for inst_idx in 0..block.insts.len() {
-            let virt_reg = block.virt_regs[inst_idx].0;
-            let inst_id = InstID::new(block_idx, inst_idx);
-            virt_reg_to_inst_id.insert(virt_reg, inst_id);
-        }
-    }
-    virt_reg_to_inst_id
-}
-
-/*
- * Build a dependency graph of instructions in an SFunction.
- */
-pub fn sched_dependence_graph(
-    function: &SFunction,
-    virt_reg_to_inst_id: &HashMap<usize, InstID>,
-) -> HashMap<InstID, Vec<InstID>> {
-    let mut dep_graph = HashMap::new();
-    for inst_id in virt_reg_to_inst_id.values() {
-        dep_graph.insert(*inst_id, vec![]);
-    }
-
-    // Process the dependencies in each block. This includes inter-block
-    // dependencies for normal def-use edges.
-    for block_idx in 0..function.blocks.len() {
-        let block = &function.blocks[block_idx];
-
-        // Add normal dependencies.
-        for inst_idx in 0..block.insts.len() {
-            let inst_id = InstID::new(block_idx, inst_idx);
-            let inst = &block.insts[inst_idx];
-            for use_sval in sched_get_uses(inst) {
-                if let SValue::VirtualRegister(virt_reg) = use_sval {
-                    // Uses of parameters don't correspond to any instruction we
-                    // need to depend on.
-                    if let Some(use_id) = virt_reg_to_inst_id.get(virt_reg) {
-                        let deps = dep_graph.get_mut(&inst_id).unwrap();
-                        if !deps.contains(use_id) {
-                            deps.push(*use_id);
-                        }
-                    }
-                }
-            }
-        }
-
-        // Phis should appear at the top of linear basic blocks.
-
-        // Add dependencies between the phis.
-        let mut last_phi = None;
-        for inst_idx in 0..block.insts.len() {
-            let inst_id = InstID::new(block_idx, inst_idx);
-            let inst = &block.insts[inst_idx];
-            if inst.is_phi() {
-                if let Some(last_phi) = last_phi {
-                    let deps = dep_graph.get_mut(&inst_id).unwrap();
-                    if !deps.contains(&last_phi) {
-                        deps.push(last_phi);
-                    }
-                }
-                last_phi = Some(inst_id);
-            }
-        }
-
-        // If there is at least one phi, add a dependency between the "last" phi
-        // and every non-phi instruction with no dependencies yet.
-        if let Some(last_phi) = last_phi {
-            for inst_idx in 0..block.insts.len() {
-                let inst_id = InstID::new(block_idx, inst_idx);
-                let inst = &block.insts[inst_idx];
-                if !inst.is_phi() {
-                    let deps = dep_graph.get_mut(&inst_id).unwrap();
-                    if deps.is_empty() {
-                        deps.push(last_phi);
-                    }
-                }
-            }
-        }
-
-        // Terminator instructions appear at the bottom of linear basic blocks.
-
-        // Find every non-terminator instruction with no users.
-        let mut no_user_insts = (0..block.insts.len())
-            .filter(|inst_idx| !block.insts[*inst_idx].is_terminator())
-            .map(|inst_idx| InstID::new(block_idx, inst_idx))
-            .collect::<HashSet<_>>();
-        for inst_idx in 0..block.insts.len() {
-            let inst_id = InstID::new(block_idx, inst_idx);
-            for dep in dep_graph[&inst_id].iter() {
-                no_user_insts.remove(dep);
-            }
-        }
-
-        // Add a dependency between each instruction with no users (previously)
-        // and each terminator instruction.
-        for inst_idx in 0..block.insts.len() {
-            let inst_id = InstID::new(block_idx, inst_idx);
-            let inst = &block.insts[inst_idx];
-            if inst.is_terminator() {
-                let deps = dep_graph.get_mut(&inst_id).unwrap();
-                for no_user_inst in no_user_insts.iter() {
-                    if !deps.contains(no_user_inst) {
-                        deps.push(*no_user_inst);
-                    }
-                }
-            }
-        }
-    }
-
-    dep_graph
-}
-
-/*
- * Assemble a map from SValue to SType.
- */
-pub fn sched_svalue_types(function: &SFunction) -> HashMap<SValue, SType> {
-    let mut result = HashMap::new();
-
-    // Add types of parameters.
-    for (param_idx, param_ty) in function.param_types.iter().enumerate() {
-        result.insert(SValue::VirtualRegister(param_idx), param_ty.clone());
-    }
-
-    // Add types of instructions and constants.
-    for block in function.blocks.iter() {
-        for (inst, (virt_reg, sty)) in zip(block.insts.iter(), block.virt_regs.iter()) {
-            // Add the type of the output of the instruction.
-            result.insert(SValue::VirtualRegister(*virt_reg), sty.clone());
-
-            // Find constants inthe uses of instructions.
-            for u in sched_get_uses(inst) {
-                if let SValue::Constant(cons) = u {
-                    result.insert(u.clone(), cons.get_type());
-                }
-            }
-        }
-    }
-
-    result
-}
-
-/*
- * Analysis information for one fork-join.
- */
-#[derive(Debug)]
-pub struct ParallelReduceInfo {
-    // The block that jumps into the parallel section.
-    pub predecessor: BlockID,
-    // The block that is jumped into after the reduce section.
-    pub successor: BlockID,
-
-    // The first parallel block in the parallel section.
-    pub top_parallel_block: BlockID,
-    // The parallel block that jumps to the reduce section.
-    pub bottom_parallel_block: BlockID,
-    // The single block in the reduce section.
-    pub reduce_block: BlockID,
-
-    // The thread count SValues used for this fork-join.
-    pub thread_counts: Box<[SValue]>,
-    // The initial SValues for the reduction variables.
-    pub reduce_inits: Box<[SValue]>,
-    // The reduct SValues for the reduction variables.
-    pub reduce_reducts: Box<[SValue]>,
-
-    // Map from thread ID dimension to virtual registers of corresponding thread
-    // ID instructions.
-    pub thread_ids: HashMap<usize, Vec<usize>>,
-    // Map from reduction variable number to virtual register of the
-    // corresponding reduction variable instruction.
-    pub reduction_variables: HashMap<usize, usize>,
-
-    // If this parallel-reduce section is inside another parallel-reduce, store
-    // the parent's ForkJoinID. Parallel-reduce sections in an SFunction form a
-    // forest.
-    pub parent_fork_join_id: Option<ForkJoinID>,
-
-    // Information about how this fork-join should be scheduled. Collecting this
-    // info here just makes writing the backends more convenient.
-    pub vector_width: Option<usize>,
-    // For each reduction variable, track if its associative or parallel
-    // individually.
-    pub associative_reduce: HashMap<usize, bool>,
-    pub parallel_reduce: HashMap<usize, bool>,
-    // Track if this is a "top-level" parallel-reduce. That is, the parallel-
-    // reduce is the "only thing" inside this partition function. Only these
-    // parallel-reduces can be parallelized on the CPU, even if this parallel-
-    // reduce has a parallel schedule on the entry jump.
-    pub top_level: bool,
-}
-
-/*
- * Analyze parallel-reduce sections to make lowering them easier. Returns a map
- * from ForkJoinID to information about that parallel-reduce section.
- */
-pub fn sched_parallel_reduce_sections(
-    function: &SFunction,
-) -> HashMap<ForkJoinID, ParallelReduceInfo> {
-    let mut result = HashMap::new();
-
-    for (block_idx, block) in function.blocks.iter().enumerate() {
-        // Start by identifying a jump into a parallel section.
-        for (inst_idx, inst) in block.insts.iter().enumerate() {
-            if let SInst::Jump {
-                target,
-                parallel_entry,
-                reduce_exit: _,
-            } = inst
-                && let Some(parallel_entry) = parallel_entry
-            {
-                let predecessor = BlockID::new(block_idx);
-                let ParallelEntry {
-                    thread_counts,
-                    reduce_inits,
-                } = parallel_entry.clone();
-                let vector_width = block.schedules[&inst_idx]
-                    .iter()
-                    .filter_map(|schedule| schedule.try_vectorizable())
-                    .next();
-
-                // The jump target is the top of the parallel section. Get the
-                // fork-join ID from that block.
-                let top_parallel_block = *target;
-                let fork_join_id = function.blocks[top_parallel_block.idx()]
-                    .kind
-                    .try_parallel()
-                    .unwrap();
-
-                // Traverse the blocks until finding a jump to the corresponding
-                // reduce block.
-                let mut queue = VecDeque::from(vec![top_parallel_block]);
-                let mut visited = HashSet::new();
-                visited.insert(top_parallel_block);
-                let mut bfs_dest = None;
-                while let Some(bfs) = queue.pop_front() {
-                    for succ in function.blocks[bfs.idx()].successors().as_ref() {
-                        if let Some(reduce_fork_join_id) =
-                            function.blocks[succ.idx()].kind.try_reduce()
-                            && reduce_fork_join_id == fork_join_id
-                        {
-                            bfs_dest = Some((bfs, *succ));
-                        } else if !visited.contains(succ) {
-                            queue.push_back(*succ);
-                            visited.insert(*succ);
-                        }
-                    }
-                }
-                let (bottom_parallel_block, reduce_block) = bfs_dest.unwrap();
-
-                // Find the jump out of the reduce block.
-                let (successor, _, reduce_exit) = function.blocks[reduce_block.idx()]
-                    .insts
-                    .iter()
-                    .filter_map(|inst| inst.try_jump())
-                    .next()
-                    .unwrap();
-                let reduce_reducts = reduce_exit.unwrap().reduce_reducts.clone();
-
-                // Find the thread ID instructions.
-                let mut thread_ids = (0..thread_counts.len())
-                    .map(|dim| (dim, vec![]))
-                    .collect::<HashMap<usize, Vec<usize>>>();
-                for parallel_block in visited {
-                    for (inst, (virt_reg, _)) in zip(
-                        function.blocks[parallel_block.idx()].insts.iter(),
-                        function.blocks[parallel_block.idx()].virt_regs.iter(),
-                    ) {
-                        if let Some((dim, tid_fork_join)) = inst.try_thread_id()
-                            && tid_fork_join == fork_join_id
-                        {
-                            thread_ids.get_mut(&dim).unwrap().push(*virt_reg);
-                        }
-                    }
-                }
-
-                // Find the reduction variable instructions.
-                let mut associative_reduce = HashMap::new();
-                let mut parallel_reduce = HashMap::new();
-                let reduce_sblock = &function.blocks[reduce_block.idx()];
-                let reduction_variables = zip(
-                    reduce_sblock.insts.iter().enumerate(),
-                    reduce_sblock.virt_regs.iter(),
-                )
-                .filter_map(|((inst_idx, inst), (virt_reg, _))| {
-                    inst.try_reduction_variable().map(|number| {
-                        let schedules = &reduce_sblock.schedules[&inst_idx];
-                        associative_reduce
-                            .insert(number, schedules.contains(&SSchedule::Associative));
-                        parallel_reduce
-                            .insert(number, schedules.contains(&SSchedule::ParallelReduce));
-                        (number, *virt_reg)
-                    })
-                })
-                .collect();
-
-                // Assemble all of the info and add it to the map.
-                let info = ParallelReduceInfo {
-                    predecessor,
-                    successor,
-
-                    top_parallel_block,
-                    bottom_parallel_block,
-                    reduce_block,
-
-                    thread_counts,
-                    reduce_inits,
-                    reduce_reducts,
-
-                    thread_ids,
-                    reduction_variables,
-
-                    parent_fork_join_id: None,
-                    vector_width,
-                    associative_reduce,
-                    parallel_reduce,
-
-                    top_level: false,
-                };
-                result.insert(fork_join_id, info);
-            }
-        }
-    }
-
-    // Figure out if any parallel-reduces are top level - that is, they are the
-    // "only thing" in the partition function.
-    for (_, parallel_reduce_info) in result.iter_mut() {
-        // A parallel-reduce is top-level if its predecessor is the entry block
-        // containing only a jump and its successor is an exit block containing
-        // just a function terminator.
-        let pred_block = &function.blocks[parallel_reduce_info.predecessor.idx()];
-        let succ_block = &function.blocks[parallel_reduce_info.successor.idx()];
-        if parallel_reduce_info.predecessor == BlockID::new(0)
-            && pred_block.insts.len() == 1
-            && pred_block.insts[0].is_jump()
-            && succ_block.insts.len() == 1
-            && (succ_block.insts[0].is_partition_exit() || succ_block.insts[0].is_return())
-        {
-            parallel_reduce_info.top_level = true;
-        }
-    }
-
-    // Compute the parallel-reduce forest last, since this requires some info we
-    // just computed above.
-    let mut parents = HashMap::new();
-    for (fork_join_id, parallel_reduce_info) in result.iter() {
-        let mut pred_block = parallel_reduce_info.predecessor;
-
-        // Keep looking at predecessors of adjacent parallel-reduce sections
-        // until one belongs to a parent parallel-reduce or is sequential, so
-        // this parallel-reduce is a root.
-        let parent = loop {
-            match function.blocks[pred_block.idx()].kind {
-                // If the predecessor is sequential, then this parallel-reduce
-                // is a root.
-                SBlockKind::Sequential => break None,
-                // If the predecessor is parallel, then this parallel-reduce is
-                // inside that parallel-reduce.
-                SBlockKind::Parallel(parent) => break Some(parent),
-                // If the predecessor is reduce, then that parallel-reduce is a
-                // child of the same parent. Iterate on its predecessor.
-                SBlockKind::Reduce(adjacent) => {
-                    pred_block = result[&adjacent].predecessor;
-                }
-            }
-        };
-        parents.insert(*fork_join_id, parent);
-    }
-
-    // Insert the information into the parallel reduce info map.
-    for (fork_join_id, parallel_reduce_info) in result.iter_mut() {
-        parallel_reduce_info.parent_fork_join_id = parents[fork_join_id];
-    }
-
-    result
-}
diff --git a/hercules_ir/src/callgraph.rs b/hercules_ir/src/callgraph.rs
index 84be922dea8a89732bf2f2ad0d9fe3f3865d5d90..3a8e6316f8213b2c665e2ad34034d070a495c0cb 100644
--- a/hercules_ir/src/callgraph.rs
+++ b/hercules_ir/src/callgraph.rs
@@ -43,6 +43,37 @@ impl CallGraph {
     pub fn num_functions(&self) -> usize {
         self.first_callees.len()
     }
+
+    pub fn topo(&self) -> Vec<FunctionID> {
+        let mut num_calls: Vec<usize> = (0..self.num_functions())
+            .map(|idx| self.num_callees(FunctionID::new(idx)))
+            .collect();
+        let mut no_calls_stack: Vec<FunctionID> = num_calls
+            .iter()
+            .enumerate()
+            .filter(|(_, num)| **num == 0)
+            .map(|(idx, _)| FunctionID::new(idx))
+            .collect();
+        let mut topo = vec![];
+        while let Some(no_call_func) = no_calls_stack.pop() {
+            topo.push(no_call_func);
+            for caller in self.get_callers(no_call_func) {
+                num_calls[caller.idx()] -= 1;
+                if num_calls[caller.idx()] == 0 {
+                    no_calls_stack.push(*caller);
+                }
+            }
+        }
+
+        // Mutual recursion is not currently supported, so assert that a
+        // topological sort exists.
+        assert_eq!(
+            topo.len(),
+            self.num_functions(),
+            "PANIC: Found mutual recursion in Hercules IR."
+        );
+        topo
+    }
 }
 
 /*
diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs
index 8c36c8ad31ca1e196e8a13778239d6f2a83080d4..d23a49729e1b76c46dc85b6d850d56e8914dd213 100644
--- a/hercules_ir/src/dot.rs
+++ b/hercules_ir/src/dot.rs
@@ -42,9 +42,10 @@ pub fn xdot_module(
     file.write_all(contents.as_bytes())
         .expect("PANIC: Unable to write output file contents.");
     Command::new("xdot")
-        .args([tmp_path])
+        .args([&tmp_path])
         .output()
         .expect("PANIC: Couldn't execute xdot. Is xdot installed?");
+    println!("Graphviz written to: {}", tmp_path.display());
 }
 
 /*
diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs
index 77a7ed3c971ecc68ebeb3393dc818d20013fb978..cef8e43ae9b6448a9c0239df12a9d94976b0b751 100644
--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -545,6 +545,58 @@ pub fn constants_bottom_up(constants: &Vec<Constant>) -> impl Iterator<Item = Co
     }
 }
 
+/*
+ * Create an iterator that traverses all the dynamic constants in the module
+ * bottom up. This uses a coroutine to make iteratively traversing the dynamic
+ * constant DAGs easier.
+ */
+pub fn dynamic_constants_bottom_up(
+    dynamic_constants: &Vec<DynamicConstant>,
+) -> impl Iterator<Item = DynamicConstantID> + '_ {
+    let mut visited = bitvec![u8, Lsb0; 0; dynamic_constants.len()];
+    let mut stack = (0..dynamic_constants.len())
+        .map(DynamicConstantID::new)
+        .collect::<Vec<DynamicConstantID>>();
+    let coroutine = #[coroutine]
+    move || {
+        // Since this is a coroutine, handle recursion manually.
+        while let Some(id) = stack.pop() {
+            if visited[id.idx()] {
+                continue;
+            }
+            match dynamic_constants[id.idx()] {
+                DynamicConstant::Add(left, right)
+                | DynamicConstant::Sub(left, right)
+                | DynamicConstant::Mul(left, right)
+                | DynamicConstant::Div(left, right)
+                | DynamicConstant::Rem(left, right) => {
+                    // We have to yield the children of this node before
+                    // this node itself. We keep track of which nodes have
+                    // yielded using visited.
+                    let can_yield = visited[left.idx()] && visited[right.idx()];
+                    if can_yield {
+                        visited.set(id.idx(), true);
+                        yield id;
+                    } else {
+                        // Push ourselves, then children, so that children
+                        // get popped first.
+                        stack.push(id);
+                        stack.push(left);
+                        stack.push(right);
+                    }
+                }
+                _ => {
+                    visited.set(id.idx(), true);
+                    yield id;
+                }
+            }
+        }
+    };
+    CoroutineIterator {
+        coroutine: Box::new(coroutine),
+    }
+}
+
 struct CoroutineIterator<G, I>
 where
     G: Coroutine<Yield = I, Return = ()> + Unpin,
@@ -671,6 +723,16 @@ impl Type {
         }
     }
 
+    pub fn is_signed(&self) -> bool {
+        match self {
+            Type::Integer8 => true,
+            Type::Integer16 => true,
+            Type::Integer32 => true,
+            Type::Integer64 => true,
+            _ => false,
+        }
+    }
+
     pub fn is_fixed(&self) -> bool {
         match self {
             Type::Integer8 => true,
@@ -685,6 +747,10 @@ impl Type {
         }
     }
 
+    pub fn is_integer(&self) -> bool {
+        self.is_fixed() || self.is_bool()
+    }
+
     pub fn is_float(&self) -> bool {
         match self {
             Type::Float32 => true,
@@ -740,6 +806,17 @@ impl Type {
             None
         }
     }
+
+    pub fn num_bits(&self) -> u8 {
+        match self {
+            Type::Boolean => 1,
+            Type::Integer8 | Type::UnsignedInteger8 => 8,
+            Type::Integer16 | Type::UnsignedInteger16 => 16,
+            Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 32,
+            Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 64,
+            _ => panic!(),
+        }
+    }
 }
 
 impl Constant {
@@ -774,7 +851,7 @@ impl Constant {
         }
     }
 
-    pub fn is_strictly_scalar(&self) -> bool {
+    pub fn is_scalar(&self) -> bool {
         match self {
             Constant::Boolean(_) => true,
             Constant::Integer8(_) => true,
@@ -1013,6 +1090,8 @@ impl Node {
         }
     );
 
+    define_pattern_predicate!(is_undef, Node::Undef { ty: _ });
+
     pub fn try_region(&self) -> Option<&[NodeID]> {
         if let Node::Region { preds } = self {
             Some(preds)
@@ -1090,6 +1169,14 @@ impl Node {
         }
     }
 
+    pub fn try_parameter(&self) -> Option<usize> {
+        if let Node::Parameter { index } = self {
+            Some(*index)
+        } else {
+            None
+        }
+    }
+
     pub fn try_constant(&self) -> Option<ConstantID> {
         if let Node::Constant { id } = self {
             Some(*id)
@@ -1137,6 +1224,21 @@ impl Node {
         }
     }
 
+    pub fn try_ternary(&self, bop: TernaryOperator) -> Option<(NodeID, NodeID, NodeID)> {
+        if let Node::Ternary {
+            first,
+            second,
+            third,
+            op,
+        } = self
+            && *op == bop
+        {
+            Some((*first, *second, *third))
+        } else {
+            None
+        }
+    }
+
     pub fn is_zero_constant(&self, constants: &Vec<Constant>) -> bool {
         if let Node::Constant { id } = self
             && constants[id.idx()].is_zero()
diff --git a/hercules_ir/src/loops.rs b/hercules_ir/src/loops.rs
index 5aa6bd19a65f842ab19ac855066ce894e0e568f8..7c9a0a85949efcc248439031601b2fed17f0acf6 100644
--- a/hercules_ir/src/loops.rs
+++ b/hercules_ir/src/loops.rs
@@ -1,8 +1,8 @@
 extern crate bitvec;
 
 use std::collections::hash_map;
-use std::collections::HashMap;
 use std::collections::VecDeque;
+use std::collections::{HashMap, HashSet};
 
 use self::bitvec::prelude::*;
 
@@ -37,6 +37,14 @@ impl LoopTree {
         self.loops.iter()
     }
 
+    pub fn nodes_in_loop(&self, header: NodeID) -> impl Iterator<Item = NodeID> + '_ {
+        self.loops[&header].0.iter_ones().map(NodeID::new)
+    }
+
+    pub fn is_in_loop(&self, header: NodeID, is_in: NodeID) -> bool {
+        header == self.root || self.loops[&header].0[is_in.idx()]
+    }
+
     /*
      * Sometimes, we need to iterate the loop tree bottom-up. Just assemble the
      * order upfront.
@@ -195,3 +203,98 @@ fn loop_reachability_helper(
         visited
     }
 }
+
+/*
+ * Top level function to calculate reduce cycles. Returns for each reduce node
+ * what other nodes form a cycle with that reduce node.
+ */
+pub fn reduce_cycles(
+    function: &Function,
+    def_use: &ImmutableDefUseMap,
+) -> HashMap<NodeID, HashSet<NodeID>> {
+    let reduces = (0..function.nodes.len())
+        .filter(|idx| function.nodes[*idx].is_reduce())
+        .map(NodeID::new);
+    let mut result = HashMap::new();
+
+    for reduce in reduces {
+        let (_, _, reduct) = function.nodes[reduce.idx()].try_reduce().unwrap();
+
+        // First, find all data nodes that are used by the `reduct` input of the
+        // reduce, including the `reduct` itself.
+        let mut use_reachable = HashSet::new();
+        use_reachable.insert(reduct);
+        let mut worklist = vec![reduct];
+        while let Some(item) = worklist.pop() {
+            for u in get_uses(&function.nodes[item.idx()]).as_ref() {
+                if !function.nodes[u.idx()].is_control() && !use_reachable.contains(u) {
+                    use_reachable.insert(*u);
+                    worklist.push(*u);
+                }
+            }
+        }
+
+        // Second, find all data nodes thare are users of the reduce node.
+        let mut user_reachable = HashSet::new();
+        let mut worklist = vec![reduce];
+        while let Some(item) = worklist.pop() {
+            for u in def_use.get_users(item) {
+                if !function.nodes[u.idx()].is_control() && !user_reachable.contains(u) {
+                    user_reachable.insert(*u);
+                    worklist.push(*u);
+                }
+            }
+        }
+
+        // Nodes that are both use-reachable and user-reachable by the reduce
+        // node are in the reduce node's cycle.
+        result.insert(
+            reduce,
+            use_reachable
+                .intersection(&user_reachable)
+                .map(|id| *id)
+                .collect(),
+        );
+    }
+
+    result
+}
+
+/*
+ * Top level function to calculate which data nodes are "inside" a fork-join,
+ * not including its reduces.
+ */
+pub fn data_nodes_in_fork_joins(
+    function: &Function,
+    def_use: &ImmutableDefUseMap,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+) -> HashMap<NodeID, HashSet<NodeID>> {
+    let mut result = HashMap::new();
+
+    for (fork, join) in fork_join_map {
+        let mut worklist = vec![*fork];
+        let mut set = HashSet::new();
+
+        while let Some(item) = worklist.pop() {
+            for u in def_use.get_users(item) {
+                if function.nodes[u.idx()].is_control()
+                    || function.nodes[u.idx()]
+                        .try_reduce()
+                        .map(|(control, _, _)| control == *join)
+                        .unwrap_or(false)
+                {
+                    // Ignore control users and reduces of the fork-join.
+                    continue;
+                }
+                if !set.contains(u) {
+                    set.insert(*u);
+                    worklist.push(*u);
+                }
+            }
+        }
+
+        result.insert(*fork, set);
+    }
+
+    result
+}
diff --git a/hercules_opt/src/ccp.rs b/hercules_opt/src/ccp.rs
index 1b28db47ac4c59a943ce58aed54e764c5c1ae085..aa3d0e680bfa5ea7b1553e5b31e34e049fcbb600 100644
--- a/hercules_opt/src/ccp.rs
+++ b/hercules_opt/src/ccp.rs
@@ -4,7 +4,6 @@ use std::collections::HashSet;
 use std::iter::zip;
 
 use self::hercules_ir::dataflow::*;
-use self::hercules_ir::def_use::*;
 use self::hercules_ir::ir::*;
 
 use crate::*;
@@ -442,12 +441,18 @@ fn ccp_flow_function(
                 if inputs[reduct.idx()].is_reachable() {
                     constant = ConstantLattice::meet(&constant, &inputs[reduct.idx()].constant);
                 }
-                CCPLattice { reachability, constant }
+                CCPLattice {
+                    reachability,
+                    constant,
+                }
             } else {
-                CCPLattice { reachability, constant: ConstantLattice::top() }
+                CCPLattice {
+                    reachability,
+                    constant: ConstantLattice::top(),
+                }
             }
-        },
-        Node::Return { control, data } => inputs[control.idx()].clone(),
+        }
+        Node::Return { control, data: _ } => inputs[control.idx()].clone(),
         Node::Parameter { index: _ } => CCPLattice::bottom(),
         // A constant node is the "source" of concrete constant lattice values.
         Node::Constant { id } => CCPLattice {
@@ -861,10 +866,7 @@ fn ccp_flow_function(
                 constant: new_constant,
             }
         }
-        Node::Read {
-            collect,
-            indices,
-        } => {
+        Node::Read { collect, indices } => {
             let mut reachability = inputs[collect.idx()].reachability.clone();
             for index in indices.iter() {
                 if let Index::Position(positions) = index {
diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs
index 7f9c9ba2077a6fd343751df812f30da3ea55b2ee..0ff58822180575979430cb98018ca6dc128ae3d9 100644
--- a/hercules_opt/src/editor.rs
+++ b/hercules_opt/src/editor.rs
@@ -218,6 +218,10 @@ impl<'a: 'b, 'b> FunctionEditor<'a> {
         &self.function
     }
 
+    pub fn get_dynamic_constants(&self) -> Ref<'_, Vec<DynamicConstant>> {
+        self.dynamic_constants.borrow()
+    }
+
     pub fn get_users(&self, id: NodeID) -> impl ExactSizeIterator<Item = NodeID> + '_ {
         self.mut_def_use[id.idx()].iter().map(|x| *x)
     }
@@ -276,8 +280,12 @@ impl<'a, 'b> FunctionEdit<'a, 'b> {
         self.editor.dynamic_constants.borrow().len() + self.added_dynamic_constants.len()
     }
 
+    pub fn num_node_ids(&self) -> usize {
+        self.editor.function.nodes.len() + self.added_nodeids.len()
+    }
+
     pub fn add_node(&mut self, node: Node) -> NodeID {
-        let id = NodeID::new(self.editor.function.nodes.len() + self.added_nodeids.len());
+        let id = NodeID::new(self.num_node_ids());
         // Added nodes need to have an entry in the def-use map.
         self.updated_def_use.insert(id, HashSet::new());
         // Added nodes use other nodes, and we need to update their def-use
diff --git a/hercules_opt/src/fork_concat_split.rs b/hercules_opt/src/fork_concat_split.rs
new file mode 100644
index 0000000000000000000000000000000000000000..df3652dfe4be2454161fe75d80235f157ce27786
--- /dev/null
+++ b/hercules_opt/src/fork_concat_split.rs
@@ -0,0 +1,137 @@
+extern crate hercules_ir;
+
+use std::collections::{HashMap, HashSet};
+use std::iter::zip;
+
+use self::hercules_ir::ir::*;
+
+use crate::*;
+
+/*
+ * Split multi-dimensional fork-joins into separate one-dimensional fork-joins.
+ * Useful for code generation.
+ */
+pub fn fork_split(
+    editor: &mut FunctionEditor,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>,
+) {
+    // A single multi-dimensional fork becomes multiple forks, a join becomes
+    // multiple joins, a thread ID becomes a thread ID on the correct
+    // fork, and a reduce becomes multiple reduces to shuffle the reduction
+    // value through the fork-join nest.
+    for (fork, join) in fork_join_map {
+        let nodes = &editor.func().nodes;
+        let (fork_control, factors) = nodes[fork.idx()].try_fork().unwrap();
+        if factors.len() < 2 {
+            continue;
+        }
+        let factors: Box<[DynamicConstantID]> = factors.into();
+        let join_control = nodes[join.idx()].try_join().unwrap();
+        let tids: Vec<_> = editor
+            .get_users(*fork)
+            .filter(|id| nodes[id.idx()].is_thread_id())
+            .collect();
+        let reduces: Vec<_> = editor
+            .get_users(*join)
+            .filter(|id| nodes[id.idx()].is_reduce())
+            .collect();
+
+        let data_in_reduce_cycle: HashSet<(NodeID, NodeID)> = reduces
+            .iter()
+            .map(|reduce| editor.get_users(*reduce).map(move |user| (user, *reduce)))
+            .flatten()
+            .filter(|(user, reduce)| reduce_cycles[&reduce].contains(&user))
+            .collect();
+
+        editor.edit(|mut edit| {
+            // Create the forks and a thread ID per fork.
+            let mut acc_fork = fork_control;
+            let mut new_tids = vec![];
+            for factor in factors {
+                acc_fork = edit.add_node(Node::Fork {
+                    control: acc_fork,
+                    factors: Box::new([factor]),
+                });
+                new_tids.push(edit.add_node(Node::ThreadID {
+                    control: acc_fork,
+                    dimension: 0,
+                }));
+            }
+
+            // Create the joins.
+            let mut acc_join = if join_control == *fork {
+                acc_fork
+            } else {
+                join_control
+            };
+            let mut joins = vec![];
+            for _ in new_tids.iter() {
+                acc_join = edit.add_node(Node::Join { control: acc_join });
+                joins.push(acc_join);
+            }
+
+            // Create the reduces.
+            let mut new_reduces = vec![];
+            for reduce in reduces.iter() {
+                let (_, init, reduct) = edit.get_node(*reduce).try_reduce().unwrap();
+                let num_nodes = edit.num_node_ids();
+                let mut inner_reduce = NodeID::new(0);
+                let mut outer_reduce = NodeID::new(0);
+                for (join_idx, join) in joins.iter().enumerate() {
+                    let init = if join_idx == joins.len() - 1 {
+                        init
+                    } else {
+                        NodeID::new(num_nodes + join_idx + 1)
+                    };
+                    let reduct = if join_idx == 0 {
+                        reduct
+                    } else {
+                        NodeID::new(num_nodes + join_idx - 1)
+                    };
+                    let reduce = edit.add_node(Node::Reduce {
+                        control: *join,
+                        init,
+                        reduct,
+                    });
+                    assert_eq!(reduce, NodeID::new(num_nodes + join_idx));
+                    if join_idx == 0 {
+                        inner_reduce = reduce;
+                    }
+                    if join_idx == joins.len() - 1 {
+                        outer_reduce = reduce;
+                    }
+                }
+                new_reduces.push((inner_reduce, outer_reduce));
+            }
+
+            // Replace everything.
+            edit = edit.replace_all_uses(*fork, acc_fork)?;
+            edit = edit.replace_all_uses(*join, acc_join)?;
+            for tid in tids.iter() {
+                let dim = edit.get_node(*tid).try_thread_id().unwrap().1;
+                edit = edit.replace_all_uses(*tid, new_tids[dim])?;
+            }
+            for (reduce, (inner_reduce, outer_reduce)) in zip(reduces.iter(), new_reduces) {
+                edit = edit.replace_all_uses_where(*reduce, inner_reduce, |id| {
+                    data_in_reduce_cycle.contains(&(*id, *reduce))
+                })?;
+                edit = edit.replace_all_uses_where(*reduce, outer_reduce, |id| {
+                    !data_in_reduce_cycle.contains(&(*id, *reduce))
+                })?;
+            }
+
+            // Delete all the old stuff.
+            edit = edit.delete_node(*fork)?;
+            edit = edit.delete_node(*join)?;
+            for tid in tids {
+                edit = edit.delete_node(tid)?;
+            }
+            for reduce in reduces {
+                edit = edit.delete_node(reduce)?;
+            }
+
+            Ok(edit)
+        });
+    }
+}
diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs
index 425fe315fdc6dcb2de22c60db16fb42ec9a3f273..6b9e006d489863dea79381f69528ec5cfe4741d8 100644
--- a/hercules_opt/src/inline.rs
+++ b/hercules_opt/src/inline.rs
@@ -20,32 +20,8 @@ pub fn inline(
     mut plans: Option<&mut Vec<Plan>>,
 ) {
     // Step 1: run topological sort on the call graph to inline the "deepest"
-    // function first. Mutual recursion is not currently supported, so assert
-    // that a topological sort exists.
-    let mut num_calls: Vec<usize> = (0..editors.len())
-        .map(|idx| callgraph.num_callees(FunctionID::new(idx)))
-        .collect();
-    let mut no_calls_stack: Vec<FunctionID> = num_calls
-        .iter()
-        .enumerate()
-        .filter(|(_, num)| **num == 0)
-        .map(|(idx, _)| FunctionID::new(idx))
-        .collect();
-    let mut topo = vec![];
-    while let Some(no_call_func) = no_calls_stack.pop() {
-        topo.push(no_call_func);
-        for caller in callgraph.get_callers(no_call_func) {
-            num_calls[caller.idx()] -= 1;
-            if num_calls[caller.idx()] == 0 {
-                no_calls_stack.push(*caller);
-            }
-        }
-    }
-    assert_eq!(
-        topo.len(),
-        editors.len(),
-        "PANIC: Found mutual recursion in Hercules IR."
-    );
+    // function first.
+    let topo = callgraph.topo();
 
     // Step 2: make sure each function has a single return node. If an edit
     // failed to make a function have a single return node, then we can't inline
diff --git a/hercules_opt/src/lib.rs b/hercules_opt/src/lib.rs
index a01f901b1b2b1aceaf05eacfe7fe7207a553a182..5a429e14c15ef842e5bcf475d2a78a139fb1850b 100644
--- a/hercules_opt/src/lib.rs
+++ b/hercules_opt/src/lib.rs
@@ -4,6 +4,7 @@ pub mod ccp;
 pub mod dce;
 pub mod delete_uncalled;
 pub mod editor;
+pub mod fork_concat_split;
 pub mod fork_guard_elim;
 pub mod forkify;
 pub mod gvn;
@@ -14,12 +15,14 @@ pub mod pass;
 pub mod phi_elim;
 pub mod pred;
 pub mod sroa;
+pub mod unforkify;
 pub mod utils;
 
 pub use crate::ccp::*;
 pub use crate::dce::*;
 pub use crate::delete_uncalled::*;
 pub use crate::editor::*;
+pub use crate::fork_concat_split::*;
 pub use crate::fork_guard_elim::*;
 pub use crate::forkify::*;
 pub use crate::gvn::*;
@@ -30,4 +33,5 @@ pub use crate::pass::*;
 pub use crate::phi_elim::*;
 pub use crate::pred::*;
 pub use crate::sroa::*;
+pub use crate::unforkify::*;
 pub use crate::utils::*;
diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs
index ee240846d9ea8df356e2ffe27dcf827636df9366..eb8d386c0706039a254c69810e249ef363d3060f 100644
--- a/hercules_opt/src/outline.rs
+++ b/hercules_opt/src/outline.rs
@@ -558,8 +558,9 @@ pub fn outline(
 }
 
 /*
- * Just outlines all of a function except the entry and return. Minimum work
- * needed to cause runtime Rust code to be generated as necessary.
+ * Just outlines all of a function except the entry, return, and aggregate
+ * constants. This is the minimum work needed to cause runtime Rust code to be
+ * generated as necessary.
  */
 pub fn dumb_outline(
     editor: &mut FunctionEditor,
@@ -575,7 +576,11 @@ pub fn dumb_outline(
         .node_ids()
         .filter(|id| {
             let node = &editor.func().nodes[id.idx()];
-            !(node.is_start() || node.is_parameter() || node.is_return())
+            if let Node::Constant { id } = editor.func().nodes[id.idx()] {
+                editor.get_constant(id).is_scalar()
+            } else {
+                !(node.is_start() || node.is_parameter() || node.is_return())
+            }
         })
         .collect();
     outline(
diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs
index ccba355aa936cfb5629187e039d480178b6ba005..006bd371a52c500dd750fa48e28f02f0c42a719c 100644
--- a/hercules_opt/src/pass.rs
+++ b/hercules_opt/src/pass.rs
@@ -5,7 +5,7 @@ extern crate serde;
 extern crate take_mut;
 
 use std::cell::RefCell;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::env::temp_dir;
 use std::fs::File;
 use std::io::Write;
@@ -27,24 +27,25 @@ pub enum Pass {
     DCE,
     CCP,
     GVN,
-    Forkify,
     PhiElim,
+    Forkify,
     ForkGuardElim,
     Predication,
     SROA,
     Inline,
     Outline,
+    InterproceduralSROA,
+    DeleteUncalled,
+    ForkSplit,
+    Unforkify,
     Verify,
     // Parameterized over whether analyses that aid visualization are necessary.
     // Useful to set to false if displaying a potentially broken module.
     Xdot(bool),
-    SchedXdot,
     // Parameterized over output directory and module name.
     Codegen(String, String),
     // Parameterized over where to serialize module to.
     Serialize(String),
-    InterproceduralSROA,
-    DeleteUncalled,
 }
 
 /*
@@ -68,15 +69,14 @@ pub struct PassManager {
     pub fork_join_maps: Option<Vec<HashMap<NodeID, NodeID>>>,
     pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>,
     pub loops: Option<Vec<LoopTree>>,
+    pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
     pub antideps: Option<Vec<Vec<(NodeID, NodeID)>>>,
+    pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
     pub bbs: Option<Vec<Vec<NodeID>>>,
     pub callgraph: Option<CallGraph>,
 
     // Current plan.
     pub plans: Option<Vec<Plan>>,
-
-    // Store the manifest of a compiled object.
-    pub manifests: Option<HashMap<String, Manifest>>,
 }
 
 impl PassManager {
@@ -93,11 +93,12 @@ impl PassManager {
             fork_join_maps: None,
             fork_join_nests: None,
             loops: None,
+            reduce_cycles: None,
             antideps: None,
+            data_nodes_in_fork_joins: None,
             bbs: None,
             callgraph: None,
             plans: None,
-            manifests: None,
         }
     }
 
@@ -225,6 +226,18 @@ impl PassManager {
         }
     }
 
+    pub fn make_reduce_cycles(&mut self) {
+        if self.reduce_cycles.is_none() {
+            self.make_def_uses();
+            let def_uses = self.def_uses.as_ref().unwrap().iter();
+            self.reduce_cycles = Some(
+                zip(self.module.functions.iter(), def_uses)
+                    .map(|(function, def_use)| reduce_cycles(function, def_use))
+                    .collect(),
+            );
+        }
+    }
+
     pub fn make_antideps(&mut self) {
         if self.antideps.is_none() {
             self.make_def_uses();
@@ -239,6 +252,26 @@ impl PassManager {
         }
     }
 
+    pub fn make_data_nodes_in_fork_joins(&mut self) {
+        if self.data_nodes_in_fork_joins.is_none() {
+            self.make_def_uses();
+            self.make_fork_join_maps();
+            self.data_nodes_in_fork_joins = Some(
+                zip(
+                    self.module.functions.iter(),
+                    zip(
+                        self.def_uses.as_ref().unwrap().iter(),
+                        self.fork_join_maps.as_ref().unwrap().iter(),
+                    ),
+                )
+                .map(|(function, (def_use, fork_join_map))| {
+                    data_nodes_in_fork_joins(function, def_use, fork_join_map)
+                })
+                .collect(),
+            );
+        }
+    }
+
     pub fn make_bbs(&mut self) {
         if self.bbs.is_none() {
             self.make_def_uses();
@@ -804,6 +837,78 @@ impl PassManager {
 
                     assert!(self.module.functions.len() > 0, "PANIC: There are no entry functions in the Hercules module being compiled, and they all got deleted by DeleteUncalled. Please mark at least one function as an entry!");
                 }
+                Pass::ForkSplit => {
+                    self.make_def_uses();
+                    self.make_fork_join_maps();
+                    self.make_reduce_cycles();
+                    let def_uses = self.def_uses.as_ref().unwrap();
+                    let fork_join_maps = self.fork_join_maps.as_ref().unwrap();
+                    let reduce_cycles = self.reduce_cycles.as_ref().unwrap();
+                    for idx in 0..self.module.functions.len() {
+                        let constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.constants));
+                        let dynamic_constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
+                        let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
+                        let mut editor = FunctionEditor::new(
+                            &mut self.module.functions[idx],
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            &def_uses[idx],
+                        );
+                        fork_split(&mut editor, &fork_join_maps[idx], &reduce_cycles[idx]);
+
+                        self.module.constants = constants_ref.take();
+                        self.module.dynamic_constants = dynamic_constants_ref.take();
+                        self.module.types = types_ref.take();
+
+                        let edits = &editor.edits();
+                        if let Some(plans) = self.plans.as_mut() {
+                            repair_plan(&mut plans[idx], &self.module.functions[idx], edits);
+                        }
+                        let grave_mapping = self.module.functions[idx].delete_gravestones();
+                        if let Some(plans) = self.plans.as_mut() {
+                            plans[idx].fix_gravestones(&grave_mapping);
+                        }
+                    }
+                    self.clear_analyses();
+                }
+                Pass::Unforkify => {
+                    self.make_def_uses();
+                    self.make_fork_join_maps();
+                    let def_uses = self.def_uses.as_ref().unwrap();
+                    let fork_join_maps = self.fork_join_maps.as_ref().unwrap();
+                    for idx in 0..self.module.functions.len() {
+                        let constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.constants));
+                        let dynamic_constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
+                        let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
+                        let mut editor = FunctionEditor::new(
+                            &mut self.module.functions[idx],
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            &def_uses[idx],
+                        );
+                        unforkify(&mut editor, &fork_join_maps[idx]);
+
+                        self.module.constants = constants_ref.take();
+                        self.module.dynamic_constants = dynamic_constants_ref.take();
+                        self.module.types = types_ref.take();
+
+                        let edits = &editor.edits();
+                        if let Some(plans) = self.plans.as_mut() {
+                            repair_plan(&mut plans[idx], &self.module.functions[idx], edits);
+                        }
+                        let grave_mapping = self.module.functions[idx].delete_gravestones();
+                        if let Some(plans) = self.plans.as_mut() {
+                            plans[idx].fix_gravestones(&grave_mapping);
+                        }
+                    }
+                    self.clear_analyses();
+                }
                 Pass::Verify => {
                     let (
                         def_uses,
@@ -853,65 +958,66 @@ impl PassManager {
                         self.plans.as_ref(),
                     );
                 }
-                Pass::SchedXdot => {
-                    self.make_def_uses();
-                    self.make_typing();
-                    self.make_control_subgraphs();
-                    self.make_fork_join_maps();
-                    self.make_fork_join_nests();
-                    self.make_antideps();
-                    self.make_bbs();
-                    self.make_plans();
-
-                    let smodule = sched_compile(
-                        &self.module,
-                        self.def_uses.as_ref().unwrap(),
-                        self.typing.as_ref().unwrap(),
-                        self.control_subgraphs.as_ref().unwrap(),
-                        self.fork_join_maps.as_ref().unwrap(),
-                        self.fork_join_nests.as_ref().unwrap(),
-                        self.antideps.as_ref().unwrap(),
-                        self.bbs.as_ref().unwrap(),
-                        self.plans.as_ref().unwrap(),
-                    );
-
-                    xdot_sched_module(&smodule);
-                }
                 Pass::Codegen(output_dir, module_name) => {
-                    self.make_def_uses();
+                    self.make_reverse_postorders();
                     self.make_typing();
                     self.make_control_subgraphs();
-                    self.make_fork_join_maps();
-                    self.make_fork_join_nests();
-                    self.make_antideps();
                     self.make_bbs();
-                    self.make_plans();
+                    self.make_callgraph();
+                    let reverse_postorders = self.reverse_postorders.as_ref().unwrap();
+                    let typing = self.typing.as_ref().unwrap();
+                    let control_subgraphs = self.control_subgraphs.as_ref().unwrap();
+                    let bbs = self.bbs.as_ref().unwrap();
+                    let callgraph = self.callgraph.as_ref().unwrap();
 
-                    let smodule = sched_compile(
-                        &self.module,
-                        self.def_uses.as_ref().unwrap(),
-                        self.typing.as_ref().unwrap(),
-                        self.control_subgraphs.as_ref().unwrap(),
-                        self.fork_join_maps.as_ref().unwrap(),
-                        self.fork_join_nests.as_ref().unwrap(),
-                        self.antideps.as_ref().unwrap(),
-                        self.bbs.as_ref().unwrap(),
-                        self.plans.as_ref().unwrap(),
-                    );
+                    let memory_objects: Vec<_> = (0..self.module.functions.len())
+                        .map(|idx| {
+                            memory_objects(
+                                &self.module.functions[idx],
+                                &self.module.types,
+                                &reverse_postorders[idx],
+                                &typing[idx],
+                            )
+                        })
+                        .collect();
+                    let memory_objects_mutable =
+                        memory_objects_mutability(&self.module, &callgraph, &memory_objects);
 
+                    let mut rust_rt = String::new();
                     let mut llvm_ir = String::new();
-                    for manifest in smodule.manifests.values() {
-                        for partition_manifest in manifest.partitions.iter() {
-                            let function = &smodule.functions[&partition_manifest.name];
-                            match partition_manifest.device {
-                                DeviceManifest::CPU { parallel_launch: _ } => {
-                                    cpu_compile(function, partition_manifest, &mut llvm_ir).unwrap()
-                                }
-                                _ => todo!(),
-                            }
+                    for idx in 0..self.module.functions.len() {
+                        if self.module.functions[idx].entry {
+                            rt_codegen(
+                                FunctionID::new(idx),
+                                &self.module,
+                                &reverse_postorders[idx],
+                                &typing[idx],
+                                &control_subgraphs[idx],
+                                &bbs[idx],
+                                &callgraph,
+                                &memory_objects,
+                                &memory_objects_mutable,
+                                &mut rust_rt,
+                            )
+                            .unwrap();
+                        } else {
+                            // TODO: determine which backend to use for function.
+                            cpu_codegen(
+                                &self.module.functions[idx],
+                                &self.module.types,
+                                &self.module.constants,
+                                &self.module.dynamic_constants,
+                                &reverse_postorders[idx],
+                                &typing[idx],
+                                &control_subgraphs[idx],
+                                &bbs[idx],
+                                &mut llvm_ir,
+                            )
+                            .unwrap();
                         }
                     }
                     println!("{}", llvm_ir);
+                    println!("{}", rust_rt);
 
                     // Write the LLVM IR into a temporary file.
                     let mut tmp_path = temp_dir();
@@ -938,14 +1044,13 @@ impl PassManager {
                     assert!(clang_process.wait().unwrap().success());
                     println!("{}", output_archive);
 
-                    // Package manifest into a file.
-                    let hman_contents: Vec<u8> = postcard::to_allocvec(&smodule.manifests).unwrap();
-                    let mut file = File::create(format!("{}/{}.hman", output_dir, module_name))
-                        .expect("PANIC: Unable to open output manifest file.");
-                    file.write_all(&hman_contents)
-                        .expect("PANIC: Unable to write output manifest file contents.");
-                    self.manifests = Some(smodule.manifests);
-                    println!("{:?}", self.manifests);
+                    // Write the Rust runtime into a file.
+                    let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name);
+                    let mut file = File::create(&output_rt)
+                        .expect("PANIC: Unable to open output Rust runtime file.");
+                    file.write_all(rust_rt.as_bytes())
+                        .expect("PANIC: Unable to write output Rust runtime file contents.");
+                    println!("{}", output_rt);
                 }
                 Pass::Serialize(output_file) => {
                     let module_contents: Vec<u8> = postcard::to_allocvec(&self.module).unwrap();
@@ -995,10 +1100,6 @@ impl PassManager {
         self.module
     }
 
-    pub fn get_manifests(self) -> HashMap<String, Manifest> {
-        self.manifests.unwrap()
-    }
-
     fn fix_deleted_functions(&mut self, id_mapping: &[Option<usize>]) {
         let mut idx = 0;
 
diff --git a/hercules_opt/src/unforkify.rs b/hercules_opt/src/unforkify.rs
new file mode 100644
index 0000000000000000000000000000000000000000..f31b740984c5b7d0ab6baaffd3edc807fe8cadbc
--- /dev/null
+++ b/hercules_opt/src/unforkify.rs
@@ -0,0 +1,147 @@
+extern crate hercules_ir;
+
+use std::collections::HashMap;
+use std::iter::zip;
+
+use self::hercules_ir::ir::*;
+
+use crate::*;
+
+/*
+ * Convert forks back into loops right before codegen when a backend is not
+ * lowering a fork-join to vector / parallel code. Lowering fork-joins into
+ * sequential loops in LLVM is actually not entirely trivial, so it's easier to
+ * just do this transformation within Hercules IR.
+ */
+pub fn unforkify(editor: &mut FunctionEditor, fork_join_map: &HashMap<NodeID, NodeID>) {
+    let mut zero_cons_id = ConstantID::new(0);
+    let mut one_cons_id = ConstantID::new(0);
+    assert!(editor.edit(|mut edit| {
+        zero_cons_id = edit.add_constant(Constant::UnsignedInteger64(0));
+        one_cons_id = edit.add_constant(Constant::UnsignedInteger64(1));
+        Ok(edit)
+    }));
+
+    // Convert the fork to a region, thread IDs to a single phi, reduces to
+    // phis, and the join to a branch at the top of the loop. The previous
+    // control insides of the fork-join should become the successor of the true
+    // projection node, and what was the use of the join should become a use of
+    // the new region.
+    for (fork, join) in fork_join_map {
+        let nodes = &editor.func().nodes;
+        let (fork_control, factors) = nodes[fork.idx()].try_fork().unwrap();
+        if factors.len() > 1 {
+            // For now, don't convert multi-dimensional fork-joins. Rely on pass
+            // that splits fork-joins.
+            continue;
+        }
+        let join_control = nodes[join.idx()].try_join().unwrap();
+        let tids: Vec<_> = editor
+            .get_users(*fork)
+            .filter(|id| nodes[id.idx()].is_thread_id())
+            .collect();
+        let reduces: Vec<_> = editor
+            .get_users(*join)
+            .filter(|id| nodes[id.idx()].is_reduce())
+            .collect();
+
+        let num_nodes = editor.node_ids().len();
+        let region_id = NodeID::new(num_nodes);
+        let if_id = NodeID::new(num_nodes + 1);
+        let proj_back_id = NodeID::new(num_nodes + 2);
+        let proj_exit_id = NodeID::new(num_nodes + 3);
+        let zero_id = NodeID::new(num_nodes + 4);
+        let one_id = NodeID::new(num_nodes + 5);
+        let indvar_id = NodeID::new(num_nodes + 6);
+        let add_id = NodeID::new(num_nodes + 7);
+        let dc_id = NodeID::new(num_nodes + 8);
+        let neq_id = NodeID::new(num_nodes + 9);
+        let phi_ids = (num_nodes + 10..num_nodes + 10 + reduces.len()).map(NodeID::new);
+
+        let region = Node::Region {
+            preds: Box::new([
+                fork_control,
+                if join_control == *fork {
+                    proj_back_id
+                } else {
+                    join_control
+                },
+            ]),
+        };
+        let if_node = Node::If {
+            control: region_id,
+            cond: neq_id,
+        };
+        let proj_back = Node::Projection {
+            control: if_id,
+            selection: 1,
+        };
+        let proj_exit = Node::Projection {
+            control: if_id,
+            selection: 0,
+        };
+        let zero = Node::Constant { id: zero_cons_id };
+        let one = Node::Constant { id: one_cons_id };
+        let indvar = Node::Phi {
+            control: region_id,
+            data: Box::new([zero_id, add_id]),
+        };
+        let add = Node::Binary {
+            op: BinaryOperator::Add,
+            left: indvar_id,
+            right: one_id,
+        };
+        let dc = Node::DynamicConstant { id: factors[0] };
+        let neq = Node::Binary {
+            op: BinaryOperator::NE,
+            left: indvar_id,
+            right: dc_id,
+        };
+        let phis: Vec<_> = reduces
+            .iter()
+            .map(|reduce_id| {
+                let (_, init, reduct) = nodes[reduce_id.idx()].try_reduce().unwrap();
+                Node::Phi {
+                    control: region_id,
+                    data: Box::new([init, reduct]),
+                }
+            })
+            .collect();
+
+        editor.edit(|mut edit| {
+            assert_eq!(edit.add_node(region), region_id);
+            assert_eq!(edit.add_node(if_node), if_id);
+            assert_eq!(edit.add_node(proj_back), proj_back_id);
+            assert_eq!(edit.add_node(proj_exit), proj_exit_id);
+            assert_eq!(edit.add_node(zero), zero_id);
+            assert_eq!(edit.add_node(one), one_id);
+            assert_eq!(edit.add_node(indvar), indvar_id);
+            assert_eq!(edit.add_node(add), add_id);
+            assert_eq!(edit.add_node(dc), dc_id);
+            assert_eq!(edit.add_node(neq), neq_id);
+            for (phi_id, phi) in zip(phi_ids.clone(), phis) {
+                assert_eq!(edit.add_node(phi), phi_id);
+            }
+
+            edit = edit.replace_all_uses(*fork, proj_back_id)?;
+            edit = edit.replace_all_uses(*join, proj_exit_id)?;
+            for tid in tids.iter() {
+                edit = edit.replace_all_uses(*tid, indvar_id)?;
+            }
+            for (reduce, phi_id) in zip(reduces.iter(), phi_ids) {
+                edit = edit.replace_all_uses(*reduce, phi_id)?;
+            }
+
+            edit = edit.delete_node(*fork)?;
+            edit = edit.delete_node(*join)?;
+            for tid in tids {
+                edit = edit.delete_node(tid)?;
+            }
+            for reduce in reduces {
+                edit = edit.delete_node(reduce)?;
+            }
+
+            Ok(edit)
+        });
+    }
+}
diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml
deleted file mode 100644
index 76c940b71979ee5bb7ab1d01abb77e3f7f3a0460..0000000000000000000000000000000000000000
--- a/hercules_rt/Cargo.toml
+++ /dev/null
@@ -1,11 +0,0 @@
-[package]
-name = "hercules_rt"
-version = "0.1.0"
-authors = ["Russel Arbore <rarbore2@illinois.edu>"]
-edition = "2021"
-
-[dependencies]
-libc = "*"
-postcard = { version = "*", features = ["alloc"] }
-serde = { version = "*", features = ["derive"] }
-hercules_rt_proc = { path = "../hercules_rt_proc" }
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
deleted file mode 100644
index 95b93aa488d708b24733ed78489d0cdb08d33a19..0000000000000000000000000000000000000000
--- a/hercules_rt/src/lib.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-extern crate hercules_rt_proc;
-
-pub use hercules_rt_proc::use_hman;
-pub use hercules_rt_proc::use_hir;
diff --git a/hercules_rt_proc/Cargo.toml b/hercules_rt_proc/Cargo.toml
deleted file mode 100644
index 6d026135252836154102006f0a444f2a1f824313..0000000000000000000000000000000000000000
--- a/hercules_rt_proc/Cargo.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-[package]
-name = "hercules_rt_proc"
-version = "0.1.0"
-authors = ["Russel Arbore <rarbore2@illinois.edu>"]
-edition = "2021"
-
-[lib]
-proc-macro = true
-
-[dependencies]
-postcard = { version = "*", features = ["alloc"] }
-serde = { version = "*", features = ["derive"] }
-hercules_cg = { path = "../hercules_cg" }
-hercules_ir = { path = "../hercules_ir" }
-hercules_opt = { path = "../hercules_opt" }
-anyhow = "*"
-uuid = { version = "*", features = ["v4", "fast-rng", "macro-diagnostics"] }
diff --git a/hercules_rt_proc/src/lib.rs b/hercules_rt_proc/src/lib.rs
deleted file mode 100644
index 0465a2584d3c266e77415afcdcc90be1276481f9..0000000000000000000000000000000000000000
--- a/hercules_rt_proc/src/lib.rs
+++ /dev/null
@@ -1,521 +0,0 @@
-#![feature(iter_intersperse)]
-
-extern crate anyhow;
-extern crate hercules_cg;
-extern crate hercules_ir;
-extern crate hercules_opt;
-extern crate postcard;
-extern crate proc_macro;
-
-use std::collections::{BTreeSet, HashMap};
-use std::ffi::OsStr;
-use std::fmt::Write;
-use std::fs::File;
-use std::io::prelude::*;
-use std::path::Path;
-
-use proc_macro::*;
-
-use self::hercules_cg::*;
-use self::hercules_ir::{DynamicConstant, DynamicConstantID, ID};
-
-/*
- * Convert schedule IR types to the Rust types generated in the interface.
- */
-fn generate_type_string(ty: &SType) -> String {
-    match ty {
-        SType::Boolean => "bool".to_string(),
-        SType::Integer8 => "i8".to_string(),
-        SType::Integer16 => "i16".to_string(),
-        SType::Integer32 => "i32".to_string(),
-        SType::Integer64 => "i64".to_string(),
-        SType::UnsignedInteger8 => "u8".to_string(),
-        SType::UnsignedInteger16 => "u16".to_string(),
-        SType::UnsignedInteger32 => "u32".to_string(),
-        SType::UnsignedInteger64 => "u64".to_string(),
-        SType::Float32 => "f32".to_string(),
-        SType::Float64 => "f64".to_string(),
-        SType::Product(fields) => {
-            fields.iter().fold("__Prod".to_string(), |acc, field| {
-                format!("{}_{}", acc, generate_type_name(field))
-            }) + "_"
-        }
-        SType::ArrayRef(elem) => format!("*mut {}", generate_type_string(elem)),
-    }
-}
-
-fn generate_type_name(ty: &SType) -> String {
-    match ty {
-        SType::Boolean
-        | SType::Integer8
-        | SType::Integer16
-        | SType::Integer32
-        | SType::Integer64
-        | SType::UnsignedInteger8
-        | SType::UnsignedInteger16
-        | SType::UnsignedInteger32
-        | SType::UnsignedInteger64
-        | SType::Float32
-        | SType::Float64 => generate_type_string(ty),
-        SType::Product(fields) => {
-            fields.iter().fold("__Prod".to_string(), |acc, field| {
-                format!("{}_{}", acc, generate_type_name(field))
-            }) + "_"
-        }
-        SType::ArrayRef(elem) => format!("ArrayRef_{}", generate_type_name(elem)),
-    }
-}
-
-fn compute_dynamic_constant<W: Write>(
-    dc: DynamicConstantID,
-    manifest: &Manifest,
-    rust_code: &mut W,
-) -> Result<(), anyhow::Error> {
-    match manifest.dynamic_constants[dc.idx()] {
-        DynamicConstant::Constant(cons) => write!(rust_code, "{}", cons)?,
-        DynamicConstant::Parameter(idx) => write!(rust_code, "dc_{}", idx)?,
-        DynamicConstant::Add(left, right) => {
-            write!(rust_code, "(")?;
-            compute_dynamic_constant(left, manifest, rust_code)?;
-            write!(rust_code, " + ")?;
-            compute_dynamic_constant(right, manifest, rust_code)?;
-            write!(rust_code, ")")?;
-        }
-        DynamicConstant::Sub(left, right) => {
-            write!(rust_code, "(")?;
-            compute_dynamic_constant(left, manifest, rust_code)?;
-            write!(rust_code, " - ")?;
-            compute_dynamic_constant(right, manifest, rust_code)?;
-            write!(rust_code, ")")?;
-        }
-        DynamicConstant::Mul(left, right) => {
-            write!(rust_code, "(")?;
-            compute_dynamic_constant(left, manifest, rust_code)?;
-            write!(rust_code, " * ")?;
-            compute_dynamic_constant(right, manifest, rust_code)?;
-            write!(rust_code, ")")?;
-        }
-        DynamicConstant::Div(left, right) => {
-            write!(rust_code, "(")?;
-            compute_dynamic_constant(left, manifest, rust_code)?;
-            write!(rust_code, " / ")?;
-            compute_dynamic_constant(right, manifest, rust_code)?;
-            write!(rust_code, ")")?;
-        }
-        DynamicConstant::Rem(left, right) => {
-            write!(rust_code, "(")?;
-            compute_dynamic_constant(left, manifest, rust_code)?;
-            write!(rust_code, " % ")?;
-            compute_dynamic_constant(right, manifest, rust_code)?;
-            write!(rust_code, ")")?;
-        }
-    }
-    Ok(())
-}
-
-/*
- * Generate async Rust code orchestrating partition execution.
- */
-fn codegen(
-    manifests: &HashMap<String, Manifest>,
-    link_library: &Option<String>,
-) -> Result<String, anyhow::Error> {
-    // Write to a String containing all of the Rust code.
-    let mut rust_code = "".to_string();
-
-    // Rust doesn't allow you to send pointers between threads. In order to send
-    // pointers between threads, we need to wrap them in a struct that unsafely
-    // implements Send and Sync. This passes the responsibility of
-    // synchronization onto us, which we do by being careful with how we lower
-    // parallel code. Make this type generic so that we actually wrap all
-    // arguments in it for ease of macro codegen.
-    write!(
-        rust_code,
-        "#[derive(Clone, Copy, Debug)]\nstruct SendSyncWrapper<T: Copy>(T);\nunsafe impl<T: Copy> Send for SendSyncWrapper<T> {{}}\nunsafe impl<T: Copy> Sync for SendSyncWrapper<T> {{}}\n"
-    )?;
-
-    // Emit the product types used in this module. We can't just emit product
-    // types, since we need #[repr(C)] to interact with LLVM.
-    let visible_stypes = manifests
-        .into_iter()
-        .map(|(_, manifest)| manifest.all_visible_types())
-        .flatten()
-        .collect::<BTreeSet<SType>>();
-    let all_stypes = Manifest::transitive_closure_type_set(visible_stypes);
-    for stype in all_stypes.iter() {
-        if let Some(fields) = stype.try_product() {
-            write!(
-                rust_code,
-                "#[derive(Clone, Copy, Debug)]\n#[repr(C)]\nstruct {}({});\n",
-                generate_type_string(stype),
-                fields
-                    .iter()
-                    .map(|field| generate_type_string(field))
-                    .intersperse(", ".to_string())
-                    .fold("".to_string(), |acc, token| acc + &token)
-            )?;
-        }
-    }
-
-    // Emit the async Rust functions implementing each Hercules function.
-    for (function_name, manifest) in manifests.into_iter() {
-        // Emit the function signature.
-        write!(rust_code, "async unsafe fn {}(", function_name)?;
-        for (param_ty, param_kind) in manifest.param_types.iter() {
-            match param_kind {
-                ParameterKind::HerculesParameter(idx) => write!(rust_code, "param_{}", idx)?,
-                ParameterKind::DataInput(_) => panic!(
-                    "PANIC: Parameter kind for Hercules function parameter cannot be DataInput."
-                ),
-                ParameterKind::DynamicConstant(idx) => write!(rust_code, "dc_{}", idx)?,
-                ParameterKind::ArrayConstant(array_id) => {
-                    write!(rust_code, "array_{}", array_id.idx())?
-                }
-            }
-            write!(rust_code, ": {}, ", generate_type_string(param_ty))?
-        }
-        write!(
-            rust_code,
-            ") -> {} {{\n",
-            generate_type_string(&manifest.return_type)
-        )?;
-
-        // Compute the signature for each partition function and emit the extern
-        // function signatures.
-        if let Some(link_library_name) = link_library {
-            write!(rust_code, "    #[link(name = \"{}\")]\n", link_library_name)?;
-        }
-        write!(rust_code, "    extern \"C\" {{\n")?;
-        for partition in manifest.partitions.iter() {
-            write!(rust_code, "        fn {}(", partition.name)?;
-
-            // Add parameters for SFunction signature.
-            for (param_stype, kind) in partition.parameters.iter() {
-                match kind {
-                    ParameterKind::HerculesParameter(idx) => write!(rust_code, "param_{}: ", idx)?,
-                    ParameterKind::DataInput(id) => write!(rust_code, "data_{}: ", id.idx())?,
-                    ParameterKind::DynamicConstant(idx) => write!(rust_code, "dc_{}: ", idx)?,
-                    ParameterKind::ArrayConstant(id) => write!(rust_code, "array_{}: ", id.idx())?,
-                }
-                write!(rust_code, "{}, ", generate_type_string(param_stype))?;
-            }
-
-            // Add parameters for device specific lowering details.
-            if let DeviceManifest::CPU { parallel_launch } = &partition.device {
-                for parallel_launch_dim in 0..parallel_launch.len() {
-                    write!(
-                        rust_code,
-                        "parallel_launch_low_{}: u64, parallel_launch_len_{}: u64, ",
-                        parallel_launch_dim, parallel_launch_dim
-                    )?;
-                }
-            }
-
-            // Add the return product of the SFunction signature.
-            let return_stype = if partition.returns.len() == 1 {
-                partition.returns[0].0.clone()
-            } else {
-                SType::Product(
-                    partition
-                        .returns
-                        .iter()
-                        .map(|(return_stype, _)| return_stype.clone())
-                        .collect(),
-                )
-            };
-            write!(rust_code, ") -> {};\n", generate_type_string(&return_stype),)?;
-        }
-        write!(rust_code, "    }}\n")?;
-
-        // Declare all of the intermediary data input / output variables. They
-        // are declared as MaybeUninit, since they get assigned after running a
-        // partition. MaybeUninits should always be defined before assume_init()
-        // is called on them, assuming a valid partitioning.
-        let mut data_inputs = BTreeSet::new();
-        let mut data_outputs = BTreeSet::new();
-        for partition in manifest.partitions.iter() {
-            data_inputs.extend(partition.data_inputs());
-            data_outputs.extend(partition.data_outputs());
-        }
-        assert_eq!(data_inputs, data_outputs);
-        for (node, stype) in data_inputs {
-            write!(rust_code, "    let mut node_{}: ::core::mem::MaybeUninit<{}> = ::core::mem::MaybeUninit::uninit();\n", node.idx(), generate_type_string(stype))?;
-        }
-
-        // The core executor is a Rust loop. We literally run a "control token"
-        // as described in the original sea of nodes paper through the
-        // partitions to drive execution.
-        write!(
-            rust_code,
-            "    let mut control_token: i8 = 0;\n    loop {{\n",
-        )?;
-
-        // Match on the control token position to determine which partition to
-        // execute.
-        write!(rust_code, "        match control_token {{\n")?;
-
-        // Emit the match arm per partition.
-        for (idx, partition) in manifest.partitions.iter().enumerate() {
-            // Open the arm.
-            write!(rust_code, "            {} => {{\n", idx)?;
-
-            match partition.device {
-                DeviceManifest::CPU {
-                    ref parallel_launch,
-                } => {
-                    for (idx, (_, kind)) in partition.parameters.iter().enumerate() {
-                        write!(
-                            rust_code,
-                            "                let local_param_{} = SendSyncWrapper(",
-                            idx
-                        )?;
-                        match kind {
-                            ParameterKind::HerculesParameter(idx) => {
-                                write!(rust_code, "param_{}", idx)?
-                            }
-                            ParameterKind::DataInput(id) => {
-                                write!(rust_code, "node_{}.assume_init()", id.idx())?
-                            }
-                            ParameterKind::DynamicConstant(idx) => write!(rust_code, "dc_{}", idx)?,
-                            ParameterKind::ArrayConstant(id) => {
-                                write!(rust_code, "array_{}", id.idx())?
-                            }
-                        }
-                        write!(rust_code, ");\n")?;
-                    }
-
-                    if parallel_launch.is_empty() {
-                        // Call the partition function.
-                        write!(
-                            rust_code,
-                            "                let output = {}(",
-                            partition.name
-                        )?;
-                        for idx in 0..partition.parameters.len() {
-                            write!(rust_code, "local_param_{}.0, ", idx)?;
-                        }
-                        write!(rust_code, ");\n")?;
-                    } else {
-                        // Compute the dynamic constant bounds.
-                        for (dim, (_, dc)) in parallel_launch.into_iter().enumerate() {
-                            write!(rust_code, "                let bound_{} = ", dim)?;
-                            compute_dynamic_constant(*dc, manifest, &mut rust_code)?;
-                            write!(rust_code, ";\n                let low_{} = 0;\n", dim)?;
-                        }
-
-                        // Simultaneously calculate the tiles lows and lens and
-                        // spawn the tiles. Emit the launches unrolled.
-                        let mut tile = vec![0; parallel_launch.len()];
-                        let total_num_tiles = parallel_launch
-                            .into_iter()
-                            .fold(1, |acc, (num_tiles, _)| acc * num_tiles);
-                        for tile_num in 0..total_num_tiles {
-                            // Calculate the lows and lens for this tile.
-                            for (dim, tile) in tile.iter().enumerate() {
-                                let num_tiles = parallel_launch[dim].0;
-                                write!(
-                                    rust_code,
-                                    "                let len_{} = bound_{} / {} + ({} < bound_{} % {}) as u64;\n",
-                                    dim, dim, num_tiles, tile, dim, num_tiles
-                                )?;
-                            }
-
-                            // Spawn the tile. We need to explicitly copy the
-                            // SendSyncWrappers, or else the path expression for
-                            // the parameters get interpreted as what needs to
-                            // be moved, when we want the wrapper itself to be
-                            // what gets moved. Ugh.
-                            write!(
-                                rust_code,
-                                "                let tile_{} = async_std::task::spawn(async move {{ ",
-                                tile_num,
-                            )?;
-                            for idx in 0..partition.parameters.len() {
-                                write!(
-                                    rust_code,
-                                    "let local_param_{} = local_param_{}; ",
-                                    idx, idx
-                                )?;
-                            }
-                            write!(rust_code, "SendSyncWrapper({}(", partition.name)?;
-                            for idx in 0..partition.parameters.len() {
-                                write!(rust_code, "local_param_{}.0, ", idx)?;
-                            }
-                            for dim in 0..parallel_launch.len() {
-                                write!(rust_code, "low_{}, len_{}, ", dim, dim)?;
-                            }
-                            write!(rust_code, ")) }});\n")?;
-
-                            // Go to the next tile.
-                            for dim in (0..parallel_launch.len()).rev() {
-                                tile[dim] += 1;
-                                let num_tiles = parallel_launch[dim].0;
-                                if tile[dim] < num_tiles {
-                                    write!(
-                                        rust_code,
-                                        "                let low_{} = low_{} + len_{};\n",
-                                        dim, dim, dim
-                                    )?;
-                                    break;
-                                } else {
-                                    tile[dim] = 0;
-                                    write!(rust_code, "                let low_{} = 0;\n", dim)?;
-                                }
-                            }
-                        }
-
-                        // Join the JoinHandles, and get the output from one of
-                        // them.
-                        write!(
-                            rust_code,
-                            "                let output = ::core::future::join!(",
-                        )?;
-                        for tile_num in 0..total_num_tiles {
-                            write!(rust_code, "tile_{}, ", tile_num)?;
-                        }
-                        // join! unhelpfully returns either a tuple or a single
-                        // value, but never a singleton tuple.
-                        if total_num_tiles == 1 {
-                            write!(rust_code, ").await.0;\n")?;
-                        } else {
-                            write!(rust_code, ").await.0.0;\n")?;
-                        }
-                    }
-
-                    // Assign the outputs.
-                    for (output_idx, (_, kind)) in partition.returns.iter().enumerate() {
-                        let output_ref = if partition.returns.len() == 1 {
-                            "output".to_string()
-                        } else {
-                            format!("output.{}", output_idx)
-                        };
-                        match kind {
-                            ReturnKind::HerculesReturn => {
-                                write!(rust_code, "                return {};\n", output_ref)?
-                            }
-                            ReturnKind::DataOutput(id) => write!(
-                                rust_code,
-                                "                node_{}.write({});\n",
-                                id.idx(),
-                                output_ref
-                            )?,
-                            ReturnKind::NextPartition => write!(
-                                rust_code,
-                                "                control_token = {};\n",
-                                output_ref
-                            )?,
-                        }
-                    }
-                }
-                _ => todo!(),
-            }
-
-            // If there's only one partition successor, then an explicit
-            // NextPartition isn't returned - emit the new control token here.
-            if partition.successors.len() == 1 {
-                write!(
-                    rust_code,
-                    "                control_token = {};\n",
-                    partition.successors[0].idx()
-                )?;
-            }
-
-            // Close the arm.
-            write!(rust_code, "            }}\n")?;
-        }
-
-        // Close the match, and handle invalid control token values.
-        write!(
-            rust_code,
-            "            _ => panic!(\"PANIC: Invalid control token value.\"),\n        }}\n"
-        )?;
-
-        // Close the loop.
-        write!(rust_code, "    }}\n")?;
-
-        // Close the function.
-        write!(rust_code, "}}\n")?;
-    }
-
-    Ok(rust_code)
-}
-
-/*
- * Generate the async Rust runtime from the manifest of a Hercules module.
- */
-#[proc_macro]
-pub fn use_hman(path: TokenStream) -> TokenStream {
-    use TokenTree::Literal;
-
-    // Get the path as a Rust path object, and make sure it's a .hman file.
-    let mut tokens_iter = path.into_iter();
-    let token = tokens_iter
-        .next()
-        .expect("Please provide a path to a .hman file to the use_hman! macro.");
-    assert!(tokens_iter.next().is_none(), "Too many tokens provided to the use_hman! macro. Please provide only one path to a .hman file.");
-    let literal = if let Literal(literal) = token {
-        literal
-    } else {
-        panic!("Please provide a string literal containing the path to a .hman file to the use_hman! macro.");
-    };
-    let literal_string = literal.to_string();
-    let path = Path::new(&literal_string[1..(literal_string.len() - 1)]);
-    assert_eq!(
-        path.extension(),
-        Some(OsStr::new("hman")),
-        "Please provide only .hman files to the use_hman! macro."
-    );
-    assert_eq!(
-        path.try_exists().ok(),
-        Some(true),
-        "Please provide a valid path to a .hman file to the use_hman! macro."
-    );
-
-    // Load manifest from path.
-    let mut f = File::open(path).unwrap();
-    let mut buffer = vec![];
-    f.read_to_end(&mut buffer).unwrap();
-    let manifests = postcard::from_bytes(&buffer).unwrap();
-
-    // Generate Rust code.
-    let rust_code = codegen(&manifests, &None).unwrap();
-    eprintln!("{}", rust_code);
-    rust_code.parse().unwrap()
-}
-
-#[proc_macro]
-pub fn use_hir(hir_tokens: TokenStream) -> TokenStream {
-    use std::env;
-    use TokenTree::Literal;
-
-    let mut tokens_iter = hir_tokens.into_iter();
-    let token = tokens_iter
-        .next()
-        .expect("Please provide Hercules IR to use the use_hir! macro.");
-    assert!(
-        tokens_iter.next().is_none(),
-        "Too many tokens provided to use the use_hir! macro. Please provide only Hercules IR."
-    );
-    let literal = if let Literal(literal) = token {
-        literal
-    } else {
-        panic!("Please provide a string literal containing Hercules IR.");
-    };
-    let literal_string = literal.to_string();
-
-    let module = hercules_ir::parse::parse(&literal_string[1..(literal_string.len() - 1)])
-        .expect("PANIC: Failed to parse Hercules IR string.");
-    let out_dir = env::var("OUT_DIR").unwrap();
-    let libname = format!("hir_generated_{}", uuid::Uuid::new_v4().simple());
-
-    let mut p = hercules_opt::pass::PassManager::new(module);
-    p.add_pass(hercules_opt::pass::Pass::Codegen(out_dir, libname.clone()));
-
-    p.run_passes();
-
-    let manifests = p.get_manifests();
-    let rust_code = codegen(&manifests, &Some(libname)).unwrap();
-    eprintln!("{}", rust_code);
-
-    rust_code.parse().unwrap()
-}
diff --git a/hercules_samples/call/Cargo.toml b/hercules_samples/call/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..4a2fbb862039ad0d268b85fc6e85463dc87841d7
--- /dev/null
+++ b/hercules_samples/call/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "call"
+version = "0.1.0"
+authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+edition = "2021"
+
+[build-dependencies]
+juno_build = { path = "../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../juno_build" }
+rand = "*"
+async-std = "*"
+with_builtin_macros = "0.1.0"
diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..dbefe008a14e57785261e2757bb0e0dbbb5fa27c
--- /dev/null
+++ b/hercules_samples/call/build.rs
@@ -0,0 +1,10 @@
+extern crate juno_build;
+use juno_build::JunoCompiler;
+
+fn main() {
+    JunoCompiler::new()
+        .ir_in_src("call.hir")
+        .unwrap()
+        .build()
+        .unwrap();
+}
diff --git a/hercules_samples/call.hir b/hercules_samples/call/src/call.hir
similarity index 72%
rename from hercules_samples/call.hir
rename to hercules_samples/call/src/call.hir
index 3c4f79111cd2ee4ee4c530dacba4ceb6b5e5e9e0..937ce1ef70eae9a569692a8d8c61b2cf26646d04 100644
--- a/hercules_samples/call.hir
+++ b/hercules_samples/call/src/call.hir
@@ -1,9 +1,9 @@
-fn myfunc(x: i32) -> i32
+fn myfunc(x: u64) -> u64
   cr = region(start)
   y = call<16>(add, cr, x, x)
   r = return(cr, y)
 
-fn add<1>(x: i32, y: i32) -> i32
+fn add<1>(x: u64, y: u64) -> u64
   w = add(x, y)
   dc = dynamic_constant(#0)
   z = add(w, dc)
diff --git a/hercules_samples/call/src/main.rs b/hercules_samples/call/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..3bbb634c7405dd9aff81dd9c3a0068b54df45a26
--- /dev/null
+++ b/hercules_samples/call/src/main.rs
@@ -0,0 +1,19 @@
+#![feature(box_as_ptr, let_chains)]
+
+extern crate async_std;
+extern crate juno_build;
+
+juno_build::juno!("call");
+
+fn main() {
+    async_std::task::block_on(async {
+        let x = myfunc(7).await;
+        let y = add(10, 2, 18).await;
+        assert_eq!(x, y);
+    });
+}
+
+#[test]
+fn dot_test() {
+    main();
+}
diff --git a/hercules_samples/ccp/Cargo.toml b/hercules_samples/ccp/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..3547aa52df766cb00445e088176c076fb5996b80
--- /dev/null
+++ b/hercules_samples/ccp/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "ccp"
+version = "0.1.0"
+authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+edition = "2021"
+
+[build-dependencies]
+juno_build = { path = "../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../juno_build" }
+rand = "*"
+async-std = "*"
+with_builtin_macros = "0.1.0"
diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..650b51b8b14715579de164f7fc65330e113613a1
--- /dev/null
+++ b/hercules_samples/ccp/build.rs
@@ -0,0 +1,10 @@
+extern crate juno_build;
+use juno_build::JunoCompiler;
+
+fn main() {
+    JunoCompiler::new()
+        .ir_in_src("ccp.hir")
+        .unwrap()
+        .build()
+        .unwrap();
+}
diff --git a/hercules_samples/ccp_example.hir b/hercules_samples/ccp/src/ccp.hir
similarity index 100%
rename from hercules_samples/ccp_example.hir
rename to hercules_samples/ccp/src/ccp.hir
diff --git a/hercules_samples/ccp/src/main.rs b/hercules_samples/ccp/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..5fc78ab51227a12ef46636bd4479cd3321cc509b
--- /dev/null
+++ b/hercules_samples/ccp/src/main.rs
@@ -0,0 +1,18 @@
+#![feature(box_as_ptr, let_chains)]
+
+extern crate async_std;
+extern crate juno_build;
+
+juno_build::juno!("ccp");
+
+fn main() {
+    async_std::task::block_on(async {
+        let x = tricky(7).await;
+        assert_eq!(x, 1);
+    });
+}
+
+#[test]
+fn dot_test() {
+    main();
+}
diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml
index 69cd39e388661b3f7f6dca53cf9210ab7050902c..f74ab1f6f4ed5de3b02ab45b1f6fca461fdbc192 100644
--- a/hercules_samples/dot/Cargo.toml
+++ b/hercules_samples/dot/Cargo.toml
@@ -10,7 +10,6 @@ juno_build = { path = "../../juno_build" }
 [dependencies]
 clap = { version = "*", features = ["derive"] }
 juno_build = { path = "../../juno_build" }
-hercules_rt = { path = "../../hercules_rt" }
 rand = "*"
 async-std = "*"
 with_builtin_macros = "0.1.0"
diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs
index 6d4cf3800563cd17b48eb744d2e30ffe8c641bf5..0f5ee518506e6abe6cb815c244699ce12ccb45d1 100644
--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
@@ -1,14 +1,31 @@
+#![feature(box_as_ptr, let_chains)]
+
 extern crate async_std;
-extern crate clap;
 extern crate juno_build;
 
+use core::ptr::copy_nonoverlapping;
+
 juno_build::juno!("dot");
 
 fn main() {
     async_std::task::block_on(async {
-        let mut a = vec![0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
-        let mut b = vec![0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
-        let c = unsafe { dot(a.as_mut_ptr(), b.as_mut_ptr(), 8).await };
+        let a: Box<[f32]> = Box::new([0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]);
+        let b: Box<[f32]> = Box::new([0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]);
+        let mut a_bytes: Box<[u8]> = Box::new([0; 32]);
+        let mut b_bytes: Box<[u8]> = Box::new([0; 32]);
+        unsafe {
+            copy_nonoverlapping(
+                Box::as_ptr(&a) as *const u8,
+                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
+                32,
+            );
+            copy_nonoverlapping(
+                Box::as_ptr(&b) as *const u8,
+                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
+                32,
+            );
+        };
+        let c = dot(8, a_bytes, b_bytes).await;
         println!("{}", c);
         assert_eq!(c, 70.0);
     });
diff --git a/hercules_samples/fac/Cargo.toml b/hercules_samples/fac/Cargo.toml
index 9082a4fc4194ac3fa9c694a5a5973f605772a384..d4b9c5fe2ae7ac6907d3c1181dabb3cb247cff2e 100644
--- a/hercules_samples/fac/Cargo.toml
+++ b/hercules_samples/fac/Cargo.toml
@@ -10,7 +10,6 @@ juno_build = { path = "../../juno_build" }
 [dependencies]
 clap = { version = "*", features = ["derive"] }
 juno_build = { path = "../../juno_build" }
-hercules_rt = { path = "../../hercules_rt" }
 rand = "*"
 async-std = "*"
 with_builtin_macros = "0.1.0"
diff --git a/hercules_samples/fac/src/fac.hir b/hercules_samples/fac/src/fac.hir
index 0d85c5d095cef28bc88e3b9669c26ed42fde8b8c..e43dd8cae1a605bca7c3ceac4eb7c029665e86e6 100644
--- a/hercules_samples/fac/src/fac.hir
+++ b/hercules_samples/fac/src/fac.hir
@@ -1,4 +1,4 @@
-fn fac_inner(x: i32) -> i32
+fn fac(x: i32) -> i32
   zero = constant(i32, 0)
   one = constant(i32, 1)
   loop = region(start, if_true)
@@ -11,8 +11,3 @@ fn fac_inner(x: i32) -> i32
   if_false = projection(if, 0)
   if_true = projection(if, 1)
   r = return(if_false, fac_acc)
-
-fn fac(x: i32) -> i32
-  cr = region(start)
-  call = call(fac_inner, cr, x)
-  r = return(cr, call)
diff --git a/hercules_samples/fac/src/main.rs b/hercules_samples/fac/src/main.rs
index e3a307fcfd521217f2956e77f25332a54e6befee..7071fd2c115bba1d6ff60fae688b262354d4fc71 100644
--- a/hercules_samples/fac/src/main.rs
+++ b/hercules_samples/fac/src/main.rs
@@ -6,7 +6,7 @@ juno_build::juno!("fac");
 
 fn main() {
     async_std::task::block_on(async {
-        let f = unsafe { fac(8).await };
+        let f = fac(8).await;
         println!("{}", f);
         assert_eq!(f, 40320);
     });
diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml
index 9066c1535e2c40400bdb3b5ca20a3e38237ef597..d3975c5ca58b68cdb3fef0f6d8a3cf8e106408d6 100644
--- a/hercules_samples/matmul/Cargo.toml
+++ b/hercules_samples/matmul/Cargo.toml
@@ -10,7 +10,6 @@ juno_build = { path = "../../juno_build" }
 [dependencies]
 clap = { version = "*", features = ["derive"] }
 juno_build = { path = "../../juno_build" }
-hercules_rt = { path = "../../hercules_rt" }
 rand = "*"
 async-std = "*"
 with_builtin_macros = "0.1.0"
diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs
index f3ceba93d88402443b503a232abd5af71eb9ca58..12c14249aa62502c766028cef3c0518cf0fb4633 100644
--- a/hercules_samples/matmul/src/main.rs
+++ b/hercules_samples/matmul/src/main.rs
@@ -1,19 +1,39 @@
-#![feature(future_join)]
+#![feature(box_as_ptr, let_chains)]
 
 extern crate async_std;
-extern crate clap;
 extern crate juno_build;
 
+use core::ptr::copy_nonoverlapping;
+
 juno_build::juno!("matmul");
 
 fn main() {
     async_std::task::block_on(async {
-        let mut a = vec![1.0, 2.0, 3.0, 4.0];
-        let mut b = vec![5.0, 6.0, 7.0, 8.0];
-        let mut c = vec![0.0, 0.0, 0.0, 0.0];
+        let a: Box<[f32]> = Box::new([1.0, 2.0, 3.0, 4.0]);
+        let b: Box<[f32]> = Box::new([5.0, 6.0, 7.0, 8.0]);
+        let mut a_bytes: Box<[u8]> = Box::new([0; 16]);
+        let mut b_bytes: Box<[u8]> = Box::new([0; 16]);
+        unsafe {
+            copy_nonoverlapping(
+                Box::as_ptr(&a) as *const u8,
+                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
+                16,
+            );
+            copy_nonoverlapping(
+                Box::as_ptr(&b) as *const u8,
+                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
+                16,
+            );
+        };
+        let c_bytes = matmul(2, 2, 2, a_bytes, b_bytes).await;
+        let mut c: Box<[f32]> = Box::new([0.0; 4]);
         unsafe {
-            matmul(a.as_mut_ptr(), b.as_mut_ptr(), c.as_mut_ptr(), 2, 2, 2).await;
-        }
+            copy_nonoverlapping(
+                Box::as_ptr(&c_bytes) as *const u8,
+                Box::as_mut_ptr(&mut c) as *mut u8,
+                16,
+            );
+        };
         println!("[[{}, {}], [{}, {}]]", c[0], c[1], c[2], c[3]);
         assert_eq!(c[0], 19.0);
         assert_eq!(c[1], 22.0);
diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml
index 4f6234988f7e30ade972933dcec56b29412369ba..72faf4bd14da65b482f2e379c1b51ce3ede8dcf0 100644
--- a/juno_build/Cargo.toml
+++ b/juno_build/Cargo.toml
@@ -6,6 +6,5 @@ edition = "2021"
 
 [dependencies]
 juno_frontend = { path = "../juno_frontend" }
-hercules_rt = { path = "../hercules_rt" }
 hercules_ir = { path = "../hercules_ir" }
 with_builtin_macros = "0.1.0"
diff --git a/juno_build/src/lib.rs b/juno_build/src/lib.rs
index e01a518722d8e074905593f04e14391e0d905a9e..fdaf4d27cbcb7b31738e6df42d69272369d5026f 100644
--- a/juno_build/src/lib.rs
+++ b/juno_build/src/lib.rs
@@ -1,5 +1,4 @@
 extern crate hercules_ir;
-extern crate hercules_rt;
 
 use juno_compiler::*;
 
@@ -233,8 +232,8 @@ impl JunoCompiler {
 macro_rules! juno {
     ($path:expr) => {
         with_builtin_macros::with_builtin!(
-            let $hman = concat!(env!("OUT_DIR"), "/", $path, ".hman") in {
-            hercules_rt::use_hman!($hman);
+            let $hrt = concat!(env!("OUT_DIR"), "/rt_", $path, ".hrt") in {
+            include!($hrt);
         });
     };
 }
diff --git a/juno_frontend/src/lib.rs b/juno_frontend/src/lib.rs
index 4713cfeb92c13b46a7bcc413a765f9a4951cb5f5..c39faef9011b39960a74ecf4472df8c0780a8281 100644
--- a/juno_frontend/src/lib.rs
+++ b/juno_frontend/src/lib.rs
@@ -184,9 +184,15 @@ pub fn compile_ir(
     add_pass!(pm, verify, Forkify);
     add_pass!(pm, verify, ForkGuardElim);
     add_verified_pass!(pm, verify, DCE);
+    add_pass!(pm, verify, Outline);
+    add_pass!(pm, verify, InterproceduralSROA);
+    add_pass!(pm, verify, SROA);
+    add_pass!(pm, verify, ForkSplit);
+    add_pass!(pm, verify, Unforkify);
+    add_pass!(pm, verify, GVN);
+    add_verified_pass!(pm, verify, DCE);
     if x_dot {
         pm.add_pass(hercules_opt::pass::Pass::Xdot(true));
-        pm.add_pass(hercules_opt::pass::Pass::SchedXdot);
     }
 
     pm.add_pass(hercules_opt::pass::Pass::Codegen(output_dir, module_name));
diff --git a/juno_samples/casts_and_intrinsics/Cargo.toml b/juno_samples/casts_and_intrinsics/Cargo.toml
index f49797969012f5195a0338b7b14fa04414dddb03..af74c07acc3950b22b9b2c95e0d07090f99d7490 100644
--- a/juno_samples/casts_and_intrinsics/Cargo.toml
+++ b/juno_samples/casts_and_intrinsics/Cargo.toml
@@ -13,6 +13,5 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
-hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs
index 8422f7e60619b48aca2d5e783eaed45e70be71c3..fafa97bbc642751c37b2ef47a43954fa84f340d9 100644
--- a/juno_samples/casts_and_intrinsics/build.rs
+++ b/juno_samples/casts_and_intrinsics/build.rs
@@ -5,8 +5,6 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("casts_and_intrinsics.jn")
         .unwrap()
-        .schedule_in_src("casts_and_intrinsics.sch")
-        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/juno_samples/casts_and_intrinsics/src/casts_and_intrinsics.sch b/juno_samples/casts_and_intrinsics/src/casts_and_intrinsics.sch
deleted file mode 100644
index 80ec2766eac8850ff736cd8a1f52dbe87bf24553..0000000000000000000000000000000000000000
--- a/juno_samples/casts_and_intrinsics/src/casts_and_intrinsics.sch
+++ /dev/null
@@ -1,2 +0,0 @@
-function casts_and_intrinsics {
-}
diff --git a/juno_samples/casts_and_intrinsics/src/main.rs b/juno_samples/casts_and_intrinsics/src/main.rs
index 344168e0e1995becff3b4580f4d957002a9ee2a5..037d4c4025ca141887034353124436a2db8f84f3 100644
--- a/juno_samples/casts_and_intrinsics/src/main.rs
+++ b/juno_samples/casts_and_intrinsics/src/main.rs
@@ -2,15 +2,12 @@
 
 extern crate async_std;
 extern crate juno_build;
-extern crate hercules_rt;
 
 juno_build::juno!("casts_and_intrinsics");
 
 fn main() {
     async_std::task::block_on(async {
-        let output = unsafe {
-            casts_and_intrinsics(16.0).await
-        };
+        let output = casts_and_intrinsics(16.0).await;
         println!("{}", output);
         assert_eq!(output, 4);
     });
diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml
index dd40d2094d58bbb17da943f798c1565b12000245..c272fc443df485aaacd80fe5fdc882bd4d02225c 100644
--- a/juno_samples/matmul/Cargo.toml
+++ b/juno_samples/matmul/Cargo.toml
@@ -13,6 +13,5 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
-hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs
index e68df99828cc47d8dba49fbd11e40156c670dc01..81f645e0666dfb22e075953c3d0a1a531909f1a0 100644
--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
@@ -5,8 +5,6 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("matmul.jn")
         .unwrap()
-        .schedule_in_src("matmul.sch")
-        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs
index 6d1867acd13be82b817e7363626acbbd52aa5343..6ec3dae763672075b5410f1b0350c56504f36068 100644
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -1,19 +1,39 @@
-#![feature(future_join)]
+#![feature(future_join, box_as_ptr, let_chains)]
 
 extern crate async_std;
 extern crate juno_build;
-extern crate hercules_rt;
+
+use core::ptr::copy_nonoverlapping;
 
 juno_build::juno!("matmul");
 
 fn main() {
     async_std::task::block_on(async {
-        let mut a = vec![1.0, 2.0, 3.0, 4.0];
-        let mut b = vec![5.0, 6.0, 7.0, 8.0];
-        let mut c = vec![0.0, 0.0, 0.0, 0.0];
+        let a: Box<[f32]> = Box::new([1.0, 2.0, 3.0, 4.0]);
+        let b: Box<[f32]> = Box::new([5.0, 6.0, 7.0, 8.0]);
+        let mut a_bytes: Box<[u8]> = Box::new([0; 16]);
+        let mut b_bytes: Box<[u8]> = Box::new([0; 16]);
+        unsafe {
+            copy_nonoverlapping(
+                Box::as_ptr(&a) as *const u8,
+                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
+                16,
+            );
+            copy_nonoverlapping(
+                Box::as_ptr(&b) as *const u8,
+                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
+                16,
+            );
+        };
+        let c_bytes = matmul(2, 2, 2, a_bytes, b_bytes).await;
+        let mut c: Box<[f32]> = Box::new([0.0; 4]);
         unsafe {
-            matmul(a.as_mut_ptr(), b.as_mut_ptr(), c.as_mut_ptr(), 2, 2, 2).await;
-        }
+            copy_nonoverlapping(
+                Box::as_ptr(&c_bytes) as *const u8,
+                Box::as_mut_ptr(&mut c) as *mut u8,
+                16,
+            );
+        };
         println!("[[{}, {}], [{}, {}]]", c[0], c[1], c[2], c[3]);
         assert_eq!(c[0], 19.0);
         assert_eq!(c[1], 22.0);
@@ -26,3 +46,4 @@ fn main() {
 fn matmul_test() {
     main();
 }
+
diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch
deleted file mode 100644
index 847a91214a88c8523771394506d25eb55c3f675d..0000000000000000000000000000000000000000
--- a/juno_samples/matmul/src/matmul.sch
+++ /dev/null
@@ -1,7 +0,0 @@
-function matmul {
-  partition { @outer, @middle, @inner } on cpu //gpu
-  partition @exit on cpu
-
-  parallelize @outer
-  vectorize @inner
-}
diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml
index 8c9b969d23019b8bbd3bf28b3506e2e497ae8ec7..7ffc13f21b155dbe6028d808be97aaf0e5ffb8d6 100644
--- a/juno_samples/nested_ccp/Cargo.toml
+++ b/juno_samples/nested_ccp/Cargo.toml
@@ -13,6 +13,5 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
-hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs
index f866112f34914add4f9598d217bf6589fd033dba..80f92c0b9f9e600a157fa22784843438a7445aae 100644
--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
@@ -1,17 +1,24 @@
-#![feature(future_join)]
+#![feature(box_as_ptr, let_chains)]
 
 extern crate async_std;
 extern crate juno_build;
-extern crate hercules_rt;
+
+use core::ptr::copy_nonoverlapping;
 
 juno_build::juno!("nested_ccp");
 
 fn main() {
     async_std::task::block_on(async {
-        let mut a = vec![17.0, 18.0, 19.0];
-        let output = unsafe {
-            ccp_example(a.as_mut_ptr()).await
+        let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
+        let mut a_bytes: Box<[u8]> = Box::new([0; 12]);
+        unsafe {
+            copy_nonoverlapping(
+                Box::as_ptr(&a) as *const u8,
+                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
+                12,
+            );
         };
+        let output = ccp_example(a_bytes).await;
         println!("{}", output);
         assert_eq!(output, 1.0);
     });
diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml
index 8060c5b3472ad898cb48e011332a852cd7b6705e..201c8d3782d4b41d7bfef5b7df4b5b29758e6e00 100644
--- a/juno_samples/simple3/Cargo.toml
+++ b/juno_samples/simple3/Cargo.toml
@@ -13,6 +13,5 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
-hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs
index 38b198b0ef4a6ee8a3aac4a7da1ff32653981f22..0e476e8d41c7880741a3f474f341a0decf0bda4b 100644
--- a/juno_samples/simple3/build.rs
+++ b/juno_samples/simple3/build.rs
@@ -5,8 +5,6 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("simple3.jn")
         .unwrap()
-        .schedule_in_src("simple3.sch")
-        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs
index 71e2766701130fd66e3c09a34493adc11ddb842f..89be5527a2e4d1e5842778818aa934db98fcdf09 100644
--- a/juno_samples/simple3/src/main.rs
+++ b/juno_samples/simple3/src/main.rs
@@ -1,16 +1,31 @@
-#![feature(future_join)]
+#![feature(box_as_ptr, let_chains)]
 
 extern crate async_std;
-extern crate hercules_rt;
 extern crate juno_build;
 
+use core::ptr::copy_nonoverlapping;
+
 juno_build::juno!("simple3");
 
 fn main() {
     async_std::task::block_on(async {
-        let mut a = vec![1, 2, 3, 4, 5, 6, 7, 8];
-        let mut b = vec![8, 7, 6, 5, 4, 3, 2, 1];
-        let c = unsafe { simple3(a.as_mut_ptr(), b.as_mut_ptr(), 8).await };
+        let a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]);
+        let b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]);
+        let mut a_bytes: Box<[u8]> = Box::new([0; 32]);
+        let mut b_bytes: Box<[u8]> = Box::new([0; 32]);
+        unsafe {
+            copy_nonoverlapping(
+                Box::as_ptr(&a) as *const u8,
+                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
+                32,
+            );
+            copy_nonoverlapping(
+                Box::as_ptr(&b) as *const u8,
+                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
+                32,
+            );
+        };
+        let c = simple3(8, a_bytes, b_bytes).await;
         println!("{}", c);
         assert_eq!(c, 120);
     });
diff --git a/juno_samples/simple3/src/simple3.sch b/juno_samples/simple3/src/simple3.sch
deleted file mode 100644
index b3842bee68ab6f5b0aee8ccc663b93cb57753b5a..0000000000000000000000000000000000000000
--- a/juno_samples/simple3/src/simple3.sch
+++ /dev/null
@@ -1,6 +0,0 @@
-function simple3 {
-  partition @loop on cpu
-  partition @exit on cpu
-
-  vectorize @loop
-}