diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d8af3a03dd006708a95167fc8dc9f7d49381a372..b80dd590ab42b4019daa285a83ec89d978884006 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,9 +1,5 @@
-build-job:
-  stage: build
-  script:
-    - cargo build
-
 test-job:
   stage: test
   script:
     - cargo test
+    - cargo test --features=cuda
diff --git a/Cargo.lock b/Cargo.lock
index 13cadc9592c8e1d95e21c013f465919ffcf6c5ec..ea295fdffe60316165c2de9f8ef41d3536579039 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -395,6 +395,7 @@ version = "0.1.0"
 dependencies = [
  "async-std",
  "clap",
+ "hercules_rt",
  "juno_build",
  "rand",
  "with_builtin_macros",
@@ -649,8 +650,13 @@ dependencies = [
  "postcard",
  "serde",
  "take_mut",
+ "tempfile",
 ]
 
+[[package]]
+name = "hercules_rt"
+version = "0.1.0"
+
 [[package]]
 name = "hermit-abi"
 version = "0.4.0"
@@ -702,6 +708,7 @@ name = "juno_antideps"
 version = "0.1.0"
 dependencies = [
  "async-std",
+ "hercules_rt",
  "juno_build",
  "with_builtin_macros",
 ]
@@ -741,11 +748,22 @@ dependencies = [
  "phf",
 ]
 
+[[package]]
+name = "juno_implicit_clone"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "hercules_rt",
+ "juno_build",
+ "with_builtin_macros",
+]
+
 [[package]]
 name = "juno_matmul"
 version = "0.1.0"
 dependencies = [
  "async-std",
+ "hercules_rt",
  "juno_build",
  "rand",
  "with_builtin_macros",
@@ -756,6 +774,7 @@ name = "juno_nested_ccp"
 version = "0.1.0"
 dependencies = [
  "async-std",
+ "hercules_rt",
  "juno_build",
  "with_builtin_macros",
 ]
@@ -775,6 +794,7 @@ name = "juno_simple3"
 version = "0.1.0"
 dependencies = [
  "async-std",
+ "hercules_rt",
  "juno_build",
  "with_builtin_macros",
 ]
@@ -896,6 +916,7 @@ version = "0.1.0"
 dependencies = [
  "async-std",
  "clap",
+ "hercules_rt",
  "juno_build",
  "rand",
  "with_builtin_macros",
@@ -1372,6 +1393,19 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
 
+[[package]]
+name = "tempfile"
+version = "3.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "once_cell",
+ "rustix",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "time"
 version = "0.3.36"
diff --git a/Cargo.toml b/Cargo.toml
index 1db806f41befe6b20d89b8484e069a4674276d6d..86307fd85b5fc707ce4bd9db7f645d5fea8c5dbb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
 	"hercules_cg",
 	"hercules_ir",
 	"hercules_opt",
+	"hercules_rt",
 	
 	"hercules_tools/hercules_driver",
 
@@ -25,5 +26,5 @@ members = [
 	"juno_samples/casts_and_intrinsics",
 	"juno_samples/nested_ccp",
 	"juno_samples/antideps",
-	#"juno_samples/implicit_clone",
+	"juno_samples/implicit_clone",
 ]
diff --git a/hercules_cg/Cargo.toml b/hercules_cg/Cargo.toml
index 8a60956b356465632225e5dfb477166149c44648..cf0767de04c81e35974a7aca89e9085b543e1a9d 100644
--- a/hercules_cg/Cargo.toml
+++ b/hercules_cg/Cargo.toml
@@ -2,6 +2,7 @@
 name = "hercules_cg"
 version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+edition = "2021"
 
 [dependencies]
 rand = "*"
diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index a09eacfa621876a570d37bd96e1d1f534e94fa4e..3f600fc0b6e729711be8632442e2b756b40e26a1 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -1,11 +1,9 @@
-extern crate hercules_ir;
-
 use std::collections::BTreeMap;
 use std::fmt::{Error, Write};
 use std::iter::zip;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
-use self::hercules_ir::*;
+use hercules_ir::*;
 
 use crate::*;
 
@@ -21,10 +19,8 @@ pub fn cpu_codegen<W: Write>(
     types: &Vec<Type>,
     constants: &Vec<Constant>,
     dynamic_constants: &Vec<DynamicConstant>,
-    reverse_postorder: &Vec<NodeID>,
     typing: &Vec<TypeID>,
     control_subgraph: &Subgraph,
-    antideps: &Vec<(NodeID, NodeID)>,
     bbs: &BasicBlocks,
     w: &mut W,
 ) -> Result<(), Error> {
@@ -33,10 +29,8 @@ pub fn cpu_codegen<W: Write>(
         types,
         constants,
         dynamic_constants,
-        reverse_postorder,
         typing,
         control_subgraph,
-        antideps,
         bbs,
     };
     ctx.codegen_function(w)
@@ -47,10 +41,8 @@ struct CPUContext<'a> {
     types: &'a Vec<Type>,
     constants: &'a Vec<Constant>,
     dynamic_constants: &'a Vec<DynamicConstant>,
-    reverse_postorder: &'a Vec<NodeID>,
     typing: &'a Vec<TypeID>,
     control_subgraph: &'a Subgraph,
-    antideps: &'a Vec<(NodeID, NodeID)>,
     bbs: &'a BasicBlocks,
 }
 
@@ -275,7 +267,7 @@ impl<'a> CPUContext<'a> {
                             write!(body, "double {} to double", val)?
                         }
                     }
-                    _ => panic!("PANIC: Can't dynamically allocate memory for an aggregate type within a CPU function."),
+                    _ => panic!("PANIC: Can't dynamically allocate memory for an aggregate type within a CPU function ({:?} in {}).", id, self.function.name),
                 }
             }
             Node::DynamicConstant { id: dc_id } => {
@@ -524,12 +516,6 @@ impl<'a> CPUContext<'a> {
                         self.get_value(data, true),
                         index_ptr_name
                     )?;
-                    write!(
-                        body,
-                        "  {} = bitcast {} to ptr\n",
-                        self.get_value(id, false),
-                        self.get_value(collect, true)
-                    )?;
                 } else {
                     // If the data item being written is not a primitive type,
                     // then perform a memcpy from the data collection to the
@@ -543,6 +529,12 @@ impl<'a> CPUContext<'a> {
                         data_size
                     )?;
                 }
+                write!(
+                    body,
+                    "  {} = bitcast {} to ptr\n",
+                    self.get_value(id, false),
+                    self.get_value(collect, true)
+                )?;
             }
             _ => panic!("PANIC: Can't lower {:?}.", self.function.nodes[id.idx()]),
         }
@@ -732,7 +724,7 @@ impl<'a> CPUContext<'a> {
                 // the dynamic constant bounds.
                 let mut acc_size = self.codegen_type_size(elem, body)?;
                 for dc in bounds {
-                    acc_size = Self::multiply(&acc_size, &format!("dc{}", dc.idx()), body)?;
+                    acc_size = Self::multiply(&acc_size, &format!("%dc{}", dc.idx()), body)?;
                 }
                 Ok(acc_size)
             }
diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs
index 50b7dda48108c9b50774f54da23162cb9125c5e0..09a5bc2689ecf3ed1a7186f2ada52758c7b19fbb 100644
--- a/hercules_cg/src/device.rs
+++ b/hercules_cg/src/device.rs
@@ -1,6 +1,4 @@
-extern crate hercules_ir;
-
-use self::hercules_ir::*;
+use hercules_ir::*;
 
 /*
  * Top level function to definitively place functions onto devices. A function
diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs
index 2c1d3fc06980b1b9af608452a89d31a8150430b4..e41f0205433b52fd974e449f55d4cd8ea9ca7bcf 100644
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
@@ -10,9 +10,19 @@ pub use crate::gpu::*;
 pub use crate::device::*;
 pub use crate::rt::*;
 
-extern crate hercules_ir;
+use hercules_ir::*;
 
-use self::hercules_ir::*;
+/*
+ * Basic block info consists of two things:
+ *
+ * 1. A map from node to block (named by control nodes).
+ * 2. For each node, which nodes are in its own block.
+ *
+ * Note that for #2, the structure is Vec<NodeID>, meaning the nodes are ordered
+ * inside the block. This order corresponds to the traversal order of the nodes
+ * in the block needed by the backend code generators.
+ */
+pub type BasicBlocks = (Vec<NodeID>, Vec<Vec<NodeID>>);
 
 /*
  * The alignment of a type does not depend on dynamic constants.
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 890c898d75841c85756f3b6e2dd0011678e7789b..13370c4580a9365f0a9013b573e558133656d407 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -1,10 +1,8 @@
-extern crate hercules_ir;
-
 use std::collections::BTreeMap;
 use std::fmt::{Error, Write};
 use std::iter::zip;
 
-use self::hercules_ir::*;
+use hercules_ir::*;
 
 use crate::*;
 
@@ -16,10 +14,8 @@ use crate::*;
 pub fn rt_codegen<W: Write>(
     func_id: FunctionID,
     module: &Module,
-    reverse_postorder: &Vec<NodeID>,
     typing: &Vec<TypeID>,
     control_subgraph: &Subgraph,
-    antideps: &Vec<(NodeID, NodeID)>,
     bbs: &BasicBlocks,
     collection_objects: &CollectionObjects,
     callgraph: &CallGraph,
@@ -29,10 +25,8 @@ pub fn rt_codegen<W: Write>(
     let ctx = RTContext {
         func_id,
         module,
-        reverse_postorder,
         typing,
         control_subgraph,
-        antideps,
         bbs,
         collection_objects,
         callgraph,
@@ -44,10 +38,8 @@ pub fn rt_codegen<W: Write>(
 struct RTContext<'a> {
     func_id: FunctionID,
     module: &'a Module,
-    reverse_postorder: &'a Vec<NodeID>,
     typing: &'a Vec<TypeID>,
     control_subgraph: &'a Subgraph,
-    antideps: &'a Vec<(NodeID, NodeID)>,
     bbs: &'a BasicBlocks,
     collection_objects: &'a CollectionObjects,
     callgraph: &'a CallGraph,
@@ -61,7 +53,7 @@ impl<'a> RTContext<'a> {
         // Dump the function signature.
         write!(
             w,
-            "#[allow(unused_variables,unused_mut)]\nasync fn {}(",
+            "#[allow(unused_variables,unused_mut,unused_parens)]\nasync fn {}<'a>(",
             func.name
         )?;
         let mut first_param = true;
@@ -84,75 +76,29 @@ impl<'a> RTContext<'a> {
             if !self.module.types[func.param_types[idx].idx()].is_primitive() {
                 write!(w, "mut ")?;
             }
-            write!(
-                w,
-                "p_i{}: {}",
-                idx,
-                self.get_type_interface(func.param_types[idx])
-            )?;
-        }
-        write!(w, ") -> {} {{\n", self.get_type_interface(func.return_type))?;
-
-        // Copy the "interface" parameters to "non-interface" parameters.
-        // The purpose of this is to convert collection objects from a Box<[u8]>
-        // type to a *mut u8 type. This name copying is done so that we can
-        // easily construct objects just after this by moving the "inferface"
-        // parameters.
-        for (idx, ty) in func.param_types.iter().enumerate() {
-            if self.module.types[ty.idx()].is_primitive() {
-                write!(w, "    let p{} = p_i{};\n", idx, idx)?;
-            } else {
-                write!(
-                    w,
-                    "    let p{} = ::std::boxed::Box::as_mut_ptr(&mut p_i{}) as *mut u8;\n",
-                    idx, idx
-                )?;
-            }
+            write!(w, "p{}: {}", idx, self.get_type(func.param_types[idx]))?;
         }
+        write!(w, ") -> {} {{\n", self.get_type(func.return_type))?;
 
-        // Collect the boxes representing ownership over collection objects for
-        // this function. The actual emitted computation is done entirely using
-        // pointers, so these get emitted to hold onto ownership over the
-        // underlying memory and to automatically clean them up when this
-        // function returns. Collection objects are inside Options, since their
-        // ownership may get passed to other called RT functions. If this
-        // function returns a collection object, then at the very end, right
-        // before the return, the to-be-returned pointer is compared against the
-        // owned collection objects - it should match exactly one of those
-        // objects, and that box is what's actually returned.
-        let mem_obj_ty = "::core::option::Option<::std::boxed::Box<[u8]>>";
+        // Allocate collection constants.
         for object in self.collection_objects[&self.func_id].iter_objects() {
-            match self.collection_objects[&self.func_id].origin(object) {
-                CollectionObjectOrigin::Parameter(index) => write!(
-                    w,
-                    "    let mut obj{}: {} = Some(p_i{});\n",
-                    object.idx(),
-                    mem_obj_ty,
-                    index
-                )?,
-                CollectionObjectOrigin::Constant(id) => {
-                    let size = self.codegen_type_size(self.typing[id.idx()]);
-                    write!(
-                        w,
-                        "    let mut obj{}: {} = Some((0..{}).map(|_| 0u8).collect());\n",
-                        object.idx(),
-                        mem_obj_ty,
-                        size
-                    )?
-                }
-                CollectionObjectOrigin::Call(_) | CollectionObjectOrigin::Undef(_) => write!(
+            if let CollectionObjectOrigin::Constant(id) =
+                self.collection_objects[&self.func_id].origin(object)
+            {
+                let size = self.codegen_type_size(self.typing[id.idx()]);
+                write!(
                     w,
-                    "    let mut obj{}: {} = None;\n",
+                    "    let mut obj{}: ::hercules_rt::HerculesBox = unsafe {{ ::hercules_rt::HerculesBox::__zeros({}) }};\n",
                     object.idx(),
-                    mem_obj_ty,
-                )?,
+                    size
+                )?
             }
         }
 
-        // Dump signatures for called CPU functions.
+        // Dump signatures for called device functions.
         write!(w, "    extern \"C\" {{\n")?;
         for callee in self.callgraph.get_callees(self.func_id) {
-            if self.devices[callee.idx()] != Device::LLVM {
+            if self.devices[callee.idx()] == Device::AsyncRust {
                 continue;
             }
             let callee = &self.module.functions[callee.idx()];
@@ -172,9 +118,9 @@ impl<'a> RTContext<'a> {
                 } else {
                     write!(w, ", ")?;
                 }
-                write!(w, "p{}: {}", idx, self.get_type(*ty))?;
+                write!(w, "p{}: {}", idx, self.device_get_type(*ty))?;
             }
-            write!(w, ") -> {};\n", self.get_type(callee.return_type))?;
+            write!(w, ") -> {};\n", self.device_get_type(callee.return_type))?;
         }
         write!(w, "    }}\n")?;
 
@@ -193,7 +139,7 @@ impl<'a> RTContext<'a> {
                 } else if self.module.types[self.typing[idx].idx()].is_float() {
                     "0.0"
                 } else {
-                    "::core::ptr::null::<u8>() as _"
+                    "unsafe { ::hercules_rt::HerculesBox::__null() }"
                 }
             )?;
         }
@@ -284,20 +230,7 @@ impl<'a> RTContext<'a> {
             }
             Node::Return { control: _, data } => {
                 let block = &mut blocks.get_mut(&id).unwrap();
-                let objects = self.collection_objects[&self.func_id].objects(data);
-                if objects.is_empty() {
-                    write!(block, "                return {};\n", self.get_value(data))?
-                } else {
-                    // If the value to return is a collection object, figure out
-                    // which object it actually is at runtime and return that
-                    // box.
-                    for object in objects {
-                        write!(block, "                if let Some(mut obj) = obj{} && ::std::boxed::Box::as_mut_ptr(&mut obj) as *mut u8 == {} {{\n", object.idx(), self.get_value(data))?;
-                        write!(block, "                    return obj;\n")?;
-                        write!(block, "                }}\n")?;
-                    }
-                    write!(block, "                panic!(\"HERCULES PANIC: Pointer to be returned doesn't match any known collection objects.\");\n")?
-                }
+                write!(block, "                return {};\n", self.get_value(data))?
             }
             _ => panic!("PANIC: Can't lower {:?}.", func.nodes[id.idx()]),
         }
@@ -316,12 +249,21 @@ impl<'a> RTContext<'a> {
         match func.nodes[id.idx()] {
             Node::Parameter { index } => {
                 let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap();
-                write!(
-                    block,
-                    "                {} = p{};\n",
-                    self.get_value(id),
-                    index
-                )?
+                if self.module.types[self.typing[id.idx()].idx()].is_primitive() {
+                    write!(
+                        block,
+                        "                {} = p{};\n",
+                        self.get_value(id),
+                        index
+                    )?
+                } else {
+                    write!(
+                        block,
+                        "                {} = unsafe {{ p{}.__take() }};\n",
+                        self.get_value(id),
+                        index
+                    )?
+                }
             }
             Node::Constant { id: cons_id } => {
                 let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap();
@@ -342,11 +284,7 @@ impl<'a> RTContext<'a> {
                         let objects = self.collection_objects[&self.func_id].objects(id);
                         assert_eq!(objects.len(), 1);
                         let object = objects[0];
-                        write!(
-                            block,
-                            "::std::boxed::Box::as_mut_ptr(obj{}.as_mut().unwrap()) as *mut u8",
-                            object.idx()
-                        )?
+                        write!(block, "unsafe {{ obj{}.__take() }}", object.idx())?
                     }
                 }
                 write!(block, ";\n")?
@@ -357,86 +295,100 @@ impl<'a> RTContext<'a> {
                 ref dynamic_constants,
                 ref args,
             } => {
-                match self.devices[callee_id.idx()] {
-                    Device::LLVM => {
+                let device = self.devices[callee_id.idx()];
+                match device {
+                    // The device backends ensure that device functions have the
+                    // same C interface.
+                    Device::LLVM | Device::CUDA => {
                         let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap();
+
+                        let device = match device {
+                            Device::LLVM => "cpu",
+                            Device::CUDA => "cuda",
+                            _ => panic!(),
+                        };
+
+                        // First, get the raw pointers to collections that the
+                        // device function takes as input.
+                        let callee_objs = &self.collection_objects[&callee_id];
+                        for (idx, arg) in args.into_iter().enumerate() {
+                            if let Some(obj) = callee_objs.param_to_object(idx) {
+                                // Extract a raw pointer from the HerculesBox.
+                                if callee_objs.is_mutated(obj) {
+                                    write!(
+                                        block,
+                                        "                let arg_tmp{} = unsafe {{ {}.__{}_ptr_mut() }};\n",
+                                        idx,
+                                        self.get_value(*arg),
+                                        device
+                                    )?;
+                                } else {
+                                    write!(
+                                        block,
+                                        "                let arg_tmp{} = unsafe {{ {}.__{}_ptr() }};\n",
+                                        idx,
+                                        self.get_value(*arg),
+                                        device
+                                    )?;
+                                }
+                            } else {
+                                write!(
+                                    block,
+                                    "                let arg_tmp{} = {};\n",
+                                    idx,
+                                    self.get_value(*arg)
+                                )?;
+                            }
+                        }
+
+                        // Emit the call.
                         write!(
                             block,
-                            "                {} = unsafe {{ {}(",
-                            self.get_value(id),
+                            "                let call_tmp = unsafe {{ {}(",
                             self.module.functions[callee_id.idx()].name
                         )?;
                         for dc in dynamic_constants {
                             self.codegen_dynamic_constant(*dc, block)?;
                             write!(block, ", ")?;
                         }
-                        for arg in args {
-                            write!(block, "{}, ", self.get_value(*arg))?;
+                        for idx in 0..args.len() {
+                            write!(block, "arg_tmp{}, ", idx)?;
                         }
                         write!(block, ") }};\n")?;
 
-                        // When a CPU function is called that returns a
+                        // When a device function is called that returns a
                         // collection object, that object must have come from
                         // one of its parameters. Dynamically figure out which
                         // one it came from, so that we can move it to the slot
                         // of the output object.
-                        let call_objects = self.collection_objects[&self.func_id].objects(id);
-                        if !call_objects.is_empty() {
-                            assert_eq!(call_objects.len(), 1);
-                            let call_object = call_objects[0];
-
-                            let callee_returned_objects =
-                                self.collection_objects[&callee_id].returned_objects();
-                            let possible_params: Vec<_> =
-                                (0..self.module.functions[callee_id.idx()].param_types.len())
-                                    .filter(|idx| {
-                                        let object_of_param = self.collection_objects[&callee_id]
-                                            .param_to_object(*idx);
-                                        // Look at parameters that could be the
-                                        // source of the memory object returned
-                                        // by the function.
-                                        object_of_param
-                                            .map(|object_of_param| {
-                                                callee_returned_objects.contains(&object_of_param)
-                                            })
-                                            .unwrap_or(false)
-                                    })
-                                    .collect();
-                            let arg_objects = args
-                                .into_iter()
-                                .enumerate()
-                                .filter(|(idx, _)| possible_params.contains(idx))
-                                .map(|(_, arg)| {
-                                    self.collection_objects[&self.func_id]
-                                        .objects(*arg)
-                                        .into_iter()
-                                })
-                                .flatten();
-
-                            // Dynamically check which of the memory objects
-                            // corresponding to arguments to the call was
-                            // returned by the call. Move that memory object
-                            // into the memory object of the call.
-                            let mut first_obj = true;
-                            for arg_object in arg_objects {
-                                write!(block, "                ")?;
-                                if first_obj {
-                                    first_obj = false;
-                                } else {
-                                    write!(block, "else ")?;
+                        let caller_objects = self.collection_objects[&self.func_id].objects(id);
+                        if !caller_objects.is_empty() {
+                            for (idx, arg) in args.into_iter().enumerate() {
+                                if idx != 0 {
+                                    write!(block, "                else\n")?;
                                 }
-                                write!(block, "if let Some(obj) = obj{}.as_mut() && ::std::boxed::Box::as_mut_ptr(obj) as *mut u8 == {} {{\n", arg_object.idx(), self.get_value(id))?;
                                 write!(
                                     block,
-                                    "                    obj{} = obj{}.take();\n",
-                                    call_object.idx(),
-                                    arg_object.idx()
+                                    "                if call_tmp == arg_tmp{} {{\n",
+                                    idx
                                 )?;
-                                write!(block, "                }}\n")?;
+                                write!(
+                                    block,
+                                    "                    {} = unsafe {{ {}.__take() }};\n",
+                                    self.get_value(id),
+                                    self.get_value(*arg)
+                                )?;
+                                write!(block, "                }}")?;
                             }
                             write!(block, "                else {{\n")?;
-                            write!(block, "                    panic!(\"HERCULES PANIC: Pointer returned from called function doesn't match any known collection objects.\");\n")?;
+                            write!(block, "                    panic!(\"HERCULES PANIC: Pointer returned from device function doesn't match an argument pointer.\");\n")?;
                             write!(block, "                }}\n")?;
+                        } else {
+                            write!(
+                                block,
+                                "                {} = call_tmp;\n",
+                                self.get_value(id)
+                            )?;
                         }
                     }
                     Device::AsyncRust => {
@@ -455,12 +407,11 @@ impl<'a> RTContext<'a> {
                             if self.module.types[self.typing[arg.idx()].idx()].is_primitive() {
                                 write!(block, "{}, ", self.get_value(*arg))?;
                             } else {
-                                write!(block, "{}.take(), ", self.get_value(*arg))?;
+                                write!(block, "unsafe {{ {}.__take() }}, ", self.get_value(*arg))?;
                             }
                         }
                         write!(block, ").await;\n")?;
                     }
-                    _ => todo!(),
                 }
             }
             _ => panic!(
@@ -606,8 +557,8 @@ impl<'a> RTContext<'a> {
         convert_type(&self.module.types[id.idx()])
     }
 
-    fn get_type_interface(&self, id: TypeID) -> &'static str {
-        convert_type_interface(&self.module.types[id.idx()])
+    fn device_get_type(&self, id: TypeID) -> &'static str {
+        device_convert_type(&self.module.types[id.idx()])
     }
 }
 
@@ -624,18 +575,27 @@ fn convert_type(ty: &Type) -> &'static str {
         Type::UnsignedInteger64 => "u64",
         Type::Float32 => "f32",
         Type::Float64 => "f64",
-        Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "*mut u8",
+        Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => {
+            "::hercules_rt::HerculesBox<'a>"
+        }
         _ => panic!(),
     }
 }
 
-/*
- * Collection types are passed to / returned from runtime functions through a
- * wrapper type for ownership tracking reasons.
- */
-fn convert_type_interface(ty: &Type) -> &'static str {
+fn device_convert_type(ty: &Type) -> &'static str {
     match ty {
-        Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "Box<[u8]>",
-        _ => convert_type(ty),
+        Type::Boolean => "bool",
+        Type::Integer8 => "i8",
+        Type::Integer16 => "i16",
+        Type::Integer32 => "i32",
+        Type::Integer64 => "i64",
+        Type::UnsignedInteger8 => "u8",
+        Type::UnsignedInteger16 => "u16",
+        Type::UnsignedInteger32 => "u32",
+        Type::UnsignedInteger64 => "u64",
+        Type::Float32 => "f32",
+        Type::Float64 => "f64",
+        Type::Product(_) | Type::Summation(_) | Type::Array(_, _) => "*mut u8",
+        _ => panic!(),
     }
 }
diff --git a/hercules_ir/Cargo.toml b/hercules_ir/Cargo.toml
index 44648c11baaa5539f6598270f4423b591f4422e2..deda9cc58758f6cc834aadcc8e4ec66625fefb4b 100644
--- a/hercules_ir/Cargo.toml
+++ b/hercules_ir/Cargo.toml
@@ -2,10 +2,11 @@
 name = "hercules_ir"
 version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
 
 [dependencies]
 rand = "*"
 nom = "*"
 ordered-float = { version = "*", features = ["serde"] }
 bitvec = "*"
-serde = { version = "*", features = ["derive"] }
\ No newline at end of file
+serde = { version = "*", features = ["derive"] }
diff --git a/hercules_ir/src/antideps.rs b/hercules_ir/src/antideps.rs
deleted file mode 100644
index af949708dd2bc278410ecedd632a04313fa30415..0000000000000000000000000000000000000000
--- a/hercules_ir/src/antideps.rs
+++ /dev/null
@@ -1,297 +0,0 @@
-use std::collections::{BTreeMap, BTreeSet};
-use std::iter::zip;
-
-use crate::*;
-
-/*
- * In addition to collections, we need to figure out which "generation" of a
- * collection a node may take as input.
- */
-#[derive(PartialEq, Eq, Clone, Debug)]
-struct GenerationLattice {
-    objs: BTreeSet<(CollectionObjectID, NodeID)>,
-}
-
-impl Semilattice for GenerationLattice {
-    fn meet(a: &Self, b: &Self) -> Self {
-        GenerationLattice {
-            objs: a.objs.union(&b.objs).map(|x| *x).collect(),
-        }
-    }
-
-    fn top() -> Self {
-        GenerationLattice {
-            objs: BTreeSet::new(),
-        }
-    }
-
-    fn bottom() -> Self {
-        // Bottom is not representable for this lattice with our Semilattice
-        // interface, but we never need to construct it.
-        panic!()
-    }
-}
-
-/*
- * Function to assemble anti-dependence edges. Returns a list of pairs of nodes.
- * The first item in the pair is the reading node, and the second item is the
- * mutating node.
- */
-pub fn antideps(
-    function: &Function,
-    reverse_postorder: &Vec<NodeID>,
-    objects: &FunctionCollectionObjects,
-) -> Vec<(NodeID, NodeID)> {
-    // First, we analyze "generations" of collections as they are mutated.
-    // Originating, mutating, phi, and reduce nodes start a new generation of a
-    // collection. Generations are not ordered due to loops, but are rather just
-    // node IDs of node (parameter, constant, call, undef, write, phi, reduce).
-    // Other nodes operating on collections mean reads / writes can operate on
-    // potentially different generations of multiple collections (select).
-    let lattice = forward_dataflow(function, reverse_postorder, |inputs, id| {
-        match function.nodes[id.idx()] {
-            Node::Ternary {
-                op: TernaryOperator::Select,
-                first: _,
-                second: _,
-                third: _,
-            } => inputs
-                .into_iter()
-                .fold(GenerationLattice::top(), |acc, input| {
-                    GenerationLattice::meet(&acc, input)
-                }),
-            Node::Parameter { index: _ } | Node::Constant { id: _ } | Node::Undef { ty: _ } => {
-                let objs = objects.objects(id);
-                GenerationLattice {
-                    objs: objs.into_iter().map(|obj| (*obj, id)).collect(),
-                }
-            }
-            Node::Call {
-                control: _,
-                function: _,
-                dynamic_constants: _,
-                ref args,
-            } => {
-                let mut objs = BTreeSet::new();
-                let call_objs = objects.objects(id);
-
-                // If this call node might originate an object, add that to the
-                // lattice output - its generation is this call node.
-                for obj in call_objs {
-                    if objects.origin(*obj) == CollectionObjectOrigin::Call(id) {
-                        assert!(objs.len() <= 1);
-                        objs.insert((*obj, id));
-                    }
-                }
-
-                // For every argument...
-                for (arg, arg_gens) in zip(args, inputs.into_iter().skip(1)) {
-                    // Look at its objects...
-                    for arg_obj in objects.objects(*arg) {
-                        // For each object that might be returned...
-                        if call_objs.contains(&arg_obj) {
-                            let mutable = objects.mutators(*arg_obj).contains(&id);
-                            for (obj, gen) in arg_gens.objs.iter() {
-                                // Add that object to the output lattice.
-                                if obj == arg_obj && mutable {
-                                    // Set the generation to this node if the
-                                    // object might be mutated.
-                                    objs.insert((*obj, id));
-                                } else if obj == arg_obj {
-                                    // Otherwise, keep the old generation.
-                                    objs.insert((*obj, *gen));
-                                }
-                            }
-                        }
-                    }
-                }
-                GenerationLattice { objs }
-            }
-            Node::Read {
-                collect: _,
-                indices: _,
-            } => inputs[0].clone(),
-            Node::Phi {
-                control: _,
-                data: _,
-            }
-            | Node::Reduce {
-                control: _,
-                init: _,
-                reduct: _,
-            }
-            | Node::Write {
-                collect: _,
-                data: _,
-                indices: _,
-            } => {
-                // Phis, reduces, and writes update the generation to the write.
-                let objs = inputs[0].objs.iter().map(|(obj, _)| (*obj, id)).collect();
-                GenerationLattice { objs }
-            }
-            _ => GenerationLattice::top(),
-        }
-    });
-
-    // Second, we generate anti-dependence edges from the dataflow analysis.
-    // There are four cases where an anti-dependence edge is generated:
-    //
-    // 1. A read node and a write node share an object and generation pair on
-    //    their `collect` input.
-    // 2. A read node and a call node share an object and generation pair, where
-    //    the pair is on the read's `collect` input and the pair is on any input
-    //    of the call node AND the call node is a mutator of the object.
-    // 3. A call node and a write node share an object and generation pair,
-    //    where the pair is on any input of the call node and the pair is on the
-    //    write's `collect` input.
-    // 4. A call node and another call node share an object and generation pair,
-    //    where the pair is on any input of both call nodes AND the second call
-    //    node is a mutator of the object.
-    let mut reads_writes_calls_mut_calls_per_pair: BTreeMap<
-        (CollectionObjectID, NodeID),
-        (Vec<NodeID>, Vec<NodeID>, Vec<NodeID>, Vec<NodeID>),
-    > = BTreeMap::new();
-    for (idx, node) in function.nodes.iter().enumerate() {
-        let id = NodeID::new(idx);
-        match node {
-            Node::Read {
-                collect,
-                indices: _,
-            } => {
-                for pair in lattice[collect.idx()].objs.iter() {
-                    reads_writes_calls_mut_calls_per_pair
-                        .entry(*pair)
-                        .or_default()
-                        .0
-                        .push(id);
-                }
-            }
-            Node::Write {
-                collect,
-                data,
-                indices: _,
-            } => {
-                for pair in lattice[collect.idx()].objs.iter() {
-                    reads_writes_calls_mut_calls_per_pair
-                        .entry(*pair)
-                        .or_default()
-                        .1
-                        .push(id);
-                }
-
-                // When a write takes a collection on its `data` input, it
-                // memcpys that collection into the mutated collection. This is
-                // a read.
-                if !objects.objects(*data).is_empty() {
-                    for pair in lattice[collect.idx()].objs.iter() {
-                        reads_writes_calls_mut_calls_per_pair
-                            .entry(*pair)
-                            .or_default()
-                            .0
-                            .push(id);
-                    }
-                }
-            }
-            Node::Call {
-                control: _,
-                function: _,
-                dynamic_constants: _,
-                ref args,
-            } => {
-                for arg in args {
-                    for pair in lattice[arg.idx()].objs.iter() {
-                        if objects.mutators(pair.0).contains(&id) {
-                            reads_writes_calls_mut_calls_per_pair
-                                .entry(*pair)
-                                .or_default()
-                                .3
-                                .push(id);
-                        } else {
-                            reads_writes_calls_mut_calls_per_pair
-                                .entry(*pair)
-                                .or_default()
-                                .2
-                                .push(id);
-                        }
-                    }
-                }
-            }
-            _ => {}
-        }
-    }
-
-    // Once we've grouped reads / writes / calls by pairs, we create pair-wise
-    // anti-dependence edges. Due to loops, a write may technically anti-depend
-    // on a read where the read depends on the write, but we don't want to
-    // generate that anti-dependence edge, since it'll create a cycle during
-    // backend code generation. Thus, if the mutator in an anti-dependence is
-    // the same as the generation of the current pair, don't generate the edge.
-    let mut antideps = vec![];
-    for ((_, gen), (reads, writes, calls, mut_calls)) in reads_writes_calls_mut_calls_per_pair {
-        // Case 1:
-        for read in reads.iter() {
-            for write in writes.iter() {
-                if *write != gen && *read != *write {
-                    antideps.push((*read, *write));
-                }
-            }
-        }
-
-        // Case 2:
-        for read in reads.iter() {
-            for mut_call in mut_calls.iter() {
-                if *mut_call != gen && *read != *mut_call {
-                    antideps.push((*read, *mut_call));
-                }
-            }
-        }
-
-        // Case 3:
-        for call in calls.iter().chain(mut_calls.iter()) {
-            for write in writes.iter() {
-                if *write != gen && *call != *write {
-                    antideps.push((*call, *write));
-                }
-            }
-        }
-
-        // Case 4:
-        for call in calls.iter().chain(mut_calls.iter()) {
-            for mut_call in mut_calls.iter() {
-                if *mut_call != gen && *call != *mut_call {
-                    antideps.push((*call, *mut_call));
-                }
-            }
-        }
-    }
-
-    antideps
-}
-
-/*
- * Utility to make a map from node to anti-dependency uses (map mutator ->
- * reads).
- */
-pub fn flip_antideps(antideps: &Vec<(NodeID, NodeID)>) -> BTreeMap<NodeID, Vec<NodeID>> {
-    let mut result: BTreeMap<NodeID, Vec<NodeID>> = BTreeMap::new();
-
-    for (read, mutator) in antideps {
-        result.entry(*mutator).or_default().push(*read);
-    }
-
-    result
-}
-
-/*
- * Utility to make a map from node to anti-dependency users (map reads ->
- * mutators).
- */
-pub fn map_antideps(antideps: &Vec<(NodeID, NodeID)>) -> BTreeMap<NodeID, Vec<NodeID>> {
-    let mut result: BTreeMap<NodeID, Vec<NodeID>> = BTreeMap::new();
-
-    for (read, mutator) in antideps {
-        result.entry(*read).or_default().push(*mutator);
-    }
-
-    result
-}
diff --git a/hercules_ir/src/collections.rs b/hercules_ir/src/collections.rs
index 23d84b1b6629f0a477217d0657085d234bf6cfe1..8bb1b359fdbf27c44baaec5ac129419abb066331 100644
--- a/hercules_ir/src/collections.rs
+++ b/hercules_ir/src/collections.rs
@@ -285,8 +285,8 @@ pub fn collection_objects(
                 Node::Read {
                     collect: _,
                     indices: _,
-                }
-                | Node::Write {
+                } if !module.types[typing[id.idx()].idx()].is_primitive() => inputs[0].clone(),
+                Node::Write {
                     collect: _,
                     data: _,
                     indices: _,
diff --git a/hercules_ir/src/dataflow.rs b/hercules_ir/src/dataflow.rs
index 6df19d1408e06cc9e18c2f3f52e4eb5f5f618315..ced77d0ba4b107ce740fd3bafb99645f17747f96 100644
--- a/hercules_ir/src/dataflow.rs
+++ b/hercules_ir/src/dataflow.rs
@@ -1,7 +1,5 @@
-extern crate bitvec;
-
-use self::bitvec::prelude::*;
-use self::bitvec::slice::*;
+use bitvec::prelude::*;
+use bitvec::slice::*;
 
 use crate::*;
 
diff --git a/hercules_ir/src/dom.rs b/hercules_ir/src/dom.rs
index f9046b434dc707224f8fbbddb6fec766eaf0b1b9..2c0f085df7f3057380a8a9364ad1acc70a71f640 100644
--- a/hercules_ir/src/dom.rs
+++ b/hercules_ir/src/dom.rs
@@ -77,6 +77,26 @@ impl DomTree {
         .1
     }
 
+    /*
+     * Find the node with the shallowest level in the dom tree amongst the nodes
+     * given.
+     */
+    pub fn shallowest_amongst<I>(&self, x: I) -> NodeID
+    where
+        I: Iterator<Item = NodeID>,
+    {
+        x.map(|x| {
+            if x == self.root {
+                (0, x)
+            } else {
+                (self.idom[&x].0, x)
+            }
+        })
+        .min_by(|x, y| x.0.cmp(&y.0))
+        .unwrap()
+        .1
+    }
+
     /*
      * Find the least common ancestor in the tree of two nodes. This is an
      * ancestor of the two nodes that is as far down the tree as possible.
diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs
index 5ef16bb1c61846a36b78e308767838c03fb8ede0..4d526366808e8b2aea39fecd81f6c00269ffb154 100644
--- a/hercules_ir/src/dot.rs
+++ b/hercules_ir/src/dot.rs
@@ -1,5 +1,3 @@
-extern crate rand;
-
 use std::collections::HashMap;
 use std::env::temp_dir;
 use std::fmt::Write;
@@ -7,7 +5,7 @@ use std::fs::File;
 use std::io::Write as _;
 use std::process::Command;
 
-use self::rand::Rng;
+use rand::Rng;
 
 use crate::*;
 
@@ -20,7 +18,6 @@ pub fn xdot_module(
     reverse_postorders: &Vec<Vec<NodeID>>,
     doms: Option<&Vec<DomTree>>,
     fork_join_maps: Option<&Vec<HashMap<NodeID, NodeID>>>,
-    bbs: Option<&Vec<BasicBlocks>>,
 ) {
     let mut tmp_path = temp_dir();
     let mut rng = rand::thread_rng();
@@ -33,7 +30,6 @@ pub fn xdot_module(
         &reverse_postorders,
         doms,
         fork_join_maps,
-        bbs,
         &mut contents,
     )
     .expect("PANIC: Unable to generate output file contents.");
@@ -55,7 +51,6 @@ pub fn write_dot<W: Write>(
     reverse_postorders: &Vec<Vec<NodeID>>,
     doms: Option<&Vec<DomTree>>,
     fork_join_maps: Option<&Vec<HashMap<NodeID, NodeID>>>,
-    bbs: Option<&Vec<BasicBlocks>>,
     w: &mut W,
 ) -> std::fmt::Result {
     write_digraph_header(w)?;
@@ -170,28 +165,6 @@ pub fn write_dot<W: Write>(
             }
         }
 
-        // Step 4: draw BB edges in olive.
-        if let Some(bbs) = bbs {
-            let bbs = &bbs[function_id.idx()];
-            for node_idx in 0..bbs.0.len() {
-                let maybe_data = NodeID::new(node_idx);
-                let control = bbs.0[node_idx];
-                if maybe_data != control {
-                    write_edge(
-                        maybe_data,
-                        function_id,
-                        control,
-                        function_id,
-                        true,
-                        "olivedrab4, constraint=false",
-                        "dotted",
-                        &module,
-                        w,
-                    )?;
-                }
-            }
-        }
-
         write_graph_footer(w)?;
     }
 
diff --git a/hercules_ir/src/fork_join_analysis.rs b/hercules_ir/src/fork_join_analysis.rs
new file mode 100644
index 0000000000000000000000000000000000000000..130bc2ed49c637d28cc27cfd88c564222c680c12
--- /dev/null
+++ b/hercules_ir/src/fork_join_analysis.rs
@@ -0,0 +1,129 @@
+use std::collections::{HashMap, HashSet};
+
+use bitvec::prelude::*;
+
+use crate::*;
+
+/*
+ * Top level function for creating a fork-join map. Map is from fork node ID to
+ * join node ID, since a join can easily determine the fork it corresponds to
+ * (that's the mechanism used to implement this analysis). This analysis depends
+ * on type information.
+ */
+pub fn fork_join_map(function: &Function, control: &Subgraph) -> HashMap<NodeID, NodeID> {
+    let mut fork_join_map = HashMap::new();
+    for idx in 0..function.nodes.len() {
+        // We only care about join nodes.
+        if function.nodes[idx].is_join() {
+            // Iterate the control predecessors until finding a fork. Maintain a
+            // counter of unmatched fork-join pairs seen on the way, since fork-
+            // joins may be nested. Every join is dominated by their fork, so
+            // just iterate the first unseen predecessor of each control node.
+            let join_id = NodeID::new(idx);
+            let mut unpaired = 0;
+            let mut cursor = join_id;
+            let mut seen = HashSet::<NodeID>::new();
+            let fork_id = loop {
+                cursor = control
+                    .preds(cursor)
+                    .filter(|pred| !seen.contains(pred))
+                    .next()
+                    .unwrap();
+                seen.insert(cursor);
+
+                if function.nodes[cursor.idx()].is_join() {
+                    unpaired += 1;
+                } else if function.nodes[cursor.idx()].is_fork() && unpaired > 0 {
+                    unpaired -= 1;
+                } else if function.nodes[cursor.idx()].is_fork() {
+                    break cursor;
+                }
+            };
+            fork_join_map.insert(fork_id, join_id);
+        }
+    }
+    fork_join_map
+}
+
+/*
+ * Find fork/join nests that each control node is inside of. Result is a map
+ * from each control node to a list of fork nodes. The fork nodes are listed in
+ * ascending order of nesting.
+ */
+pub fn compute_fork_join_nesting(
+    function: &Function,
+    dom: &DomTree,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+) -> HashMap<NodeID, Vec<NodeID>> {
+    // For each control node, ascend dominator tree, looking for fork nodes. For
+    // each fork node, make sure each control node isn't strictly dominated by
+    // the corresponding join node.
+    (0..function.nodes.len())
+        .map(NodeID::new)
+        .filter(|id| dom.contains(*id))
+        .map(|id| {
+            (
+                id,
+                dom.ascend(id)
+                    // Filter for forks that dominate this control node,
+                    .filter(|id| function.nodes[id.idx()].is_fork())
+                    // where its corresponding join doesn't dominate the control
+                    // node (if so, then this control is after the fork-join).
+                    .filter(|fork_id| !dom.does_prop_dom(fork_join_map[&fork_id], id))
+                    .collect(),
+            )
+        })
+        .collect()
+}
+
+/*
+ * Check if a data node dominates a control node. This involves checking all
+ * immediate control uses to see if they dominate the queried control node.
+ */
+pub fn does_data_dom_control(
+    function: &Function,
+    data: NodeID,
+    control: NodeID,
+    dom: &DomTree,
+) -> bool {
+    let mut stack = vec![data];
+    let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()];
+    visited.set(data.idx(), true);
+
+    while let Some(pop) = stack.pop() {
+        let node = &function.nodes[pop.idx()];
+
+        let imm_control = match node {
+            Node::Phi { control, data: _ }
+            | Node::Reduce {
+                control,
+                init: _,
+                reduct: _,
+            }
+            | Node::Call {
+                control,
+                function: _,
+                dynamic_constants: _,
+                args: _,
+            } => Some(*control),
+            _ if node.is_control() => Some(pop),
+            _ => {
+                for u in get_uses(node).as_ref() {
+                    if !visited[u.idx()] {
+                        visited.set(u.idx(), true);
+                        stack.push(*u);
+                    }
+                }
+                None
+            }
+        };
+
+        if let Some(imm_control) = imm_control
+            && !dom.does_dom(imm_control, control)
+        {
+            return false;
+        }
+    }
+
+    true
+}
diff --git a/hercules_ir/src/gcm.rs b/hercules_ir/src/gcm.rs
deleted file mode 100644
index 3718df9b00d0e262572d4602a9c9555ea9f6bb98..0000000000000000000000000000000000000000
--- a/hercules_ir/src/gcm.rs
+++ /dev/null
@@ -1,391 +0,0 @@
-extern crate bitvec;
-
-use std::collections::{HashMap, HashSet, VecDeque};
-use std::iter::{zip, FromIterator};
-
-use self::bitvec::prelude::*;
-
-use crate::*;
-
-/*
- * Basic block info consists of two things:
- *
- * 1. A map from node to block (named by control nodes).
- * 2. For each node, which nodes are in its own block.
- *
- * Note that for #2, the structure is Vec<NodeID>, meaning the nodes are ordered
- * inside the block. This order corresponds to the traversal order of the nodes
- * in the block needed by the backend code generators.
- */
-pub type BasicBlocks = (Vec<NodeID>, Vec<Vec<NodeID>>);
-
-/*
- * Top level global code motion function. Assigns each data node to one of its
- * immediate control use / user nodes, forming (unordered) basic blocks. Returns
- * the control node / basic block each node is in. Takes in a partial
- * partitioning that must be respected. Based on the schedule-early-schedule-
- * late method from Cliff Click's PhD thesis.
- */
-pub fn gcm(
-    function: &Function,
-    def_use: &ImmutableDefUseMap,
-    reverse_postorder: &Vec<NodeID>,
-    control_subgraph: &Subgraph,
-    dom: &DomTree,
-    antideps: &Vec<(NodeID, NodeID)>,
-    loops: &LoopTree,
-    fork_join_map: &HashMap<NodeID, NodeID>,
-) -> BasicBlocks {
-    let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()];
-    let back_edges = control_subgraph.back_edges(NodeID::new(0));
-    let no_loop_reachability =
-        control_subgraph.pairwise_reachability(|src, dst| !back_edges.contains(&(src, dst)));
-    let antideps_users = map_antideps(antideps);
-    let antideps_uses = flip_antideps(antideps);
-
-    // Step 1: assign the basic block locations of all nodes that must be in a
-    // specific block. This includes control nodes as well as some special data
-    // nodes, such as phis.
-    for idx in 0..function.nodes.len() {
-        match function.nodes[idx] {
-            Node::Phi { control, data: _ } => bbs[idx] = Some(control),
-            Node::ThreadID {
-                control,
-                dimension: _,
-            } => bbs[idx] = Some(control),
-            Node::Reduce {
-                control,
-                init: _,
-                reduct: _,
-            } => bbs[idx] = Some(control),
-            Node::Call {
-                control,
-                function: _,
-                dynamic_constants: _,
-                args: _,
-            } => bbs[idx] = Some(control),
-            Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)),
-            Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)),
-            Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)),
-            _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)),
-            _ => {}
-        }
-    }
-
-    // Step 2: schedule early. Place nodes in the earliest position they could
-    // go - use worklist to iterate nodes.
-    let mut schedule_early = bbs.clone();
-    let mut worklist = VecDeque::from(reverse_postorder.clone());
-    while let Some(id) = worklist.pop_front() {
-        if schedule_early[id.idx()].is_some() {
-            continue;
-        }
-
-        // For every use, check what block is its "schedule early" block. This
-        // node goes in the lowest block amongst those blocks.
-        let use_places: Option<Vec<NodeID>> = get_uses(&function.nodes[id.idx()])
-            .as_ref()
-            .into_iter()
-            .map(|id| *id)
-            .map(|id| schedule_early[id.idx()])
-            .collect();
-        if let Some(use_places) = use_places {
-            // If every use has been placed, we can place this node as the
-            // lowest place in the domtree that dominates all of the use places.
-            let lowest = dom.lowest_amongst(use_places.into_iter());
-            schedule_early[id.idx()] = Some(lowest);
-        } else {
-            // If not, then just push this node back on the worklist.
-            worklist.push_back(id);
-        }
-    }
-
-    // Step 3: schedule late and pick each nodes final position. Since the late
-    // schedule of each node depends on the final positions of its users, these
-    // two steps must be fused. Compute their latest position, then use the
-    // control dependent + shallow loop heuristic to actually place them.
-    let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map
-        .into_iter()
-        .map(|(fork, join)| (*join, *fork))
-        .collect();
-    let mut worklist = VecDeque::from_iter(reverse_postorder.into_iter().map(|id| *id).rev());
-    'worklist: while let Some(id) = worklist.pop_front() {
-        if bbs[id.idx()].is_some() {
-            continue;
-        }
-
-        // Calculate the least common ancestor of user blocks, a.k.a. the "late"
-        // schedule.
-        let calculate_lca = || -> Option<_> {
-            let mut lca = None;
-            // Helper to incrementally update the LCA.
-            let mut update_lca = |a| {
-                if let Some(acc) = lca {
-                    lca = Some(dom.least_common_ancestor(acc, a));
-                } else {
-                    lca = Some(a);
-                }
-            };
-
-            // For every user, consider where we need to be to directly dominate the
-            // user.
-            for user in def_use.get_users(id).as_ref().into_iter().map(|id| *id) {
-                if let Node::Phi { control, data } = &function.nodes[user.idx()] {
-                    // For phis, we need to dominate the block jumping to the phi in
-                    // the slot that corresponds to our use.
-                    for (control, data) in
-                        zip(get_uses(&function.nodes[control.idx()]).as_ref(), data)
-                    {
-                        if id == *data {
-                            update_lca(*control);
-                        }
-                    }
-                } else if let Node::Reduce {
-                    control,
-                    init,
-                    reduct,
-                } = &function.nodes[user.idx()]
-                {
-                    // For reduces, we need to either dominate the block right
-                    // before the fork if we're the init input, or we need to
-                    // dominate the join if we're the reduct input.
-                    if id == *init {
-                        let before_fork = function.nodes[join_fork_map[control].idx()]
-                            .try_fork()
-                            .unwrap()
-                            .0;
-                        update_lca(before_fork);
-                    } else {
-                        assert_eq!(id, *reduct);
-                        update_lca(*control);
-                    }
-                } else {
-                    // For everything else, we just need to dominate the user.
-                    update_lca(bbs[user.idx()]?);
-                }
-            }
-
-            Some(lca)
-        };
-
-        // Check if all users have been placed. If one of them hasn't, then add
-        // this node back on to the worklist.
-        let Some(lca) = calculate_lca() else {
-            worklist.push_back(id);
-            continue;
-        };
-
-        // Check if all anti-dependency users have been placed. If one of them
-        // hasn't, then add this node back on to the worklist. We need to know
-        // where the anti-dependency users are, so that we can place this
-        // read "above" mutators that anti-depend on it. The condition for a
-        // potential placement location is that in the CFG *without loop back-
-        // edges* the mutator cannot reach the read. Ask Russel about why this
-        // works, hopefully I'll have a convincing argument by then ;).
-        let mut antidep_user_locations = vec![];
-        for antidep_user in antideps_users.get(&id).unwrap_or(&vec![]) {
-            if let Some(location) = bbs[antidep_user.idx()] {
-                antidep_user_locations.push(location);
-            } else {
-                worklist.push_back(id);
-                continue 'worklist;
-            }
-        }
-
-        // Look between the LCA and the schedule early location to place the
-        // node.
-        let schedule_early = schedule_early[id.idx()].unwrap();
-        let mut chain = dom
-            // If the node has no users, then it doesn't really matter where we
-            // place it - just place it at the early placement.
-            .chain(lca.unwrap_or(schedule_early), schedule_early)
-            // Only allow locations that don't violate the anti-depence property
-            // listed above.
-            .filter(|location| {
-                !antidep_user_locations.iter().any(|antidep_user_location| {
-                    antidep_user_location != location
-                        && no_loop_reachability[antidep_user_location.idx()][location.idx()]
-                })
-            });
-        let mut location = chain.next().unwrap();
-        while let Some(control_node) = chain.next() {
-            // If the next node further up the dominator tree is in a shallower
-            // loop nest or if we can get out of a reduce loop when we don't
-            // need to be in one, place this data node in a higher-up location.
-            let shallower_nest = if let (Some(old_nest), Some(new_nest)) =
-                (loops.nesting(location), loops.nesting(control_node))
-            {
-                old_nest > new_nest
-            } else {
-                false
-            };
-            // This will move all nodes that don't need to be in reduce loops
-            // outside of reduce loops. Nodes that do need to be in a reduce
-            // loop use the reduce node forming the loop, so the dominator chain
-            // will consist of one block, and this loop won't ever iterate.
-            let currently_at_join = function.nodes[location.idx()].is_join();
-            if shallower_nest || currently_at_join {
-                location = control_node;
-            }
-        }
-
-        bbs[id.idx()] = Some(location);
-    }
-    let bbs: Vec<_> = bbs.into_iter().map(Option::unwrap).collect();
-
-    // Step 4: determine the order of nodes inside each block. Use worklist to
-    // add nodes to blocks in order that obeys dependencies.
-    let mut order: Vec<Vec<NodeID>> = vec![vec![]; function.nodes.len()];
-    let mut worklist = VecDeque::from_iter(
-        reverse_postorder
-            .into_iter()
-            .filter(|id| !function.nodes[id.idx()].is_control()),
-    );
-    let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()];
-    while let Some(id) = worklist.pop_front() {
-        let node = &function.nodes[id.idx()];
-        if node.is_phi()
-            || node.is_reduce()
-            || get_uses(node)
-                .as_ref()
-                .into_iter()
-                .chain(antideps_uses.get(&id).into_iter().flatten())
-                .all(|u| {
-                    function.nodes[u.idx()].is_control()
-                        || bbs[u.idx()] != bbs[id.idx()]
-                        || visited[u.idx()]
-                })
-        {
-            order[bbs[id.idx()].idx()].push(*id);
-            visited.set(id.idx(), true);
-        } else {
-            worklist.push_back(id);
-        }
-    }
-
-    (bbs, order)
-}
-
-/*
- * Top level function for creating a fork-join map. Map is from fork node ID to
- * join node ID, since a join can easily determine the fork it corresponds to
- * (that's the mechanism used to implement this analysis). This analysis depends
- * on type information.
- */
-pub fn fork_join_map(function: &Function, control: &Subgraph) -> HashMap<NodeID, NodeID> {
-    let mut fork_join_map = HashMap::new();
-    for idx in 0..function.nodes.len() {
-        // We only care about join nodes.
-        if function.nodes[idx].is_join() {
-            // Iterate the control predecessors until finding a fork. Maintain a
-            // counter of unmatched fork-join pairs seen on the way, since fork-
-            // joins may be nested. Every join is dominated by their fork, so
-            // just iterate the first unseen predecessor of each control node.
-            let join_id = NodeID::new(idx);
-            let mut unpaired = 0;
-            let mut cursor = join_id;
-            let mut seen = HashSet::<NodeID>::new();
-            let fork_id = loop {
-                cursor = control
-                    .preds(cursor)
-                    .filter(|pred| !seen.contains(pred))
-                    .next()
-                    .unwrap();
-                seen.insert(cursor);
-
-                if function.nodes[cursor.idx()].is_join() {
-                    unpaired += 1;
-                } else if function.nodes[cursor.idx()].is_fork() && unpaired > 0 {
-                    unpaired -= 1;
-                } else if function.nodes[cursor.idx()].is_fork() {
-                    break cursor;
-                }
-            };
-            fork_join_map.insert(fork_id, join_id);
-        }
-    }
-    fork_join_map
-}
-
-/*
- * Find fork/join nests that each control node is inside of. Result is a map
- * from each control node to a list of fork nodes. The fork nodes are listed in
- * ascending order of nesting.
- */
-pub fn compute_fork_join_nesting(
-    function: &Function,
-    dom: &DomTree,
-    fork_join_map: &HashMap<NodeID, NodeID>,
-) -> HashMap<NodeID, Vec<NodeID>> {
-    // For each control node, ascend dominator tree, looking for fork nodes. For
-    // each fork node, make sure each control node isn't strictly dominated by
-    // the corresponding join node.
-    (0..function.nodes.len())
-        .map(NodeID::new)
-        .filter(|id| dom.contains(*id))
-        .map(|id| {
-            (
-                id,
-                dom.ascend(id)
-                    // Filter for forks that dominate this control node,
-                    .filter(|id| function.nodes[id.idx()].is_fork())
-                    // where its corresponding join doesn't dominate the control
-                    // node (if so, then this control is after the fork-join).
-                    .filter(|fork_id| !dom.does_prop_dom(fork_join_map[&fork_id], id))
-                    .collect(),
-            )
-        })
-        .collect()
-}
-
-/*
- * Check if a data node dominates a control node. This involves checking all
- * immediate control uses to see if they dominate the queried control node.
- */
-pub fn does_data_dom_control(
-    function: &Function,
-    data: NodeID,
-    control: NodeID,
-    dom: &DomTree,
-) -> bool {
-    let mut stack = vec![data];
-    let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()];
-    visited.set(data.idx(), true);
-
-    while let Some(pop) = stack.pop() {
-        let node = &function.nodes[pop.idx()];
-
-        let imm_control = match node {
-            Node::Phi { control, data: _ }
-            | Node::Reduce {
-                control,
-                init: _,
-                reduct: _,
-            }
-            | Node::Call {
-                control,
-                function: _,
-                dynamic_constants: _,
-                args: _,
-            } => Some(*control),
-            _ if node.is_control() => Some(pop),
-            _ => {
-                for u in get_uses(node).as_ref() {
-                    if !visited[u.idx()] {
-                        visited.set(u.idx(), true);
-                        stack.push(*u);
-                    }
-                }
-                None
-            }
-        };
-
-        if let Some(imm_control) = imm_control
-            && !dom.does_dom(imm_control, control)
-        {
-            return false;
-        }
-    }
-
-    true
-}
diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs
index 2faf2bb6925888c49f0c9d85d929a8e5508baa37..ba01d8bf138c62696c48c3b1d82bb0508a5b74b7 100644
--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -1,19 +1,13 @@
-extern crate bitvec;
-extern crate ordered_float;
-extern crate serde;
-
-use self::bitvec::prelude::*;
-use self::serde::Deserialize;
-use self::serde::Serialize;
-use std::cmp::Ordering;
-use std::cmp::{max, min};
-use std::collections::HashMap;
-use std::convert::TryInto;
-use std::fmt::{Error, Write};
+use std::fmt::Write;
 use std::ops::Coroutine;
 use std::ops::CoroutineState;
 use std::pin::Pin;
 
+use bitvec::prelude::*;
+use ordered_float::OrderedFloat;
+use serde::Deserialize;
+use serde::Serialize;
+
 use crate::*;
 
 /*
@@ -96,8 +90,8 @@ pub enum Constant {
     UnsignedInteger16(u16),
     UnsignedInteger32(u32),
     UnsignedInteger64(u64),
-    Float32(ordered_float::OrderedFloat<f32>),
-    Float64(ordered_float::OrderedFloat<f64>),
+    Float32(OrderedFloat<f32>),
+    Float64(OrderedFloat<f64>),
     Product(TypeID, Box<[ConstantID]>),
     Summation(TypeID, u32, ConstantID),
     // Array constants are always zero.
@@ -949,8 +943,8 @@ impl Constant {
             Constant::UnsignedInteger16(0) => true,
             Constant::UnsignedInteger32(0) => true,
             Constant::UnsignedInteger64(0) => true,
-            Constant::Float32(ord) => *ord == ordered_float::OrderedFloat::<f32>(0.0),
-            Constant::Float64(ord) => *ord == ordered_float::OrderedFloat::<f64>(0.0),
+            Constant::Float32(ord) => *ord == OrderedFloat::<f32>(0.0),
+            Constant::Float64(ord) => *ord == OrderedFloat::<f64>(0.0),
             _ => false,
         }
     }
@@ -965,8 +959,8 @@ impl Constant {
             Constant::UnsignedInteger16(1) => true,
             Constant::UnsignedInteger32(1) => true,
             Constant::UnsignedInteger64(1) => true,
-            Constant::Float32(ord) => *ord == ordered_float::OrderedFloat::<f32>(1.0),
-            Constant::Float64(ord) => *ord == ordered_float::OrderedFloat::<f64>(1.0),
+            Constant::Float32(ord) => *ord == OrderedFloat::<f32>(1.0),
+            Constant::Float64(ord) => *ord == OrderedFloat::<f64>(1.0),
             _ => false,
         }
     }
@@ -1141,8 +1135,8 @@ pub fn dynamic_constant_cmp(
                     return Ok(ranges);
                 }
 
-                // Scalar multiple requires both that all right terms have left 
-                // term with same positive multiplier, and there are no 
+                // Scalar multiple requires both that all right terms have left
+                // term with same positive multiplier, and there are no
                 // outstanding left terms after matching.
                 let mut is_scalar_multiple = true;
                 let mut scalar_factor = 0;
@@ -1518,6 +1512,19 @@ impl Node {
         }
     }
 
+    pub fn try_write(&self) -> Option<(NodeID, NodeID, &[Index])> {
+        if let Node::Write {
+            collect,
+            data,
+            indices,
+        } = self
+        {
+            Some((*collect, *data, indices))
+        } else {
+            None
+        }
+    }
+
     pub fn is_zero_constant(&self, constants: &Vec<Constant>) -> bool {
         if let Node::Constant { id } = self
             && constants[id.idx()].is_zero()
diff --git a/hercules_ir/src/lib.rs b/hercules_ir/src/lib.rs
index 05e5e2e860a122392a668a254d54a7a5917db3f4..32bbf6310ff7ea0415383dc3fd7176043de835ee 100644
--- a/hercules_ir/src/lib.rs
+++ b/hercules_ir/src/lib.rs
@@ -6,7 +6,6 @@
     iter_intersperse
 )]
 
-pub mod antideps;
 pub mod build;
 pub mod callgraph;
 pub mod collections;
@@ -14,7 +13,7 @@ pub mod dataflow;
 pub mod def_use;
 pub mod dom;
 pub mod dot;
-pub mod gcm;
+pub mod fork_join_analysis;
 pub mod ir;
 pub mod loops;
 pub mod parse;
@@ -22,7 +21,6 @@ pub mod subgraph;
 pub mod typecheck;
 pub mod verify;
 
-pub use crate::antideps::*;
 pub use crate::build::*;
 pub use crate::callgraph::*;
 pub use crate::collections::*;
@@ -30,7 +28,7 @@ pub use crate::dataflow::*;
 pub use crate::def_use::*;
 pub use crate::dom::*;
 pub use crate::dot::*;
-pub use crate::gcm::*;
+pub use crate::fork_join_analysis::*;
 pub use crate::ir::*;
 pub use crate::loops::*;
 pub use crate::parse::*;
diff --git a/hercules_ir/src/loops.rs b/hercules_ir/src/loops.rs
index 7c9a0a85949efcc248439031601b2fed17f0acf6..13e935e0dd151ba3a29c4d07c9f9ee50341d5091 100644
--- a/hercules_ir/src/loops.rs
+++ b/hercules_ir/src/loops.rs
@@ -1,10 +1,8 @@
-extern crate bitvec;
-
 use std::collections::hash_map;
 use std::collections::VecDeque;
 use std::collections::{HashMap, HashSet};
 
-use self::bitvec::prelude::*;
+use bitvec::prelude::*;
 
 use crate::*;
 
@@ -25,6 +23,7 @@ use crate::*;
 pub struct LoopTree {
     root: NodeID,
     loops: HashMap<NodeID, (BitVec<u8, Lsb0>, NodeID)>,
+    inverse_loops: HashMap<NodeID, NodeID>,
     nesting: HashMap<NodeID, usize>,
 }
 
@@ -45,6 +44,10 @@ impl LoopTree {
         header == self.root || self.loops[&header].0[is_in.idx()]
     }
 
+    pub fn header_of(&self, control_node: NodeID) -> Option<NodeID> {
+        self.inverse_loops.get(&control_node).map(|h| *h)
+    }
+
     /*
      * Sometimes, we need to iterate the loop tree bottom-up. Just assemble the
      * order upfront.
@@ -149,7 +152,16 @@ pub fn loops(
         })
         .collect();
 
-    // Step 6: compute loop tree nesting.
+    // Step 6: compute the inverse loop map - this maps control nodes to which
+    // loop they are in (keyed by header), if they are in one.
+    let mut inverse_loops = HashMap::new();
+    for (header, (contents, _)) in loops.iter() {
+        for idx in contents.iter_ones() {
+            inverse_loops.insert(NodeID::new(idx), *header);
+        }
+    }
+
+    // Step 7: compute loop tree nesting.
     let mut nesting = HashMap::new();
     let mut worklist: VecDeque<NodeID> = loops.keys().map(|id| *id).collect();
     while let Some(header) = worklist.pop_front() {
@@ -166,6 +178,7 @@ pub fn loops(
     LoopTree {
         root,
         loops,
+        inverse_loops,
         nesting,
     }
 }
diff --git a/hercules_ir/src/parse.rs b/hercules_ir/src/parse.rs
index 5e4b12178d1782df47d2e5d36abffdb568311799..21eb325a530907bec0aa1f34788708da76453a54 100644
--- a/hercules_ir/src/parse.rs
+++ b/hercules_ir/src/parse.rs
@@ -1,5 +1,3 @@
-extern crate nom;
-
 use std::cell::RefCell;
 use std::collections::HashMap;
 use std::str::FromStr;
diff --git a/hercules_ir/src/subgraph.rs b/hercules_ir/src/subgraph.rs
index 89e8bcc64febd6fe36ec69d0d3a68a0dc0eda348..908011d3198ee4fe982fdc7b7aa263c87f80e3c7 100644
--- a/hercules_ir/src/subgraph.rs
+++ b/hercules_ir/src/subgraph.rs
@@ -1,9 +1,7 @@
-extern crate bitvec;
-
 use std::collections::{HashMap, HashSet};
 use std::mem::take;
 
-use self::bitvec::prelude::*;
+use bitvec::prelude::*;
 
 use crate::*;
 
@@ -23,6 +21,7 @@ pub struct Subgraph {
     original_num_nodes: u32,
 }
 
+#[derive(Debug, Clone)]
 pub struct SubgraphIterator<'a> {
     nodes: &'a Vec<NodeID>,
     edges: &'a [u32],
@@ -203,6 +202,33 @@ impl Subgraph {
         edges
     }
 
+    pub fn rev_po(&self, root: NodeID) -> Vec<NodeID> {
+        let mut order = vec![];
+        let mut stack = vec![];
+        let mut visited = bitvec![u8, Lsb0; 0; self.original_num_nodes as usize];
+
+        stack.push(root);
+        visited.set(root.idx(), true);
+
+        while let Some(pop) = stack.pop() {
+            if self.succs(pop).any(|succ| !visited[succ.idx()]) {
+                stack.push(pop);
+                for succ in self.succs(pop) {
+                    if !visited[succ.idx()] {
+                        visited.set(succ.idx(), true);
+                        stack.push(succ);
+                        break;
+                    }
+                }
+            } else {
+                order.push(pop);
+            }
+        }
+
+        order.reverse();
+        order
+    }
+
     pub fn pairwise_reachability<P>(&self, p: P) -> Vec<BitVec<u8, Lsb0>>
     where
         P: Fn(NodeID, NodeID) -> bool,
diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs
index c657d5987f005a721ffe663ee22fa6b8fc877b43..d6862c354199dc748797e47d4f663f898df24d7b 100644
--- a/hercules_ir/src/typecheck.rs
+++ b/hercules_ir/src/typecheck.rs
@@ -984,10 +984,6 @@ fn typeflow(
             data: _,
             indices,
         } => {
-            if indices.len() == 0 {
-                return Error(String::from("Write node must have at least one index."));
-            }
-
             // Traverse the collect input's type tree downwards.
             if let (Concrete(mut collect_id), Concrete(data_id)) = (inputs[0], inputs[1]) {
                 for index in indices.iter() {
diff --git a/hercules_ir/src/verify.rs b/hercules_ir/src/verify.rs
index 18ad92c3271df9dd9bea3bc5881489dd84176c70..572bb9d11d3aca8efb5bd70b6b18781da83bc0e7 100644
--- a/hercules_ir/src/verify.rs
+++ b/hercules_ir/src/verify.rs
@@ -1,9 +1,7 @@
-extern crate bitvec;
-
 use std::collections::HashMap;
 use std::iter::zip;
 
-use self::bitvec::prelude::*;
+use bitvec::prelude::*;
 
 use crate::*;
 
diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml
index e1936a97d4e717b06195188016b711735af6367b..84f6aca83e508d905ad0e13f0670e7d45c18d22b 100644
--- a/hercules_opt/Cargo.toml
+++ b/hercules_opt/Cargo.toml
@@ -2,10 +2,12 @@
 name = "hercules_opt"
 version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
 
 [dependencies]
 ordered-float = "*"
 bitvec = "*"
+tempfile = "*"
 either = "*"
 itertools = "*"
 take_mut = "*"
diff --git a/hercules_opt/src/ccp.rs b/hercules_opt/src/ccp.rs
index a66bf63345c6f951e7f6e7bb8abeba5bcabdf968..39fab9da391a4ee17618fb5b5f899e37138fce1f 100644
--- a/hercules_opt/src/ccp.rs
+++ b/hercules_opt/src/ccp.rs
@@ -1,12 +1,10 @@
-extern crate hercules_ir;
-
 use std::cmp::{max, min};
 use std::collections::HashSet;
 use std::iter::zip;
 
-use self::hercules_ir::dataflow::*;
-use self::hercules_ir::def_use::get_uses;
-use self::hercules_ir::ir::*;
+use hercules_ir::dataflow::*;
+use hercules_ir::def_use::get_uses;
+use hercules_ir::ir::*;
 
 use crate::*;
 
@@ -415,8 +413,8 @@ fn ccp_flow_function(
         }),
         // If node has only one output, if doesn't directly handle crossover of
         // reachability and constant propagation. Read handles that.
-        Node::If { control, cond } => inputs[control.idx()].clone(),
-        Node::Match { control, sum } => inputs[control.idx()].clone(),
+        Node::If { control, cond: _ } => inputs[control.idx()].clone(),
+        Node::Match { control, sum: _ } => inputs[control.idx()].clone(),
         Node::Fork {
             control,
             factors: _,
diff --git a/hercules_opt/src/dce.rs b/hercules_opt/src/dce.rs
index 75268694fdccb23920cccf3cd60cb5ae6738ef0f..026672a395d783c0abd5257894c4c32335654371 100644
--- a/hercules_opt/src/dce.rs
+++ b/hercules_opt/src/dce.rs
@@ -1,13 +1,10 @@
-extern crate hercules_ir;
-
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::ir::*;
+use hercules_ir::def_use::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
 /*
- * Top level function to run dead code elimination. Deletes nodes by setting
- * nodes to gravestones. Works with a function already containing gravestones.
+ * Top level function to run dead code elimination.
  */
 pub fn dce(editor: &mut FunctionEditor) {
     // Create worklist (starts as all nodes).
diff --git a/hercules_opt/src/delete_uncalled.rs b/hercules_opt/src/delete_uncalled.rs
index 78ab428526971f121e0a2bc582e55b423395730b..1a19ee010e4772b8a2367b928f8226109dbef32c 100644
--- a/hercules_opt/src/delete_uncalled.rs
+++ b/hercules_opt/src/delete_uncalled.rs
@@ -1,9 +1,7 @@
-extern crate bitvec;
-extern crate hercules_ir;
-use self::bitvec::prelude::*;
+use bitvec::prelude::*;
 
-use self::hercules_ir::callgraph::*;
-use self::hercules_ir::ir::*;
+use hercules_ir::callgraph::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
diff --git a/hercules_opt/src/editor.rs b/hercules_opt/src/editor.rs
index 0c97abff6429a76f03481542f03c9ba7cd09a5f3..1318f032e0ac8a02c0375ae2de56311f0d97306a 100644
--- a/hercules_opt/src/editor.rs
+++ b/hercules_opt/src/editor.rs
@@ -1,18 +1,13 @@
-extern crate bitvec;
-extern crate either;
-extern crate hercules_ir;
-extern crate itertools;
-
 use std::cell::{Ref, RefCell};
 use std::collections::{BTreeMap, HashSet};
 use std::mem::take;
 use std::ops::Deref;
 
-use self::bitvec::prelude::*;
-use self::either::Either;
+use bitvec::prelude::*;
+use either::Either;
 
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::ir::*;
+use hercules_ir::def_use::*;
+use hercules_ir::ir::*;
 
 /*
  * Helper object for editing Hercules functions in a trackable manner. Edits
@@ -25,6 +20,7 @@ pub struct FunctionEditor<'a> {
     // Wraps a mutable reference to a function. Doesn't provide access to this
     // reference directly, so that we can monitor edits.
     function: &'a mut Function,
+    function_id: FunctionID,
     // Keep a RefCell to (dynamic) constants and types to allow function changes
     // to update these
     constants: &'a RefCell<Vec<Constant>>,
@@ -61,6 +57,7 @@ pub struct FunctionEdit<'a: 'b, 'b> {
     added_types: Vec<Type>,
     // Compute a def-use map entries iteratively.
     updated_def_use: BTreeMap<NodeID, HashSet<NodeID>>,
+    updated_param_types: Option<Vec<TypeID>>,
     updated_return_type: Option<TypeID>,
     // Keep track of which deleted and added node IDs directly correspond.
     sub_edits: Vec<(NodeID, NodeID)>,
@@ -69,6 +66,7 @@ pub struct FunctionEdit<'a: 'b, 'b> {
 impl<'a: 'b, 'b> FunctionEditor<'a> {
     pub fn new(
         function: &'a mut Function,
+        function_id: FunctionID,
         constants: &'a RefCell<Vec<Constant>>,
         dynamic_constants: &'a RefCell<Vec<DynamicConstant>>,
         types: &'a RefCell<Vec<Type>>,
@@ -87,6 +85,7 @@ impl<'a: 'b, 'b> FunctionEditor<'a> {
 
         FunctionEditor {
             function,
+            function_id,
             constants,
             dynamic_constants,
             types,
@@ -110,6 +109,7 @@ impl<'a: 'b, 'b> FunctionEditor<'a> {
             added_dynamic_constants: Vec::new().into(),
             added_types: Vec::new().into(),
             updated_def_use: BTreeMap::new(),
+            updated_param_types: None,
             updated_return_type: None,
             sub_edits: vec![],
         };
@@ -120,13 +120,14 @@ impl<'a: 'b, 'b> FunctionEditor<'a> {
             let FunctionEdit {
                 editor,
                 deleted_nodeids,
-                added_nodeids,
+                added_nodeids: _,
                 added_and_updated_nodes,
                 added_and_updated_schedules,
                 added_constants,
                 added_dynamic_constants,
                 added_types,
                 updated_def_use,
+                updated_param_types,
                 updated_return_type,
                 sub_edits,
             } = populated_edit;
@@ -203,7 +204,12 @@ impl<'a: 'b, 'b> FunctionEditor<'a> {
             editor_dynamic_constants.extend(added_dynamic_constants);
             editor_types.extend(added_types);
 
-            // Step 8: update return type if necessary.
+            // Step 8: update parameter types if necessary.
+            if let Some(param_types) = updated_param_types {
+                editor.function.param_types = param_types;
+            }
+
+            // Step 9: update return type if necessary.
             if let Some(return_type) = updated_return_type {
                 editor.function.return_type = return_type;
             }
@@ -218,6 +224,10 @@ impl<'a: 'b, 'b> FunctionEditor<'a> {
         &self.function
     }
 
+    pub fn func_id(&self) -> FunctionID {
+        self.function_id
+    }
+
     pub fn get_dynamic_constants(&self) -> Ref<'_, Vec<DynamicConstant>> {
         self.dynamic_constants.borrow()
     }
@@ -573,6 +583,10 @@ impl<'a, 'b> FunctionEdit<'a, 'b> {
         }
     }
 
+    pub fn set_param_types(&mut self, tys: Vec<TypeID>) {
+        self.updated_param_types = Some(tys);
+    }
+
     pub fn set_return_type(&mut self, ty: TypeID) {
         self.updated_return_type = Some(ty);
     }
@@ -585,8 +599,8 @@ mod editor_tests {
 
     use std::mem::replace;
 
-    use self::hercules_ir::dataflow::reverse_postorder;
-    use self::hercules_ir::parse::parse;
+    use hercules_ir::dataflow::reverse_postorder;
+    use hercules_ir::parse::parse;
 
     fn canonicalize(function: &mut Function) -> Vec<Option<NodeID>> {
         // The reverse postorder traversal from the Start node is a map from new
@@ -660,6 +674,7 @@ fn func(x: i32) -> i32
         // Edit the function by replacing the add with a multiply.
         let mut editor = FunctionEditor::new(
             func,
+            FunctionID::new(0),
             &constants_ref,
             &dynamic_constants_ref,
             &types_ref,
diff --git a/hercules_opt/src/float_collections.rs b/hercules_opt/src/float_collections.rs
new file mode 100644
index 0000000000000000000000000000000000000000..faa38375f68e93fad04b632c2f2e5491403a6a3e
--- /dev/null
+++ b/hercules_opt/src/float_collections.rs
@@ -0,0 +1,105 @@
+use hercules_ir::*;
+
+use crate::*;
+
+/*
+ * Float collections constants out of device functions, where allocation isn't
+ * allowed.
+ */
+pub fn float_collections(
+    editors: &mut [FunctionEditor],
+    typing: &ModuleTyping,
+    callgraph: &CallGraph,
+    devices: &Vec<Device>,
+) {
+    let topo = callgraph.topo();
+    for to_float_id in topo {
+        // Collection constants float until reaching an AsyncRust function.
+        if devices[to_float_id.idx()] == Device::AsyncRust {
+            continue;
+        }
+
+        // Find the target constant nodes in the function.
+        let cons: Vec<(NodeID, Node)> = editors[to_float_id.idx()]
+            .func()
+            .nodes
+            .iter()
+            .enumerate()
+            .filter(|(_, node)| {
+                node.try_constant()
+                    .map(|cons_id| !editors[to_float_id.idx()].get_constant(cons_id).is_scalar())
+                    .unwrap_or(false)
+            })
+            .map(|(idx, node)| (NodeID::new(idx), node.clone()))
+            .collect();
+        if cons.is_empty() {
+            continue;
+        }
+
+        // Each constant node becomes a new parameter.
+        let mut new_param_types = editors[to_float_id.idx()].func().param_types.clone();
+        let old_num_params = new_param_types.len();
+        for (id, _) in cons.iter() {
+            new_param_types.push(typing[to_float_id.idx()][id.idx()]);
+        }
+        let success = editors[to_float_id.idx()].edit(|mut edit| {
+            for (idx, (id, _)) in cons.iter().enumerate() {
+                let param = edit.add_node(Node::Parameter {
+                    index: idx + old_num_params,
+                });
+                edit = edit.replace_all_uses(*id, param)?;
+                edit = edit.delete_node(*id)?;
+            }
+            edit.set_param_types(new_param_types);
+            Ok(edit)
+        });
+        if !success {
+            continue;
+        }
+
+        // Add constants in callers and pass them into calls.
+        for caller in callgraph.get_callers(to_float_id) {
+            let calls: Vec<(NodeID, Node)> = editors[caller.idx()]
+                .func()
+                .nodes
+                .iter()
+                .enumerate()
+                .filter(|(_, node)| {
+                    node.try_call()
+                        .map(|(_, callee, _, _)| callee == to_float_id)
+                        .unwrap_or(false)
+                })
+                .map(|(idx, node)| (NodeID::new(idx), node.clone()))
+                .collect();
+            let success = editors[caller.idx()].edit(|mut edit| {
+                let cons_ids: Vec<_> = cons
+                    .iter()
+                    .map(|(_, node)| edit.add_node(node.clone()))
+                    .collect();
+                for (id, node) in calls {
+                    let Node::Call {
+                        control,
+                        function,
+                        dynamic_constants,
+                        args,
+                    } = node
+                    else {
+                        panic!()
+                    };
+                    let mut args = Vec::from(args);
+                    args.extend(cons_ids.iter());
+                    let new_call = edit.add_node(Node::Call {
+                        control,
+                        function,
+                        dynamic_constants,
+                        args: args.into_boxed_slice(),
+                    });
+                    edit = edit.replace_all_uses(id, new_call)?;
+                    edit = edit.delete_node(id)?;
+                }
+                Ok(edit)
+            });
+            assert!(success);
+        }
+    }
+}
diff --git a/hercules_opt/src/fork_concat_split.rs b/hercules_opt/src/fork_concat_split.rs
index 232b43f70374427842b425f912928d316535d886..186cd6a6eaad70ba4b26e25dee8714c7988ee611 100644
--- a/hercules_opt/src/fork_concat_split.rs
+++ b/hercules_opt/src/fork_concat_split.rs
@@ -1,9 +1,7 @@
-extern crate hercules_ir;
-
 use std::collections::{HashMap, HashSet};
 use std::iter::zip;
 
-use self::hercules_ir::ir::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
diff --git a/hercules_opt/src/fork_guard_elim.rs b/hercules_opt/src/fork_guard_elim.rs
index cfa2a6ff344f5641a28d01b7b1fa44feb480b03e..842c83086f9ecedbbe5c8c96bf160de8968a953a 100644
--- a/hercules_opt/src/fork_guard_elim.rs
+++ b/hercules_opt/src/fork_guard_elim.rs
@@ -1,10 +1,8 @@
-extern crate hercules_ir;
-
 use std::collections::{HashMap, HashSet};
 
-use self::hercules_ir::get_uses_mut;
-use self::hercules_ir::ir::*;
-use self::hercules_ir::ImmutableDefUseMap;
+use hercules_ir::get_uses_mut;
+use hercules_ir::ir::*;
+use hercules_ir::ImmutableDefUseMap;
 
 /*
  * This is a Hercules IR transformation that:
diff --git a/hercules_opt/src/forkify.rs b/hercules_opt/src/forkify.rs
index e32bef383a077d6444bf3f3817c4adbe96fee78b..fb53a5e4a84bb08da7606aea914d368878c94b41 100644
--- a/hercules_opt/src/forkify.rs
+++ b/hercules_opt/src/forkify.rs
@@ -1,10 +1,8 @@
-extern crate hercules_ir;
-
 use std::iter::zip;
 
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::ir::*;
-use self::hercules_ir::loops::*;
+use hercules_ir::def_use::*;
+use hercules_ir::ir::*;
+use hercules_ir::loops::*;
 
 /*
  * Top level function to convert natural loops with simple induction variables
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
new file mode 100644
index 0000000000000000000000000000000000000000..a7df9bd9c8409daf4daa004dcd2b29a3ec2660c8
--- /dev/null
+++ b/hercules_opt/src/gcm.rs
@@ -0,0 +1,881 @@
+use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque};
+use std::iter::{empty, once, zip, FromIterator};
+
+use bitvec::prelude::*;
+use either::Either;
+
+use hercules_cg::*;
+use hercules_ir::*;
+
+use crate::*;
+
+/*
+ * Top level function to legalize the reference semantics of a Hercules IR
+ * function. Hercules IR is a value semantics representation, meaning that all
+ * program state is in the form of copyable values, and mutation takes place by
+ * making a new value that is a copy of the old value with some modification.
+ * This representation is extremely convenient for optimization, but is not good
+ * for code generation, where we need to generate code with references to get
+ * good performance. Hercules IR can alternatively be interpreted using
+ * reference semantics, where pointers to collection objects are passed around,
+ * read from, and written to. However, the value semantics and reference
+ * semantics interpretation of a Hercules IR function may not be equal - this
+ * pass transforms a Hercules IR function such that its new value semantics is
+ * the same as its old value semantics and that its new reference semantics is
+ * the same as its new value semantics. This pass returns a placement of nodes
+ * into ordered basic blocks, since the reference semantics of a function
+ * depends on the order of execution with respect to anti-dependencies. This
+ * is analogous to global code motion from the original sea of nodes paper.
+ *
+ * Our strategy for handling multiple mutating users of a collection is to treat
+ * the problem similar to register allocation; we perform a liveness analysis,
+ * spill constants into newly allocated constants, and read back the spilled
+ * contents when they are used after the first mutation. It's not obvious how
+ * many spills are needed upfront, and newly spilled constants may affect the
+ * liveness analysis result, so every spill restarts the process of checking for
+ * spills. Once no more spills are found, the process terminates. When a spill
+ * is found, the basic block assignments, and all the other analyses, are not
+ * necessarily valid anymore, so this function is called in a loop in pass.rs
+ * until no more spills are found.
+ */
+pub fn gcm(
+    editor: &mut FunctionEditor,
+    def_use: &ImmutableDefUseMap,
+    reverse_postorder: &Vec<NodeID>,
+    typing: &Vec<TypeID>,
+    control_subgraph: &Subgraph,
+    dom: &DomTree,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    loops: &LoopTree,
+    objects: &CollectionObjects,
+) -> Option<BasicBlocks> {
+    let bbs = basic_blocks(
+        editor.func(),
+        editor.func_id(),
+        def_use,
+        reverse_postorder,
+        dom,
+        loops,
+        fork_join_map,
+        objects,
+    );
+    if spill_clones(editor, typing, control_subgraph, objects, &bbs) {
+        None
+    } else {
+        Some(bbs)
+    }
+}
+
+/*
+ * Top level global code motion function. Assigns each data node to one of its
+ * immediate control use / user nodes, forming (unordered) basic blocks. Returns
+ * the control node / basic block each node is in. Takes in a partial
+ * partitioning that must be respected. Based on the schedule-early-schedule-
+ * late method from Cliff Click's PhD thesis. May fail if an anti-dependency
+ * edge can't be satisfied - in this case, a clone that has to be induced is
+ * returned instead.
+ */
+fn basic_blocks(
+    function: &Function,
+    func_id: FunctionID,
+    def_use: &ImmutableDefUseMap,
+    reverse_postorder: &Vec<NodeID>,
+    dom: &DomTree,
+    loops: &LoopTree,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    objects: &CollectionObjects,
+) -> BasicBlocks {
+    let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()];
+
+    // Step 1: assign the basic block locations of all nodes that must be in a
+    // specific block. This includes control nodes as well as some special data
+    // nodes, such as phis.
+    for idx in 0..function.nodes.len() {
+        match function.nodes[idx] {
+            Node::Phi { control, data: _ } => bbs[idx] = Some(control),
+            Node::ThreadID {
+                control,
+                dimension: _,
+            } => bbs[idx] = Some(control),
+            Node::Reduce {
+                control,
+                init: _,
+                reduct: _,
+            } => bbs[idx] = Some(control),
+            Node::Call {
+                control,
+                function: _,
+                dynamic_constants: _,
+                args: _,
+            } => bbs[idx] = Some(control),
+            Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)),
+            Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)),
+            Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)),
+            _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)),
+            _ => {}
+        }
+    }
+
+    // Step 2: schedule early. Place nodes in the earliest position they could
+    // go - use worklist to iterate nodes.
+    let mut schedule_early = bbs.clone();
+    let mut worklist = VecDeque::from(reverse_postorder.clone());
+    while let Some(id) = worklist.pop_front() {
+        if schedule_early[id.idx()].is_some() {
+            continue;
+        }
+
+        // For every use, check what block is its "schedule early" block. This
+        // node goes in the lowest block amongst those blocks.
+        let use_places: Option<Vec<NodeID>> = get_uses(&function.nodes[id.idx()])
+            .as_ref()
+            .into_iter()
+            .map(|id| *id)
+            .map(|id| schedule_early[id.idx()])
+            .collect();
+        if let Some(use_places) = use_places {
+            // If every use has been placed, we can place this node as the
+            // lowest place in the domtree that dominates all of the use places.
+            let lowest = dom.lowest_amongst(use_places.into_iter());
+            schedule_early[id.idx()] = Some(lowest);
+        } else {
+            // If not, then just push this node back on the worklist.
+            worklist.push_back(id);
+        }
+    }
+
+    // Step 3: find anti-dependence edges. An anti-dependence edge needs to be
+    // drawn between a collection reading node and a collection mutating node
+    // when the following conditions are true:
+    //
+    // 1: The reading and mutating nodes may involve the same collection.
+    // 2: The node producing the collection used by the reading node is in a
+    //    schedule early block that dominates the schedule early block of the
+    //    mutating node. The node producing the collection used by the reading
+    //    node may be an originator of a collection, phi or reduce, or mutator,
+    //    but not forwarding read - forwarding reads are collapsed, and the
+    //    bottom read is treated as reading from the transitive parent of the
+    //    forwarding read(s).
+    let mut antideps = BTreeSet::new();
+    for id in reverse_postorder.iter() {
+        // Find a terminating read node and the collections it reads.
+        let terminating_reads: BTreeSet<_> =
+            terminating_reads(function, func_id, *id, objects).collect();
+        if !terminating_reads.is_empty() {
+            // Walk forwarding reads to find anti-dependency roots.
+            let mut workset = terminating_reads.clone();
+            let mut roots = BTreeSet::new();
+            while let Some(pop) = workset.pop_first() {
+                let forwarded: BTreeSet<_> =
+                    forwarding_reads(function, func_id, pop, objects).collect();
+                if forwarded.is_empty() {
+                    roots.insert(pop);
+                } else {
+                    workset.extend(forwarded);
+                }
+            }
+
+            // For each root, find mutating nodes dominated by the root that
+            // modify an object read on any input of the current node (the
+            // terminating read).
+            // TODO: make this less outrageously inefficient.
+            let func_objects = &objects[&func_id];
+            for root in roots.iter() {
+                let root_early = schedule_early[root.idx()].unwrap();
+                let mut root_block_iterated_users: BTreeSet<NodeID> = BTreeSet::new();
+                let mut workset = BTreeSet::new();
+                workset.insert(*root);
+                while let Some(pop) = workset.pop_first() {
+                    let users = def_use.get_users(pop).into_iter().filter(|user| {
+                        !function.nodes[user.idx()].is_phi()
+                            && !function.nodes[user.idx()].is_reduce()
+                            && schedule_early[user.idx()].unwrap() == root_early
+                    });
+                    workset.extend(users.clone());
+                    root_block_iterated_users.extend(users);
+                }
+                let read_objs: BTreeSet<_> = terminating_reads
+                    .iter()
+                    .map(|read_use| func_objects.objects(*read_use).into_iter())
+                    .flatten()
+                    .map(|id| *id)
+                    .collect();
+                for mutator in reverse_postorder.iter() {
+                    let mutator_early = schedule_early[mutator.idx()].unwrap();
+                    if dom.does_dom(root_early, mutator_early)
+                        && (root_early != mutator_early
+                            || root_block_iterated_users.contains(&mutator))
+                        && mutating_objects(function, func_id, *mutator, objects)
+                            .any(|mutated| read_objs.contains(&mutated))
+                        && id != mutator
+                    {
+                        antideps.insert((*id, *mutator));
+                    }
+                }
+            }
+        }
+    }
+    let mut antideps_uses = vec![vec![]; function.nodes.len()];
+    let mut antideps_users = vec![vec![]; function.nodes.len()];
+    for (reader, mutator) in antideps.iter() {
+        antideps_uses[mutator.idx()].push(*reader);
+        antideps_users[reader.idx()].push(*mutator);
+    }
+
+    // Step 4: schedule late and pick each nodes final position. Since the late
+    // schedule of each node depends on the final positions of its users, these
+    // two steps must be fused. Compute their latest position, then use the
+    // control dependent + shallow loop heuristic to actually place them. A
+    // placement might not necessarily be found due to anti-dependency edges.
+    // These are optional and not necessary to consider, but we do since obeying
+    // them can reduce the number of clones. If the worklist stops making
+    // progress, stop considering the anti-dependency edges.
+    let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map
+        .into_iter()
+        .map(|(fork, join)| (*join, *fork))
+        .collect();
+    let mut worklist = VecDeque::from_iter(reverse_postorder.into_iter().map(|id| *id).rev());
+    let mut num_skip_iters = 0;
+    let mut consider_antidependencies = true;
+    while let Some(id) = worklist.pop_front() {
+        if num_skip_iters >= worklist.len() {
+            consider_antidependencies = false;
+        }
+
+        if bbs[id.idx()].is_some() {
+            num_skip_iters = 0;
+            continue;
+        }
+
+        // Calculate the least common ancestor of user blocks, a.k.a. the "late"
+        // schedule.
+        let calculate_lca = || -> Option<_> {
+            let mut lca = None;
+            // Helper to incrementally update the LCA.
+            let mut update_lca = |a| {
+                if let Some(acc) = lca {
+                    lca = Some(dom.least_common_ancestor(acc, a));
+                } else {
+                    lca = Some(a);
+                }
+            };
+
+            // For every user, consider where we need to be to directly dominate the
+            // user.
+            for user in def_use
+                .get_users(id)
+                .as_ref()
+                .into_iter()
+                .chain(if consider_antidependencies {
+                    Either::Left(antideps_users[id.idx()].iter())
+                } else {
+                    Either::Right(empty())
+                })
+                .map(|id| *id)
+            {
+                if let Node::Phi { control, data } = &function.nodes[user.idx()] {
+                    // For phis, we need to dominate the block jumping to the phi in
+                    // the slot that corresponds to our use.
+                    for (control, data) in
+                        zip(get_uses(&function.nodes[control.idx()]).as_ref(), data)
+                    {
+                        if id == *data {
+                            update_lca(*control);
+                        }
+                    }
+                } else if let Node::Reduce {
+                    control,
+                    init,
+                    reduct,
+                } = &function.nodes[user.idx()]
+                {
+                    // For reduces, we need to either dominate the block right
+                    // before the fork if we're the init input, or we need to
+                    // dominate the join if we're the reduct input.
+                    if id == *init {
+                        let before_fork = function.nodes[join_fork_map[control].idx()]
+                            .try_fork()
+                            .unwrap()
+                            .0;
+                        update_lca(before_fork);
+                    } else {
+                        assert_eq!(id, *reduct);
+                        update_lca(*control);
+                    }
+                } else {
+                    // For everything else, we just need to dominate the user.
+                    update_lca(bbs[user.idx()]?);
+                }
+            }
+
+            Some(lca)
+        };
+
+        // Check if all users have been placed. If one of them hasn't, then add
+        // this node back on to the worklist.
+        let Some(lca) = calculate_lca() else {
+            worklist.push_back(id);
+            num_skip_iters += 1;
+            continue;
+        };
+
+        // Look between the LCA and the schedule early location to place the
+        // node.
+        let schedule_early = schedule_early[id.idx()].unwrap();
+        let schedule_late = lca.unwrap_or(schedule_early);
+        let mut chain = dom
+            // If the node has no users, then it doesn't really matter where we
+            // place it - just place it at the early placement.
+            .chain(schedule_late, schedule_early);
+
+        if let Some(mut location) = chain.next() {
+            while let Some(control_node) = chain.next() {
+                // If the next node further up the dominator tree is in a shallower
+                // loop nest or if we can get out of a reduce loop when we don't
+                // need to be in one, place this data node in a higher-up location.
+                let old_nest = loops
+                    .header_of(location)
+                    .map(|header| loops.nesting(header).unwrap());
+                let new_nest = loops
+                    .header_of(control_node)
+                    .map(|header| loops.nesting(header).unwrap());
+                let shallower_nest = if let (Some(old_nest), Some(new_nest)) = (old_nest, new_nest)
+                {
+                    old_nest > new_nest
+                } else {
+                    // If the new location isn't a loop, it's nesting level should
+                    // be considered "shallower" if the current location is in a
+                    // loop.
+                    old_nest.is_some()
+                };
+                // This will move all nodes that don't need to be in reduce loops
+                // outside of reduce loops. Nodes that do need to be in a reduce
+                // loop use the reduce node forming the loop, so the dominator chain
+                // will consist of one block, and this loop won't ever iterate.
+                let currently_at_join = function.nodes[location.idx()].is_join();
+                if shallower_nest || currently_at_join {
+                    location = control_node;
+                }
+            }
+
+            bbs[id.idx()] = Some(location);
+            num_skip_iters = 0;
+        } else {
+            // If there is no valid location for this node, then it's a reading
+            // node of a collection that can't be placed above a mutation that
+            // anti-depend uses it. Push the node back on the list, and we'll
+            // stop considering anti-dependencies soon. Don't immediately stop
+            // considering anti-dependencies, as we may be able to eak out some
+            // more use of them.
+            worklist.push_back(id);
+            num_skip_iters += 1;
+            continue;
+        }
+    }
+    let bbs: Vec<_> = bbs.into_iter().map(Option::unwrap).collect();
+    // Calculate the number of phis and reduces per basic block. We use this to
+    // emit phis and reduces at the top of basic blocks. We want to emit phis
+    // and reduces first into ordered basic blocks for two reasons:
+    // 1. This is useful for liveness analysis.
+    // 2. This is needed for some backends - LLVM expects phis to be at the top
+    //    of basic blocks.
+    let mut num_phis_reduces = vec![0; function.nodes.len()];
+    for (node_idx, bb) in bbs.iter().enumerate() {
+        let node = &function.nodes[node_idx];
+        if node.is_phi() || node.is_reduce() {
+            num_phis_reduces[bb.idx()] += 1;
+        }
+    }
+
+    // Step 5: determine the order of nodes inside each block. Use worklist to
+    // add nodes to blocks in order that obeys dependencies.
+    let mut order: Vec<Vec<NodeID>> = vec![vec![]; function.nodes.len()];
+    let mut worklist = VecDeque::from_iter(
+        reverse_postorder
+            .into_iter()
+            .filter(|id| !function.nodes[id.idx()].is_control()),
+    );
+    let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()];
+    let mut num_skip_iters = 0;
+    let mut consider_antidependencies = true;
+    while let Some(id) = worklist.pop_front() {
+        // If the worklist isn't making progress, then there's at least one
+        // reading node of a collection that is in a anti-depend + normal depend
+        // use cycle with a mutating node. See above comment about anti-
+        // dependencies being optional; we just stop considering them here.
+        if num_skip_iters >= worklist.len() {
+            consider_antidependencies = false;
+        }
+
+        // Phis and reduces always get emitted. Other nodes need to obey
+        // dependency relationships and need to come after phis and reduces.
+        let node = &function.nodes[id.idx()];
+        let bb = bbs[id.idx()];
+        if node.is_phi()
+            || node.is_reduce()
+            || (num_phis_reduces[bb.idx()] == 0
+                && get_uses(node)
+                    .as_ref()
+                    .into_iter()
+                    .chain(if consider_antidependencies {
+                        Either::Left(antideps_uses[id.idx()].iter())
+                    } else {
+                        Either::Right(empty())
+                    })
+                    .all(|u| {
+                        function.nodes[u.idx()].is_control()
+                            || bbs[u.idx()] != bbs[id.idx()]
+                            || visited[u.idx()]
+                    }))
+        {
+            order[bb.idx()].push(*id);
+            visited.set(id.idx(), true);
+            num_skip_iters = 0;
+            if node.is_phi() || node.is_reduce() {
+                num_phis_reduces[bb.idx()] -= 1;
+            }
+        } else {
+            worklist.push_back(id);
+            num_skip_iters += 1;
+        }
+    }
+
+    (bbs, order)
+}
+
+fn terminating_reads<'a>(
+    function: &'a Function,
+    func_id: FunctionID,
+    reader: NodeID,
+    objects: &'a CollectionObjects,
+) -> Box<dyn Iterator<Item = NodeID> + 'a> {
+    match function.nodes[reader.idx()] {
+        Node::Read {
+            collect,
+            indices: _,
+        } if objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)),
+        Node::Write {
+            collect: _,
+            data,
+            indices: _,
+        } if !objects[&func_id].objects(data).is_empty() => Box::new(once(data)),
+        Node::Call {
+            control: _,
+            function: callee,
+            dynamic_constants: _,
+            ref args,
+        } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| {
+            let objects = &objects[&callee];
+            let returns = objects.returned_objects();
+            let param_obj = objects.param_to_object(idx)?;
+            if !objects.is_mutated(param_obj) && !returns.contains(&param_obj) {
+                Some(*arg)
+            } else {
+                None
+            }
+        })),
+        _ => Box::new(empty()),
+    }
+}
+
+fn forwarding_reads<'a>(
+    function: &'a Function,
+    func_id: FunctionID,
+    reader: NodeID,
+    objects: &'a CollectionObjects,
+) -> Box<dyn Iterator<Item = NodeID> + 'a> {
+    match function.nodes[reader.idx()] {
+        Node::Read {
+            collect,
+            indices: _,
+        } if !objects[&func_id].objects(reader).is_empty() => Box::new(once(collect)),
+        Node::Ternary {
+            op: TernaryOperator::Select,
+            first: _,
+            second,
+            third,
+        } if !objects[&func_id].objects(reader).is_empty() => {
+            Box::new(once(second).chain(once(third)))
+        }
+        Node::Call {
+            control: _,
+            function: callee,
+            dynamic_constants: _,
+            ref args,
+        } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| {
+            let objects = &objects[&callee];
+            let returns = objects.returned_objects();
+            let param_obj = objects.param_to_object(idx)?;
+            if !objects.is_mutated(param_obj) && returns.contains(&param_obj) {
+                Some(*arg)
+            } else {
+                None
+            }
+        })),
+        _ => Box::new(empty()),
+    }
+}
+
+fn mutating_objects<'a>(
+    function: &'a Function,
+    func_id: FunctionID,
+    mutator: NodeID,
+    objects: &'a CollectionObjects,
+) -> Box<dyn Iterator<Item = CollectionObjectID> + 'a> {
+    match function.nodes[mutator.idx()] {
+        Node::Write {
+            collect,
+            data: _,
+            indices: _,
+        } => Box::new(objects[&func_id].objects(collect).into_iter().map(|id| *id)),
+        Node::Call {
+            control: _,
+            function: callee,
+            dynamic_constants: _,
+            ref args,
+        } => Box::new(
+            args.into_iter()
+                .enumerate()
+                .filter_map(move |(idx, arg)| {
+                    let callee_objects = &objects[&callee];
+                    let param_obj = callee_objects.param_to_object(idx)?;
+                    if callee_objects.is_mutated(param_obj) {
+                        Some(objects[&func_id].objects(*arg).into_iter().map(|id| *id))
+                    } else {
+                        None
+                    }
+                })
+                .flatten(),
+        ),
+        _ => Box::new(empty()),
+    }
+}
+
+type Liveness = BTreeMap<NodeID, Vec<BTreeSet<NodeID>>>;
+
+/*
+ * Top level function to find implicit clones that need to be spilled. Returns
+ * whether a clone was spilled, in which case the whole scheduling process must
+ * be restarted.
+ */
+fn spill_clones(
+    editor: &mut FunctionEditor,
+    typing: &Vec<TypeID>,
+    control_subgraph: &Subgraph,
+    objects: &CollectionObjects,
+    bbs: &BasicBlocks,
+) -> bool {
+    // Step 1: compute a liveness analysis of collection values in the IR. This
+    // requires a dataflow analysis over the scheduled IR, which is not a common
+    // need in Hercules, so just hardcode the analysis.
+    let liveness = liveness_dataflow(
+        editor.func(),
+        editor.func_id(),
+        control_subgraph,
+        objects,
+        bbs,
+    );
+
+    // Step 2: compute an interference graph from the liveness result. This
+    // graph contains a vertex per node ID producing a collection value and an
+    // edge per pair of node IDs that interfere. Nodes A and B interfere if node
+    // A is defined right above a point where node B is live.
+    let mut edges = vec![];
+    for (bb, liveness) in liveness {
+        let insts = &bbs.1[bb.idx()];
+        for (node, live) in zip(insts, liveness.into_iter().skip(1)) {
+            for live_node in live {
+                if *node != live_node {
+                    edges.push((*node, live_node));
+                }
+            }
+        }
+    }
+
+    // Step 3: filter edges (A, B) to just see edges where A uses B and A isn't
+    // a terminating read. These are the edges that may require a spill.
+    let mut spill_edges = edges.into_iter().filter(|(a, b)| {
+        get_uses(&editor.func().nodes[a.idx()])
+            .as_ref()
+            .into_iter()
+            .any(|u| *u == *b)
+            && !terminating_reads(editor.func(), editor.func_id(), *a, objects).any(|id| id == *b)
+    });
+
+    // Step 4: if there is a spill edge, spill it and return true. Otherwise,
+    // return false.
+    if let Some((user, obj)) = spill_edges.next() {
+        // Figure out the most immediate dominating region for every basic
+        // block. These are the points where spill slot phis get placed.
+        let nodes = &editor.func().nodes;
+        let mut imm_dom_reg = vec![NodeID::new(0); editor.func().nodes.len()];
+        for (idx, node) in nodes.into_iter().enumerate() {
+            if node.is_region() {
+                imm_dom_reg[idx] = NodeID::new(idx);
+            }
+        }
+        let rev_po = control_subgraph.rev_po(NodeID::new(0));
+        for bb in rev_po.iter() {
+            if !nodes[bb.idx()].is_region() && !nodes[bb.idx()].is_start() {
+                imm_dom_reg[bb.idx()] =
+                    imm_dom_reg[control_subgraph.preds(*bb).next().unwrap().idx()];
+            }
+        }
+
+        let other_obj_users: Vec<_> = editor.get_users(obj).filter(|id| *id != user).collect();
+        let mut dummy_phis = vec![NodeID::new(0); imm_dom_reg.len()];
+        let mut success = editor.edit(|mut edit| {
+            // Construct the spill slot. This is just a constant that gets phi-
+            // ed throughout the entire function.
+            let cons_id = edit.add_zero_constant(typing[obj.idx()]);
+            let slot_id = edit.add_node(Node::Constant { id: cons_id });
+
+            // Allocate IDs for phis that move the spill slot throughout the
+            // function without implicit clones. These are dummy phis, since
+            // there are potentially cycles between them. We will replace them
+            // later.
+            for (idx, reg) in imm_dom_reg.iter().enumerate().skip(1) {
+                if idx == reg.idx() {
+                    dummy_phis[idx] = edit.add_node(Node::Phi {
+                        control: *reg,
+                        data: empty().collect(),
+                    });
+                }
+            }
+
+            // Spill `obj` before `user` potentially modifies it.
+            let spill_region = imm_dom_reg[bbs.0[obj.idx()].idx()];
+            let spill_id = edit.add_node(Node::Write {
+                collect: if spill_region == NodeID::new(0) {
+                    slot_id
+                } else {
+                    dummy_phis[spill_region.idx()]
+                },
+                data: obj,
+                indices: empty().collect(),
+            });
+
+            // Before each other user, unspill `obj`.
+            for other_user in other_obj_users {
+                let other_region = imm_dom_reg[bbs.0[other_user.idx()].idx()];
+                // If this assert fails, then `obj` is not in the first basic
+                // block, but it has a user that is in the first basic block,
+                // which violates SSA.
+                assert!(other_region == spill_region || other_region != NodeID::new(0));
+
+                // If an other user is a phi, we need to be a little careful
+                // about how we insert unspilling code for `obj`. Instead of
+                // inserting an unspill in the same block as the user, we need
+                // to insert one in each predecessor of the phi that corresponds
+                // to a use of `obj`. Since this requires modifying individual
+                // uses in a phi, just rebuild the node entirely.
+                if let Node::Phi { control, data } = edit.get_node(other_user).clone() {
+                    assert_eq!(control, other_region);
+                    let mut new_data = vec![];
+                    for (pred, data) in zip(control_subgraph.preds(control), data) {
+                        let pred = imm_dom_reg[pred.idx()];
+                        if data == obj {
+                            let unspill_id = edit.add_node(Node::Write {
+                                collect: obj,
+                                data: if pred == spill_region {
+                                    spill_id
+                                } else {
+                                    dummy_phis[pred.idx()]
+                                },
+                                indices: empty().collect(),
+                            });
+                            new_data.push(unspill_id);
+                        } else {
+                            new_data.push(data);
+                        }
+                    }
+                    let new_phi = edit.add_node(Node::Phi {
+                        control,
+                        data: new_data.into_boxed_slice(),
+                    });
+                    edit = edit.replace_all_uses(other_user, new_phi)?;
+                    edit = edit.delete_node(other_user)?;
+                } else {
+                    let unspill_id = edit.add_node(Node::Write {
+                        collect: obj,
+                        data: if other_region == spill_region {
+                            spill_id
+                        } else {
+                            dummy_phis[other_region.idx()]
+                        },
+                        indices: empty().collect(),
+                    });
+                    edit = edit.replace_all_uses_where(obj, unspill_id, |id| *id == other_user)?;
+                }
+            }
+
+            // Create and hook up all the real phis. Phi elimination will clean
+            // this up.
+            let mut real_phis = vec![NodeID::new(0); imm_dom_reg.len()];
+            for (idx, reg) in imm_dom_reg.iter().enumerate().skip(1) {
+                if idx == reg.idx() {
+                    real_phis[idx] = edit.add_node(Node::Phi {
+                        control: *reg,
+                        data: control_subgraph
+                            .preds(*reg)
+                            .map(|pred| {
+                                let pred = imm_dom_reg[pred.idx()];
+                                if pred == spill_region {
+                                    spill_id
+                                } else if pred == NodeID::new(0) {
+                                    slot_id
+                                } else {
+                                    dummy_phis[pred.idx()]
+                                }
+                            })
+                            .collect(),
+                    });
+                }
+            }
+            for (dummy, real) in zip(dummy_phis.iter(), real_phis) {
+                if *dummy != real {
+                    edit = edit.replace_all_uses(*dummy, real)?;
+                }
+            }
+
+            Ok(edit)
+        });
+        success = success
+            && editor.edit(|mut edit| {
+                for dummy in dummy_phis {
+                    if dummy != NodeID::new(0) {
+                        edit = edit.delete_node(dummy)?;
+                    }
+                }
+                Ok(edit)
+            });
+        assert!(success, "PANIC: GCM cannot fail to edit a function, as it needs to legalize the reference semantics of every function before code generation.");
+        true
+    } else {
+        false
+    }
+}
+
+/*
+ * Liveness dataflow analysis on scheduled Hercules IR. Just look at nodes that
+ * involve collections.
+ */
+fn liveness_dataflow(
+    function: &Function,
+    func_id: FunctionID,
+    control_subgraph: &Subgraph,
+    objects: &CollectionObjects,
+    bbs: &BasicBlocks,
+) -> Liveness {
+    let mut po = control_subgraph.rev_po(NodeID::new(0));
+    po.reverse();
+    let mut liveness = Liveness::default();
+    for (bb_idx, insts) in bbs.1.iter().enumerate() {
+        liveness.insert(NodeID::new(bb_idx), vec![BTreeSet::new(); insts.len() + 1]);
+    }
+    let mut num_phis_reduces = vec![0; function.nodes.len()];
+    let mut reducing = vec![false; function.nodes.len()];
+    for (node_idx, bb) in bbs.0.iter().enumerate() {
+        let node = &function.nodes[node_idx];
+        if node.is_phi() || node.is_reduce() {
+            num_phis_reduces[bb.idx()] += 1;
+            // Phis and reduces can't be in the same basic block.
+            if node.is_reduce() {
+                assert!(num_phis_reduces[bb.idx()] == 0 || reducing[bb.idx()]);
+                reducing[bb.idx()] = true;
+            } else {
+                assert!(!reducing[bb.idx()]);
+            }
+        }
+    }
+    let is_obj = |id: NodeID| !objects[&func_id].objects(id).is_empty();
+
+    loop {
+        let mut changed = false;
+
+        for bb in po.iter() {
+            // First, calculate the liveness set for the bottom of this block.
+            let last_pt = bbs.1[bb.idx()].len();
+            let old_value = &liveness[&bb][last_pt];
+            let mut new_value = BTreeSet::new();
+            for succ in control_subgraph.succs(*bb).chain(if reducing[bb.idx()] {
+                Either::Left(once(*bb))
+            } else {
+                Either::Right(empty())
+            }) {
+                // The liveness at the bottom of a basic block is the union of:
+                // 1. The liveness of each succecessor right after its phis and
+                //    reduces.
+                // 2. Every data use in a phi or reduce that corresponds to this
+                //    block as the predecessor.
+                let after_phis_reduces_pt = num_phis_reduces[succ.idx()];
+                new_value.extend(&liveness[&succ][after_phis_reduces_pt]);
+                for inst_idx in 0..after_phis_reduces_pt {
+                    let id = bbs.1[succ.idx()][inst_idx];
+                    new_value.remove(&id);
+                    match function.nodes[id.idx()] {
+                        Node::Phi { control, ref data } if is_obj(data[0]) => {
+                            assert_eq!(control, succ);
+                            new_value.extend(
+                                zip(control_subgraph.preds(succ), data)
+                                    .filter(|(pred, _)| *pred == *bb)
+                                    .map(|(_, data)| *data),
+                            );
+                        }
+                        Node::Reduce {
+                            control,
+                            init,
+                            reduct,
+                        } if is_obj(init) => {
+                            assert_eq!(control, succ);
+                            if succ == *bb {
+                                new_value.insert(reduct);
+                            } else {
+                                new_value.insert(init);
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+            }
+            changed |= *old_value != new_value;
+            liveness.get_mut(&bb).unwrap()[last_pt] = new_value;
+
+            // Second, calculate the liveness set above each instruction in this block.
+            for pt in (0..last_pt).rev() {
+                let old_value = &liveness[&bb][pt];
+                let mut new_value = liveness[&bb][pt + 1].clone();
+                let id = bbs.1[bb.idx()][pt];
+                let uses = get_uses(&function.nodes[id.idx()]);
+                new_value.remove(&id);
+                new_value.extend(
+                    if let Node::Write {
+                        collect: _,
+                        data,
+                        ref indices,
+                    } = function.nodes[id.idx()]
+                        && indices.is_empty()
+                    {
+                        // If this write is a cloning write, the `collect` input
+                        // isn't actually live, because its value doesn't
+                        // matter.
+                        Either::Left(once(data).filter(|id| is_obj(*id)))
+                    } else {
+                        Either::Right(
+                            uses.as_ref()
+                                .into_iter()
+                                .map(|id| *id)
+                                .filter(|id| is_obj(*id)),
+                        )
+                    },
+                );
+                changed |= *old_value != new_value;
+                liveness.get_mut(&bb).unwrap()[pt] = new_value;
+            }
+        }
+
+        if !changed {
+            return liveness;
+        }
+    }
+}
diff --git a/hercules_opt/src/gvn.rs b/hercules_opt/src/gvn.rs
index a9db0cd879a0796f0ae1545c7bb1048d012b0d39..42835e853c36a994f656a16e1662e8837fd55239 100644
--- a/hercules_opt/src/gvn.rs
+++ b/hercules_opt/src/gvn.rs
@@ -1,8 +1,6 @@
-extern crate hercules_ir;
-
 use std::collections::HashMap;
 
-use self::hercules_ir::ir::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
@@ -11,7 +9,7 @@ use crate::*;
  * fairly simple compared to in a normal CFG. Needs access to constants for
  * identity function simplification.
  */
-pub fn gvn(editor: &mut FunctionEditor) {
+pub fn gvn(editor: &mut FunctionEditor, gvn_constants_and_clones: bool) {
     // Create worklist (starts as all nodes) and value number hashmap.
     let mut worklist: Vec<NodeID> = (0..editor.func().nodes.len()).map(NodeID::new).collect();
     let mut value_numbers: HashMap<Node, NodeID> = HashMap::new();
@@ -28,7 +26,18 @@ pub fn gvn(editor: &mut FunctionEditor) {
         // Next, check if there is a value number for this simplified value yet.
         if let Some(number) = value_numbers.get(&editor.func().nodes[value.idx()]) {
             // If the number is this worklist item, there's nothing to be done.
-            if *number == work {
+            // Also, don't GVN constants and clones if indicated to not do so.
+            if *number == work
+                || (!gvn_constants_and_clones
+                    && (editor.func().nodes[work.idx()]
+                        .try_constant()
+                        .map(|cons_id| !editor.get_constant(cons_id).is_scalar())
+                        .unwrap_or(false)
+                        || editor.func().nodes[work.idx()]
+                            .try_write()
+                            .map(|(_, _, indices)| indices.is_empty())
+                            .unwrap_or(false)))
+            {
                 continue;
             }
 
diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs
index 9ab158742d140448c6aadead23cd237e7f7be1c9..63a05b0c10bd4efca7da76ceb21969b118770e53 100644
--- a/hercules_opt/src/inline.rs
+++ b/hercules_opt/src/inline.rs
@@ -1,11 +1,9 @@
-extern crate hercules_ir;
-
 use std::collections::HashMap;
 use std::iter::zip;
 
-use self::hercules_ir::callgraph::*;
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::ir::*;
+use hercules_ir::callgraph::*;
+use hercules_ir::def_use::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
diff --git a/hercules_opt/src/interprocedural_sroa.rs b/hercules_opt/src/interprocedural_sroa.rs
index c6cf448b0615cac528bbbff055d91c4bd0d371d6..9edb4d02134e4c5c05aaf398cddbd7bb19686625 100644
--- a/hercules_opt/src/interprocedural_sroa.rs
+++ b/hercules_opt/src/interprocedural_sroa.rs
@@ -1,9 +1,9 @@
-extern crate hercules_ir;
 use std::collections::HashMap;
+use std::iter::zip;
+
+use hercules_ir::ir::*;
 
-use self::hercules_ir::ir::*;
 use crate::*;
-use std::iter::zip;
 
 /**
  * Given an editor for each function in a module, return V s.t.
diff --git a/hercules_opt/src/lib.rs b/hercules_opt/src/lib.rs
index 4a4011b1f19c62d741f8d30189998039a1dd1b30..08d183a7f8c2ddfc650bb1ef3ce385761a137def 100644
--- a/hercules_opt/src/lib.rs
+++ b/hercules_opt/src/lib.rs
@@ -4,13 +4,14 @@ pub mod ccp;
 pub mod dce;
 pub mod delete_uncalled;
 pub mod editor;
+pub mod float_collections;
 pub mod fork_concat_split;
 pub mod fork_guard_elim;
 pub mod forkify;
+pub mod gcm;
 pub mod gvn;
 pub mod inline;
 pub mod interprocedural_sroa;
-pub mod materialize_clones;
 pub mod outline;
 pub mod pass;
 pub mod phi_elim;
@@ -24,13 +25,14 @@ pub use crate::ccp::*;
 pub use crate::dce::*;
 pub use crate::delete_uncalled::*;
 pub use crate::editor::*;
+pub use crate::float_collections::*;
 pub use crate::fork_concat_split::*;
 pub use crate::fork_guard_elim::*;
 pub use crate::forkify::*;
+pub use crate::gcm::*;
 pub use crate::gvn::*;
 pub use crate::inline::*;
 pub use crate::interprocedural_sroa::*;
-pub use crate::materialize_clones::*;
 pub use crate::outline::*;
 pub use crate::pass::*;
 pub use crate::phi_elim::*;
diff --git a/hercules_opt/src/materialize_clones.rs b/hercules_opt/src/materialize_clones.rs
deleted file mode 100644
index 687ac10c87d595c6f53a3fff4435e14f6dc4f375..0000000000000000000000000000000000000000
--- a/hercules_opt/src/materialize_clones.rs
+++ /dev/null
@@ -1,21 +0,0 @@
-extern crate hercules_ir;
-
-use self::hercules_ir::*;
-
-use crate::*;
-
-/*
- * Top level function to materialize clones of collections. This transformation
- * eliminates the possibility of multiple independent writes (including dynamic
- * writes) to a single collection by introducing extra collection constants and
- * inserting explicit clones. This allows us to make the simplifying assumption
- * in the backend that collections have reference, rather than value, semantics.
- * The pass calling this function is mandatory for correctness.
- */
-pub fn materialize_clones(
-    editor: &mut FunctionEditor,
-    objects: &FunctionCollectionObjects,
-    bbs: &BasicBlocks,
-) {
-    todo!()
-}
diff --git a/hercules_opt/src/outline.rs b/hercules_opt/src/outline.rs
index 70062bbcbce0001f0d07a3cfb25fbf2cd94d0433..80f97c7f079ad57bb1edb0929dfe56578eae338c 100644
--- a/hercules_opt/src/outline.rs
+++ b/hercules_opt/src/outline.rs
@@ -1,14 +1,12 @@
-extern crate hercules_ir;
-
 use std::collections::{BTreeMap, BTreeSet};
 use std::iter::zip;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::dom::*;
-use self::hercules_ir::gcm::*;
-use self::hercules_ir::ir::*;
-use self::hercules_ir::subgraph::*;
+use hercules_ir::def_use::*;
+use hercules_ir::dom::*;
+use hercules_ir::fork_join_analysis::*;
+use hercules_ir::ir::*;
+use hercules_ir::subgraph::*;
 
 use crate::*;
 
@@ -567,9 +565,7 @@ pub fn outline(
 }
 
 /*
- * Just outlines all of a function except the entry, return, and aggregate
- * constants. This is the minimum work needed to cause runtime Rust code to be
- * generated as necessary.
+ * Just outlines all of a function except the start and return nodes.
  */
 pub fn dumb_outline(
     editor: &mut FunctionEditor,
@@ -585,11 +581,7 @@ pub fn dumb_outline(
         .node_ids()
         .filter(|id| {
             let node = &editor.func().nodes[id.idx()];
-            if let Node::Constant { id } = editor.func().nodes[id.idx()] {
-                editor.get_constant(id).is_scalar()
-            } else {
-                !(node.is_start() || node.is_parameter() || node.is_return())
-            }
+            !(node.is_start() || node.is_parameter() || node.is_return())
         })
         .collect();
     outline(
diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs
index 08faa5b0760c0312688bbd902a53e2ff350bb1be..7366a3362e9942ce78d4fe0b2238fb8ce1324124 100644
--- a/hercules_opt/src/pass.rs
+++ b/hercules_opt/src/pass.rs
@@ -1,21 +1,16 @@
-extern crate hercules_cg;
-extern crate hercules_ir;
-extern crate postcard;
-extern crate serde;
-extern crate take_mut;
-
 use std::cell::RefCell;
 use std::collections::{HashMap, HashSet};
-use std::env::temp_dir;
 use std::fs::File;
 use std::io::Write;
 use std::iter::zip;
 use std::process::{Command, Stdio};
 
-use self::serde::Deserialize;
+use serde::Deserialize;
+
+use tempfile::TempDir;
 
-use self::hercules_cg::*;
-use self::hercules_ir::*;
+use hercules_cg::*;
+use hercules_ir::*;
 
 use crate::*;
 
@@ -38,8 +33,9 @@ pub enum Pass {
     DeleteUncalled,
     ForkSplit,
     Unforkify,
-    MaterializeClones,
     InferSchedules,
+    GCM,
+    FloatCollections,
     Verify,
     // Parameterized over whether analyses that aid visualization are necessary.
     // Useful to set to false if displaying a potentially broken module.
@@ -72,7 +68,6 @@ pub struct PassManager {
     pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>,
     pub loops: Option<Vec<LoopTree>>,
     pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
-    pub antideps: Option<Vec<Vec<(NodeID, NodeID)>>>,
     pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
     pub bbs: Option<Vec<BasicBlocks>>,
     pub collection_objects: Option<CollectionObjects>,
@@ -94,7 +89,6 @@ impl PassManager {
             fork_join_nests: None,
             loops: None,
             reduce_cycles: None,
-            antideps: None,
             data_nodes_in_fork_joins: None,
             bbs: None,
             collection_objects: None,
@@ -238,28 +232,6 @@ impl PassManager {
         }
     }
 
-    pub fn make_antideps(&mut self) {
-        if self.antideps.is_none() {
-            self.make_reverse_postorders();
-            self.make_collection_objects();
-            self.antideps = Some(
-                zip(
-                    self.module.functions.iter(),
-                    zip(
-                        self.reverse_postorders.as_ref().unwrap().iter(),
-                        self.collection_objects.as_ref().unwrap().iter(),
-                    ),
-                )
-                // Fine since collection_objects is a BTreeMap - iteration order
-                // is fixed.
-                .map(|(function, (reverse_postorder, objects))| {
-                    antideps(function, reverse_postorder, objects.1)
-                })
-                .collect(),
-            );
-        }
-    }
-
     pub fn make_data_nodes_in_fork_joins(&mut self) {
         if self.data_nodes_in_fork_joins.is_none() {
             self.make_def_uses();
@@ -280,64 +252,6 @@ impl PassManager {
         }
     }
 
-    pub fn make_bbs(&mut self) {
-        if self.bbs.is_none() {
-            self.make_def_uses();
-            self.make_reverse_postorders();
-            self.make_control_subgraphs();
-            self.make_doms();
-            self.make_antideps();
-            self.make_loops();
-            self.make_fork_join_maps();
-            let def_uses = self.def_uses.as_ref().unwrap().iter();
-            let reverse_postorders = self.reverse_postorders.as_ref().unwrap().iter();
-            let control_subgraphs = self.control_subgraphs.as_ref().unwrap().iter();
-            let doms = self.doms.as_ref().unwrap().iter();
-            let antideps = self.antideps.as_ref().unwrap().iter();
-            let loops = self.loops.as_ref().unwrap().iter();
-            let fork_join_maps = self.fork_join_maps.as_ref().unwrap().iter();
-            self.bbs = Some(
-                zip(
-                    self.module.functions.iter(),
-                    zip(
-                        def_uses,
-                        zip(
-                            reverse_postorders,
-                            zip(
-                                control_subgraphs,
-                                zip(doms, zip(antideps, zip(loops, fork_join_maps))),
-                            ),
-                        ),
-                    ),
-                )
-                .map(
-                    |(
-                        function,
-                        (
-                            def_use,
-                            (
-                                reverse_postorder,
-                                (control_subgraph, (dom, (antideps, (loops, fork_join_map)))),
-                            ),
-                        ),
-                    )| {
-                        gcm(
-                            function,
-                            def_use,
-                            reverse_postorder,
-                            control_subgraph,
-                            dom,
-                            antideps,
-                            loops,
-                            fork_join_map,
-                        )
-                    },
-                )
-                .collect(),
-            );
-        }
-    }
-
     pub fn make_collection_objects(&mut self) {
         if self.collection_objects.is_none() {
             self.make_reverse_postorders();
@@ -375,6 +289,7 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
@@ -409,6 +324,7 @@ impl PassManager {
                         .map(|(i, f)| {
                             FunctionEditor::new(
                                 f,
+                                FunctionID::new(i),
                                 &constants_ref,
                                 &dynamic_constants_ref,
                                 &types_ref,
@@ -442,6 +358,7 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
@@ -468,12 +385,13 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
                             &def_uses[idx],
                         );
-                        gvn(&mut editor);
+                        gvn(&mut editor, false);
 
                         self.module.constants = constants_ref.take();
                         self.module.dynamic_constants = dynamic_constants_ref.take();
@@ -515,6 +433,7 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
@@ -590,6 +509,7 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
@@ -614,18 +534,21 @@ impl PassManager {
                     let dynamic_constants_ref =
                         RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
                     let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
-                    let mut editors: Vec<_> =
-                        zip(self.module.functions.iter_mut(), def_uses.iter())
-                            .map(|(func, def_use)| {
-                                FunctionEditor::new(
-                                    func,
-                                    &constants_ref,
-                                    &dynamic_constants_ref,
-                                    &types_ref,
-                                    def_use,
-                                )
-                            })
-                            .collect();
+                    let mut editors: Vec<_> = zip(
+                        self.module.functions.iter_mut().enumerate(),
+                        def_uses.iter(),
+                    )
+                    .map(|((idx, func), def_use)| {
+                        FunctionEditor::new(
+                            func,
+                            FunctionID::new(idx),
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            def_use,
+                        )
+                    })
+                    .collect();
                     inline(&mut editors, callgraph);
 
                     self.module.constants = constants_ref.take();
@@ -645,18 +568,21 @@ impl PassManager {
                         RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
                     let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                     let old_num_funcs = self.module.functions.len();
-                    let mut editors: Vec<_> =
-                        zip(self.module.functions.iter_mut(), def_uses.iter())
-                            .map(|(func, def_use)| {
-                                FunctionEditor::new(
-                                    func,
-                                    &constants_ref,
-                                    &dynamic_constants_ref,
-                                    &types_ref,
-                                    def_use,
-                                )
-                            })
-                            .collect();
+                    let mut editors: Vec<_> = zip(
+                        self.module.functions.iter_mut().enumerate(),
+                        def_uses.iter(),
+                    )
+                    .map(|((idx, func), def_use)| {
+                        FunctionEditor::new(
+                            func,
+                            FunctionID::new(idx),
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            def_use,
+                        )
+                    })
+                    .collect();
                     for editor in editors.iter_mut() {
                         collapse_returns(editor);
                         ensure_between_control_flow(editor);
@@ -678,18 +604,21 @@ impl PassManager {
                     let dynamic_constants_ref =
                         RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
                     let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
-                    let mut editors: Vec<_> =
-                        zip(self.module.functions.iter_mut(), def_uses.iter())
-                            .map(|(func, def_use)| {
-                                FunctionEditor::new(
-                                    func,
-                                    &constants_ref,
-                                    &dynamic_constants_ref,
-                                    &types_ref,
-                                    def_use,
-                                )
-                            })
-                            .collect();
+                    let mut editors: Vec<_> = zip(
+                        self.module.functions.iter_mut().enumerate(),
+                        def_uses.iter(),
+                    )
+                    .map(|((idx, func), def_use)| {
+                        FunctionEditor::new(
+                            func,
+                            FunctionID::new(idx),
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            def_use,
+                        )
+                    })
+                    .collect();
                     let mut new_funcs = vec![];
                     for (idx, editor) in editors.iter_mut().enumerate() {
                         let new_func_id = FunctionID::new(old_num_funcs + new_funcs.len());
@@ -726,18 +655,21 @@ impl PassManager {
 
                     // By default in an editor all nodes are mutable, which is desired in this case
                     // since we are only modifying the IDs of functions that we call.
-                    let mut editors: Vec<_> =
-                        zip(self.module.functions.iter_mut(), def_uses.iter())
-                            .map(|(func, def_use)| {
-                                FunctionEditor::new(
-                                    func,
-                                    &constants_ref,
-                                    &dynamic_constants_ref,
-                                    &types_ref,
-                                    def_use,
-                                )
-                            })
-                            .collect();
+                    let mut editors: Vec<_> = zip(
+                        self.module.functions.iter_mut().enumerate(),
+                        def_uses.iter(),
+                    )
+                    .map(|((idx, func), def_use)| {
+                        FunctionEditor::new(
+                            func,
+                            FunctionID::new(idx),
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            def_use,
+                        )
+                    })
+                    .collect();
 
                     let new_idx = delete_uncalled(&mut editors, callgraph);
                     self.module.constants = constants_ref.take();
@@ -768,6 +700,7 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
@@ -796,6 +729,7 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
@@ -811,13 +745,24 @@ impl PassManager {
                     }
                     self.clear_analyses();
                 }
-                Pass::MaterializeClones => {
+                Pass::GCM => loop {
                     self.make_def_uses();
+                    self.make_reverse_postorders();
+                    self.make_typing();
+                    self.make_control_subgraphs();
+                    self.make_doms();
+                    self.make_fork_join_maps();
+                    self.make_loops();
                     self.make_collection_objects();
-                    self.make_bbs();
                     let def_uses = self.def_uses.as_ref().unwrap();
+                    let reverse_postorders = self.reverse_postorders.as_ref().unwrap();
+                    let typing = self.typing.as_ref().unwrap();
+                    let doms = self.doms.as_ref().unwrap();
+                    let fork_join_maps = self.fork_join_maps.as_ref().unwrap();
+                    let loops = self.loops.as_ref().unwrap();
+                    let control_subgraphs = self.control_subgraphs.as_ref().unwrap();
                     let collection_objects = self.collection_objects.as_ref().unwrap();
-                    let bbs = self.bbs.as_ref().unwrap();
+                    let mut bbs = vec![];
                     for idx in 0..self.module.functions.len() {
                         let constants_ref =
                             RefCell::new(std::mem::take(&mut self.module.constants));
@@ -826,16 +771,25 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
                             &def_uses[idx],
                         );
-                        materialize_clones(
+                        if let Some(bb) = gcm(
                             &mut editor,
-                            &collection_objects[&FunctionID::new(idx)],
-                            &bbs[idx],
-                        );
+                            &def_uses[idx],
+                            &reverse_postorders[idx],
+                            &typing[idx],
+                            &control_subgraphs[idx],
+                            &doms[idx],
+                            &fork_join_maps[idx],
+                            &loops[idx],
+                            collection_objects,
+                        ) {
+                            bbs.push(bb);
+                        }
 
                         self.module.constants = constants_ref.take();
                         self.module.dynamic_constants = dynamic_constants_ref.take();
@@ -844,6 +798,48 @@ impl PassManager {
                         self.module.functions[idx].delete_gravestones();
                     }
                     self.clear_analyses();
+                    if bbs.len() == self.module.functions.len() {
+                        self.bbs = Some(bbs);
+                        break;
+                    }
+                },
+                Pass::FloatCollections => {
+                    self.make_def_uses();
+                    self.make_typing();
+                    self.make_callgraph();
+                    let def_uses = self.def_uses.as_ref().unwrap();
+                    let typing = self.typing.as_ref().unwrap();
+                    let callgraph = self.callgraph.as_ref().unwrap();
+                    let devices = device_placement(&self.module.functions, &callgraph);
+                    let constants_ref = RefCell::new(std::mem::take(&mut self.module.constants));
+                    let dynamic_constants_ref =
+                        RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
+                    let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
+                    let mut editors: Vec<_> = zip(
+                        self.module.functions.iter_mut().enumerate(),
+                        def_uses.iter(),
+                    )
+                    .map(|((idx, func), def_use)| {
+                        FunctionEditor::new(
+                            func,
+                            FunctionID::new(idx),
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            def_use,
+                        )
+                    })
+                    .collect();
+                    float_collections(&mut editors, typing, callgraph, &devices);
+
+                    self.module.constants = constants_ref.take();
+                    self.module.dynamic_constants = dynamic_constants_ref.take();
+                    self.module.types = types_ref.take();
+
+                    for func in self.module.functions.iter_mut() {
+                        func.delete_gravestones();
+                    }
+                    self.clear_analyses();
                 }
                 Pass::InferSchedules => {
                     self.make_def_uses();
@@ -860,12 +856,17 @@ impl PassManager {
                         let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
                         let mut editor = FunctionEditor::new(
                             &mut self.module.functions[idx],
+                            FunctionID::new(idx),
                             &constants_ref,
                             &dynamic_constants_ref,
                             &types_ref,
                             &def_uses[idx],
                         );
-                        infer_parallel_reduce(&mut editor, &fork_join_maps[idx], &reduce_cycles[idx]);
+                        infer_parallel_reduce(
+                            &mut editor,
+                            &fork_join_maps[idx],
+                            &reduce_cycles[idx],
+                        );
                         infer_parallel_fork(&mut editor, &fork_join_maps[idx]);
                         infer_vectorizable(&mut editor, &fork_join_maps[idx]);
                         infer_tight_associative(&mut editor, &reduce_cycles[idx]);
@@ -905,28 +906,21 @@ impl PassManager {
                     if *force_analyses {
                         self.make_doms();
                         self.make_fork_join_maps();
-                        self.make_bbs();
                     }
                     xdot_module(
                         &self.module,
                         self.reverse_postorders.as_ref().unwrap(),
                         self.doms.as_ref(),
                         self.fork_join_maps.as_ref(),
-                        self.bbs.as_ref(),
                     );
                 }
                 Pass::Codegen(output_dir, module_name) => {
-                    self.make_reverse_postorders();
                     self.make_typing();
                     self.make_control_subgraphs();
-                    self.make_antideps();
-                    self.make_bbs();
                     self.make_collection_objects();
                     self.make_callgraph();
-                    let reverse_postorders = self.reverse_postorders.as_ref().unwrap();
                     let typing = self.typing.as_ref().unwrap();
                     let control_subgraphs = self.control_subgraphs.as_ref().unwrap();
-                    let antideps = self.antideps.as_ref().unwrap();
                     let bbs = self.bbs.as_ref().unwrap();
                     let collection_objects = self.collection_objects.as_ref().unwrap();
                     let callgraph = self.callgraph.as_ref().unwrap();
@@ -943,10 +937,8 @@ impl PassManager {
                                 &self.module.types,
                                 &self.module.constants,
                                 &self.module.dynamic_constants,
-                                &reverse_postorders[idx],
                                 &typing[idx],
                                 &control_subgraphs[idx],
-                                &antideps[idx],
                                 &bbs[idx],
                                 &mut llvm_ir,
                             )
@@ -954,10 +946,8 @@ impl PassManager {
                             Device::AsyncRust => rt_codegen(
                                 FunctionID::new(idx),
                                 &self.module,
-                                &reverse_postorders[idx],
                                 &typing[idx],
                                 &control_subgraphs[idx],
-                                &antideps[idx],
                                 &bbs[idx],
                                 &collection_objects,
                                 &callgraph,
@@ -985,16 +975,18 @@ impl PassManager {
                     println!("{}", rust_rt);
 
                     // Write the LLVM IR into a temporary file.
-                    let mut tmp_path = temp_dir();
+                    let tmp_dir = TempDir::new().unwrap();
+                    let mut tmp_path = tmp_dir.path().to_path_buf();
                     tmp_path.push(format!("{}.ll", module_name));
+                    println!("{}", tmp_path.display());
                     let mut file = File::create(&tmp_path)
                         .expect("PANIC: Unable to open output LLVM IR file.");
                     file.write_all(llvm_ir.as_bytes())
                         .expect("PANIC: Unable to write output LLVM IR file contents.");
-                    println!("{}", tmp_path.display());
 
                     // Compile LLVM IR into an ELF object file.
                     let output_archive = format!("{}/lib{}.a", output_dir, module_name);
+                    println!("{}", output_archive);
                     let mut clang_process = Command::new("clang")
                         .arg(&tmp_path)
                         .arg("--emit-static-lib")
@@ -1007,15 +999,14 @@ impl PassManager {
                         .spawn()
                         .expect("Error running clang. Is it installed?");
                     assert!(clang_process.wait().unwrap().success());
-                    println!("{}", output_archive);
 
                     // Write the Rust runtime into a file.
                     let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name);
+                    println!("{}", output_rt);
                     let mut file = File::create(&output_rt)
                         .expect("PANIC: Unable to open output Rust runtime file.");
                     file.write_all(rust_rt.as_bytes())
                         .expect("PANIC: Unable to write output Rust runtime file contents.");
-                    println!("{}", output_rt);
                 }
                 Pass::Serialize(output_file) => {
                     let module_contents: Vec<u8> = postcard::to_allocvec(&self.module).unwrap();
@@ -1040,7 +1031,6 @@ impl PassManager {
         self.fork_join_nests = None;
         self.loops = None;
         self.reduce_cycles = None;
-        self.antideps = None;
         self.data_nodes_in_fork_joins = None;
         self.bbs = None;
         self.collection_objects = None;
diff --git a/hercules_opt/src/phi_elim.rs b/hercules_opt/src/phi_elim.rs
index 2788e56a9199d20997e1272fe495e45048d91767..a79b43d3ca6a6b7fa85f09127ce53ba70b0063be 100644
--- a/hercules_opt/src/phi_elim.rs
+++ b/hercules_opt/src/phi_elim.rs
@@ -1,12 +1,9 @@
-extern crate bitvec;
-extern crate hercules_ir;
-
 use std::collections::VecDeque;
 use std::iter::FromIterator;
 
-use self::bitvec::prelude::*;
+use bitvec::prelude::*;
 
-use self::hercules_ir::ir::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
diff --git a/hercules_opt/src/pred.rs b/hercules_opt/src/pred.rs
index 09d9753d0c09a561483c733eb87415beda435944..be1b4a0bb77e3ab225f495577eae9d6a24e52561 100644
--- a/hercules_opt/src/pred.rs
+++ b/hercules_opt/src/pred.rs
@@ -1,15 +1,12 @@
-extern crate bitvec;
-extern crate hercules_ir;
-
 use std::collections::HashMap;
 use std::collections::HashSet;
 use std::collections::VecDeque;
 
-use self::bitvec::prelude::*;
+use bitvec::prelude::*;
 
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::dom::*;
-use self::hercules_ir::ir::*;
+use hercules_ir::def_use::*;
+use hercules_ir::dom::*;
+use hercules_ir::ir::*;
 
 /*
  * Top level function to convert acyclic control flow in vectorized fork-joins
diff --git a/hercules_opt/src/schedule.rs b/hercules_opt/src/schedule.rs
index ff895b1651b298cc59a8b0afbd921da2aaf04f82..2c8209aae002beffca3c8f7ee40d6986fed49fb0 100644
--- a/hercules_opt/src/schedule.rs
+++ b/hercules_opt/src/schedule.rs
@@ -1,9 +1,7 @@
-extern crate hercules_ir;
-
 use std::collections::{HashMap, HashSet};
 
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::ir::*;
+use hercules_ir::def_use::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
diff --git a/hercules_opt/src/sroa.rs b/hercules_opt/src/sroa.rs
index a73ecb2b42ef42c94f3e97faa8d777a7040cc6f4..6461ad717d5b7cbeac0e237916d11d4a3a4ae6d6 100644
--- a/hercules_opt/src/sroa.rs
+++ b/hercules_opt/src/sroa.rs
@@ -1,8 +1,6 @@
-extern crate hercules_ir;
-
 use std::collections::{BTreeMap, HashMap, VecDeque};
 
-use self::hercules_ir::ir::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
diff --git a/hercules_opt/src/unforkify.rs b/hercules_opt/src/unforkify.rs
index 61c86a276c6f71be3790ec1d9342d6b364d020c4..a5df7a7c404e820a92d27842211dd0e3396dae41 100644
--- a/hercules_opt/src/unforkify.rs
+++ b/hercules_opt/src/unforkify.rs
@@ -1,9 +1,7 @@
-extern crate hercules_ir;
-
 use std::collections::HashMap;
 use std::iter::zip;
 
-use self::hercules_ir::ir::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs
index c32225b50bd1d02500ad210171cde0a58d67b17e..77fa1ff6525c575414b94e12905502b257ac1a12 100644
--- a/hercules_opt/src/utils.rs
+++ b/hercules_opt/src/utils.rs
@@ -1,7 +1,5 @@
-extern crate hercules_ir;
-
-use self::hercules_ir::def_use::*;
-use self::hercules_ir::ir::*;
+use hercules_ir::def_use::*;
+use hercules_ir::ir::*;
 
 use crate::*;
 
diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..c4678b18918f323e601a75b0efa1f0289b9b8de9
--- /dev/null
+++ b/hercules_rt/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "hercules_rt"
+version = "0.1.0"
+authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+edition = "2021"
+
+[features]
+cuda = []
+
+[dependencies]
+
diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..15b9f6396d2c90eee66dfecb5a255cef2890726b
--- /dev/null
+++ b/hercules_rt/build.rs
@@ -0,0 +1,25 @@
+use std::env::var;
+use std::path::Path;
+use std::process::Command;
+
+fn main() {
+    if cfg!(feature = "cuda") {
+        let out_dir = var("OUT_DIR").unwrap();
+        Command::new("nvcc")
+            .args(&["src/rtdefs.cu", "-c", "-o"])
+            .arg(&format!("{}/rtdefs.o", out_dir))
+            .status()
+            .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?");
+        Command::new("ar")
+            .args(&["crus", "librtdefs.a", "rtdefs.o"])
+            .current_dir(&Path::new(&out_dir))
+            .status()
+            .unwrap();
+
+        println!("cargo::rustc-link-search=native={}", out_dir);
+        println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+        println!("cargo::rustc-link-lib=static=rtdefs");
+        println!("cargo::rustc-link-lib=cudart");
+        println!("cargo::rerun-if-changed=src/rtdefs.cu");
+    }
+}
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
new file mode 100644
index 0000000000000000000000000000000000000000..2a96970e88e2b432c8c9c2ed896dc1c623781cbd
--- /dev/null
+++ b/hercules_rt/src/lib.rs
@@ -0,0 +1,211 @@
+use std::alloc::{alloc, alloc_zeroed, dealloc, Layout};
+use std::marker::PhantomData;
+use std::mem::swap;
+use std::ptr::{copy_nonoverlapping, NonNull};
+use std::slice::from_raw_parts;
+
+#[cfg(feature = "cuda")]
+extern "C" {
+    fn cuda_alloc(size: usize) -> *mut u8;
+    fn cuda_alloc_zeroed(size: usize) -> *mut u8;
+    fn cuda_dealloc(ptr: *mut u8);
+    fn copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+    fn copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
+    fn copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+}
+
+/*
+ * An in-memory collection object that can be used by functions compiled by the
+ * Hercules compiler.
+ */
+pub struct HerculesBox<'a> {
+    cpu_shared: Option<NonNull<u8>>,
+    cpu_exclusive: Option<NonNull<u8>>,
+    cpu_owned: Option<NonNull<u8>>,
+
+    #[cfg(feature = "cuda")]
+    cuda_owned: Option<NonNull<u8>>,
+
+    size: usize,
+    _phantom: PhantomData<&'a u8>,
+}
+
+impl<'b, 'a: 'b> HerculesBox<'a> {
+    pub fn from_slice<T>(slice: &'a [T]) -> Self {
+        HerculesBox {
+            cpu_shared: Some(unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) }),
+            cpu_exclusive: None,
+            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
+            size: slice.len() * size_of::<T>(),
+            _phantom: PhantomData,
+        }
+    }
+
+    pub fn from_slice_mut<T>(slice: &'a mut [T]) -> Self {
+        HerculesBox {
+            cpu_shared: None,
+            cpu_exclusive: Some(unsafe { NonNull::new_unchecked(slice.as_mut_ptr() as *mut u8) }),
+            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
+            size: slice.len() * size_of::<T>(),
+            _phantom: PhantomData,
+        }
+    }
+
+    pub fn as_slice<T>(&'b mut self) -> &'b [T] {
+        assert_eq!(self.size % size_of::<T>(), 0);
+        unsafe { from_raw_parts(self.__cpu_ptr() as *const T, self.size / size_of::<T>()) }
+    }
+
+    unsafe fn get_cpu_ptr(&self) -> Option<NonNull<u8>> {
+        self.cpu_owned.or(self.cpu_exclusive).or(self.cpu_shared)
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn get_cuda_ptr(&self) -> Option<NonNull<u8>> {
+        self.cuda_owned
+    }
+
+    unsafe fn allocate_cpu(&mut self) -> NonNull<u8> {
+        if let Some(ptr) = self.cpu_owned {
+            ptr
+        } else {
+            let ptr =
+                NonNull::new(alloc(Layout::from_size_align_unchecked(self.size, 16))).unwrap();
+            self.cpu_owned = Some(ptr);
+            ptr
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn allocate_cuda(&mut self) -> NonNull<u8> {
+        if let Some(ptr) = self.cuda_owned {
+            ptr
+        } else {
+            let ptr = cuda_alloc(self.size);
+            self.cuda_owned = Some(NonNull::new(ptr).unwrap());
+            self.cuda_owned.unwrap()
+        }
+    }
+
+    unsafe fn deallocate_cpu(&mut self) {
+        if let Some(ptr) = self.cpu_owned {
+            dealloc(
+                ptr.as_ptr(),
+                Layout::from_size_align_unchecked(self.size, 16),
+            );
+            self.cpu_owned = None;
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn deallocate_cuda(&mut self) {
+        if let Some(ptr) = self.cuda_owned {
+            cuda_dealloc(ptr.as_ptr());
+            self.cuda_owned = None;
+        }
+    }
+
+    pub unsafe fn __zeros(size: u64) -> Self {
+        assert_ne!(size, 0);
+        let size = size as usize;
+        HerculesBox {
+            cpu_shared: None,
+            cpu_exclusive: None,
+            cpu_owned: Some(
+                NonNull::new(alloc_zeroed(Layout::from_size_align_unchecked(size, 16))).unwrap(),
+            ),
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
+            size: size,
+            _phantom: PhantomData,
+        }
+    }
+
+    pub unsafe fn __null() -> Self {
+        HerculesBox {
+            cpu_shared: None,
+            cpu_exclusive: None,
+            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
+            size: 0,
+            _phantom: PhantomData,
+        }
+    }
+
+    pub unsafe fn __take(&mut self) -> Self {
+        let mut ret = Self::__null();
+        swap(&mut ret, self);
+        ret
+    }
+
+    pub unsafe fn __cpu_ptr(&mut self) -> *mut u8 {
+        if let Some(ptr) = self.get_cpu_ptr() {
+            return ptr.as_ptr();
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let cuda_ptr = self.get_cuda_ptr().unwrap();
+            let cpu_ptr = self.allocate_cpu();
+            copy_cuda_to_cpu(cpu_ptr.as_ptr(), cuda_ptr.as_ptr(), self.size);
+            return cpu_ptr.as_ptr();
+        }
+        panic!()
+    }
+
+    pub unsafe fn __cpu_ptr_mut(&mut self) -> *mut u8 {
+        let cpu_ptr = self.__cpu_ptr();
+        if Some(cpu_ptr) == self.cpu_shared.map(|nn| nn.as_ptr()) {
+            self.allocate_cpu();
+            copy_nonoverlapping(cpu_ptr, self.cpu_owned.unwrap().as_ptr(), self.size);
+        }
+        self.cpu_shared = None;
+        self.cpu_exclusive = None;
+        #[cfg(feature = "cuda")]
+        self.deallocate_cuda();
+        cpu_ptr
+    }
+
+    #[cfg(feature = "cuda")]
+    pub unsafe fn __cuda_ptr(&mut self) -> *mut u8 {
+        if let Some(ptr) = self.get_cuda_ptr() {
+            ptr.as_ptr()
+        } else {
+            let cpu_ptr = self.get_cpu_ptr().unwrap();
+            let cuda_ptr = self.allocate_cuda();
+            copy_cpu_to_cuda(cuda_ptr.as_ptr(), cpu_ptr.as_ptr(), self.size);
+            cuda_ptr.as_ptr()
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub unsafe fn __cuda_ptr_mut(&mut self) -> *mut u8 {
+        let cuda_ptr = self.__cuda_ptr();
+        self.cpu_shared = None;
+        self.cpu_exclusive = None;
+        self.deallocate_cpu();
+        cuda_ptr
+    }
+}
+
+impl<'a> Drop for HerculesBox<'a> {
+    fn drop(&mut self) {
+        unsafe {
+            self.deallocate_cpu();
+            #[cfg(feature = "cuda")]
+            self.deallocate_cuda();
+        }
+    }
+}
diff --git a/hercules_rt/src/rtdefs.cu b/hercules_rt/src/rtdefs.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b7378d816c65666b445a989fc6b0530d821beb7b
--- /dev/null
+++ b/hercules_rt/src/rtdefs.cu
@@ -0,0 +1,38 @@
+extern "C" {
+	void *cuda_alloc(size_t size) {
+		void *ptr = NULL;
+		cudaError_t res = cudaMalloc(&ptr, size);
+		if (res != cudaSuccess) {
+			ptr = NULL;
+		}
+		return ptr;
+	}
+	
+	void *cuda_alloc_zeroed(size_t size) {
+		void *ptr = cuda_alloc(size);
+		if (!ptr) {
+			return NULL;
+		}
+		cudaError_t res = cudaMemset(ptr, 0, size);
+		if (res != cudaSuccess) {
+			return NULL;
+		}
+		return ptr;
+	}
+	
+	void cuda_dealloc(void *ptr) {
+		cudaFree(ptr);
+	}
+	
+	void copy_cpu_to_cuda(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
+	}
+	
+	void copy_cuda_to_cpu(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
+	}
+	
+	void copy_cuda_to_cuda(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
+	}
+}
diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs
index dbefe008a14e57785261e2757bb0e0dbbb5fa27c..af48fe64f7c2b778c9841e45ecf13b8a6a5740d2 100644
--- a/hercules_samples/call/build.rs
+++ b/hercules_samples/call/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/hercules_samples/call/src/main.rs b/hercules_samples/call/src/main.rs
index b5c999fdac3738f6d3fced2164fbf9320d5a1034..0b657dd81e7ade1491f3c6c51fdd9a9f0abb5d57 100644
--- a/hercules_samples/call/src/main.rs
+++ b/hercules_samples/call/src/main.rs
@@ -1,8 +1,5 @@
 #![feature(box_as_ptr, let_chains)]
 
-extern crate async_std;
-extern crate juno_build;
-
 juno_build::juno!("call");
 
 fn main() {
diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs
index 650b51b8b14715579de164f7fc65330e113613a1..f04d48c7d0ea6df8b16d70b05cedabfc04c1f6f2 100644
--- a/hercules_samples/ccp/build.rs
+++ b/hercules_samples/ccp/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/hercules_samples/ccp/src/main.rs b/hercules_samples/ccp/src/main.rs
index 9e2aced90044d1af68e6a47f9205ee1a21a69328..7f6459a0c3593c38794d5534ac7af41e404db8c1 100644
--- a/hercules_samples/ccp/src/main.rs
+++ b/hercules_samples/ccp/src/main.rs
@@ -1,8 +1,5 @@
 #![feature(box_as_ptr, let_chains)]
 
-extern crate async_std;
-extern crate juno_build;
-
 juno_build::juno!("ccp");
 
 fn main() {
diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml
index f74ab1f6f4ed5de3b02ab45b1f6fca461fdbc192..69cd39e388661b3f7f6dca53cf9210ab7050902c 100644
--- a/hercules_samples/dot/Cargo.toml
+++ b/hercules_samples/dot/Cargo.toml
@@ -10,6 +10,7 @@ juno_build = { path = "../../juno_build" }
 [dependencies]
 clap = { version = "*", features = ["derive"] }
 juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 rand = "*"
 async-std = "*"
 with_builtin_macros = "0.1.0"
diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs
index cfa03fd3b989748fe530071f289fb31f036283b4..2a239bc6c3ebd3780cb15358375c59bdfb2e25ae 100644
--- a/hercules_samples/dot/build.rs
+++ b/hercules_samples/dot/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs
index 0f5ee518506e6abe6cb815c244699ce12ccb45d1..0b5c6a93c3527d4c6863c48956f1d10947bdb017 100644
--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
@@ -1,31 +1,16 @@
 #![feature(box_as_ptr, let_chains)]
 
-extern crate async_std;
-extern crate juno_build;
-
-use core::ptr::copy_nonoverlapping;
+use hercules_rt::HerculesBox;
 
 juno_build::juno!("dot");
 
 fn main() {
     async_std::task::block_on(async {
-        let a: Box<[f32]> = Box::new([0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]);
-        let b: Box<[f32]> = Box::new([0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]);
-        let mut a_bytes: Box<[u8]> = Box::new([0; 32]);
-        let mut b_bytes: Box<[u8]> = Box::new([0; 32]);
-        unsafe {
-            copy_nonoverlapping(
-                Box::as_ptr(&a) as *const u8,
-                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
-                32,
-            );
-            copy_nonoverlapping(
-                Box::as_ptr(&b) as *const u8,
-                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
-                32,
-            );
-        };
-        let c = dot(8, a_bytes, b_bytes).await;
+        let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
+        let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
+        let a = HerculesBox::from_slice(&a);
+        let b = HerculesBox::from_slice(&b);
+        let c = dot(8, a, b).await;
         println!("{}", c);
         assert_eq!(c, 70.0);
     });
diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs
index 49a6024828516f994522e66bf1de1d714f63e75b..4d8226f11183d9500e6affec4c46110e8626ee69 100644
--- a/hercules_samples/fac/build.rs
+++ b/hercules_samples/fac/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/hercules_samples/fac/src/main.rs b/hercules_samples/fac/src/main.rs
index 7071fd2c115bba1d6ff60fae688b262354d4fc71..b6e0257b0be48e39555050194be6d29459c3c765 100644
--- a/hercules_samples/fac/src/main.rs
+++ b/hercules_samples/fac/src/main.rs
@@ -1,7 +1,3 @@
-extern crate async_std;
-extern crate clap;
-extern crate juno_build;
-
 juno_build::juno!("fac");
 
 fn main() {
diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml
index d3975c5ca58b68cdb3fef0f6d8a3cf8e106408d6..9066c1535e2c40400bdb3b5ca20a3e38237ef597 100644
--- a/hercules_samples/matmul/Cargo.toml
+++ b/hercules_samples/matmul/Cargo.toml
@@ -10,6 +10,7 @@ juno_build = { path = "../../juno_build" }
 [dependencies]
 clap = { version = "*", features = ["derive"] }
 juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 rand = "*"
 async-std = "*"
 with_builtin_macros = "0.1.0"
diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs
index ec6eb892326c5d6eb341c89239b0465de0cf4a49..08478deaac459d9a94f79fdabce37da9a1205f89 100644
--- a/hercules_samples/matmul/build.rs
+++ b/hercules_samples/matmul/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs
index 93d007c791579a75dea65d5680ab3018e9b00085..767fda07195544ae291e0e9e731a1ee43bd1615b 100644
--- a/hercules_samples/matmul/src/main.rs
+++ b/hercules_samples/matmul/src/main.rs
@@ -1,13 +1,9 @@
 #![feature(box_as_ptr, let_chains)]
 
-extern crate async_std;
-extern crate juno_build;
-extern crate rand;
-
-use core::ptr::copy_nonoverlapping;
-
 use rand::random;
 
+use hercules_rt::HerculesBox;
+
 juno_build::juno!("matmul");
 
 fn main() {
@@ -15,31 +11,8 @@ fn main() {
         const I: usize = 256;
         const J: usize = 64;
         const K: usize = 128;
-        let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
-        let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
-        let mut a_bytes: Box<[u8]> = Box::new([0; I * J * 4]);
-        let mut b_bytes: Box<[u8]> = Box::new([0; J * K * 4]);
-        unsafe {
-            copy_nonoverlapping(
-                Box::as_ptr(&a) as *const u8,
-                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
-                I * J * 4,
-            );
-            copy_nonoverlapping(
-                Box::as_ptr(&b) as *const u8,
-                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
-                J * K * 4,
-            );
-        };
-        let c_bytes = matmul(I as u64, J as u64, K as u64, a_bytes, b_bytes).await;
-        let mut c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
-        unsafe {
-            copy_nonoverlapping(
-                Box::as_ptr(&c_bytes) as *const u8,
-                Box::as_mut_ptr(&mut c) as *mut u8,
-                I * K * 4,
-            );
-        };
+        let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
+        let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
         let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
         for i in 0..I {
             for k in 0..K {
@@ -48,7 +21,10 @@ fn main() {
                 }
             }
         }
-        assert_eq!(c, correct_c);
+        let a = HerculesBox::from_slice_mut(&mut a);
+        let b = HerculesBox::from_slice_mut(&mut b);
+        let mut c = matmul(I as u64, J as u64, K as u64, a, b).await;
+        assert_eq!(c.as_slice::<i32>(), &*correct_c);
     });
 }
 
diff --git a/hercules_test/hercules_interpreter/src/interpreter.rs b/hercules_test/hercules_interpreter/src/interpreter.rs
index a166427fe062f90577f1b29b193cc08e02e01193..621260e5a25597e3dedcc279348e0be7c1fb9472 100644
--- a/hercules_test/hercules_interpreter/src/interpreter.rs
+++ b/hercules_test/hercules_interpreter/src/interpreter.rs
@@ -1,19 +1,14 @@
-extern crate itertools;
-extern crate ordered_float;
 
-use crate::value;
-
-use self::itertools::Itertools;
 use std::collections::HashMap;
 use std::panic;
 use std::collections::hash_map::Entry::Occupied;
 
-use value::*;
+use itertools::Itertools;
 
-extern crate hercules_ir;
-extern crate hercules_opt;
+use hercules_ir::*;
 
-use self::hercules_ir::*;
+use crate::value;
+use value::*;
 
 /* High level design details / discussion for this:
  *
diff --git a/hercules_test/hercules_interpreter/src/main.rs b/hercules_test/hercules_interpreter/src/main.rs
index 4d2ebd31853aedf36fb5871e62af9422f49b3c0f..5db31cd730fe802dd9ccbf1b8e0d603c736fb196 100644
--- a/hercules_test/hercules_interpreter/src/main.rs
+++ b/hercules_test/hercules_interpreter/src/main.rs
@@ -1,18 +1,13 @@
-extern crate clap;
-extern crate hercules_ir;
-extern crate hercules_opt;
-extern crate rand;
+use std::fs::File;
+use std::io::prelude::*;
 
+use clap::Parser;
+
+use hercules_ir::*;
 
 use hercules_interpreter::interpreter::*;
 use hercules_interpreter::*;
 use hercules_interpreter::value;
-use std::fs::File;
-use std::io::prelude::*;
-
-use self::hercules_ir::*;
-
-use clap::Parser;
 
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
diff --git a/hercules_test/hercules_interpreter/src/value.rs b/hercules_test/hercules_interpreter/src/value.rs
index d236145c00e1bfd63a1b45ff5bebad93398062ea..e032bd5b711a713c55c6cd911c99e327ff1d2e0b 100644
--- a/hercules_test/hercules_interpreter/src/value.rs
+++ b/hercules_test/hercules_interpreter/src/value.rs
@@ -1,23 +1,18 @@
 #![allow(unused)]
 
-extern crate derive_more;
 use derive_more::From;
 
 /* Defines semantic meaning of IR operations. */
-extern crate itertools;
 
 use crate::dyn_const_value;
 
-use self::itertools::Itertools;
+use itertools::Itertools;
 use std::clone;
 use std::convert::TryInto;
 use std::panic;
 
-extern crate hercules_ir;
-extern crate hercules_opt;
-
-use self::hercules_ir::*;
-use self::hercules_opt::*;
+use hercules_ir::*;
+use hercules_opt::*;
 
 #[derive(PartialEq, Debug, Clone, Eq)]
 pub enum InterpreterVal {
diff --git a/hercules_test/hercules_tests/tests/opt_tests.rs b/hercules_test/hercules_tests/tests/opt_tests.rs
index c14d4db5b0072ec0f724cdde9b467c7568e1b41c..388dfeddf38eecab3ffb191a344682395e71d5cd 100644
--- a/hercules_test/hercules_tests/tests/opt_tests.rs
+++ b/hercules_test/hercules_tests/tests/opt_tests.rs
@@ -1,11 +1,10 @@
 use std::env;
 
+use rand::Rng;
+
 use hercules_interpreter::*;
 use hercules_opt::pass::Pass;
 
-extern crate rand;
-use rand::Rng;
-
 #[test]
 fn matmul_int() {
     let module = parse_file("../test_inputs/matmul_int.hir");
@@ -198,4 +197,4 @@ fn sum_int2_smaller() {
         Pass::DCE,
     ],
     vec![1; 100]);
-}
\ No newline at end of file
+}
diff --git a/hercules_tools/hercules_driver/Cargo.toml b/hercules_tools/hercules_driver/Cargo.toml
index 9236c34a948e61e962f2fddc3d23a4fd72cfaee6..ad9397b140052539a341084646d5f7fde1cbafff 100644
--- a/hercules_tools/hercules_driver/Cargo.toml
+++ b/hercules_tools/hercules_driver/Cargo.toml
@@ -2,6 +2,7 @@
 name = "hercules_driver"
 version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
+edition = "2021"
 
 [dependencies]
 clap = { version = "*", features = ["derive"] }
diff --git a/hercules_tools/hercules_driver/src/main.rs b/hercules_tools/hercules_driver/src/main.rs
index 97c9fe2ef9f03c87f50d079d4b1aabad371b894f..a2550022129029387664fb4327528d09078c2e02 100644
--- a/hercules_tools/hercules_driver/src/main.rs
+++ b/hercules_tools/hercules_driver/src/main.rs
@@ -1,6 +1,3 @@
-extern crate clap;
-extern crate postcard;
-
 use std::fs::File;
 use std::io::prelude::*;
 use std::path::Path;
diff --git a/juno_build/src/lib.rs b/juno_build/src/lib.rs
index fdaf4d27cbcb7b31738e6df42d69272369d5026f..0c676e4c1b203c53e98a8430e0f2354104540e07 100644
--- a/juno_build/src/lib.rs
+++ b/juno_build/src/lib.rs
@@ -1,5 +1,3 @@
-extern crate hercules_ir;
-
 use juno_compiler::*;
 
 use std::env::{current_dir, var};
diff --git a/juno_frontend/src/lib.rs b/juno_frontend/src/lib.rs
index b18b29791b54aa267945ec8e658fdce069e250f3..906d780560626d7aff1716b4d159aa82d76a7df5 100644
--- a/juno_frontend/src/lib.rs
+++ b/juno_frontend/src/lib.rs
@@ -9,8 +9,6 @@ mod semant;
 mod ssa;
 mod types;
 
-extern crate hercules_ir;
-
 use std::fmt;
 use std::path::Path;
 
@@ -187,18 +185,24 @@ pub fn compile_ir(
     //add_pass!(pm, verify, Forkify);
     //add_pass!(pm, verify, ForkGuardElim);
     add_verified_pass!(pm, verify, DCE);
+    add_pass!(pm, verify, ForkSplit);
+    add_pass!(pm, verify, Unforkify);
+    add_pass!(pm, verify, GVN);
+    add_verified_pass!(pm, verify, DCE);
+    add_pass!(pm, verify, DCE);
     add_pass!(pm, verify, Outline);
     add_pass!(pm, verify, InterproceduralSROA);
     add_pass!(pm, verify, SROA);
     add_pass!(pm, verify, InferSchedules);
-    add_pass!(pm, verify, ForkSplit);
-    add_pass!(pm, verify, Unforkify);
-    add_pass!(pm, verify, GVN);
     add_verified_pass!(pm, verify, DCE);
     if x_dot {
         pm.add_pass(hercules_opt::pass::Pass::Xdot(true));
     }
 
+    add_pass!(pm, verify, GCM);
+    add_verified_pass!(pm, verify, DCE);
+    add_pass!(pm, verify, FloatCollections);
+    add_pass!(pm, verify, GCM);
     pm.add_pass(hercules_opt::pass::Pass::Codegen(output_dir, module_name));
     pm.run_passes();
 
diff --git a/juno_frontend/src/main.rs b/juno_frontend/src/main.rs
index 4624e7160d242b064fdae85e7eba889e752fff75..d98c1e29e5aedeac5ce2a4e791c669a7af34c320 100644
--- a/juno_frontend/src/main.rs
+++ b/juno_frontend/src/main.rs
@@ -1,5 +1,3 @@
-extern crate clap;
-
 use juno_compiler::*;
 
 use clap::{ArgGroup, Parser};
diff --git a/juno_frontend/src/semant.rs b/juno_frontend/src/semant.rs
index 1ccdf3ed80b33deab7fe51c51cf44c118f4c4518..660d8afe35a803e7395d67871236e0f596b09a87 100644
--- a/juno_frontend/src/semant.rs
+++ b/juno_frontend/src/semant.rs
@@ -1,5 +1,3 @@
-extern crate hercules_ir;
-
 use std::collections::{HashMap, LinkedList};
 use std::fmt;
 use std::fs::File;
diff --git a/juno_frontend/src/ssa.rs b/juno_frontend/src/ssa.rs
index f5e1d8302d8a407e2a6e1b6d89cc9b62d4e5c40e..578f7a9af120644d265d5c5b679fb97b35806d36 100644
--- a/juno_frontend/src/ssa.rs
+++ b/juno_frontend/src/ssa.rs
@@ -4,12 +4,11 @@
  *  Compiler Construction. CC 2013. Lecture Notes in Computer Science, vol 7791. Springer, Berlin,
  *  Heidelberg. https://doi.org/10.1007/978-3-642-37051-9_6
  */
-extern crate hercules_ir;
 
 use std::collections::{HashMap, HashSet};
 
-use self::hercules_ir::build::*;
-use self::hercules_ir::ir::*;
+use hercules_ir::build::*;
+use hercules_ir::ir::*;
 use crate::labeled_builder::LabeledBuilder;
 
 pub struct SSA {
diff --git a/juno_frontend/src/types.rs b/juno_frontend/src/types.rs
index 582e7cfddb87cc47e467662d52e87029319e99ad..5f907cd9370a343eb501e0c4b5b1dcef2a55f651 100644
--- a/juno_frontend/src/types.rs
+++ b/juno_frontend/src/types.rs
@@ -1,11 +1,12 @@
 use std::collections::{HashMap, HashSet, VecDeque};
 
 use crate::dynconst::DynConst;
-use crate::hercules_ir::build::*;
-use crate::hercules_ir::ir::*;
 use crate::locs::Location;
 use crate::parser;
 
+use hercules_ir::build::*;
+use hercules_ir::ir::*;
+
 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
 pub enum Either<A, B> {
     Left(A),
diff --git a/juno_samples/antideps/Cargo.toml b/juno_samples/antideps/Cargo.toml
index 40b4d47c57bbe422bfe983ee73a1d140a2ab1b94..9bd1d5a0d484e257ee3ad1e425a09f503da4b503 100644
--- a/juno_samples/antideps/Cargo.toml
+++ b/juno_samples/antideps/Cargo.toml
@@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs
index 757243b829c3f805f8d87d38676e72e2b532f072..7ed716a444460d7a90965f5b7f5faf3a7aadcb14 100644
--- a/juno_samples/antideps/build.rs
+++ b/juno_samples/antideps/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/juno_samples/antideps/src/antideps.jn b/juno_samples/antideps/src/antideps.jn
index 5949c91a4e64bea9e3afa966f1f9f6a160d8553a..9efe71f10963aacf1620c4348abef6a74d8cb502 100644
--- a/juno_samples/antideps/src/antideps.jn
+++ b/juno_samples/antideps/src/antideps.jn
@@ -7,7 +7,20 @@ fn simple_antideps(a : usize, b : usize) -> i32 {
 }
 
 #[entry]
-fn complex_antideps(x : i32) -> i32 {
+fn loop_antideps(input : i32) -> i32 {
+  let arr1 : i32[1];
+  arr1[0] = 2;
+  let p1 = arr1[0];
+  while input > 10 {
+    arr1[0] = arr1[0] + 1;
+    input -= 10;
+  }
+  let p2 = arr1[0];
+  return p1 + p2;
+}
+
+#[entry]
+fn complex_antideps1(x : i32) -> i32 {
   let arr : i32[4];
   let arr2 : i32[12];
   arr[1] = 7 + arr2[0];
@@ -28,6 +41,23 @@ fn complex_antideps(x : i32) -> i32 {
   return r;
 }
 
+#[entry]
+fn complex_antideps2(input : i32) -> i32 {
+  let arr1 : i32[2];
+  arr1[0] = 2;
+  arr1[1] = 3;
+  let p1 = arr1[0] + arr1[1];
+  if input > 0 {
+    while input > 10 {
+      arr1[0] = arr1[1] + input;
+      arr1[1] = arr1[0] + input;
+      input -= 10;
+    }
+  }
+  let p2 = arr1[0];
+  return p1 + p2;
+}
+
 #[entry]
 fn very_complex_antideps(x: usize) -> usize {
   let arr1 : usize[203];
diff --git a/juno_samples/antideps/src/main.rs b/juno_samples/antideps/src/main.rs
index b0a991637bde67a0229fb749213927b8e14c06dd..0b065cbaa6e6cffcaf9ff7b3fbc5a2c882dc7248 100644
--- a/juno_samples/antideps/src/main.rs
+++ b/juno_samples/antideps/src/main.rs
@@ -1,8 +1,5 @@
 #![feature(future_join, box_as_ptr)]
 
-extern crate async_std;
-extern crate juno_build;
-
 juno_build::juno!("antideps");
 
 fn main() {
@@ -11,10 +8,18 @@ fn main() {
         println!("{}", output);
         assert_eq!(output, 5);
 
-        let output = complex_antideps(9).await;
+        let output = loop_antideps(11).await;
+        println!("{}", output);
+        assert_eq!(output, 5);
+
+        let output = complex_antideps1(9).await;
         println!("{}", output);
         assert_eq!(output, 20);
 
+        let output = complex_antideps2(44).await;
+        println!("{}", output);
+        assert_eq!(output, 226);
+
         let output = very_complex_antideps(3).await;
         println!("{}", output);
         assert_eq!(output, 144);
diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs
index fafa97bbc642751c37b2ef47a43954fa84f340d9..16d5c7a4f7fcb00344fc7669b67103a27f71a7c6 100644
--- a/juno_samples/casts_and_intrinsics/build.rs
+++ b/juno_samples/casts_and_intrinsics/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/juno_samples/casts_and_intrinsics/src/main.rs b/juno_samples/casts_and_intrinsics/src/main.rs
index 037d4c4025ca141887034353124436a2db8f84f3..8ee509bfb401aca4bd42751c0fe34beee94c7324 100644
--- a/juno_samples/casts_and_intrinsics/src/main.rs
+++ b/juno_samples/casts_and_intrinsics/src/main.rs
@@ -1,8 +1,5 @@
 #![feature(future_join)]
 
-extern crate async_std;
-extern crate juno_build;
-
 juno_build::juno!("casts_and_intrinsics");
 
 fn main() {
diff --git a/juno_samples/implicit_clone/Cargo.toml b/juno_samples/implicit_clone/Cargo.toml
index 928fa1f2718b9637da2ac3fe740a0d893345d576..b312f5def295a6c31a7d2c9eab5c23f4e16ccd2f 100644
--- a/juno_samples/implicit_clone/Cargo.toml
+++ b/juno_samples/implicit_clone/Cargo.toml
@@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs
index 3378966b64a71a239178edc6a010f9081939f58a..75c1afc41a75b2006b26042323df3bdc3fcf5a17 100644
--- a/juno_samples/implicit_clone/build.rs
+++ b/juno_samples/implicit_clone/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/juno_samples/implicit_clone/src/implicit_clone.jn b/juno_samples/implicit_clone/src/implicit_clone.jn
index 17e345e51e80db27c0d2f21854e22abf1eefcdb8..882e5abc51bb596be63512eaff96434a2c45d43a 100644
--- a/juno_samples/implicit_clone/src/implicit_clone.jn
+++ b/juno_samples/implicit_clone/src/implicit_clone.jn
@@ -1,5 +1,5 @@
 #[entry]
-fn implicit_clone(input : i32) -> i32 {
+fn simple_implicit_clone(input : i32) -> i32 {
   let arr : i32[3];
   arr[0] = 2;
   let arr2 = arr;
@@ -7,3 +7,150 @@ fn implicit_clone(input : i32) -> i32 {
   arr[2] = 4;
   return arr[0] + arr2[0] + arr[1] + arr2[1] + arr[2] + arr2[2];
 }
+
+#[entry]
+fn loop_implicit_clone(input : i32) -> i32 {
+  let arr : i32[3];
+  let r : i32 = 5;
+  while input > 0 {
+    r = arr[0];
+    let arr2 = arr;
+    let x = arr2[input as usize - input as usize];
+    arr2[input as usize - input as usize] = 9;
+    if x == 0 {
+      input -= arr2[0];
+    } else {
+      r = 99;
+      break;
+    }
+  }
+  return r + 7;
+}
+
+#[entry]
+fn double_loop_implicit_clone(a : usize) -> usize {
+  for i = 0 to a {
+    let arr : i32[1];
+    for j = 0 to a {
+      arr[0] = 1;
+    }
+  }
+  return 42;
+}
+
+#[entry]
+fn tricky_loop_implicit_clone(a : usize, b : usize) -> i32 {
+  let x = 0;
+  for j = 0 to 2 {
+    for i = 0 to 5 {
+      let arr : i32[3];
+      let arr2 : i32[1];
+      if a == b {
+        arr[a] += 7;
+      } else {
+        arr[a] += 1;
+      }
+      for k = 0 to (a + b - 1) {
+        arr[a] += 2;
+	arr2[0] += 1;
+      }
+      x += arr[b];
+    }
+  }
+  return x;
+}
+
+#[entry]
+fn tricky2_loop_implicit_clone(a : usize, b : usize) -> i32 {
+  let x = 0;
+  for i = 0 to 3 {
+    let arr1 : i32[1];
+    let arr2 : i32[1];
+    if a == b {
+      arr1[0] = 6;
+    } else {
+      arr2[0] = 9;
+    }
+    arr1[0] = 2;
+    for j = 0 to 4 {
+      arr2[0] += 1;
+    }
+    x += arr2[0];
+  }
+  return x;
+}
+
+#[entry]
+fn tricky3_loop_implicit_clone(a : usize, b : usize) -> usize {
+  let x = 0;
+  for i = 0 to b {
+    let arr1 : usize[10];
+    let arr2 : usize[10];
+    arr1[1] = 1;
+    for kk = 0 to 10 {
+      arr2[kk] += arr1[kk];
+    }
+    x += arr2[1];
+  }
+  return x;
+}
+
+#[entry]
+fn no_implicit_clone(input : i32) -> i32 {
+  let arr : i32[2];
+  arr[0] = input;
+  while input > 0 {
+    arr[0] += 1;
+    input -= 1;
+  }
+  let arr2 : i32[1];
+  if input == 0 {
+    arr2[0] = 5;
+  } else {
+    arr2[0] = 3;
+  }
+  return arr[0] + arr2[0];
+}
+
+#[entry]
+fn mirage_implicit_clone(input : i32) -> i32 {
+  let arr1 : i32[2];
+  let arr2 : i32[2];
+  let arr3 : i32[2];
+  let arr4 : i32[2];
+  arr1[0] = 7;
+  arr1[1] = 3;
+  arr2[0] = input;
+  arr2[1] = 45;
+  arr3[0] = -14;
+  arr3[1] = -5;
+  arr4[0] = -1;
+  arr4[1] = 0;
+  arr2 = arr4;
+  arr3 = arr2;
+  arr2 = arr1;
+  let p1 = arr1[0] + arr1[1] + arr2[0] + arr2[1] + arr3[0] + arr3[1] + arr4[0] + arr4[1]; // 18
+  arr4 = arr2;
+  let p2 = arr1[0] + arr1[1] + arr2[0] + arr2[1] + arr3[0] + arr3[1] + arr4[0] + arr4[1]; // 29
+  if input > 0 {
+    while input > 10 {
+      arr1[0] = arr1[1] + input;
+      arr1[1] = arr1[0] + input;
+      input -= 10;
+    }
+  }
+  let p3 = arr1[0]; // 592
+  let x : i32 = 0;
+  while input < 20 {
+    let arr5 : i32[2];
+    arr5[0] = 7;
+    let y = arr5[0] + arr5[1];
+    arr5 = arr4;
+    arr5[1] += 2;
+    y += arr5[1];
+    x += 12;
+    input += 1;
+  }
+  let p4 = x; // 204
+  return p1 + p2 + p3 + p4;
+}
diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs
index ca7ddeb1571be6698f6a9c3971ef617b3a6fd4ca..bc687ed386f091e674c5eb81122adf30c06f8da9 100644
--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -1,15 +1,40 @@
 #![feature(future_join, box_as_ptr)]
 
-extern crate async_std;
-extern crate juno_build;
-
 juno_build::juno!("implicit_clone");
 
 fn main() {
     async_std::task::block_on(async {
-        let output = implicit_clone(3).await;
+        let output = simple_implicit_clone(3).await;
         println!("{}", output);
         assert_eq!(output, 11);
+
+        let output = loop_implicit_clone(100).await;
+        println!("{}", output);
+        assert_eq!(output, 7);
+
+        let output = double_loop_implicit_clone(3).await;
+        println!("{}", output);
+        assert_eq!(output, 42);
+
+        let output = tricky_loop_implicit_clone(2, 2).await;
+        println!("{}", output);
+        assert_eq!(output, 130);
+
+        let output = tricky2_loop_implicit_clone(2, 3).await;
+        println!("{}", output);
+        assert_eq!(output, 39);
+
+        let output = tricky3_loop_implicit_clone(5, 7).await;
+        println!("{}", output);
+        assert_eq!(output, 7);
+
+        let output = no_implicit_clone(4).await;
+        println!("{}", output);
+        assert_eq!(output, 13);
+
+        let output = mirage_implicit_clone(73).await;
+        println!("{}", output);
+        assert_eq!(output, 843);
     });
 }
 
diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml
index ea705dddd2fac0e4b5a4b8fe0ddfeef72039e3c4..8ad95853d25509d91713a080c0b63b01c0469110 100644
--- a/juno_samples/matmul/Cargo.toml
+++ b/juno_samples/matmul/Cargo.toml
@@ -13,6 +13,7 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
 rand = "*"
diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs
index 81f645e0666dfb22e075953c3d0a1a531909f1a0..926fbc33ecfa5ab31b40a92f778bb4d3b7f6a77e 100644
--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs
index 948459dd4badfbdf31fea25be70520df22ff5b6e..bace3765789010655ae058aa92f95862b93b3084 100644
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -1,13 +1,9 @@
-#![feature(future_join, box_as_ptr, let_chains)]
-
-extern crate async_std;
-extern crate juno_build;
-extern crate rand;
-
-use core::ptr::copy_nonoverlapping;
+#![feature(box_as_ptr, let_chains)]
 
 use rand::random;
 
+use hercules_rt::HerculesBox;
+
 juno_build::juno!("matmul");
 
 fn main() {
@@ -17,45 +13,6 @@ fn main() {
         const K: usize = 128;
         let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
         let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
-        let mut a_bytes: Box<[u8]> = Box::new([0; I * J * 4]);
-        let mut b_bytes: Box<[u8]> = Box::new([0; J * K * 4]);
-        unsafe {
-            copy_nonoverlapping(
-                Box::as_ptr(&a) as *const u8,
-                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
-                I * J * 4,
-            );
-            copy_nonoverlapping(
-                Box::as_ptr(&b) as *const u8,
-                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
-                J * K * 4,
-            );
-        };
-        let c_bytes = matmul(
-            I as u64,
-            J as u64,
-            K as u64,
-            a_bytes.clone(),
-            b_bytes.clone(),
-        )
-        .await;
-        let mut c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
-        unsafe {
-            copy_nonoverlapping(
-                Box::as_ptr(&c_bytes) as *const u8,
-                Box::as_mut_ptr(&mut c) as *mut u8,
-                I * K * 4,
-            );
-        };
-        let tiled_c_bytes = matmul(I as u64, J as u64, K as u64, a_bytes, b_bytes).await;
-        let mut tiled_c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
-        unsafe {
-            copy_nonoverlapping(
-                Box::as_ptr(&tiled_c_bytes) as *const u8,
-                Box::as_mut_ptr(&mut tiled_c) as *mut u8,
-                I * K * 4,
-            );
-        };
         let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
         for i in 0..I {
             for k in 0..K {
@@ -64,8 +21,18 @@ fn main() {
                 }
             }
         }
-        assert_eq!(c, correct_c);
-        assert_eq!(tiled_c, correct_c);
+        let mut c = {
+            let a = HerculesBox::from_slice(&a);
+            let b = HerculesBox::from_slice(&b);
+            matmul(I as u64, J as u64, K as u64, a, b).await
+        };
+        assert_eq!(c.as_slice::<i32>(), &*correct_c);
+        let mut tiled_c = {
+            let a = HerculesBox::from_slice(&a);
+            let b = HerculesBox::from_slice(&b);
+            tiled_64_matmul(I as u64, J as u64, K as u64, a, b).await
+        };
+        assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
     });
 }
 
@@ -73,3 +40,4 @@ fn main() {
 fn matmul_test() {
     main();
 }
+
diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn
index 775bb382a46a640cc45ddbf5d68b31c5c1d58bd3..ca9be73a86144ebe57048ed35adc2f36bfcd905b 100644
--- a/juno_samples/matmul/src/matmul.jn
+++ b/juno_samples/matmul/src/matmul.jn
@@ -17,25 +17,25 @@ fn matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]) -> i32[
 #[entry]
 fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]) -> i32[n, l] {
   let res : i32[n, l];
+  let atile : i32[64, 64];
+  let btile : i32[64, 64];
+  let ctile : i32[64, 64];
   
   for bi = 0 to n / 64 {
     for bk = 0 to l / 64 {
-      // TODO: make these all the same size, clone analysis should undo GVN's
-      // combining of these three arrays.
-      let atile : i32[66, 64];
-      let btile : i32[65, 64];
-      let ctile : i32[64, 64];
+      for ti = 0 to 64 {
+        for tk = 0 to 64 {
+	  atile[ti, tk] = 0;
+	  btile[ti, tk] = 0;
+	  ctile[ti, tk] = 0;
+	}
+      }
 
       for tile_idx = 0 to m / 64 {
         for ti = 0 to 64 {
 	  for tk = 0 to 64 {
 	    atile[ti, tk] = a[bi * 64 + ti, tile_idx * 64 + tk];
 	    btile[ti, tk] = b[tile_idx * 64 + ti, bk * 64 + tk];
-	    // TODO: remove setting ctile to zero explicitly, clone analysis
-	    // should see a lack of a phi for ctile in the block loops and
-	    // induce a copy of an initial value of ctile (all zeros) on each
-	    // iteration of the block loops.
-	    ctile[ti, tk] = 0;
 	  }
 	}
         for ti = 0 to 64 {
diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml
index 7ffc13f21b155dbe6028d808be97aaf0e5ffb8d6..8c9b969d23019b8bbd3bf28b3506e2e497ae8ec7 100644
--- a/juno_samples/nested_ccp/Cargo.toml
+++ b/juno_samples/nested_ccp/Cargo.toml
@@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs
index 0fb54be347f011f4c29f1d7df98a0f29c85193c6..c5c7ca6a1b9ab5decf6a8cf0b8e8f13ff7122834 100644
--- a/juno_samples/nested_ccp/build.rs
+++ b/juno_samples/nested_ccp/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs
index 83132aca516bd59a6e5b5c0e5ca4562a2e1cc3cc..974a488c8f3574aaecfd40b1110b6a7022891f0f 100644
--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
@@ -1,32 +1,17 @@
 #![feature(box_as_ptr, let_chains)]
 
-extern crate async_std;
-extern crate juno_build;
-
-use core::ptr::copy_nonoverlapping;
+use hercules_rt::HerculesBox;
 
 juno_build::juno!("nested_ccp");
 
 fn main() {
     async_std::task::block_on(async {
-        let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
-        let b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]);
-        let mut a_bytes: Box<[u8]> = Box::new([0; 12]);
-        let mut b_bytes: Box<[u8]> = Box::new([0; 36]);
-        unsafe {
-            copy_nonoverlapping(
-                Box::as_ptr(&a) as *const u8,
-                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
-                12,
-            );
-            copy_nonoverlapping(
-                Box::as_ptr(&b) as *const u8,
-                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
-                36,
-            );
-        };
-        let output_example = ccp_example(a_bytes).await;
-        let output_median = median_array(9, b_bytes).await;
+        let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
+        let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]);
+        let a = HerculesBox::from_slice_mut(&mut a);
+        let b = HerculesBox::from_slice_mut(&mut b);
+        let output_example = ccp_example(a).await;
+        let output_median = median_array(9, b).await;
         println!("{}", output_example);
         println!("{}", output_median);
         assert_eq!(output_example, 1.0);
diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml
index 201c8d3782d4b41d7bfef5b7df4b5b29758e6e00..8060c5b3472ad898cb48e011332a852cd7b6705e 100644
--- a/juno_samples/simple3/Cargo.toml
+++ b/juno_samples/simple3/Cargo.toml
@@ -13,5 +13,6 @@ juno_build = { path = "../../juno_build" }
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs
index 0e476e8d41c7880741a3f474f341a0decf0bda4b..94760025d53abe7e10914052e1a7783386b316b0 100644
--- a/juno_samples/simple3/build.rs
+++ b/juno_samples/simple3/build.rs
@@ -1,4 +1,3 @@
-extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs
index 89be5527a2e4d1e5842778818aa934db98fcdf09..1f6e213c41a27ceeb9c2988fe85bd956dec879d1 100644
--- a/juno_samples/simple3/src/main.rs
+++ b/juno_samples/simple3/src/main.rs
@@ -1,31 +1,16 @@
 #![feature(box_as_ptr, let_chains)]
 
-extern crate async_std;
-extern crate juno_build;
-
-use core::ptr::copy_nonoverlapping;
+use hercules_rt::HerculesBox;
 
 juno_build::juno!("simple3");
 
 fn main() {
     async_std::task::block_on(async {
-        let a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]);
-        let b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]);
-        let mut a_bytes: Box<[u8]> = Box::new([0; 32]);
-        let mut b_bytes: Box<[u8]> = Box::new([0; 32]);
-        unsafe {
-            copy_nonoverlapping(
-                Box::as_ptr(&a) as *const u8,
-                Box::as_mut_ptr(&mut a_bytes) as *mut u8,
-                32,
-            );
-            copy_nonoverlapping(
-                Box::as_ptr(&b) as *const u8,
-                Box::as_mut_ptr(&mut b_bytes) as *mut u8,
-                32,
-            );
-        };
-        let c = simple3(8, a_bytes, b_bytes).await;
+        let mut a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]);
+        let mut b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]);
+        let a = HerculesBox::from_slice_mut(&mut a);
+        let b = HerculesBox::from_slice_mut(&mut b);
+        let c = simple3(8, a, b).await;
         println!("{}", c);
         assert_eq!(c, 120);
     });
diff --git a/juno_scheduler/src/lib.rs b/juno_scheduler/src/lib.rs
index 7e558d6bcb7a5c9250f3efe862ff235463c3599c..d515633eec468012ceee7305544b1859c1ebd621 100644
--- a/juno_scheduler/src/lib.rs
+++ b/juno_scheduler/src/lib.rs
@@ -1,5 +1,3 @@
-extern crate hercules_ir;
-
 use std::collections::{HashMap, HashSet};
 use std::fs::File;
 use std::io::Read;
@@ -7,7 +5,7 @@ use std::io::Read;
 use lrlex::DefaultLexerTypes;
 use lrpar::NonStreamingLexer;
 
-use self::hercules_ir::ir::*;
+use hercules_ir::ir::*;
 
 mod parser;
 use crate::parser::lexer;
diff --git a/reports/technical1/main.tex b/reports/technical1/main.tex
index 4743d2069ce05baae6e37ea3f8c24582cbd0e9ff..e70625d4cd8adcf9157e1234d45eac177fa1f255 100644
--- a/reports/technical1/main.tex
+++ b/reports/technical1/main.tex
@@ -502,9 +502,6 @@ The manifest plus the Hercules module itself are loaded from a \texttt{.hbin} fi
 Here's an example host code written in Rust:
 
 \begin{lstlisting}[style=colouredRust]
-extern crate async_std;
-extern crate hercules_rt;
-
 hercules_rt::use_hbin!("my_module.hbin");
 
 fn main() {