diff --git a/.gitignore b/.gitignore
index f8a684ce5223884e46f65b7fa67b6328eadf3100..87af5349ee5aa22c4f3763c6f8905b69f2773bb2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,8 +4,12 @@
 *.out
 *.ll
 *.c
+*.cu
 *.o
 *.a
 *.hrt
-.*.swp
+*.png
+*.swp
 .vscode
+*_env
+*.txt
diff --git a/Cargo.lock b/Cargo.lock
index 2802b8b14af4c87362529671057ebf4eadbc8c9e..49630436a0f0b90d8252824046c29f0e18b78af2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1217,9 +1217,9 @@ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
 [[package]]
 name = "libfuzzer-sys"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa"
+checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75"
 dependencies = [
  "arbitrary",
  "cc",
@@ -2174,9 +2174,9 @@ dependencies = [
 
 [[package]]
 name = "toml_edit"
-version = "0.22.22"
+version = "0.22.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
+checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee"
 dependencies = [
  "indexmap",
  "serde",
@@ -2444,9 +2444,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "winnow"
-version = "0.6.24"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a"
+checksum = "7e49d2d35d3fad69b39b94139037ecfb4f359f08958b9c11e7315ce770462419"
 dependencies = [
  "memchr",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index a9197bf4fb4490ef0c25f37c787d766f9b6fae53..ced011a96c96793891228876314debeabcb561ed 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,12 +5,12 @@ members = [
 	"hercules_ir",
 	"hercules_opt",
 	"hercules_rt",
-	
+
 	"juno_utils",
 	"juno_frontend",
 	"juno_scheduler",
 	"juno_build",
-	
+
 	"hercules_test/hercules_interpreter",
 	"hercules_test/hercules_tests",
 
diff --git a/hercules_cg/Cargo.toml b/hercules_cg/Cargo.toml
index cf0767de04c81e35974a7aca89e9085b543e1a9d..0952ee57e0dd9e4b8f7e9639e10a3f2b0fc35a72 100644
--- a/hercules_cg/Cargo.toml
+++ b/hercules_cg/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = []
+
 [dependencies]
 rand = "*"
 ordered-float = "*"
diff --git a/hercules_cg/src/fork_tree.rs b/hercules_cg/src/fork_tree.rs
new file mode 100644
index 0000000000000000000000000000000000000000..64a93160aabc5564c784bf0ff8b14e82045b81bb
--- /dev/null
+++ b/hercules_cg/src/fork_tree.rs
@@ -0,0 +1,37 @@
+use std::collections::{HashMap, HashSet};
+
+use crate::*;
+
+/*
+ * Construct a map from fork node to all control nodes (including itself) satisfying:
+ * a) domination by F
+ * b) no domination by F's join
+ * c) no domination by any other fork that's also dominated by F, where we do count self-domination
+ * Here too we include the non-fork start node, as key for all controls outside any fork.
+ */
+pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> {
+    let mut fork_control_map = HashMap::new();
+    for (control, forks) in fork_join_nesting {
+        let fork = forks.first().copied().unwrap_or(NodeID::new(0));
+        fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(*control);
+    }
+    fork_control_map
+}
+
+/* Construct a map from each fork node F to all forks satisfying:
+ * a) domination by F
+ * b) no domination by F's join
+ * c) no domination by any other fork that's also dominated by F, where we don't count self-domination
+ * Note that the fork_tree also includes the non-fork start node, as unique root node.
+ */
+pub fn fork_tree(function: &Function, fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> {
+    let mut fork_tree = HashMap::new();
+    for (control, forks) in fork_join_nesting {
+        if function.nodes[control.idx()].is_fork() {
+            fork_tree.entry(*control).or_insert_with(HashSet::new);
+            let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0));
+            fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(*control);
+        }
+    }
+    fork_tree
+}
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
new file mode 100644
index 0000000000000000000000000000000000000000..d7a6d258358920d8a42aea1530843c08e223022a
--- /dev/null
+++ b/hercules_cg/src/gpu.rs
@@ -0,0 +1,1894 @@
+extern crate bitvec;
+extern crate hercules_ir;
+
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::fmt::{Error, Write};
+use std::fs::{OpenOptions, File};
+use std::io::Write as _;
+
+use self::hercules_ir::*;
+
+use crate::*;
+
+/*
+ * The top level function to compile a Hercules IR function into CUDA
+ * kernel for execution on the GPU. We generate CUDA C textually, with a lot
+ * of similarities with the CPU LLVM generation plus custom GPU parallelization.
+ */
+pub fn gpu_codegen<W: Write>(
+    function: &Function,
+    types: &Vec<Type>,
+    constants: &Vec<Constant>,
+    dynamic_constants: &Vec<DynamicConstant>,
+    typing: &Vec<TypeID>,
+    control_subgraph: &Subgraph,
+    bbs: &BasicBlocks,
+    backing_allocation: &FunctionBackingAllocation,
+    collection_objects: &FunctionCollectionObjects,
+    def_use_map: &ImmutableDefUseMap,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    fork_control_map: &HashMap<NodeID, HashSet<NodeID>>,
+    fork_tree: &HashMap<NodeID, HashSet<NodeID>>,
+    w: &mut W,
+) -> Result<(), Error> {
+    /*
+     * We assert the following:
+     * - Fork node must have >= 1 Reduce nodes
+     * - (Later in code) If the returned data type is a collection, it must have
+     *   originated from (potentially multiple) parameter(s).
+     *
+     * We don't assert but assume the following:
+     * - max_num_blocks in KernelParams is within constraint of 1D grid size. This
+     *   can be relaxed if we want to support larger grids.
+     * - Product types are packed with padding inserted for each element to
+     *   be aligned for its type and for full product to be aligned to its
+     *   largest element
+     * - Summation types must be aligned to their largest element
+     *
+     * Notes on GPU parallelization strategy and tips for IR transformations:
+     * - The top level block fork and any lower thread forks require a known Fork
+     *   size. Thus for an otherwise parallelizable Fork with unknown size,
+     *   consider splitting it into two Forks with one of known size. For block
+     *   level, the known fork has to be the (only) top-most fork.
+     * - The thread-level strategy is determined by starting at the most nested
+     *   Forks and working outwards in a greedy manner, with caps by GPU spec.
+     *   Thus, to ensure some outer Fork is parallelized, ensure the inner
+     *   parallelizable Forks aren't too large or consider removing schedule
+     *   annotations.
+     * - Tight-Associative reductions can only be efficiently implemented if
+     *   different Hercules ThreadIDs correspond to consecutive CUDA threads. But
+     *   this prevents nested parallelization since each parallel group must always
+     *   be a contiguous tile of threads. We use a heuristic of choosing the larger
+     *   factor when this results in a conflict between a Fork and it's subtree,
+     *   but this choice may not be optimal.
+     * - A given Fork (not talking about its children) can only be parallelized
+     *   if all its Reduces are Parallel-Reduce or Tight-Associative. So if the
+     *   Fork contains expensive parallelizable operations, ensure all reductions
+     *   are parallelizable or if not try pulling those out into a different Fork.
+     * - We do nothing to mitigate intra-warp divergence. To mitigate this, the
+     *   IR, for example, should ensure the innermost parallelizable Forks either
+     *   have factor >= warp size (32) or remove Fork/Reduce node schedule
+     *   annotations.
+     *
+     * Main TODOs:
+     * - Fix dynamic shared memory allocation to reuse old shmem. The main case
+     *   for improvement is when we have serialized forks with unused intermediate
+     *   values from previous iterations.
+     * - Add mapping from Region node to Fork node if there's a reduce whose control
+     *   is a Region not Join.
+     * - Matmul/Conv detection
+     * - Add float8, float16, bfloat16 dtypes if they come
+     */
+
+    let reduce_nodes: Vec<NodeID> = (0..function.nodes.len())
+        .filter(|idx| function.nodes[*idx].is_reduce())
+        .map(NodeID::new)
+        .collect();
+
+
+    let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map
+        .iter()
+        .map(|(fork, join)| (*join, *fork))
+        .collect();
+    // Fork Reduce map should have all reduces contained in some key
+    let mut fork_reduce_map: HashMap<NodeID, Vec<NodeID>> = HashMap::new();
+    // Reduct Reduce map should have all non-parallel and non-associative reduces
+    // contained in some key. Unlike Fork, Reduct is not involved in any assertions.
+    // It's placed here for convenience but can be moved.
+    let mut reduct_reduce_map: HashMap<NodeID, Vec<NodeID>> = HashMap::new();
+    for reduce_node in &reduce_nodes {
+        if let Node::Reduce {
+            control,
+            init: _,
+            reduct,
+        } = &function.nodes[reduce_node.idx()]
+        {
+            match function.nodes[control.idx()] {
+                Node::Join {..} => {
+                    let fork_node = join_fork_map[control];
+                    fork_reduce_map
+                        .entry(fork_node)
+                        .or_default()
+                        .push(*reduce_node);
+                }
+                Node::Region { preds: _ } => {
+                    // TODO: map region node to fork node
+                }
+                _ => {
+                    panic!("Reduce's control must be a join or region node");
+                }
+            }
+            reduct_reduce_map
+                .entry(*reduct)
+                .or_default()
+                .push(*reduce_node);
+        }
+    }
+    for idx in 0..function.nodes.len() {
+        if function.nodes[idx].is_fork() {
+            assert!(fork_reduce_map
+                .get(&NodeID::new(idx))
+                .is_some_and(|reduces| !reduces.is_empty()),
+                "Fork node {} has no reduce nodes",
+                idx
+            );
+        }
+    }
+
+    // Map from control to pairs of data to update phi
+    // For each phi, we go to its region and get region's controls
+    let mut control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>> = HashMap::new();
+    for (idx, node) in function.nodes.iter().enumerate() {
+        if let Node::Phi { control, data } = node {
+            let Node::Region { preds } = &function.nodes[control.idx()] else {
+                panic!("Phi's control must be a region node");
+            };
+            for (i, &pred) in preds.iter().enumerate() {
+                control_data_phi_map.entry(pred).or_default().push((data[i], NodeID::new(idx)));
+            }
+        }
+    }
+
+    let return_parameter = if collection_objects.returned_objects().len() == 1 {
+        collection_objects.origin(*collection_objects.returned_objects()
+            .first().unwrap()).try_parameter()
+    } else {
+        None
+    };
+
+    let kernel_params = &GPUKernelParams {
+        max_num_threads: 1024,
+        threads_per_warp: 32,
+    };
+
+    let ctx = GPUContext {
+        function,
+        types,
+        constants,
+        dynamic_constants,
+        typing,
+        control_subgraph,
+        bbs,
+        backing_allocation,
+        collection_objects,
+        def_use_map,
+        fork_join_map,
+        fork_control_map,
+        fork_tree,
+        join_fork_map,
+        fork_reduce_map,
+        reduct_reduce_map,
+        control_data_phi_map,
+        return_parameter,
+        kernel_params,
+    };
+    ctx.codegen_function(w)
+}
+
+struct GPUKernelParams {
+    max_num_threads: usize,
+    threads_per_warp: usize,
+}
+
+struct GPUContext<'a> {
+    function: &'a Function,
+    types: &'a Vec<Type>,
+    constants: &'a Vec<Constant>,
+    dynamic_constants: &'a Vec<DynamicConstant>,
+    typing: &'a Vec<TypeID>,
+    control_subgraph: &'a Subgraph,
+    bbs: &'a BasicBlocks,
+    backing_allocation: &'a FunctionBackingAllocation,
+    collection_objects: &'a FunctionCollectionObjects,
+    def_use_map: &'a ImmutableDefUseMap,
+    fork_join_map: &'a HashMap<NodeID, NodeID>,
+    fork_control_map: &'a HashMap<NodeID, HashSet<NodeID>>,
+    fork_tree: &'a HashMap<NodeID, HashSet<NodeID>>,
+    join_fork_map: HashMap<NodeID, NodeID>,
+    fork_reduce_map: HashMap<NodeID, Vec<NodeID>>,
+    reduct_reduce_map: HashMap<NodeID, Vec<NodeID>>,
+    control_data_phi_map: HashMap<NodeID, Vec<(NodeID, NodeID)>>,
+    return_parameter: Option<usize>,
+    kernel_params: &'a GPUKernelParams,
+}
+
+/*
+ * For all control nodes besides forks, Init, Body, and Term compose the main basic
+ * block, with Init and Term populated by control flow (Init used only by Fork and
+ * Join) and Body populated by data flow.
+ * For serialized Fork nodes which may be jumped back to by corresponding Join node,
+ * init and post_init separate one-time code (currently just cooperative group
+ * creation) from repeated code.
+ */
+#[derive(Default, Debug)]
+struct CudaGoto {
+    init: String,
+    post_init: String,
+    body: String,
+    term: String,
+}
+
+/*
+ * KernelState is used for data and control node organization and generation.
+ * We define a block fork as one with each ThreadID being a block, and a thread
+ * fork as one with each ThreadID being a subset of threads within a block.
+ * OutBlock is outside a potential block fork at the full grid level, InBlock
+ * is inside a block fork but outside any thread forks, and InThread is inside
+ * a thread fork.
+ */
+#[derive(Clone, Copy, PartialEq, Debug)]
+enum KernelState {
+    OutBlock,
+    InBlock,
+    InThread,
+}
+
+/*
+ * CGType is used to track cooperative group types. UsePerId is the group of (CUDA)
+ * threads for a current ThreadID, Use is the union of such threads for all ThreadIDs
+ * in the current innermost Fork, and Available is Use plus additional threads not
+ * used in the current Fork.
+ */
+#[derive(Clone, Copy, PartialEq, Debug)]
+enum CGType {
+    UsePerId,
+    Use,
+    Available,
+}
+
+impl GPUContext<'_> {
+    fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> {
+        // Emit all code up to the "goto" to Start's block
+        let mut top = String::new();
+        self.codegen_kernel_begin(self.return_parameter.is_none(), &mut top)?;
+        let mut dynamic_shared_offset = "0".to_string();
+        self.codegen_dynamic_constants(&mut top)?;
+        self.codegen_declare_data(&mut top)?;
+        self.codegen_helpers(&mut top)?;
+        self.codegen_goto_start(&mut top)?;
+        write!(w, "{}", top)?;
+
+        // Setup for CUDA's "goto" for control flow between basic blocks.
+        let mut gotos: BTreeMap<_, _> = (0..self.function.nodes.len())
+            .filter(|idx| self.function.nodes[*idx].is_control())
+            .map(|idx| {
+                let node_id = NodeID::new(idx);
+                let goto = CudaGoto::default();
+                (node_id, goto)
+            })
+            .collect();
+
+        // If there are no forks, fast forward to single-block, single-thread codegen
+        let (num_blocks, num_threads) = if self.fork_join_map.is_empty() {
+            self.codegen_data_control_no_forks(&HashSet::new(), &mut dynamic_shared_offset, &mut gotos)?;
+            ("1".to_string(), "1".to_string())
+        } else {
+            // Create structures and determine block and thread parallelization strategy
+            let (root_forks, num_blocks, is_block_parallel) =
+                self.get_root_forks_and_num_blocks(self.fork_tree);
+            let (thread_root_root_fork, thread_root_forks) = self.get_thread_root_forks(&root_forks, self.fork_tree, is_block_parallel);
+            let (fork_thread_quota_map, num_threads) = self.get_thread_quotas(self.fork_tree, thread_root_root_fork);
+            // TODO: Uncomment and adjust once we know logic of extra dim. This will affect constant
+            // collections, reads, and writes.
+            // let extra_dim_collects = self.get_extra_dim_collects(&fork_control_map, &fork_thread_quota_map);
+            let extra_dim_collects = HashSet::new();
+
+            // Core function for the CUDA code of all data and control nodes.
+            self.codegen_data_control(
+                if is_block_parallel {
+                    Some(thread_root_root_fork)
+                } else {
+                    None
+                },
+                &thread_root_forks,
+                &fork_thread_quota_map,
+                &extra_dim_collects,
+                &mut dynamic_shared_offset,
+                is_block_parallel,
+                num_threads,
+                &mut gotos,
+            )?;
+            (num_blocks, num_threads.to_string())
+        };
+
+        // Emit all GPU kernel code from previous steps
+        let mut kernel_body = String::new();
+        self.codegen_gotos(false, &mut gotos, &mut kernel_body)?;
+        write!(w, "{}", kernel_body)?;
+        write!(w, "}}\n")?;
+
+        // Emit host launch code
+        let mut host_launch = String::new();
+        self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?;
+        write!(w, "{}", host_launch)?;
+
+        Ok(())
+    }
+
+    /*
+     * Emit kernel headers, signature, arguments, and dynamic shared memory declaration
+     */
+    fn codegen_kernel_begin(&self, has_ret_var: bool, w: &mut String) -> Result<(), Error> {
+        write!(w, "
+#include <assert.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <math_constants.h>
+#include <mma.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+namespace cg = cooperative_groups;
+
+#define uabs(a) (a)
+#define umin(a, b) ((a) < (b) ? (a) : (b))
+#define umax(a, b) ((a) > (b) ? (a) : (b))
+#define powi(a, b) ({{ int res = 1; for(int i = 0; i < b; ++i) res *= a; res; }})
+#define roundi(a) (a)
+#define isqrt(a) ((int)sqrtf((float)(a)))
+
+",
+        )?;
+
+        write!(
+            w,
+            "__global__ void __launch_bounds__({}) {}_gpu(",
+            self.kernel_params.max_num_threads, self.function.name
+        )?;
+        let mut first_param = true;
+        // The first parameter is a pointer to GPU backing memory, if it's
+        // needed.
+        if self.backing_allocation.contains_key(&Device::CUDA) {
+            first_param = false;
+            write!(w, "char* backing")?;
+        }
+        // The second set of parameters are dynamic constants.
+        for idx in 0..self.function.num_dynamic_constants {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(w, ", ")?;
+            }
+            write!(w, "unsigned long long dc_p{}", idx)?;
+        }
+        // The third set of parameters are normal arguments.
+        for (idx, ty) in self.function.param_types.iter().enumerate() {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(w, ", ")?;
+            }
+            let param_type = if self.types[ty.idx()].is_primitive() {
+                self.get_type(*ty, false)
+            } else {
+                format!("{} __restrict__", self.get_type(*ty, false))
+            };
+            write!(w, "{} p{}", param_type, idx)?;
+        }
+        if has_ret_var {
+            if !first_param {
+                write!(w, ", ")?;
+            }
+            write!(
+                w,
+                "void* __restrict__ ret",
+            )?;
+        }
+
+        // Type is char since it's simplest to use single bytes for indexing
+        // and it's required for heterogeneous Product and Summation types.
+        // Casting is later used for conversion to different types like int.
+        write!(w, ") {{\n")?;
+        write!(w, "\textern __shared__ char dynamic_shared[];\n")?;
+        // This will only get used by thread rank 0 in each block, since it
+        // does all shared memory "allocation". The actual state is preserved
+        // in Rust string and this offset is assigned to for ease of readability.
+        write!(w, "\tuint64_t dynamic_shared_offset;\n")?;
+
+        Ok(())
+    }
+
+    /*
+     * Emit calculation of all dynamic constants
+     */
+    fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> {
+        for dc in dynamic_constants_bottom_up(&self.dynamic_constants) {
+            let dc_val = format!("unsigned long long dc{}", dc.idx());
+            match &self.dynamic_constants[dc.idx()] {
+                DynamicConstant::Constant(val) => write!(w, "\t{} = {}ull;\n", dc_val, val)?,
+                DynamicConstant::Parameter(idx) => {
+                    if *idx < self.function.num_dynamic_constants as usize {
+                        write!(w, "\t{} = dc_p{};\n", dc_val, idx)?
+                    } else {
+                        write!(w, "\t{} = 0;\n", dc_val)?
+                    }
+                }
+                DynamicConstant::Add(args) => {
+                    let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" + ");
+                    write!(w, "\t{} = {};\n", dc_val, rhs)?
+                }
+                DynamicConstant::Mul(args) => {
+                    let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" * ");
+                    write!(w, "\t{} = {};\n", dc_val, rhs)?
+                }
+                DynamicConstant::Min(args) => {
+                    let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("min(dc{}, ", arg.idx())).collect();
+                    let rhs_last = format!("dc{}", args.last().unwrap().idx());
+                    let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect();
+                    write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)?
+                }
+                DynamicConstant::Max(args) => {
+                    let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("max(dc{}, ", arg.idx())).collect();
+                    let rhs_last = format!("dc{}", args.last().unwrap().idx());
+                    let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect();
+                    write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)?
+                }
+                DynamicConstant::Sub(left, right) => {
+                    write!(w, "\t{} = dc{} - dc{};\n", dc_val, left.idx(), right.idx())?
+                }
+                DynamicConstant::Div(left, right) => {
+                    write!(w, "\t{} = dc{} / dc{};\n", dc_val, left.idx(), right.idx())?
+                }
+                DynamicConstant::Rem(left, right) => {
+                    write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())?
+                }
+            }
+        }
+        Ok(())
+    }
+
+    /*
+     * To abide by c++ reassignment restrictions, we declare all data values
+     * upfront.
+     */
+    fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> {
+        for id in (0..self.function.nodes.len()).map(NodeID::new) {
+            if !self.function.nodes[id.idx()].is_control() &&
+                !self.function.nodes[id.idx()].is_dynamic_constant() &&
+                !self.function.nodes[id.idx()].is_parameter() {
+                write!(w, "\t{};\n", self.get_value(id, true, false))?;
+            }
+            if self.function.nodes[id.idx()].is_phi() {
+                write!(w, "\t{}_tmp;\n", self.get_value(id, true, false))?;
+            }
+        }
+        Ok(())
+    }
+
+    /*
+     * Emit helper registers that are used throughout the kernel. grid and block
+     * are from CUDA's cooperative groups API and are used specifically for reads
+     * and writes.
+     */
+    fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
+        write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
+        write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?;
+        Ok(())
+    }
+
+    fn codegen_goto_start(&self, w: &mut String) -> Result<(), Error> {
+        let block_start = self.get_block_name(NodeID::new(0), false);
+        write!(w, "\tgoto {};\n", block_start)?;
+        Ok(())
+    }
+
+    fn codegen_gotos(&self, goto_debug: bool, gotos: &mut BTreeMap<NodeID, CudaGoto>, w: &mut String) -> Result<(), Error> {
+        write!(w, "\n")?;
+        for (id, goto) in gotos.iter() {
+            let goto_block = self.get_block_name(*id, false);
+            write!(w, "{}:\n", goto_block)?;
+            if goto_debug {
+                write!(w, "\tprintf(\"goto {}\\n\");\n", goto_block)?;
+            }
+            write!(w, "{}", goto.init)?;
+            if !goto.post_init.is_empty() {
+                let goto_block = self.get_block_name(*id, true);
+                write!(w, "{}:\n", goto_block)?;
+                write!(w, "{}", goto.post_init)?;
+            }
+            write!(w, "{}", goto.body)?;
+            write!(w, "{}\n", goto.term)?;
+        }
+        Ok(())
+    }
+
+    fn codegen_launch_code(&self, num_blocks: String, num_threads: String, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> {
+        // The following steps are for host-side C function arguments, but we also
+        // need to pass arguments to kernel, so we keep track of the arguments here.
+        let ret_type = self.get_type(self.function.return_type, false);
+        let mut pass_args = String::new();
+        write!(w, "
+extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
+        let mut first_param = true;
+        // The first parameter is a pointer to GPU backing memory, if it's
+        // needed.
+        if self.backing_allocation.contains_key(&Device::CUDA) {
+            first_param = false;
+            write!(w, "char* backing")?;
+            write!(pass_args, "backing")?;
+        }
+        // The second set of parameters are dynamic constants.
+        for idx in 0..self.function.num_dynamic_constants {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(w, ", ")?;
+                write!(pass_args, ", ")?;
+            }
+            write!(w, "unsigned long long dc_p{}", idx)?;
+            write!(pass_args, "dc_p{}", idx)?;
+        }
+        // The third set of parameters are normal arguments.
+        for (idx, ty) in self.function.param_types.iter().enumerate() {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(w, ", ")?;
+                write!(pass_args, ", ")?;
+            }
+            let param_type = self.get_type(*ty, false);
+            write!(w, "{} p{}", param_type, idx)?;
+            write!(pass_args, "p{}", idx)?;
+        }
+        write!(w, ") {{\n")?;
+        // For case of dynamic block count
+        self.codegen_dynamic_constants(w)?;
+        let has_ret_var = self.return_parameter.is_none();
+        if has_ret_var {
+            // Allocate return parameter and lift to kernel argument
+            let ret_type_pnt = self.get_type(self.function.return_type, true);
+            write!(w, "\t{} ret;\n", ret_type_pnt)?;
+            if !first_param {
+                write!(pass_args, ", ")?;
+            }
+            write!(pass_args, "ret")?;
+            write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
+        }
+        write!(w, "\tcudaError_t err;\n");
+        write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?;
+        write!(w, "\terr = cudaGetLastError();\n");
+        write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n");
+        write!(w, "\tcudaDeviceSynchronize();\n")?;
+        write!(w, "\terr = cudaGetLastError();\n");
+        write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n");
+        if has_ret_var {
+            // Copy return from device to host, whether it's primitive value or collection pointer
+            write!(w, "\t{} host_ret;\n", ret_type)?;
+            write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?;
+            write!(w, "\treturn host_ret;\n")?;
+        } else {
+            write!(w, "\treturn p{};\n", self.return_parameter.unwrap())?;
+        }
+        write!(w, "}}\n")?;
+        Ok(())
+    }
+
+    /*
+     * If tree has a single root fork of known size s <= max_num_blocks
+     * with parallel-fork schedule, then set num_blocks to s, else set num_blocks
+     * to 1. Also return the root fork(s) for parallelization strategy within
+     * threadblocks for threads and their eventual generation.
+     */
+    fn get_root_forks_and_num_blocks(
+        &self,
+        fork_tree: &HashMap<NodeID, HashSet<NodeID>>,
+    ) -> (HashSet<NodeID>, String, bool) {
+        let root_forks: HashSet<NodeID> = fork_tree.get(&NodeID::new(0)).unwrap().clone();
+        if root_forks.len() != 1 {
+            return (root_forks, "1".to_string(), false);
+        }
+
+        let root_fork = root_forks.iter().next().unwrap();
+        let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else {
+            panic!("Expected fork node");
+        };
+        let reduces = &self.fork_reduce_map[root_fork];
+        if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork)
+        {
+            let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * ");
+            (root_forks, fork_size, true)
+        } else {
+            (root_forks, "1".to_string(), false)
+        }
+    }
+
+    /*
+     * If there's a block fork, then thread root forks are it's child forks. If
+     * not, thread root forks are the root forks. This will be used to begin the
+     * thread fork tree traversal for codegen.
+     */
+    fn get_thread_root_forks(
+        &self,
+        root_forks: &HashSet<NodeID>,
+        fork_tree: &HashMap<NodeID, HashSet<NodeID>>,
+        is_block_parallel: bool,
+    ) -> (NodeID, HashSet<NodeID>) {
+        if is_block_parallel {
+            let root_fork = root_forks.iter().next().unwrap();
+            (*root_fork, fork_tree.get(&root_fork).unwrap().iter().copied().collect())
+        } else {
+            (NodeID::new(0), root_forks.clone())
+        }
+    }
+
+    /*
+     * This analysis determines the parallelization strategy within threadblocks.
+     * We run post-order traversal on the fork tree to get the thread quota per
+     * subtree. In particular, each fork starts with a base factor as the
+     * maximum over its descendants (leafs have base 1). We traverse up (details
+     * in helper) and pass the factor and a map from fork node to a tuple of
+     * (max quota of its siblings (including itself), its quota, its fork factor)
+     * from each node to its parents. The parent then compares the received quota
+     * of its subtree vs just it's own. If it's associative, it chooses the larger
+     * of the two, if not it can parallelize both if applicable and if they fit.
+     *
+     * Finally, the map is returned such that a node is in the map IFF it will
+     * be parallelized. If not, the fork will use the parent's quota and serialize
+     * over the Fork's ThreadIDs. Nodes may be removed from the map when traversing
+     * up the tree due to conflicting (due to associative or limit) ancestor of
+     * larger factor.
+     */
+    fn get_thread_quotas(
+        &self,
+        fork_tree: &HashMap<NodeID, HashSet<NodeID>>,
+        root_fork: NodeID,
+    ) -> (HashMap<NodeID, (usize, usize, usize)>, usize) {
+        let (tree_map, tree_quota, _) = self.recurse_thread_quotas(root_fork, fork_tree, true);
+        (tree_map, tree_quota)
+    }
+
+    fn recurse_thread_quotas(
+        &self,
+        curr_fork: NodeID,
+        fork_tree: &HashMap<NodeID, HashSet<NodeID>>,
+        is_root: bool,
+    ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) {
+        // Subsubtree map is the union of all keys for grandchildren and lower
+        // nodes. children_quota_map is a constructed map from parallelized children
+        // to their quota to update the subsubtree map at grandchildren level to
+        // subtreemap at children level. subtree_quota is cumulative factor of
+        // subtree and is then compared to this fork's factor.
+        let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree
+            .get(&curr_fork)
+            .unwrap()
+            .iter()
+            .map(|child| (child, self.recurse_thread_quotas(*child, fork_tree, false)))
+            .fold(
+                (HashMap::new(), HashMap::new(), 1),
+                |(mut subsubtree_map, mut children_quota_map, subtree_quota), (child, (curr_map, curr_quota, use_curr))| {
+                    subsubtree_map.extend(curr_map);
+                    if use_curr {
+                        children_quota_map.insert(child, curr_quota);
+                    }
+                    (subsubtree_map, children_quota_map, subtree_quota.max(curr_quota))
+                },
+            );
+        // First update children_quota_map items with full information and add
+        // to subsubtree_map
+        for (&child, quota) in children_quota_map.iter() {
+            let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else {
+                panic!("Expected fork node");
+            };
+            let fork_size = self.multiply_fork_factors(factors).unwrap();
+            subsubtree_map.insert(*child, (subtree_quota, *quota, fork_size));
+        }
+        let subtree_map = subsubtree_map;
+        if is_root {
+            return (subtree_map, subtree_quota, true)
+        }
+        // A node can only be considered for parallelization if:
+        // a) it has statically known size
+        // b) the known size is less than or equal to the max_num_threads
+        // c) the known size is a power of 2
+        // d) all reduces are parallel-reduce or associative
+        //
+        // If not, just take the max cumulative factor of its subtree
+        let reduces = &self.fork_reduce_map[&curr_fork];
+        if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()]
+            && let Some(fork_size) = self.multiply_fork_factors(factors)
+            && fork_size <= self.kernel_params.max_num_threads
+            && fork_size.is_power_of_two()
+            && reduces.iter().all(|&reduce| {
+                self.function.schedules[reduce.idx()].contains(&Schedule::ParallelReduce)
+                    || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative)
+            })
+        {
+            // If there's an associative Reduce, parallelize the larger factor
+            // between the Fork and subtree
+            // Else, all Reduces must be only parallel-reduce, so parallelize
+            // both if they fit and the larger if not.
+            // The reason for this distinction is that we only perform Reduces over
+            // ThreadID-based values over consecutive CUDA threads, so there's no
+            // opportunity for further nested parallelization. In contrast, this
+            // restriction doesn't help for parallel Writes, so nested parallelization
+            // is possible.
+            if reduces.iter().any(|&reduce| {
+                self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative)
+            }) || fork_size > self.kernel_params.max_num_threads / subtree_quota {
+                if fork_size >= subtree_quota {
+                    (HashMap::new(), fork_size, true)
+                } else {
+                    (subtree_map, subtree_quota, false)
+                }
+            } else {
+                (subtree_map, fork_size * subtree_quota, true)
+            }
+        } else {
+            (subtree_map, subtree_quota, false)
+        }
+    }
+
+    /*
+     * All non reduced-over collections used in fork joins have an extra dimension.
+     * However, this is only useful if ThreadIDs run in parallel not serially,
+     * otherwise it's unnecessarily consuming shared memory. This function returns
+     * the set of collections that have an unnecessary extra dimension.
+     */
+    fn get_extra_dim_collects(
+        &self,
+        fork_control_map: &HashMap<NodeID, HashSet<NodeID>>,
+        fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>,
+    ) -> HashSet<TypeID> {
+        // Determine which fork each collection is used in, and check if it's
+        // parallelized via the fork_thread_quota_map.
+        todo!()
+    }
+
+    fn codegen_data_control_no_forks(
+        &self,
+        extra_dim_collects: &HashSet<TypeID>,
+        dynamic_shared_offset: &mut String,
+        gotos: &mut BTreeMap<NodeID, CudaGoto>,
+    ) -> Result<(), Error> {
+        (0..self.function.nodes.len())
+            .filter(|idx| self.function.nodes[*idx].is_control())
+            .try_for_each(|idx| -> Result<(), Error> {
+                let control = NodeID::new(idx);
+                let goto = gotos.get_mut(&control).unwrap();
+                let init = &mut goto.init;
+                let post_init = &mut goto.post_init;
+                let body = &mut goto.body;
+                let term = &mut goto.term;
+                let mut tabs = self.codegen_control_node(control, None, None, None, init, post_init, term)?;
+                for data in self.bbs.1[control.idx()].iter() {
+                    self.codegen_data_node(*data, KernelState::OutBlock, Some(false), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
+                }
+                Ok(())
+            })
+    }
+
+    /*
+     * Codegen for all control and data nodes.
+     */
+    fn codegen_data_control(
+        &self,
+        block_fork: Option<NodeID>,
+        thread_root_forks: &HashSet<NodeID>,
+        fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>,
+        extra_dim_collects: &HashSet<TypeID>,
+        dynamic_shared_offset: &mut String,
+        is_block_parallel: bool,
+        num_threads: usize,
+        gotos: &mut BTreeMap<NodeID, CudaGoto>,
+    ) -> Result<(), Error> {
+        // First emit data and control gen for each control node outside any fork.
+        // Recall that this was tracked through a fake fork node with NodeID 0.
+        let mut state = KernelState::OutBlock;
+        for control in self.fork_control_map.get(&NodeID::new(0)).unwrap() {
+            let goto = gotos.get_mut(control).unwrap();
+            let init = &mut goto.init;
+            let post_init = &mut goto.post_init;
+            let body = &mut goto.body;
+            let term = &mut goto.term;
+            let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?;
+            for data in self.bbs.1[control.idx()].iter() {
+                self.codegen_data_node(*data, state, Some(is_block_parallel), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
+            }
+        }
+        // Then generate data and control for the single block fork if it exists
+        if block_fork.is_some() {
+            state = KernelState::InBlock;
+            for control in self.fork_control_map.get(&block_fork.unwrap()).unwrap() {
+                let goto = gotos.get_mut(control).unwrap();
+                let init = &mut goto.init;
+                let post_init = &mut goto.post_init;
+                let body = &mut goto.body;
+                let term = &mut goto.term;
+                let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?;
+                for data in self.bbs.1[control.idx()].iter() {
+                    self.codegen_data_node(*data, state, None, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
+                }
+            }
+        }
+        // Then generate for the thread fork tree through Fork node traversal.
+        state = KernelState::InThread;
+        for &root_fork in thread_root_forks {
+            self.codegen_data_control_traverse(
+                root_fork,
+                state,
+                fork_thread_quota_map,
+                1,
+                num_threads,
+                extra_dim_collects,
+                dynamic_shared_offset,
+                gotos,
+            )?;
+        }
+        Ok(())
+    }
+
+    /*
+     * The important feature of this traversal is that we update the available
+     * thread quota, use thread quota, and parallel factor for each Fork node.
+     * Either this information is in the precomputed map, or we use the parent's
+     * quota with no parallel factor.
+     */
+    fn codegen_data_control_traverse(
+        &self,
+        curr_fork: NodeID,
+        state: KernelState,
+        fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>,
+        parent_quota: usize,
+        num_threads: usize,
+        extra_dim_collections: &HashSet<TypeID>,
+        dynamic_shared_offset: &mut String,
+        gotos: &mut BTreeMap<NodeID, CudaGoto>,
+    ) -> Result<(), Error> {
+        let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map
+            .get(&curr_fork)
+            .map(|(a, u, f)| (*a, *u, Some(*f)))
+            .unwrap_or((parent_quota, parent_quota, None));
+        let reduces = &self.fork_reduce_map[&curr_fork];
+        let reducts = if parallel_factor.is_some() {
+            reduces
+            .iter()
+            .map(|&reduce| {
+                let Node::Reduce { control: _, init: _, reduct} = &self.function.nodes[reduce.idx()] else {
+                    panic!("Expected reduce node");
+                };
+                *reduct
+            })
+            .collect()
+        } else {
+            HashSet::new()
+        };
+        for control in self.fork_control_map.get(&curr_fork).unwrap() {
+            let goto = gotos.get_mut(control).unwrap();
+            let init = &mut goto.init;
+            let post_init = &mut goto.post_init;
+            let body = &mut goto.body;
+            let term = &mut goto.term;
+            let mut tabs = self.codegen_control_node(*control, Some(available_thread_quota), Some(use_thread_quota), parallel_factor, init, post_init, term)?;
+            for data in self.bbs.1[control.idx()].iter() {
+                self.codegen_data_node(
+                    *data,
+                    state,
+                    None,
+                    Some(use_thread_quota),
+                    parallel_factor,
+                    Some(curr_fork),
+                    reducts.contains(data),
+                    extra_dim_collections,
+                    dynamic_shared_offset,
+                    body,
+                    &mut tabs,
+                )?;
+            }
+        }
+        for child in self.fork_tree.get(&curr_fork).unwrap() {
+            self.codegen_data_control_traverse(
+                *child,
+                state,
+                fork_thread_quota_map,
+                use_thread_quota,
+                num_threads,
+                extra_dim_collections,
+                dynamic_shared_offset,
+                gotos,
+            )?;
+        }
+        Ok(())
+    }
+
+    fn codegen_data_node(
+        &self,
+        id: NodeID,
+        state: KernelState,
+        is_block_parallel: Option<bool>,
+        use_thread_quota: Option<usize>,
+        parallel_factor: Option<usize>,
+        nesting_fork: Option<NodeID>,
+        is_special_reduct: bool,
+        extra_dim_collects: &HashSet<TypeID>,
+        dynamic_shared_offset: &mut String,
+        w: &mut String,
+        num_tabs: &mut usize,
+    ) -> Result<(), Error> {
+        let define_variable = self.get_value(id, false, false).to_string();
+        let tabs = "\t".repeat(*num_tabs);
+        match &self.function.nodes[id.idx()] {
+            Node::Phi { control: _, data: _ } => {
+                write!(w, "{}{} = {}_tmp;\n", tabs, define_variable, define_variable)?;
+            }
+            Node::ThreadID { control, dimension } => {
+                let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else {
+                    panic!("Expected ThreadID's control to be a fork node");
+                };
+                let divide = multiply_dcs(&factors[dimension + 1..]);
+                let modulo = format!("dc{}", factors[*dimension].idx());
+                match state {
+                    KernelState::InBlock => {
+                        write!(
+                            w,
+                            "{}{} = (blockIdx.x / ({})) % {};\n",
+                            tabs, define_variable, divide, modulo
+                        )?;
+                    }
+                    KernelState::InThread => {
+                        if parallel_factor.is_none() {
+                            // No dependence on threadIdx.x because each used thread
+                            // will run this Fork serially
+                            let fork_iter = self.get_fork_iter(*control, false);
+                            write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?;
+                        } else {
+                            // We can directly use use_thread_quota and not worry about available
+                            // because Fork basic block's init section already does gating
+                            write!(w, "{}{} = (threadIdx.x % {}) / {};\n", tabs, define_variable, use_thread_quota.unwrap(), use_thread_quota.unwrap() / parallel_factor.unwrap())?;
+                        }
+                    }
+                    _ => {
+                        panic!("Unsupported state for ThreadID")
+                    }
+                }
+            }
+            // The Reduce node only generates it's initialization, as reduct will
+            // perform the update. If serialized, add gate to prevent re-assignment
+            // when we hit this reduce again due to the control flow loop between
+            // the Fork and Join.
+            Node::Reduce {
+                control: _,
+                init,
+                reduct: _,
+            } => {
+                let init_val = self.get_value(*init, false, false);
+                if parallel_factor.is_none() && KernelState::InThread == state {
+                    let Some(nesting_fork) = nesting_fork else {
+                        panic!("Expected reduce to be nested in a fork node");
+                    };
+                    let fork_iter = self.get_fork_iter(nesting_fork, false);
+                    write!(w, "{}if ({} == 0) {{\n", tabs, fork_iter)?;
+                    write!(w, "{}\t{} = {};\n", tabs, define_variable, init_val)?;
+                    write!(w, "{}}}\n", tabs)?;
+                } else {
+                    write!(w, "{}{} = {};\n", tabs, define_variable, init_val)?;
+                }
+            }
+            // Parameters emitted at top
+            Node::Parameter { index: _ } => {}
+            // If the constant is primitive, it's stored in register so we repeat
+            // for all threads. Otherwise, it can be inside or outside block fork.
+            // If inside, it's stored in shared memory so we "allocate" it once
+            // and parallelize memset to 0. If outside, we initialize as offset
+            // to backing, but if multi-block grid, don't memset to avoid grid-level sync.
+            Node::Constant { id: cons_id } => {
+                let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive();
+                let cg_tile = match state {
+                    KernelState::OutBlock | KernelState::InBlock => "block".to_string(),
+                    KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId),
+                };
+                if !is_primitive && state != KernelState::OutBlock {
+                    write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?;
+                    *num_tabs += 1;
+                }
+                if is_primitive || state != KernelState::OutBlock {
+                    self.codegen_constant(
+                        define_variable.clone(),
+                        *cons_id,
+                        true,
+                        Some(extra_dim_collects),
+                        dynamic_shared_offset,
+                        w,
+                        *num_tabs,
+                    )?;
+                }
+                if !is_primitive && state != KernelState::OutBlock {
+                    write!(w, "{}}}\n", tabs)?;
+                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    *num_tabs -= 1;
+                }
+                if !is_primitive && state == KernelState::OutBlock {
+                    let (_, offsets) = &self.backing_allocation[&Device::CUDA];
+                    let offset = offsets[&id];
+                    write!(w, "{}{} = backing + dc{};\n", tabs, define_variable, offset.idx())?;
+                }
+                if !is_primitive && (state != KernelState::OutBlock || is_block_parallel.is_none() || !is_block_parallel.unwrap()) {
+                    let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects));
+                    write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?;
+                    write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?;
+                    write!(w, "{}}}\n", tabs)?;
+                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                }
+            }
+            // Dynamic constants emitted at top
+            Node::DynamicConstant { id: _ } => {}
+            Node::Unary { op, input } => match op {
+                UnaryOperator::Not => match &self.types[self.typing[input.idx()].idx()] {
+                    Type::Boolean => {
+                        write!(
+                            w,
+                            "{}{} = !{};\n",
+                            tabs,
+                            define_variable,
+                            self.get_value(*input, false, false),
+                        )?;
+                    }
+                    ty if ty.is_fixed() => {
+                        write!(
+                            w,
+                            "{}{} = ~{};\n",
+                            tabs,
+                            define_variable,
+                            self.get_value(*input, false, false),
+                        )?;
+                    }
+                    _ => panic!("Unsupported type for not operator"),
+                },
+                UnaryOperator::Neg => match &self.types[self.typing[input.idx()].idx()] {
+                    ty if ty.is_signed() || ty.is_float() => {
+                        write!(
+                            w,
+                            "{}{} = -{};\n",
+                            tabs,
+                            define_variable,
+                            self.get_value(*input, false, false),
+                        )?;
+                    }
+                    _ => {
+                        panic!("Unsupported type for neg operator")
+                    }
+                },
+                UnaryOperator::Cast(dst_ty_id) => {
+                    write!(
+                        w,
+                        "{}{} = static_cast<{}>({});\n",
+                        tabs,
+                        define_variable,
+                        self.get_type(*dst_ty_id, false),
+                        self.get_value(*input, false, false),
+                    )?;
+                }
+            },
+            Node::Binary { op, left, right } => {
+                let mut left_val = self.get_value(*left, false, false);
+                let mut right_val = self.get_value(*right, false, false);
+                let id_type = self.typing[id.idx()];
+                if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And
+                    | BinaryOperator::Xor) && is_special_reduct {
+                    // For parallelized associative Reduces, use the cooperative
+                    // groups reduce API. Associative multiplication is not
+                    // supported. We need to use CGType::Use not CGType::UsePerId
+                    // because for parallelized reduction we only have one thread
+                    // per ThreadID and the reduction is over Use, not UsePerId.
+                    let (reduce_val, non_reduce_val) = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[left.idx()] {
+                        (left_val, right_val)
+                    } else {
+                        (right_val, left_val)
+                    };
+                    // Special reduct is only enabled for thread parallelization
+                    // so don't need to worry about grid and block cases
+                    let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use);
+                    #[allow(unreachable_patterns)]
+                    let cg_op = match op {
+                        BinaryOperator::Add => "plus",
+                        BinaryOperator::Or => "bit_or",
+                        BinaryOperator::And => "bit_and",
+                        BinaryOperator::Xor => "bit_xor",
+                        _ => unreachable!(),
+                    };
+                    let id_type_name = self.get_type(id_type, false);
+                    write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, cg_tile, non_reduce_val, cg_op, id_type_name)?;
+                    // Setup binop between reduce's init and reduced reduct. Since it's associative,
+                    // we can change binop ordering
+                    left_val = define_variable.clone();
+                    right_val = reduce_val;
+                }
+                match (op, &self.types[id_type.idx()]) {
+                    (BinaryOperator::Or, Type::Boolean) => write!(
+                        w,
+                        "{}{} = {} || {};\n",
+                        tabs, define_variable, left_val, right_val,
+                    )?,
+                    (BinaryOperator::And, Type::Boolean) => write!(
+                        w,
+                        "{}{} = {} && {};\n",
+                        tabs, define_variable, left_val, right_val,
+                    )?,
+                    (BinaryOperator::Rem, Type::Float32) => write!(
+                        w,
+                        "{}{} = fmodf({}, {});\n",
+                        tabs, define_variable, left_val, right_val,
+                    )?,
+                    (BinaryOperator::Rem, Type::Float64) => write!(
+                        w,
+                        "{}{} = fmod({}, {});\n",
+                        tabs, define_variable, left_val, right_val,
+                    )?,
+                    (op, _) => write!(
+                        w,
+                        "{}{} = {} {} {};\n",
+                        tabs,
+                        define_variable,
+                        left_val,
+                        match op {
+                            BinaryOperator::Add => "+",
+                            BinaryOperator::Sub => "-",
+                            BinaryOperator::Mul => "*",
+                            BinaryOperator::Div => "/",
+                            BinaryOperator::Rem => "%",
+                            BinaryOperator::LT => "<",
+                            BinaryOperator::LTE => "<=",
+                            BinaryOperator::GT => ">",
+                            BinaryOperator::GTE => ">=",
+                            BinaryOperator::EQ => "==",
+                            BinaryOperator::NE => "!=",
+                            BinaryOperator::Or => "|",
+                            BinaryOperator::And => "&",
+                            BinaryOperator::Xor => "^",
+                            BinaryOperator::LSh => "<<",
+                            BinaryOperator::RSh => ">>",
+                        },
+                        right_val,
+                    )?,
+                };
+            }
+            Node::Ternary {
+                op,
+                first,
+                second,
+                third,
+            } => match op {
+                TernaryOperator::Select => {
+                    write!(
+                        w,
+                        "{}{} = {} ? {} : {};\n",
+                        tabs,
+                        define_variable,
+                        self.get_value(*first, false, false),
+                        self.get_value(*second, false, false),
+                        self.get_value(*third, false, false),
+                    )?;
+                }
+            },
+            Node::IntrinsicCall { intrinsic, args } => {
+                let id_type = self.typing[id.idx()];
+                if matches!(intrinsic, Intrinsic::Max | Intrinsic::Min) && is_special_reduct {
+                    // Similar to associative Binops
+                    let non_reduce_arg = if let Node::Reduce { control: _, init: _, reduct: _ } = &self.function.nodes[args[0].idx()] {
+                        self.get_value(args[1], false, false)
+                    } else {
+                        self.get_value(args[0], false, false)
+                    };
+                    let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use);
+                    #[allow(unreachable_patterns)]
+                    let cg_op = match intrinsic {
+                        Intrinsic::Max => "max",
+                        Intrinsic::Min => "min",
+                        _ => unreachable!(),
+                    };
+                    let id_type_name = self.get_type(id_type, false);
+                    write!(w, "{}{} = cg::reduce({}, {}, cg::{}<{}>());\n", tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name)?;
+                } else {
+                    let ty = &self.types[id_type.idx()];
+                    let intrinsic = self.codegen_intrinsic(intrinsic, ty);
+                    let args = args.iter()
+                        .map(|arg| self.get_value(*arg, false, false))
+                        .collect::<Vec<_>>()
+                        .join(", ");
+                    write!(
+                        w,
+                        "{}{} = {}({});\n",
+                        tabs,
+                        define_variable,
+                        intrinsic,
+                        args,
+                    )?;
+                }
+            }
+            // Read of primitive requires load after pointer math.
+            Node::Read { collect, indices } => {
+                let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects);
+                let data_type_id = self.typing[id.idx()];
+                if self.types[data_type_id.idx()].is_primitive() {
+                    let type_name = self.get_type(data_type_id, true);
+                    write!(w, "{}{} = *reinterpret_cast<{}>({});\n", tabs, define_variable, type_name, collect_with_indices)?;
+                } else {
+                    write!(w, "{}{} = {};\n", tabs, define_variable, collect_with_indices)?;
+                }
+            }
+            // Write of primitive needs a thread rank gate for safety. Write of
+            // collection is memcpy that we distribute among threads.
+            Node::Write {
+                collect,
+                data,
+                indices,
+            } => {
+                let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects);
+                let data_variable = self.get_value(*data, false, false);
+                let data_type_id = self.typing[data.idx()];
+                let cg_tile = match state {
+                    KernelState::OutBlock | KernelState::InBlock => "block".to_string(),
+                    KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId),
+                };
+                if self.types[data_type_id.idx()].is_primitive() {
+                    write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?;
+                    let type_name = self.get_type(data_type_id, true);
+                    write!(w, "{}\t*reinterpret_cast<{}>({}) = {};\n", tabs, type_name, collect_with_indices, data_variable)?;
+                    write!(w, "{}}}\n", tabs)?;
+                } else {
+                    let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects));
+                    write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?;
+                    write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, collect_with_indices, data_variable)?;
+                    write!(w, "{}}}\n", tabs)?;
+                    write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?;
+                    write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?;
+                    write!(w, "{}}}\n", tabs)?;
+                }
+                write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                let collect_variable = self.get_value(*collect, false, false);
+                write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?;
+            }
+            _ => {
+                panic!("Unsupported data node type: {:?}", self.function.nodes[id.idx()])
+            }
+        }
+        // Since reducts are responsible for updating Reduce nodes, we check and
+        // emit those for each data node.
+        if let Some(reduces) = self.reduct_reduce_map.get(&id) {
+            let val = self.get_value(id, false, false);
+            for reduce in reduces {
+                let reduce_val = self.get_value(*reduce, false, false);
+                write!(w, "{}{} = {};\n", tabs, reduce_val, val)?;
+            }
+        }
+        Ok(())
+    }
+
+    fn codegen_control_node(
+        &self,
+        id: NodeID,
+        available_thread_quota: Option<usize>,
+        use_thread_quota: Option<usize>,
+        parallel_factor: Option<usize>,
+        w_init: &mut String,
+        w_post_init: &mut String,
+        w_term: &mut String,
+    ) -> Result<usize, Error> {
+        for (data, phi) in self.control_data_phi_map.get(&id).unwrap_or(&vec![]).iter() {
+            let data = self.get_value(*data, false, false);
+            let phi = self.get_value(*phi, false, false);
+            write!(w_term, "\t{}_tmp = {};\n", phi, data)?;
+        }
+        let tabs = match &self.function.nodes[id.idx()] {
+            Node::Start
+            | Node::Region { preds: _ }
+            | Node::Projection { control: _, selection: _ } => {
+                let succ = self.control_subgraph.succs(id).next().unwrap();
+                write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?;
+                1
+            }
+            Node::If { control: _, cond } => {
+                let mut succs = self.control_subgraph.succs(id);
+                let succ1 = succs.next().unwrap();
+                let succ2 = succs.next().unwrap();
+                let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some();
+                let succ1_block_name = self.get_block_name(succ1, false);
+                let succ2_block_name = self.get_block_name(succ2, false);
+                write!(
+                    w_term,
+                    "\tif ({}) {{\n",
+                    self.get_value(*cond, false, false)
+                )?;
+                write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ1_block_name.clone() } else { succ2_block_name.clone() })?;
+                write!(w_term, "\t}} else {{\n")?;
+                write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ2_block_name } else { succ1_block_name })?;
+                write!(w_term, "\t}}\n")?;
+                1
+            }
+            Node::Fork { control: _, factors: _ } => {
+                // We create a cooperative group tile for each of: used threads per
+                // thread ID- for reads and writes-, used threads across all thread
+                // IDs- for parallelized reductions-, and available threads- to
+                // synchronize between used and unused threads. We want to create
+                // these only once, so we create two goto sections for each fork-
+                // one run only once for creating groups, and other may be ran
+                // multiple times if the Fork is serialized and Join jumps back
+                // to it.
+                let cg_tile = self.get_cg_tile(id, CGType::UsePerId);
+                if use_thread_quota.is_some() {
+                    let use_thread_quota = use_thread_quota.unwrap();
+                    let use_thread_per_id = if parallel_factor.is_some() {
+                        use_thread_quota / parallel_factor.unwrap()
+                    } else {
+                        use_thread_quota
+                    };
+                    write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id)?;
+                    let cg_tile_use = self.get_cg_tile(id, CGType::Use);
+                    write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota)?;
+                    let available_thread_quota = available_thread_quota.unwrap();
+                    let cg_tile_available = self.get_cg_tile(id, CGType::Available);
+                    write!(w_init, "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota)?;
+                    if parallel_factor.is_none() {
+                        write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?;
+                        write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?;
+                    }
+                }
+                // Fork nodes gate the used vs unused threads out of all available
+                // threads. If unused, we jump straight to the Join, and if used,
+                // we jump to successor like normal.
+                let succ = self.control_subgraph.succs(id).next().unwrap();
+                if let Some(available_thread_quota) = available_thread_quota && let Some(use_thread_quota) = use_thread_quota && use_thread_quota < available_thread_quota {
+                    let w_target = if parallel_factor.is_none() { w_post_init } else { w_init };
+                    write!(w_target, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?;
+                    write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?;
+                    write!(w_term, "\t}}\n")?;
+                    write!(w_term, "\telse {{\n")?;
+                    let join = self.fork_join_map.get(&id).unwrap();
+                    write!(w_term, "\t\tgoto {};\n", self.get_block_name(*join, false))?;
+                    write!(w_term, "\t}}\n")?;
+                    2
+                } else {
+                    // Make sure post-init isn't empty so it goto header generated
+                    if use_thread_quota.is_some() && parallel_factor.is_none() {
+                        write!(w_post_init, " ")?;
+                    }
+                    write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?;
+                    1
+                }
+            }
+            Node::Join { control: _ } => {
+                // Join nodes also gate the used vs unused threads with a tile
+                // sync after the body.
+                let succ = self.control_subgraph.succs(id).next().unwrap();
+                let has_thread_quota = available_thread_quota.is_some();
+                let mut tabs = 1;
+                if has_thread_quota {
+                    let available_thread_quota = available_thread_quota.unwrap();
+                    let use_thread_quota = use_thread_quota.unwrap();
+                    if use_thread_quota < available_thread_quota {
+                        write!(w_init, "\tif (threadIdx.x % {} < {}) {{\n", available_thread_quota, use_thread_quota)?;
+                        write!(w_term, "\t}}\n")?;
+                        tabs += 1;
+                    }
+                    let fork = self.join_fork_map.get(&id).unwrap();
+                    let cg_tile_available = self.get_cg_tile(*fork, CGType::Available);
+                    write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
+                }
+                // If the Fork was parallelized, each thread or UsedPerId tile of
+                // threads only runs one ThreadID, so we can jump straight to the
+                // successor. Else, we jump back to the Fork until all ThreadIDs
+                // or equivalently the Fork's full factor number of iterations have
+                // been completed.
+                if parallel_factor.is_some() {
+                    write!(w_term, "\tgoto {};\n", self.get_block_name(succ, false))?;
+                } else {
+                    let fork = self.join_fork_map.get(&id).unwrap();
+                    let Node::Fork { factors, .. } = &self.function.nodes[fork.idx()] else {
+                        panic!("Expected join_fork_map to point to a fork node");
+                    };
+                    let fork_size = multiply_dcs(factors);
+                    let fork_iter = self.get_fork_iter(*fork, false);
+                    write!(w_term, "\t{} += 1;\n", fork_iter)?;
+                    write!(w_term, "\tif ({} == {}) {{\n", fork_iter, fork_size)?;
+                    write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ, false))?;
+                    write!(w_term, "\t}}\n")?;
+                    write!(w_term, "\telse {{\n")?;
+                    write!(w_term, "\t\tgoto {};\n", self.get_block_name(*fork, true))?;
+                    write!(w_term, "\t}}\n")?;
+                }
+                tabs
+            }
+            Node::Return { control: _, data } => {
+                if self.return_parameter.is_none() {
+                    // Since we lift return into a kernel argument, we write to that
+                    // argument upon return.
+                    let return_val = self.get_value(*data, false, false);
+                    let return_type_ptr = self.get_type(self.function.return_type, true);
+                    write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?;
+                    write!(w_term, "\t\t*(reinterpret_cast<{}>(ret)) = {};\n", return_type_ptr, return_val)?;
+                    write!(w_term, "\t}}\n")?;
+                }
+                write!(w_term, "\treturn;\n")?;
+                1
+            }
+            _ => {
+                panic!("Unsupported control node type")
+            }
+        };
+        Ok(tabs)
+    }
+
+    /*
+     * This function emits collection name + pointer math for the provided indices.
+     * All collection types use char pointers.
+     */
+    fn codegen_collect(&self, collect: NodeID, indices: &[Index], extra_dim_collects: &HashSet<TypeID>) -> String {
+        let mut index_ptr = "0".to_string();
+        let type_id = self.typing[collect.idx()];
+        for index in indices {
+            match index {
+                Index::Field(field) => {
+                    self.get_size(type_id, Some(*field), Some(extra_dim_collects));
+                }
+                // Variants of summations have zero offset
+                Index::Variant(_) => {}
+                // Convert multi-d array index to 1-d index, and optionally
+                // convert to single-byte index by multiplying by element size
+                Index::Position(array_indices) => {
+                    let has_extra_dim = extra_dim_collects.contains(&self.typing[collect.idx()]);
+                    if has_extra_dim {
+                        continue;
+                    }
+                    let Type::Array(element_type, extents) =
+                        &self.types[self.typing[collect.idx()].idx()]
+                    else {
+                        panic!("Expected array type")
+                    };
+                    let mut cumulative_offset = multiply_dcs(&extents[array_indices.len()..]);
+                    let max_left_array_index = array_indices.len() - 1;
+                    for (i, index) in array_indices.iter().rev().enumerate() {
+                        cumulative_offset = format!(
+                            "{} * ({}{}",
+                            cumulative_offset,
+                            self.get_value(*index, false, false),
+                            if i != max_left_array_index {
+                                format!(" + dc{}", extents[max_left_array_index - i].idx())
+                            } else {
+                                "".to_string()
+                            }
+                        );
+                    }
+                    index_ptr.push_str(&format!(
+                        " + {}{}",
+                        cumulative_offset,
+                        ")".repeat(array_indices.len())
+                    ));
+                    let element_size = self.get_size(*element_type, None, Some(extra_dim_collects));
+                    index_ptr.push_str(&format!(" * {}", element_size));
+                }
+            }
+        }
+        let name = self.get_value(collect, false, false);
+        format!("{} + {}", name, index_ptr)
+    }
+
+    /*
+     * The outlined codegen for constants allows us to handle recursive initialization
+     * for collections. We perform "allocation" by atomically incrementing dynamic
+     * shared memory and CUDA's support for dynamic is limited to a single extern
+     * array. Dynamic is required here because not all dynamic constants and therefore
+     * array sizes are known. This approach will need further work, as currently
+     * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate`
+     * prevents unnecessary shared memory allocations for nested product and summation
+     * collections, since the outermost allocates everything for the full collection.
+     * Since not initialized, array collections don't need to be recursed into.
+     */
+    fn codegen_constant(
+        &self,
+        name: String,
+        cons_id: ConstantID,
+        allow_allocate: bool,
+        extra_dim_collects: Option<&HashSet<TypeID>>,
+        dynamic_shared_offset: &mut String,
+        w: &mut String,
+        num_tabs: usize,
+    ) -> Result<(), Error> {
+        let tabs = "\t".repeat(num_tabs);
+        match &self.constants[cons_id.idx()] {
+            Constant::Boolean(val) => write!(w, "{}{} = {};\n", tabs, name, val)?,
+            Constant::Integer8(val) => write!(w, "{}{} = {};\n", tabs, name, val)?,
+            Constant::UnsignedInteger8(val) => write!(w, "{}{} = {};\n", tabs, name, val)?,
+            Constant::Integer16(val) => write!(w, "{}{} = {};\n", tabs, name, val)?,
+            Constant::UnsignedInteger16(val) => write!(w, "{}{} = {};\n", tabs, name, val)?,
+            Constant::Integer32(val) => write!(w, "{}{} = {};\n", tabs, name, val)?,
+            Constant::UnsignedInteger32(val) => write!(w, "{}{} = {}ul;\n", tabs, name, val)?,
+            Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?,
+            Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?,
+            Constant::Float32(val) => write!(w, "{}{} = {}f;\n", tabs, name, format_float(**val as f64))?,
+            Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, format_float(**val))?,
+            // All three following collections involve align then allocate from the
+            // single dynamic shared memory buffer by using and updating the offset.
+            Constant::Product(type_id, constant_fields) => {
+                if allow_allocate {
+                    let alignment = self.get_alignment(*type_id);
+                    let size = self.get_size(*type_id, None, extra_dim_collects);
+                    *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment);
+                    write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?;
+                    write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?;
+                    *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size);
+                }
+                let Type::Product(type_fields) = &self.types[type_id.idx()] else {
+                    panic!("Product constant should have product type")
+                };
+                for i in 0..constant_fields.len() {
+                    // For each field update offset and issue recursive call
+                    let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects);
+                    let field_constant = &self.constants[constant_fields[i].idx()];
+                    if field_constant.is_scalar() {
+                        let field_type = self.get_type(type_fields[i], true);
+                        self.codegen_constant(
+                            format!("*reinterpret_cast<{}>({}+{})", field_type, name, offset),
+                            constant_fields[i],
+                            false,
+                            None,
+                            dynamic_shared_offset,
+                            w,
+                            num_tabs,
+                        )?;
+                    } else if !field_constant.is_array() {
+                        self.codegen_constant(format!("{}+{}", name, offset), constant_fields[i], false, extra_dim_collects, dynamic_shared_offset, w, num_tabs)?;
+                    }
+                }
+            }
+            Constant::Summation(type_id, variant, field) => {
+                if allow_allocate {
+                    let alignment = self.get_alignment(*type_id);
+                    let size = self.get_size(*type_id, None, extra_dim_collects);
+                    *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment);
+                    write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?;
+                    write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?;
+                    *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size);
+                }
+                // No offset updating needed since all variants start at 0
+                let Type::Summation(variants) = &self.types[type_id.idx()] else {
+                    panic!("Summation constant should have summation type")
+                };
+                let variant_constant = &self.constants[field.idx()];
+                if variant_constant.is_scalar() {
+                    let variant_type = self.get_type(self.typing[variants[*variant as usize].idx()], true);
+                    self.codegen_constant(
+                        format!("*reinterpret_cast<{}>({})", variant_type, name),
+                        *field,
+                        false,
+                        extra_dim_collects,
+                        dynamic_shared_offset,
+                        w,
+                        num_tabs,
+                    )?;
+                } else if !variant_constant.is_array() {
+                    self.codegen_constant(name, *field, false, extra_dim_collects, dynamic_shared_offset, w, num_tabs)?;
+                };
+            }
+            Constant::Array(type_id) => {
+                if !allow_allocate {
+                    panic!("Nested array constant should not be re-allocated");
+                }
+                let alignment = self.get_alignment(*type_id);
+                let size = self.get_size(*type_id, None, extra_dim_collects);
+                *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment);
+                write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?;
+                write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?;
+                *dynamic_shared_offset = format!("{} + {}", dynamic_shared_offset, size);
+            }
+        }
+        Ok(())
+    }
+
+    /*
+     * Emit code to calculate data size. For Product types, setting `num_fields`
+     * gives data size up to but not including that field, so = 2 gives 1st field
+     * and offset to 2nd field. This is useful for constant initialization and read/write
+     * index math.
+     */
+    fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>) -> String {
+        match &self.types[type_id.idx()] {
+            Type::Array(element_type, extents) => {
+                let array_size = if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { "1".to_string() } else { multiply_dcs(extents) };
+                format!("{} * {}", self.get_alignment(*element_type), array_size)
+            }
+            Type::Product(fields) => {
+                let num_fields = &num_fields.unwrap_or(fields.len());
+                let with_field = fields
+                    .iter()
+                    .enumerate()
+                    .filter(|(i, _)| i < num_fields)
+                    .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects), self.get_alignment(*id)))
+                    .fold(String::from("0"), |acc, (size, align)| {
+                        if acc == "0" {
+                            size
+                        } else {
+                            format!(
+                                "({} + {} - 1) / {} * {} + {}",
+                                acc, align, align, align, size
+                            )
+                        }
+                    });
+                if num_fields < &fields.len() {
+                    format!(
+                        "{} - {}",
+                        with_field,
+                        self.get_size(fields[*num_fields], None, extra_dim_collects)
+                    )
+                } else {
+                    with_field
+                }
+            }
+            Type::Summation(variants) => {
+                // The argmax variant by size is not guaranteed to be same as
+                // argmax variant by alignment, eg product of 3 4-byte primitives
+                // vs 1 8-byte primitive, so we need to calculate both.
+                let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects)).fold(
+                    String::from("0"),
+                    |acc, x| {
+                        if acc == "0" {
+                            x
+                        } else {
+                            format!("umax({}, {})", acc, x)
+                        }
+                    },
+                );
+                let max_alignment = variants
+                    .iter()
+                    .map(|id| self.get_alignment(*id))
+                    .max()
+                    .unwrap_or(0);
+                format!(
+                    "({} + {} - 1) / {} * {}",
+                    max_size, max_alignment, max_alignment, max_alignment
+                )
+            }
+            _ => format!("{}", self.get_alignment(type_id)),
+        }
+    }
+
+    fn get_alignment(&self, type_id: TypeID) -> usize {
+        match &self.types[type_id.idx()] {
+            Type::Array(element_type, _) => self.get_alignment(*element_type),
+            Type::Product(fields) | Type::Summation(fields) => fields
+                .iter()
+                .map(|field| self.get_alignment(*field))
+                .max()
+                .unwrap_or(0),
+            Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => 1,
+            Type::Integer16 | Type::UnsignedInteger16 => 2,
+            Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4,
+            Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8,
+            _ => panic!("Unsupported type for alignment"),
+        }
+    }
+
+    fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String {
+        let func_name = match intrinsic {
+            Intrinsic::Abs => match ty {
+                Type::Float32 => "__fabsf",
+                Type::Float64 => "__fabs",
+                ty if ty.is_signed() => "abs",
+                ty if ty.is_unsigned() => "uabs",
+                _ => panic!("Unsupported type for Abs"),
+            },
+            Intrinsic::ACos => match ty {
+                ty if ty.is_float() => "__acosf",
+                _ => "acos",
+            },
+            Intrinsic::ASin => match ty {
+                ty if ty.is_float() => "__asinf",
+                _ => "asin",
+            },
+            Intrinsic::ATan => match ty {
+                ty if ty.is_float() => "__atanf",
+                _ => "atan",
+            },
+            Intrinsic::ATan2 => match ty {
+                ty if ty.is_float() => "__atan2f",
+                _ => "atan2",
+            },
+            Intrinsic::Ceil => match ty {
+                ty if ty.is_float() => "__ceilf",
+                _ => "ceil",
+            },
+            Intrinsic::Cos => match ty {
+                ty if ty.is_float() => "__cosf",
+                _ => "cos",
+            },
+            Intrinsic::Cosh => match ty {
+                ty if ty.is_float() => "coshf",
+                _ => "cosh",
+            },
+            Intrinsic::Exp => match ty {
+                ty if ty.is_float() => "__expf",
+                _ => "exp",
+            },
+            Intrinsic::Exp2 => match ty {
+                ty if ty.is_float() => "__exp2f",
+                _ => "exp2",
+            },
+            Intrinsic::Floor => match ty {
+                ty if ty.is_float() => "__floorf",
+                _ => "floor",
+            },
+            Intrinsic::Ln => match ty {
+                ty if ty.is_float() => "__logf",
+                _ => "log",
+            },
+            Intrinsic::Log10 => match ty {
+                ty if ty.is_float() => "__log10f",
+                _ => "log10",
+            },
+            Intrinsic::Log2 => match ty {
+                ty if ty.is_float() => "__log2f",
+                _ => "log2",
+            },
+            Intrinsic::Max => match ty {
+                Type::Float32 => "fmaxf",
+                Type::Float64 => "fmax",
+                ty if ty.is_signed() => "smax",
+                ty if ty.is_unsigned() => "umax",
+                _ => "max",
+            },
+            Intrinsic::Min => match ty {
+                Type::Float32 => "fminf",
+                Type::Float64 => "fmin",
+                ty if ty.is_signed() => "smin",
+                ty if ty.is_unsigned() => "umin",
+                _ => "min",
+            },
+            Intrinsic::Pow | Intrinsic::Powf => match ty {
+                Type::Float32 => "__powf",
+                Type::Float64 => "pow",
+                _ => panic!("Unsupported type for Pow"),
+            },
+            Intrinsic::Powi => match ty {
+                ty if ty.is_signed() || ty.is_unsigned() => "powi",
+                _ => panic!("Unsupported type for Powi"),
+            },
+            Intrinsic::Round => match ty {
+                ty if ty.is_float() => "__roundf",
+                ty if ty.is_signed() || ty.is_unsigned() => "roundi",
+                _ => "round",
+            },
+            Intrinsic::Sin => match ty {
+                ty if ty.is_float() => "__sinf",
+                _ => "sin",
+            },
+            Intrinsic::Sinh => match ty {
+                ty if ty.is_float() => "sinhf",
+                _ => "sinh",
+            },
+            Intrinsic::Sqrt => match ty {
+                Type::Float32 => "sqrtf",
+                ty if ty.is_signed() || ty.is_unsigned() => "isqrt",
+                _ => "sqrt",
+            },
+            Intrinsic::Tan => match ty {
+                ty if ty.is_float() => "__tanf",
+                _ => "tan",
+            },
+            Intrinsic::Tanh => match ty {
+                ty if ty.is_float() => "tanhf",
+                _ => "tanh",
+            },
+            _ => panic!("Unsupported intrinsic {:?}", intrinsic),
+        };
+        func_name.to_string()
+    }
+
+    fn get_cg_tile(&self, fork: NodeID, cg_type: CGType) -> String {
+        format!("cg_{}{}", self.get_value(fork, false, false), if cg_type == CGType::Use { "_use" } else if cg_type == CGType::Available { "_available" } else { "" })
+    }
+
+    fn get_fork_iter(&self, fork: NodeID, ty: bool) -> String {
+        if ty {
+            format!("unsigned int iter_{}", self.get_value(fork, false, false))
+        } else {
+            format!("iter_{}", self.get_value(fork, false, false))
+        }
+    }
+
+    fn get_block_name(&self, id: NodeID, post: bool) -> String {
+        format!("bb_{}{}", self.get_value(id, false, false), if post { "_post" } else { "" })
+    }
+
+    /*
+     * Setting `ty = true` will return with type in declaration format. `make_pointer`
+     * is only considered if `ty = true` and only relevant for primitive types-
+     * otherwise it makes no difference because collections are already pointers.
+     */
+    fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String {
+        if let Node::DynamicConstant { id: dc_id } = &self.function.nodes[id.idx()] {
+            if ty {
+                panic!("Dynamic constants shouldn't be re-initialized")
+            }
+            format!("dc{}", dc_id.idx())
+        } else if let Node::Parameter { index } = &self.function.nodes[id.idx()] {
+            if ty {
+                panic!("Parameters shouldn't be re-initialized")
+            }
+            format!("p{}", index)
+        } else if ty {
+            format!(
+                "{} {}",
+                self.get_type(self.typing[id.idx()], make_pointer),
+                self.get_value(id, false, false)
+            )
+        } else {
+            format!(
+                "{}{}",
+                self.function.nodes[id.idx()].lower_case_name(),
+                id.idx()
+            )
+        }
+    }
+
+    fn get_type(&self, id: TypeID, make_pointer: bool) -> String {
+        let ty = &self.types[id.idx()];
+        if ty.is_primitive() {
+            convert_type(ty, make_pointer)
+        } else {
+            format!("char*{}", if make_pointer { "*" } else { "" })
+        }
+    }
+
+    fn multiply_fork_factors(&self, factors: &[DynamicConstantID]) -> Option<usize> {
+        factors.iter().try_fold(1usize, |acc, &factor_id| {
+            evaluate_dynamic_constant(factor_id, self.dynamic_constants)
+                .map(|val| acc.saturating_mul(val))
+        })
+    }
+}
+
+fn multiply_dcs(dcs: &[DynamicConstantID]) -> String {
+    if dcs.is_empty() {
+        "1".to_string()
+    } else {
+        dcs.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * ")
+    }
+}
+
+fn convert_type(ty: &Type, make_pointer: bool) -> String {
+    let mut result = match ty {
+        Type::Boolean => "bool".to_string(),
+        Type::Integer8 => "int8_t".to_string(),
+        Type::UnsignedInteger8 => "uint8_t".to_string(),
+        Type::Integer16 => "short".to_string(),
+        Type::UnsignedInteger16 => "unsigned short".to_string(),
+        Type::Integer32 => "int".to_string(),
+        Type::UnsignedInteger32 => "unsigned int".to_string(),
+        Type::Integer64 => "long long".to_string(),
+        Type::UnsignedInteger64 => "unsigned long long".to_string(),
+        Type::Float32 => "float".to_string(),
+        Type::Float64 => "double".to_string(),
+        _ => panic!("Unsupported type"),
+    };
+    if make_pointer {
+        result.push('*');
+    }
+    result
+}
+
+fn format_float(val: f64) -> String {
+    let mut s = val.to_string();
+    if !s.contains('.') && !s.contains('e') && !s.contains('E') {
+        s.push_str(".0");
+    }
+    s
+}
diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs
index 6a12901f262c295985f9309b6798d4caa142af37..dab4dbacd31e1f7603466ef8941f07cbdd81e6df 100644
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
@@ -1,11 +1,17 @@
 #![feature(if_let_guard, let_chains)]
 
 pub mod cpu;
+pub mod gpu;
 pub mod rt;
 
+pub mod fork_tree;
+
 pub use crate::cpu::*;
+pub use crate::gpu::*;
 pub use crate::rt::*;
 
+pub use crate::fork_tree::*;
+
 use std::collections::BTreeMap;
 
 use hercules_ir::*;
diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs
index 9f9188af9708c0d82ab49a8330b8127554b3d8e5..187b3f986b6ae7a62ec69e388b1627761880959f 100644
--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -893,6 +893,14 @@ impl Type {
         }
     }
 
+    pub fn is_summation(&self) -> bool {
+        if let Type::Summation(_) = self {
+            true
+        } else {
+            false
+        }
+    }
+
     pub fn is_array(&self) -> bool {
         if let Type::Array(_, _) = self {
             true
diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml
index 1db5dc2f555709341ef331149602c29a6bf917c3..0b250d28d9460bd0b19f574c27916e82802664c8 100644
--- a/hercules_opt/Cargo.toml
+++ b/hercules_opt/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>, Aaron Councilman <aaronjc4@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = ["hercules_cg/cuda"]
+
 [dependencies]
 ordered-float = "*"
 bitvec = "*"
diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs
index d9c689b42b161dcb5db29f8fca91cad4fbea11bb..8f82e86cab73b28f44222a8a3d8eaa4c49f04218 100644
--- a/hercules_rt/build.rs
+++ b/hercules_rt/build.rs
@@ -11,13 +11,14 @@ fn main() {
             .status()
             .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?");
         Command::new("ar")
-            .args(&["crus", "librtdefs.a", "rtdefs.o"])
             .current_dir(&Path::new(&out_dir))
+            .args(&["crus", "librtdefs.a", "rtdefs.o"])
             .status()
             .unwrap();
 
         println!("cargo::rustc-link-search=native={}", out_dir);
         println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+        println!("cargo::rustc-link-search=native=/usr/local/cuda/lib64");
         println!("cargo::rustc-link-search=native=/opt/cuda/lib/");
         println!("cargo::rustc-link-lib=static=rtdefs");
         println!("cargo::rustc-link-lib=cudart");
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index c0da1096cf65cb643c4ee21b2713e148cbb73ef1..ed5dca1d817f7d738a29e7c4be4a6f589a7dad41 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -152,6 +152,19 @@ impl<'a> HerculesCPURefMut<'a> {
 
 #[cfg(feature = "cuda")]
 impl<'a> HerculesCUDARef<'a> {
+    pub fn to_cpu_ref<'b, T>(self, dst: &'b mut [T]) -> HerculesCPURefMut<'b> {
+        unsafe {
+            let size = self.size;
+            let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap();
+            __copy_cuda_to_cpu(ptr.as_ptr(), self.ptr.as_ptr(), size);
+            HerculesCPURefMut {
+                ptr,
+                size,
+                _phantom: PhantomData,
+            }
+        }
+    }
+
     pub unsafe fn __ptr(&self) -> *mut u8 {
         self.ptr.as_ptr()
     }
@@ -179,6 +192,19 @@ impl<'a> HerculesCUDARefMut<'a> {
         }
     }
 
+    pub fn to_cpu_ref<'b, T>(self, dst: &mut [T]) -> HerculesCPURefMut<'b> {
+        unsafe {
+            let size = self.size;
+            let ptr = NonNull::new(dst.as_ptr() as *mut u8).unwrap();
+            __copy_cuda_to_cpu(ptr.as_ptr(), self.ptr.as_ptr(), size);
+            HerculesCPURefMut {
+                ptr,
+                size,
+                _phantom: PhantomData,
+            }
+        }
+    }
+
     pub unsafe fn __ptr(&self) -> *mut u8 {
         self.ptr.as_ptr()
     }
diff --git a/hercules_rt/src/rtdefs.cu b/hercules_rt/src/rtdefs.cu
index 24bbf2d572e00f99216da9be878d68e86406ba1d..534b297db5fc5b850453e865086af781c16995f0 100644
--- a/hercules_rt/src/rtdefs.cu
+++ b/hercules_rt/src/rtdefs.cu
@@ -7,7 +7,7 @@ extern "C" {
 		}
 		return ptr;
 	}
-	
+
 	void __cuda_dealloc(void *ptr, size_t size) {
 		(void) size;
 		cudaFree(ptr);
@@ -16,15 +16,15 @@ extern "C" {
 	void __cuda_zero_mem(void *ptr, size_t size) {
 		cudaMemset(ptr, 0, size);
 	}
-	
+
 	void __copy_cpu_to_cuda(void *dst, void *src, size_t size) {
 		cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
 	}
-	
+
 	void __copy_cuda_to_cpu(void *dst, void *src, size_t size) {
 		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
 	}
-	
+
 	void __copy_cuda_to_cuda(void *dst, void *src, size_t size) {
 		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
 	}
diff --git a/hercules_samples/call/Cargo.toml b/hercules_samples/call/Cargo.toml
index c8b570affa63657422f3857f96f6bbc06a52ef1e..38a3d1b6cf4a247b4078e62d380ea942d7fb6e09 100644
--- a/hercules_samples/call/Cargo.toml
+++ b/hercules_samples/call/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = ["juno_build/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs
index af48fe64f7c2b778c9841e45ecf13b8a6a5740d2..e7b6dee9041ff6c3e862dec173f9d1c0784c8653 100644
--- a/hercules_samples/call/build.rs
+++ b/hercules_samples/call/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .ir_in_src("call.hir")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .ir_in_src("call.hir")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .ir_in_src("call.hir")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..cc4ef88fc71754e38d8d0b1b03bebe5fc806ea94
--- /dev/null
+++ b/hercules_samples/call/src/gpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.add);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/ccp/Cargo.toml b/hercules_samples/ccp/Cargo.toml
index 97d4b2ef8efc96af059284df345282213dddab68..c10aced18c8a2261852e49f5f569bf13e689f72b 100644
--- a/hercules_samples/ccp/Cargo.toml
+++ b/hercules_samples/ccp/Cargo.toml
@@ -4,8 +4,8 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
-[build-dependencies]
-juno_build = { path = "../../juno_build" }
+[features]
+cuda = ["juno_build/cuda"]
 
 [dependencies]
 juno_build = { path = "../../juno_build" }
@@ -13,3 +13,6 @@ hercules_rt = { path = "../../hercules_rt" }
 rand = "*"
 async-std = "*"
 with_builtin_macros = "0.1.0"
+
+[build-dependencies]
+juno_build = { path = "../../juno_build" }
diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs
index f04d48c7d0ea6df8b16d70b05cedabfc04c1f6f2..d74547ad222b243e8d809ac0ec1a94131f30ba98 100644
--- a/hercules_samples/ccp/build.rs
+++ b/hercules_samples/ccp/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .ir_in_src("ccp.hir")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .ir_in_src("ccp.hir")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .ir_in_src("ccp.hir")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..d49af8f5a3ec3d41ea23f74fe6ffd319bf1e53d5
--- /dev/null
+++ b/hercules_samples/ccp/src/gpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.tricky);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml
index 99a48115197ce853941223ed360079ec5376583e..9b11ddc10b5d185e020d639e677f5b56a2c3b8d0 100644
--- a/hercules_samples/dot/Cargo.toml
+++ b/hercules_samples/dot/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
 [features]
-cuda = ["hercules_rt/cuda"]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
 
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs
index 4cfd2a87fba14d3c542bb54806a65da2d1a9b8f5..8657fdc166fe68ad2565a8a0736984c7991be0a7 100644
--- a/hercules_samples/dot/build.rs
+++ b/hercules_samples/dot/build.rs
@@ -4,8 +4,7 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("dot.hir")
         .unwrap()
-        //.schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
-        .schedule_in_src("cpu.sch")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch
index 956eb99628a03a3efb3d77e97d93a8cb677bbd6a..c65827fd8bb218a53aaad5cf69b8ab4c77a38059 100644
--- a/hercules_samples/dot/src/gpu.sch
+++ b/hercules_samples/dot/src/gpu.sch
@@ -2,12 +2,16 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-auto-outline(*);
-gpu(*);
-host(dot);
+let out = auto-outline(*);
+gpu(out.dot);
 
 ip-sroa(*);
 sroa(*);
 dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
 
 gcm(*);
diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs
index 335e8909375d061438920fed66d58e5bf87daa11..8862c11a9273f9808f2148f1067dcf3f5953c11f 100644
--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
@@ -1,19 +1,37 @@
 #![feature(concat_idents)]
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("dot");
 
 fn main() {
     async_std::task::block_on(async {
-        let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
-        let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURef::from_slice(&b);
-        let mut r = runner!(dot);
-        let c = r.run(8, a, b).await;
-        println!("{}", c);
-        assert_eq!(c, 70.0);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
+            let a = HerculesCPURef::from_slice(&a);
+            let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
+            let b = HerculesCPURef::from_slice(&b);
+            let mut r = runner!(dot);
+            let c = r.run(8, a, b).await;
+            println!("{}", c);
+            assert_eq!(c, 70.0);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
+            let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
+            let a = a_box.get_ref();
+            let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
+            let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
+            let b = b_box.get_ref();
+            let mut r = runner!(dot);
+            let c = r.run(8, a, b).await;
+            println!("{}", c);
+            assert_eq!(c, 70.0);
+        }
     });
 }
 
diff --git a/hercules_samples/fac/Cargo.toml b/hercules_samples/fac/Cargo.toml
index 9082a4fc4194ac3fa9c694a5a5973f605772a384..72f82672b18e1bf9529075c233a3efab742fda81 100644
--- a/hercules_samples/fac/Cargo.toml
+++ b/hercules_samples/fac/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = ["juno_build/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs
index 4d8226f11183d9500e6affec4c46110e8626ee69..aa1f73a92d06e3e74c800f9d657ff911e9f85157 100644
--- a/hercules_samples/fac/build.rs
+++ b/hercules_samples/fac/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .ir_in_src("fac.hir")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .ir_in_src("fac.hir")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .ir_in_src("fac.hir")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..ac1f60261c5b99d2c8a61e0d503621b1db693ecd
--- /dev/null
+++ b/hercules_samples/fac/src/gpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.fac);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml
index 89e46dd682024012942e6a5014cc5f2f6ec12b83..49f05f29af907a72df7cb188c62f65544070f232 100644
--- a/hercules_samples/matmul/Cargo.toml
+++ b/hercules_samples/matmul/Cargo.toml
@@ -5,7 +5,7 @@ authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
 [features]
-cuda = ["hercules_rt/cuda"]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
 
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs
index c15ca97fa4b0730622f28e6cf16f7ab24de7310a..735458c0c8be76bdae6cd7b3b308e38ccae78edd 100644
--- a/hercules_samples/matmul/build.rs
+++ b/hercules_samples/matmul/build.rs
@@ -4,8 +4,7 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("matmul.hir")
         .unwrap()
-        // .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
-        .schedule_in_src("cpu.sch")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/hercules_samples/matmul/src/cpu.sch b/hercules_samples/matmul/src/cpu.sch
index c00a33144f012fb4d3e06a42d70a6f06840f32d5..aeed7e10583a9161fa261cd6d87d1a9a0d1623a7 100644
--- a/hercules_samples/matmul/src/cpu.sch
+++ b/hercules_samples/matmul/src/cpu.sch
@@ -16,4 +16,6 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
+infer-schedules(*);
+
 gcm(*);
diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch
index 99ac21a60db89d44aff27331c9a5832bc50e8c12..c0a1a5cebd46a47378132d357b7c0fc83a4446a6 100644
--- a/hercules_samples/matmul/src/gpu.sch
+++ b/hercules_samples/matmul/src/gpu.sch
@@ -2,17 +2,20 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-auto-outline(*);
-gpu(*);
-host(matmul);
+let out = auto-outline(*);
+gpu(out.matmul);
 
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
 
+forkify(*);
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
 gcm(*);
-xdot[true](*);
diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs
index 8757a0fd6c8905d5dee8a71312876265f0dabee2..abd25ec9cddbd4be508b3f484cffc1df1365dc4d 100644
--- a/hercules_samples/matmul/src/main.rs
+++ b/hercules_samples/matmul/src/main.rs
@@ -3,13 +3,15 @@
 use rand::random;
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("matmul");
 
 fn main() {
     async_std::task::block_on(async {
         const I: usize = 256;
-        const J: usize = 64;
+        const J: usize = 8; // hardcoded constant in matmul.hir
         const K: usize = 128;
         let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
         let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
@@ -21,11 +23,24 @@ fn main() {
                 }
             }
         }
-        let a = HerculesCPURef::from_slice(&mut a);
-        let b = HerculesCPURef::from_slice(&mut b);
-        let mut r = runner!(matmul);
-        let c = r.run(I as u64, J as u64, K as u64, a, b).await;
-        assert_eq!(c.as_slice::<i32>(), &*correct_c);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&mut a);
+            let b = HerculesCPURef::from_slice(&mut b);
+            let mut r = runner!(matmul);
+            let c = r.run(I as u64, J as u64, K as u64, a, b).await;
+            assert_eq!(c.as_slice::<i32>(), &*correct_c);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
+            let mut r = runner!(matmul);
+            let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
+            c.to_cpu_ref(&mut c_cpu);
+            assert_eq!(&*c_cpu, &*correct_c);
+        }
     });
 }
 
diff --git a/hercules_samples/matmul/src/matmul.hir b/hercules_samples/matmul/src/matmul.hir
index ab0f384a563ccb6144e59b811745fe5aa76f08dd..f9d37afcbb9e74038aafd35dfacf91f533bc4b99 100644
--- a/hercules_samples/matmul/src/matmul.hir
+++ b/hercules_samples/matmul/src/matmul.hir
@@ -1,9 +1,9 @@
-fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2)
+fn matmul<3>(a: array(i32, #0, 8), b: array(i32, 8, #2)) -> array(i32, #0, #2)
   c = constant(array(i32, #0, #2), [])
   i_j_ctrl = fork(start, #0, #2)
   i_idx = thread_id(i_j_ctrl, 0)
   j_idx = thread_id(i_j_ctrl, 1)
-  k_ctrl = fork(i_j_ctrl, #1)
+  k_ctrl = fork(i_j_ctrl, 8)
   k_idx = thread_id(k_ctrl, 0)
   k_join_ctrl = join(k_ctrl)
   i_j_join_ctrl = join(k_join_ctrl)
@@ -15,4 +15,4 @@ fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2)
   add = add(mul, dot)
   dot = reduce(k_join_ctrl, zero, add)
   update_c = write(update_i_j_c, dot, position(i_idx, j_idx))
-  update_i_j_c = reduce(i_j_join_ctrl, c, update_c)
\ No newline at end of file
+  update_i_j_c = reduce(i_j_join_ctrl, c, update_c)
diff --git a/juno_build/Cargo.toml b/juno_build/Cargo.toml
index 72faf4bd14da65b482f2e379c1b51ce3ede8dcf0..11ef85db46ba2c7958fa5becc57623a3c5875868 100644
--- a/juno_build/Cargo.toml
+++ b/juno_build/Cargo.toml
@@ -4,7 +4,10 @@ version = "0.1.0"
 authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = ["juno_frontend/cuda"]
+
 [dependencies]
-juno_frontend = { path = "../juno_frontend" }
+juno_frontend = { path = "../juno_frontend", default-features = false }
 hercules_ir = { path = "../hercules_ir" }
 with_builtin_macros = "0.1.0"
diff --git a/juno_build/build.rs b/juno_build/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..7ba34c8c9395926bd16a92dcf831eadda3f3af26
--- /dev/null
+++ b/juno_build/build.rs
@@ -0,0 +1,8 @@
+fn main() {
+    #[cfg(feature = "cuda")]
+    println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+    #[cfg(feature = "cuda")]
+    println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+    #[cfg(feature = "cuda")]
+    println!("cargo:rustc-link-lib=cudart");
+}
diff --git a/juno_frontend/Cargo.toml b/juno_frontend/Cargo.toml
index ad35b84c85ee18ed0d390c8005fc8d07a1c2916b..648daf5f4757f055259c1053d10c5849259941f8 100644
--- a/juno_frontend/Cargo.toml
+++ b/juno_frontend/Cargo.toml
@@ -4,6 +4,10 @@ version = "0.1.0"
 authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = ["hercules_opt/cuda", "juno_scheduler/cuda"]
+default = []
+
 [[bin]]
 name = "juno"
 path = "src/main.rs"
@@ -18,6 +22,7 @@ lrlex = "0.13"
 lrpar = "0.13"
 
 [dependencies]
+hercules_opt = { path = "../hercules_opt", default-features = false }
 cfgrammar = "0.13"
 clap = { version = "*", features = ["derive"] }
 lrlex = "0.13"
@@ -27,6 +32,5 @@ num-traits = "*"
 ordered-float = "*"
 phf = { version = "0.11", features = ["macros"] }
 hercules_ir = { path = "../hercules_ir" }
-hercules_opt = { path = "../hercules_opt" }
 juno_scheduler = { path = "../juno_scheduler" }
 juno_utils = { path = "../juno_utils" }
diff --git a/juno_samples/antideps/Cargo.toml b/juno_samples/antideps/Cargo.toml
index 9bd1d5a0d484e257ee3ad1e425a09f503da4b503..e6f38e095f492b2d7934b6ff969fa468137d1d2c 100644
--- a/juno_samples/antideps/Cargo.toml
+++ b/juno_samples/antideps/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_antideps"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs
index 7ed716a444460d7a90965f5b7f5faf3a7aadcb14..d74d1a49054ef2c8d22a99a1b60eae4160a1d778 100644
--- a/juno_samples/antideps/build.rs
+++ b/juno_samples/antideps/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("antideps.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("antideps.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("antideps.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..e166515dc5562f2e142792229fea92309a42e526
--- /dev/null
+++ b/juno_samples/antideps/src/gpu.sch
@@ -0,0 +1,20 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_antideps2, out.very_complex_antideps, out.read_chains, out.array_of_structs);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/casts_and_intrinsics/Cargo.toml b/juno_samples/casts_and_intrinsics/Cargo.toml
index f49797969012f5195a0338b7b14fa04414dddb03..b2e7c815b0e93ba1741503cb29fe7528b78b5996 100644
--- a/juno_samples/casts_and_intrinsics/Cargo.toml
+++ b/juno_samples/casts_and_intrinsics/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_casts_and_intrinsics"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs
index 16d5c7a4f7fcb00344fc7669b67103a27f71a7c6..342c4a053349716874d988499fb6570c5c2d971f 100644
--- a/juno_samples/casts_and_intrinsics/build.rs
+++ b/juno_samples/casts_and_intrinsics/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("casts_and_intrinsics.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("casts_and_intrinsics.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("casts_and_intrinsics.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..c997cca91ae606eda36af2fba3721cf8975ac929
--- /dev/null
+++ b/juno_samples/casts_and_intrinsics/src/gpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.casts_and_intrinsics);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml
index ff375d80d637e2ac9baa1377e168f912454b5fbc..63b6b2ac98dcc022a45f3c4084930cbfb24956ff 100644
--- a/juno_samples/cava/Cargo.toml
+++ b/juno_samples/cava/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_cava"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/cava/build.rs b/juno_samples/cava/build.rs
index 7f60f8019f105dda64cf12a2664668efbd637662..1b6dddf48053d5fb31c38947a651282867b7bcf3 100644
--- a/juno_samples/cava/build.rs
+++ b/juno_samples/cava/build.rs
@@ -1,6 +1,15 @@
 use juno_build::JunoCompiler;
 
 fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("cava.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
     JunoCompiler::new()
         .file_in_src("cava.jn")
         .unwrap()
diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn
index f3096ec34230ad838ca803e8d8de79c176aa9bd2..359a83ed20db37117a8c5eef5b3e7cf31986e8cd 100644
--- a/juno_samples/cava/src/cava.jn
+++ b/juno_samples/cava/src/cava.jn
@@ -116,7 +116,8 @@ fn denoise<row : usize, col : usize>(input : f32[CHAN, row, col]) -> f32[CHAN, r
               filter[i, j] = input[chan, r + i - 1, c + j - 1];
             }
           }
-          res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter);
+
+	  res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter);
         } else {
           res[chan, r, c] = input[chan, r, c];
         }
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..a5570b8d98324410aea9902cf29bce386126ce12
--- /dev/null
+++ b/juno_samples/cava/src/gpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+inline(*);
+let out = auto-outline(*);
+gpu(out.cava);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+// forkify(*);
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index 8ad6824f01d4d94a76f088ba1540ba44d2ce7b71..482bbf8deb0af323255b004a7e33e70202acb886 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -9,11 +9,15 @@ use self::cava_rust::CHAN;
 use self::image_proc::*;
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 use image::ImageError;
 
 use clap::Parser;
 
+use std::mem;
+
 juno_build::juno!("cava");
 
 fn run_cava(
@@ -27,42 +31,67 @@ fn run_cava(
     coefs: &[f32],
     tonemap: &[f32],
 ) -> Box<[u8]> {
-    assert_eq!(image.len(), CHAN * rows * cols);
-    let image = HerculesCPURef::from_slice(image);
 
+    assert_eq!(image.len(), CHAN * rows * cols);
     assert_eq!(tstw.len(), CHAN * CHAN);
-    let tstw = HerculesCPURef::from_slice(tstw);
-
     assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN);
-    let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts);
-
     assert_eq!(weights.len(), num_ctrl_pts * CHAN);
-    let weights = HerculesCPURef::from_slice(weights);
-
     assert_eq!(coefs.len(), 4 * CHAN);
-    let coefs = HerculesCPURef::from_slice(coefs);
-
     assert_eq!(tonemap.len(), 256 * CHAN);
-    let tonemap = HerculesCPURef::from_slice(tonemap);
-
-    let mut r = runner!(cava);
-    async_std::task::block_on(async {
-        r.run(
-            rows as u64,
-            cols as u64,
-            num_ctrl_pts as u64,
-            image,
-            tstw,
-            ctrl_pts,
-            weights,
-            coefs,
-            tonemap,
-        )
-        .await
-    })
-    .as_slice::<u8>()
-    .to_vec()
-    .into_boxed_slice()
+
+    #[cfg(not(feature = "cuda"))]
+    {
+        let image = HerculesCPURef::from_slice(image);
+        let tstw = HerculesCPURef::from_slice(tstw);
+        let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts);
+        let weights = HerculesCPURef::from_slice(weights);
+        let coefs = HerculesCPURef::from_slice(coefs);
+        let tonemap = HerculesCPURef::from_slice(tonemap);
+	    let mut r = runner!(cava);
+	    async_std::task::block_on(async {
+		r.run(
+		    rows as u64,
+		    cols as u64,
+		    num_ctrl_pts as u64,
+		    image,
+		    tstw,
+		    ctrl_pts,
+		    weights,
+		    coefs,
+		    tonemap,
+		)
+		.await
+	    }).as_slice::<u8>().to_vec().into_boxed_slice()
+    }
+
+    #[cfg(feature = "cuda")]
+    {
+        let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image));
+        let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw));
+        let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts));
+        let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
+        let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
+        let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
+	    let mut r = runner!(cava);
+	    let res = async_std::task::block_on(async {
+            r.run(
+                rows as u64,
+                cols as u64,
+                num_ctrl_pts as u64,
+                image.get_ref(),
+                tstw.get_ref(),
+                ctrl_pts.get_ref(),
+                weights.get_ref(),
+                coefs.get_ref(),
+                tonemap.get_ref(),
+            )
+            .await
+	    });
+        let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
+        let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
+        res.to_cpu_ref(&mut res_cpu);
+        res_cpu
+    }
 }
 
 enum Error {
@@ -175,7 +204,17 @@ fn cava_harness(args: CavaInputs) {
                 .expect("Error saving verification image");
         }
 
-        assert_eq!(result, cpu_result.into(), "Verification failed, mismatch");
+        let max_diff = result.iter()
+            .zip(cpu_result.iter())
+            .map(|(a, b)| (*a as i16 - *b as i16).abs())
+            .max()
+            .unwrap_or(0);
+
+        assert!(
+            max_diff <= 3,
+            "Verification failed: maximum pixel difference of {} exceeds threshold of 3",
+            max_diff
+        );
     }
 }
 
diff --git a/juno_samples/concat/Cargo.toml b/juno_samples/concat/Cargo.toml
index 24ba1acff56dc3997162c1f3afc2334fd3428155..f2f902379ae4c8844c02593a93d71c33996a40e3 100644
--- a/juno_samples/concat/Cargo.toml
+++ b/juno_samples/concat/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_concat"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/concat/build.rs b/juno_samples/concat/build.rs
index f7784b999492955289fc83ad1297907c2d0ce996..c9ef720dad8e05c4fb133decc667d1522d0c4c9b 100644
--- a/juno_samples/concat/build.rs
+++ b/juno_samples/concat/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("concat.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("concat.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("concat.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn
index 70c741b6c36ed780cbc4107028850c198d943483..01a549690336be138f41ce4ba5f5ca2e569e399f 100644
--- a/juno_samples/concat/src/concat.jn
+++ b/juno_samples/concat/src/concat.jn
@@ -18,17 +18,9 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t {
 }
 
 #[entry]
-fn concat_entry(a : i32) -> i32 {
-  let arr1 : i32[3];
-  let arr2 : i32[6];
-  arr1[0] = a;
-  arr1[1] = a;
-  arr2[0] = a;
-  arr2[1] = a;
-  arr2[4] = a;
-  arr2[5] = a;
-  let arr3 = concat::<i32, 3, 6>(arr1, arr2);
-  return sum::<i32, 9>(arr3);
+fn concat_entry<a : usize, b: usize>(arr1 : i32[a], arr2 : i32[b]) -> i32 {
+  let arr3 = concat::<i32, a, b>(arr1, arr2);
+  return sum::<i32, a + b>(arr3);
 }
 
 #[entry]
diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..084f020c5a97a3432675253f66c9755954a3aaef
--- /dev/null
+++ b/juno_samples/concat/src/gpu.sch
@@ -0,0 +1,21 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+inline(*);
+let out = auto-outline(*);
+gpu(out.concat_entry);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs
index 8bcd7ba54a7b61fe15c97ba74d37fc0e867aa277..9674c2c54b328aefcb4e670dc7e9ec482f8b2508 100644
--- a/juno_samples/concat/src/main.rs
+++ b/juno_samples/concat/src/main.rs
@@ -2,30 +2,47 @@
 
 use hercules_rt::runner;
 use hercules_rt::HerculesCPURef;
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("concat");
 
 fn main() {
     async_std::task::block_on(async {
         let mut r = runner!(concat_entry);
-        let output = r.run(7).await;
-        println!("{}", output);
-        assert_eq!(output, 42);
+        let mut a_data = [7, 7, 0];
+        let mut b_data = [7, 7, 0, 0, 7, 7];
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&mut a_data);
+            let b = HerculesCPURef::from_slice(&mut b_data);
+            let output = r.run(3, 6, a, b).await;
+            assert_eq!(output, 42);
 
-        const N: usize = 3;
-        let arr : Box<[i32]> = (2..=4).collect();
-        let arr = HerculesCPURef::from_slice(&arr);
+            const N: usize = 3;
+            let arr : Box<[i32]> = (2..=4).collect();
+            let arr = HerculesCPURef::from_slice(&arr);
 
-        let mut r = runner!(concat_switch);
-        let output = r.run(N as u64, 50, arr.clone()).await;
-        let result = output.as_slice::<i32>();
-        println!("{:?}", result);
-        assert_eq!(result, [0, 1, 2, 3, 4]);
+            let mut r = runner!(concat_switch);
+            let output = r.run(N as u64, 50, arr.clone()).await;
+            let result = output.as_slice::<i32>();
+            println!("{:?}", result);
+            assert_eq!(result, [0, 1, 2, 3, 4]);
 
-        let output = r.run(N as u64, 30, arr).await;
-        let result = output.as_slice::<i32>();
-        println!("{:?}", result);
-        assert_eq!(result, [2, 3, 4, 0, 1]);
+            let output = r.run(N as u64, 30, arr).await;
+            let result = output.as_slice::<i32>();
+            println!("{:?}", result);
+            assert_eq!(result, [2, 3, 4, 0, 1]);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let mut a_data = [7, 7, 0];
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data));
+            let mut b_data = [7, 7, 0, 0, 7, 7];
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data));
+            let output = r.run(3, 6, a.get_ref(), b.get_ref()).await;
+            assert_eq!(output, 42);
+        }
     });
 }
 
diff --git a/juno_samples/cpu.sch b/juno_samples/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..4c684da2f176dd3675dc79fa506d2d60fe6e0577
--- /dev/null
+++ b/juno_samples/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+fork-split(*);
+unforkify(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/gpu.sch b/juno_samples/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..9a714789ac768405a2782cac6e68338c2f0697bb
--- /dev/null
+++ b/juno_samples/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.matmul);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/implicit_clone/Cargo.toml b/juno_samples/implicit_clone/Cargo.toml
index b312f5def295a6c31a7d2c9eab5c23f4e16ccd2f..9612937190366bea144ade4b9ae334b228233b49 100644
--- a/juno_samples/implicit_clone/Cargo.toml
+++ b/juno_samples/implicit_clone/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_implicit_clone"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs
index 75c1afc41a75b2006b26042323df3bdc3fcf5a17..8e465874b9904a785a85dbee885086585dbf40c0 100644
--- a/juno_samples/implicit_clone/build.rs
+++ b/juno_samples/implicit_clone/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("implicit_clone.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("implicit_clone.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("implicit_clone.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..0f7c80213d20481ce51628736f0ed07be21da399
--- /dev/null
+++ b/juno_samples/implicit_clone/src/gpu.sch
@@ -0,0 +1,20 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit_clone, out.tricky_loop_implicit_clone, out.tricky2_loop_implicit_clone, out.tricky3_loop_implicit_clone, out.no_implicit_clone, out.mirage_implicit_clone);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs
index 1e94ff8977a6a5e340b6180ceff777351dda088b..c1f82528de40598007291b9258ec1cf59acbcc07 100644
--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -8,42 +8,34 @@ fn main() {
     async_std::task::block_on(async {
         let mut r = runner!(simple_implicit_clone);
         let output = r.run(3).await;
-        println!("{}", output);
         assert_eq!(output, 11);
 
         let mut r = runner!(loop_implicit_clone);
         let output = r.run(100).await;
-        println!("{}", output);
         assert_eq!(output, 7);
 
         let mut r = runner!(double_loop_implicit_clone);
         let output = r.run(3).await;
-        println!("{}", output);
         assert_eq!(output, 42);
 
         let mut r = runner!(tricky_loop_implicit_clone);
         let output = r.run(2, 2).await;
-        println!("{}", output);
         assert_eq!(output, 130);
 
         let mut r = runner!(tricky2_loop_implicit_clone);
         let output = r.run(2, 3).await;
-        println!("{}", output);
         assert_eq!(output, 39);
 
         let mut r = runner!(tricky3_loop_implicit_clone);
         let output = r.run(5, 7).await;
-        println!("{}", output);
         assert_eq!(output, 7);
 
         let mut r = runner!(no_implicit_clone);
         let output = r.run(4).await;
-        println!("{}", output);
         assert_eq!(output, 13);
 
         let mut r = runner!(mirage_implicit_clone);
         let output = r.run(73).await;
-        println!("{}", output);
         assert_eq!(output, 843);
     });
 }
diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml
index 8ad95853d25509d91713a080c0b63b01c0469110..eac83d15f430a7752d14a1ec3105c88209143477 100644
--- a/juno_samples/matmul/Cargo.toml
+++ b/juno_samples/matmul/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_matmul"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs
index 926fbc33ecfa5ab31b40a92f778bb4d3b7f6a77e..0be838c620761e8726590e2dbaf7bfdb7a82e3df 100644
--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("matmul.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("matmul.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("matmul.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..3d3f919cd26e4eba480540df06f839a8b86976b0
--- /dev/null
+++ b/juno_samples/matmul/src/gpu.sch
@@ -0,0 +1,20 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.matmul, out.tiled_64_matmul);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs
index 624ee5652a78d9c2ab7bc84d3974bf2df5b02838..50fe1760eeeedc946d510a6d5285d76e1346f3cc 100644
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -3,6 +3,8 @@
 use rand::random;
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("matmul");
 
@@ -11,8 +13,8 @@ fn main() {
         const I: usize = 256;
         const J: usize = 64;
         const K: usize = 128;
-        let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
-        let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
+        let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
+        let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
         let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
         for i in 0..I {
             for k in 0..K {
@@ -21,18 +23,32 @@ fn main() {
                 }
             }
         }
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURef::from_slice(&b);
-        let mut r = runner!(matmul);
-        let c = r
-            .run(I as u64, J as u64, K as u64, a.clone(), b.clone())
-            .await;
-        assert_eq!(c.as_slice::<i32>(), &*correct_c);
-        let mut r = runner!(tiled_64_matmul);
-        let tiled_c = r
-            .run(I as u64, J as u64, K as u64, a.clone(), b.clone())
-            .await;
-        assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let b = HerculesCPURef::from_slice(&b);
+            let mut r = runner!(matmul);
+            let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
+            assert_eq!(c.as_slice::<i32>(), &*correct_c);
+            let mut r = runner!(tiled_64_matmul);
+            let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
+            assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
+            let mut r = runner!(matmul);
+            let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
+            c.to_cpu_ref(&mut c_cpu);
+            assert_eq!(&*c_cpu, &*correct_c);
+            let mut r = runner!(tiled_64_matmul);
+            let tiled_c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let mut tiled_c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
+            tiled_c.to_cpu_ref(&mut tiled_c_cpu);
+            assert_eq!(&*tiled_c_cpu, &*correct_c);
+        }
     });
 }
 
diff --git a/juno_samples/matmul/src/matmul.jn b/juno_samples/matmul/src/matmul.jn
index ca9be73a86144ebe57048ed35adc2f36bfcd905b..fb6de5bd89e0c48b3a0ac4aa66d4764894f2fcff 100644
--- a/juno_samples/matmul/src/matmul.jn
+++ b/juno_samples/matmul/src/matmul.jn
@@ -20,7 +20,7 @@ fn tiled_64_matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]
   let atile : i32[64, 64];
   let btile : i32[64, 64];
   let ctile : i32[64, 64];
-  
+
   for bi = 0 to n / 64 {
     for bk = 0 to l / 64 {
       for ti = 0 to 64 {
diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml
index 8c9b969d23019b8bbd3bf28b3506e2e497ae8ec7..5ee3f747477c901806642818553e94b046b50242 100644
--- a/juno_samples/nested_ccp/Cargo.toml
+++ b/juno_samples/nested_ccp/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_nested_ccp"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs
index c5c7ca6a1b9ab5decf6a8cf0b8e8f13ff7122834..074937e7b0a0ce50032928f30c72feb39b5ecd79 100644
--- a/juno_samples/nested_ccp/build.rs
+++ b/juno_samples/nested_ccp/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("nested_ccp.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("nested_ccp.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("nested_ccp.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..c56d046a686ea8bddd8c9e9f91d88eb39a2eaf31
--- /dev/null
+++ b/juno_samples/nested_ccp/src/gpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.ccp_example, out.median_array, out.no_underflow);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs
index 423b66fb8c84fd1af6b0267c7c63aa204ab1dc6c..bc99a4bdd071ff19c70977e29241b76e3e249014 100644
--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
@@ -1,6 +1,8 @@
 #![feature(concat_idents)]
 
 use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("nested_ccp");
 
@@ -8,19 +10,30 @@ fn main() {
     async_std::task::block_on(async {
         let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
         let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]);
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURefMut::from_slice(&mut b);
-        let mut r = runner!(ccp_example);
-        let output_example = r.run(a).await;
-        let mut r = runner!(median_array);
-        let output_median = r.run(9, b).await;
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let b = HerculesCPURefMut::from_slice(&mut b);
+            let mut r = runner!(ccp_example);
+            let output_example = r.run(a).await;
+            let mut r = runner!(median_array);
+            let output_median = r.run(9, b).await;
+            assert_eq!(output_example, 1.0);
+            assert_eq!(output_median, 18);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
+            let mut r = runner!(ccp_example);
+            let output_example = r.run(a.get_ref_mut()).await;
+            let mut r = runner!(median_array);
+            let output_median = r.run(9, b.get_ref_mut()).await;
+            assert_eq!(output_example, 1.0);
+            assert_eq!(output_median, 18);
+        }
         let mut r = runner!(no_underflow);
         let out_no_underflow = r.run().await;
-        println!("{}", output_example);
-        println!("{}", output_median);
-        println!("{}", out_no_underflow);
-        assert_eq!(output_example, 1.0);
-        assert_eq!(output_median, 18);
         assert_eq!(out_no_underflow, 7);
     });
 }
diff --git a/juno_samples/patterns/Cargo.toml b/juno_samples/patterns/Cargo.toml
index a8dda157ff331ae9b1c5e1cb2a120db9bab3bb82..bedaf7ca01c0d7cfadbba5e11b8e94203ccda2b4 100644
--- a/juno_samples/patterns/Cargo.toml
+++ b/juno_samples/patterns/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_patterns"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/patterns/build.rs b/juno_samples/patterns/build.rs
index 8ac92f003e549a9aeb289001b8792ee4dcb51284..625da0a5fed548984b724aa89085c456ef22f12c 100644
--- a/juno_samples/patterns/build.rs
+++ b/juno_samples/patterns/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("patterns.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("patterns.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("patterns.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/patterns/src/gpu.sch b/juno_samples/patterns/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..3d9c8c9e6ddb588fbaa96e3c27f78cf21abd8f1a
--- /dev/null
+++ b/juno_samples/patterns/src/gpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.entry);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/schedule_test/Cargo.toml b/juno_samples/schedule_test/Cargo.toml
index be5d949bf1959d48c3463717f28b9fd186e05170..c783217a816960c764083c24aed1fdc5f0d1fb77 100644
--- a/juno_samples/schedule_test/Cargo.toml
+++ b/juno_samples/schedule_test/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_schedule_test"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/schedule_test/build.rs b/juno_samples/schedule_test/build.rs
index 4a4282473e87d6c24e12b5e3d59521ee8c99141e..749a660c551e8b231f63287898adb2863aef826e 100644
--- a/juno_samples/schedule_test/build.rs
+++ b/juno_samples/schedule_test/build.rs
@@ -4,7 +4,7 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("code.jn")
         .unwrap()
-        .schedule_in_src("sched.sch")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/schedule_test/src/sched.sch b/juno_samples/schedule_test/src/cpu.sch
similarity index 100%
rename from juno_samples/schedule_test/src/sched.sch
rename to juno_samples/schedule_test/src/cpu.sch
diff --git a/juno_samples/schedule_test/src/gpu.sch b/juno_samples/schedule_test/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..edca678ee103ec053f54a8e2caf0ee3e9e721e18
--- /dev/null
+++ b/juno_samples/schedule_test/src/gpu.sch
@@ -0,0 +1,47 @@
+macro juno-setup!(X) {
+  //gvn(X);
+  phi-elim(X);
+  dce(X);
+  lift-dc-math(X);
+}
+macro codegen-prep!(X) {
+  infer-schedules(X);
+  dce(X);
+  gcm(X);
+  dce(X);
+  phi-elim(X);
+  float-collections(X);
+  gcm(X);
+}
+
+
+juno-setup!(*);
+
+let first = outline(test@outer);
+let second = outline(test@row);
+
+// We can use the functions produced by outlining in our schedules
+gvn(first, second, test);
+
+ip-sroa(*);
+sroa(*);
+
+// We can evaluate expressions using labels and save them for later use
+let inner = first@inner;
+
+// A fixpoint can run a (series) of passes until no more changes are made
+// (though some passes seem to make edits even if there are no real changes,
+// so this is fragile).
+// We could just let it run until it converges but can also tell it to panic
+// if it hasn't converged after a number of iterations (like here) tell it to
+// just stop after a certain number of iterations (stop after #) or to print
+// the iteration number (print iter)
+fixpoint panic after 2 {
+  phi-elim(*);
+}
+
+host(*);
+gpu(first, second);
+
+codegen-prep!(*);
+//xdot[true](*);
diff --git a/juno_samples/schedule_test/src/main.rs b/juno_samples/schedule_test/src/main.rs
index 2e63babf29e84bc74a7649306dc28af88225cf39..1505d4e5ff620a53d1095cdc4185a5a6d665e71e 100644
--- a/juno_samples/schedule_test/src/main.rs
+++ b/juno_samples/schedule_test/src/main.rs
@@ -3,6 +3,8 @@
 use rand::random;
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("code");
 
@@ -26,12 +28,26 @@ fn main() {
             }
         }
 
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURef::from_slice(&b);
-        let c = HerculesCPURef::from_slice(&c);
-        let mut r = runner!(test);
-        let res = r.run(N as u64, M as u64, K as u64, a, b, c).await;
-        assert_eq!(res.as_slice::<i32>(), &*correct_res);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let b = HerculesCPURef::from_slice(&b);
+            let c = HerculesCPURef::from_slice(&c);
+            let mut r = runner!(test);
+            let res = r.run(N as u64, M as u64, K as u64, a, b, c).await;
+            assert_eq!(res.as_slice::<i32>(), &*correct_res);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
+            let c = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&c));
+            let mut r = runner!(test);
+            let res = r.run(N as u64, M as u64, K as u64, a.get_ref(), b.get_ref(), c.get_ref()).await;
+            let mut res_cpu: Box<[i32]> = vec![0; correct_res.len()].into_boxed_slice();
+            res.to_cpu_ref(&mut res_cpu);
+            assert_eq!(&*res_cpu, &*correct_res);
+        }
     });
 }
 
diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml
index 8060c5b3472ad898cb48e011332a852cd7b6705e..36d50dbd75c410b88142bb5ca5d680fbac0be3ce 100644
--- a/juno_samples/simple3/Cargo.toml
+++ b/juno_samples/simple3/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_simple3"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs
index 94760025d53abe7e10914052e1a7783386b316b0..58c2c5aab14e65facfe5154db13f453be411c55b 100644
--- a/juno_samples/simple3/build.rs
+++ b/juno_samples/simple3/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("simple3.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("simple3.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("simple3.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/simple3/src/cpu.sch b/juno_samples/simple3/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..7e6be7eefb05b7660d3c27ae9d937ce00cf79a0e
--- /dev/null
+++ b/juno_samples/simple3/src/cpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..d6c2a9d666c11d4ee07ce89a573ac73d8ea25bff
--- /dev/null
+++ b/juno_samples/simple3/src/gpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.simple3);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs
index 4f9fe6a708ec50fe65b1c31a2823580a117985ce..8eb78f7c93b0f195ffee1be120376dbe3f9a2a62 100644
--- a/juno_samples/simple3/src/main.rs
+++ b/juno_samples/simple3/src/main.rs
@@ -1,6 +1,8 @@
 #![feature(concat_idents)]
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("simple3");
 
@@ -8,12 +10,22 @@ fn main() {
     async_std::task::block_on(async {
         let a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]);
         let b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]);
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURef::from_slice(&b);
-        let mut r = runner!(simple3);
-        let c = r.run(8, a, b).await;
-        println!("{}", c);
-        assert_eq!(c, 120);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let b = HerculesCPURef::from_slice(&b);
+            let mut r = runner!(simple3);
+            let c = r.run(8, a, b).await;
+            assert_eq!(c, 120);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
+            let mut r = runner!(simple3);
+            let c = r.run(8, a.get_ref(), b.get_ref()).await;
+            assert_eq!(c, 120);
+        }
     });
 }
 
diff --git a/juno_scheduler/Cargo.toml b/juno_scheduler/Cargo.toml
index 04ab156c543fab4112eb0179eb1821a67c817d7a..26055b03fc6fd94fe6440f88a95faf1e2371b404 100644
--- a/juno_scheduler/Cargo.toml
+++ b/juno_scheduler/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.0.1"
 authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = []
+
 [build-dependencies]
 cfgrammar = "0.13"
 lrlex = "0.13"
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 584867e5ebc4dcf1771295717315e45af52ff141..2371e0f20e776e97d39596f656594fc679065264 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -178,6 +178,8 @@ pub struct PassManager {
     pub postdoms: Option<Vec<DomTree>>,
     pub fork_join_maps: Option<Vec<HashMap<NodeID, NodeID>>>,
     pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>,
+    pub fork_control_maps: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
+    pub fork_trees: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
     pub loops: Option<Vec<LoopTree>>,
     pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
     pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
@@ -214,6 +216,8 @@ impl PassManager {
             postdoms: None,
             fork_join_maps: None,
             fork_join_nests: None,
+            fork_control_maps: None,
+            fork_trees: None,
             loops: None,
             reduce_cycles: None,
             data_nodes_in_fork_joins: None,
@@ -337,6 +341,29 @@ impl PassManager {
         }
     }
 
+    pub fn make_fork_control_maps(&mut self) {
+        if self.fork_control_maps.is_none() {
+            self.make_fork_join_nests();
+            self.fork_control_maps = Some(
+                self.fork_join_nests.as_ref().unwrap().iter().map(fork_control_map).collect(),
+            );
+        }
+    }
+
+    pub fn make_fork_trees(&mut self) {
+        if self.fork_trees.is_none() {
+            self.make_fork_join_nests();
+            self.fork_trees = Some(
+                zip(
+                    self.functions.iter(),
+                    self.fork_join_nests.as_ref().unwrap().iter(),
+                )
+                .map(|(function, fork_join_nesting)| fork_tree(function, fork_join_nesting))
+                .collect(),
+            );
+        }
+    }
+
     pub fn make_loops(&mut self) {
         if self.loops.is_none() {
             self.make_control_subgraphs();
@@ -577,6 +604,10 @@ impl PassManager {
         self.make_control_subgraphs();
         self.make_collection_objects();
         self.make_callgraph();
+        self.make_def_uses();
+        self.make_fork_join_maps();
+        self.make_fork_control_maps();
+        self.make_fork_trees();
         self.make_devices();
 
         let PassManager {
@@ -589,6 +620,10 @@ impl PassManager {
             control_subgraphs: Some(control_subgraphs),
             collection_objects: Some(collection_objects),
             callgraph: Some(callgraph),
+            def_uses: Some(def_uses),
+            fork_join_maps: Some(fork_join_maps),
+            fork_control_maps: Some(fork_control_maps),
+            fork_trees: Some(fork_trees),
             devices: Some(devices),
             bbs: Some(bbs),
             node_colors: Some(node_colors),
@@ -612,6 +647,7 @@ impl PassManager {
 
         let mut rust_rt = String::new();
         let mut llvm_ir = String::new();
+        let mut cuda_ir = String::new();
         for idx in 0..module.functions.len() {
             match devices[idx] {
                 Device::LLVM => cpu_codegen(
@@ -629,6 +665,26 @@ impl PassManager {
                     pass: "cpu codegen".to_string(),
                     error: format!("{}", e),
                 })?,
+                Device::CUDA => gpu_codegen(
+                    &module.functions[idx],
+                    &module.types,
+                    &module.constants,
+                    &module.dynamic_constants,
+                    &typing[idx],
+                    &control_subgraphs[idx],
+                    &bbs[idx],
+                    &backing_allocations[&FunctionID::new(idx)],
+                    &collection_objects[&FunctionID::new(idx)],
+                    &def_uses[idx],
+                    &fork_join_maps[idx],
+                    &fork_control_maps[idx],
+                    &fork_trees[idx],
+                    &mut cuda_ir,
+                )
+                .map_err(|e| SchedulerError::PassError {
+                    pass: "cuda codegen".to_string(),
+                    error: format!("{}", e),
+                })?,
                 Device::AsyncRust => rt_codegen(
                     FunctionID::new(idx),
                     &module,
@@ -646,41 +702,76 @@ impl PassManager {
                     pass: "rust codegen".to_string(),
                     error: format!("{}", e),
                 })?,
-                _ => todo!(),
             }
         }
         println!("{}", llvm_ir);
+        println!("{}", cuda_ir);
         println!("{}", rust_rt);
 
+        let output_archive = format!("{}/lib{}.a", output_dir, module_name);
+        println!("{}", output_archive);
+
         // Write the LLVM IR into a temporary file.
         let tmp_dir = TempDir::new().unwrap();
-        let mut tmp_path = tmp_dir.path().to_path_buf();
-        tmp_path.push(format!("{}.ll", module_name));
-        println!("{}", tmp_path.display());
-        let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output LLVM IR file.");
+        let mut llvm_path = tmp_dir.path().to_path_buf();
+        llvm_path.push(format!("{}.ll", module_name));
+        println!("{}", llvm_path.display());
+        let mut file = File::create(&llvm_path)
+            .expect("PANIC: Unable to open output LLVM IR file.");
         file.write_all(llvm_ir.as_bytes())
             .expect("PANIC: Unable to write output LLVM IR file contents.");
 
         // Compile LLVM IR into an ELF object file.
-        let output_archive = format!("{}/lib{}.a", output_dir, module_name);
+        let llvm_object = format!("{}/{}_cpu.o", tmp_dir.path().to_str().unwrap(), module_name);
         let mut clang_process = Command::new("clang")
-            .arg(&tmp_path)
-            .arg("--emit-static-lib")
+            .arg(&llvm_path)
+            .arg("-c")
             .arg("-O3")
             .arg("-march=native")
             .arg("-o")
-            .arg(&output_archive)
+            .arg(&llvm_object)
             .stdin(Stdio::piped())
             .stdout(Stdio::piped())
             .spawn()
             .expect("Error running clang. Is it installed?");
         assert!(clang_process.wait().unwrap().success());
 
+        let mut ar_args = vec!["crus", &output_archive, &llvm_object];
+
+        let cuda_object = format!("{}/{}_cuda.o", tmp_dir.path().to_str().unwrap(), module_name);
+        if cfg!(feature = "cuda") {
+            // Write the CUDA IR into a temporary file.
+            let mut cuda_path = tmp_dir.path().to_path_buf();
+            cuda_path.push(format!("{}.cu", module_name));
+            let mut file = File::create(&cuda_path)
+                .expect("PANIC: Unable to open output CUDA IR file.");
+            file.write_all(cuda_ir.as_bytes())
+                .expect("PANIC: Unable to write output CUDA IR file contents.");
+
+            let mut nvcc_process = Command::new("nvcc")
+                .arg("-c")
+                .arg("-O3")
+                .arg("-o")
+                .arg(&cuda_object)
+                .arg(&cuda_path)
+                .spawn()
+                .expect("Error running nvcc. Is it installed?");
+            assert!(nvcc_process.wait().unwrap().success());
+
+            ar_args.push(&cuda_object);
+        }
+
+        let mut ar_process = Command::new("ar")
+            .args(&ar_args)
+            .spawn()
+            .expect("Error running ar. Is it installed?");
+        assert!(ar_process.wait().unwrap().success());
+
         // Write the Rust runtime into a file.
         let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name);
         println!("{}", output_rt);
-        let mut file =
-            File::create(&output_rt).expect("PANIC: Unable to open output Rust runtime file.");
+        let mut file = File::create(&output_rt)
+            .expect("PANIC: Unable to open output Rust runtime file.");
         file.write_all(rust_rt.as_bytes())
             .expect("PANIC: Unable to write output Rust runtime file contents.");