diff --git a/hercules_cg/src/device.rs b/hercules_cg/src/device.rs
index 09a5bc2689ecf3ed1a7186f2ada52758c7b19fbb..866fa6adeeb41c2c8106a1b7de5ce1ffdc45db40 100644
--- a/hercules_cg/src/device.rs
+++ b/hercules_cg/src/device.rs
@@ -9,8 +9,6 @@ pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec
     let mut devices = vec![];
 
     for (idx, function) in functions.into_iter().enumerate() {
-        devices.push(Device::CUDA);
-        continue;
         if let Some(device) = function.device {
             devices.push(device);
         } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 {
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 499ecce8f8da47aaec0d7f72b6f5805c7a6135bf..ab6e8f41f1c2262e8ea60c89c0a596caafc18f0d 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -31,24 +31,24 @@ pub fn gpu_codegen<W: Write>(
      *   one of multiple parameters.
      *
      * We don't assert but assume the following:
-     * - max_num_blocks in KernelParams is within constraint of 1D grid size. This 
+     * - max_num_blocks in KernelParams is within constraint of 1D grid size. This
      *   can be relaxed if we want to support larger grids.
      * - Product types are packed with padding inserted for each element to
      *   be aligned for its type and for full product to be aligned to its
      *   largest element
      * - Summation types must be aligned to their largest element
-     * 
+     *
      * Notes on GPU parallelization strategy and tips for IR transformations:
      * - The top level block fork and any lower thread forks require a known Fork
-     *   size. Thus for an otherwise parallelizable Fork with unknown size, 
+     *   size. Thus for an otherwise parallelizable Fork with unknown size,
      *   consider splitting it into two Forks with one of known size. For block
      *   level, the known fork has to be the (only) top-most fork.
-     * - The thread-level strategy is determined by starting at the most nested 
+     * - The thread-level strategy is determined by starting at the most nested
      *   Forks and working outwards in a greedy manner, with caps by GPU spec.
      *   Thus, to ensure some outer Fork is parallelized, ensure the inner
      *   parallelizable Forks aren't too large or consider removing schedule
      *   annotations.
-     * - Tight-Associative reductions can only be efficiently implemented if 
+     * - Tight-Associative reductions can only be efficiently implemented if
      *   different Hercules ThreadIDs correspond to consecutive CUDA threads. But
      *   this prevents nested parallelization since each parallel group must always
      *   be a contiguous tile of threads. We use a heuristic of choosing the larger
@@ -59,10 +59,10 @@ pub fn gpu_codegen<W: Write>(
      *   Fork contains expensive parallelizable operations, ensure all reductions
      *   are parallelizable or if not try pulling those out into a different Fork.
      * - We do nothing to mitigate intra-warp divergence. To mitigate this, the
-     *   IR, for example, should ensure the innermost parallelizable Forks either 
+     *   IR, for example, should ensure the innermost parallelizable Forks either
      *   have factor >= warp size (32) or remove Fork/Reduce node schedule
      *   annotations.
-     * 
+     *
      * Main TODOs:
      * - Fix dynamic shared memory allocation to reuse old shmem. The main case
      *   for improvement is when we have serialized forks with unused intermediate
@@ -135,7 +135,7 @@ pub fn gpu_codegen<W: Write>(
     }
 
     // Obtain the Return node and if it's a collection, use the collection objects
-    // analysis to determine the origin. Also save the return node id for later 
+    // analysis to determine the origin. Also save the return node id for later
     // conversion of primitive Return into Parameter.
     let (return_node_id, data_node_id) = {
         let pos = function
@@ -248,11 +248,11 @@ struct GPUContext<'a> {
 }
 
 /*
- * For all control nodes besides forks, Init, Body, and Term compose the main basic 
- * block, with Init and Term populated by control flow (Init used only by Fork and 
- * Join) and Body populated by data flow. 
+ * For all control nodes besides forks, Init, Body, and Term compose the main basic
+ * block, with Init and Term populated by control flow (Init used only by Fork and
+ * Join) and Body populated by data flow.
  * For serialized Fork nodes which may be jumped back to by corresponding Join node,
- * init and post_init separate one-time code (currently just cooperative group 
+ * init and post_init separate one-time code (currently just cooperative group
  * creation) from repeated code.
  */
 #[derive(Default, Debug)]
@@ -279,9 +279,9 @@ enum KernelState {
 }
 
 /*
- * CGType is used to track cooperative group types. UsePerId is the group of (CUDA) 
+ * CGType is used to track cooperative group types. UsePerId is the group of (CUDA)
  * threads for a current ThreadID, Use is the union of such threads for all ThreadIDs
- * in the current innermost Fork, and Available is Use plus additional threads not 
+ * in the current innermost Fork, and Available is Use plus additional threads not
  * used in the current Fork.
  */
 #[derive(Clone, Copy, PartialEq, Debug)]
@@ -348,7 +348,7 @@ impl GPUContext<'_> {
 
         // Emit host launch code
         let mut host_launch = String::new();
-        self.codegen_launch_code(true, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?;
+        self.codegen_launch_code(false, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?;
         write!(w, "{}", host_launch)?;
 
         Ok(())
@@ -375,7 +375,7 @@ namespace cg = cooperative_groups;
 #define roundi(a) (a)
 #define isqrt(a) ((int)sqrtf((float)(a)))
 
-",      
+",
         )?;
 
         write!(
@@ -463,12 +463,12 @@ namespace cg = cooperative_groups;
         Ok(())
     }
 
-    // To abide by c++ reassignment restrictions, we declare all data values 
+    // To abide by c++ reassignment restrictions, we declare all data values
     // upfront.
     fn codegen_declare_data(&self, w: &mut String) -> Result<(), Error> {
         for id in (0..self.function.nodes.len()).map(NodeID::new) {
-            if !self.function.nodes[id.idx()].is_control() && 
-                !self.function.nodes[id.idx()].is_dynamic_constant() && 
+            if !self.function.nodes[id.idx()].is_control() &&
+                !self.function.nodes[id.idx()].is_dynamic_constant() &&
                 !self.function.nodes[id.idx()].is_parameter() {
                 write!(w, "\t{};\n", self.get_value(id, true, false))?;
             }
@@ -477,8 +477,8 @@ namespace cg = cooperative_groups;
     }
 
     /*
-     * Emit helper registers that are used throughout the kernel. grid and block 
-     * are from CUDA's cooperative groups API and are used specifically for reads 
+     * Emit helper registers that are used throughout the kernel. grid and block
+     * are from CUDA's cooperative groups API and are used specifically for reads
      * and writes.
      */
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
@@ -517,7 +517,7 @@ namespace cg = cooperative_groups;
         write!(w, "
 int main() {{
 ")?;
-        // The following steps are for host-side C function arguments, but we also 
+        // The following steps are for host-side C function arguments, but we also
         // need to pass arguments to kernel, so we keep track of the arguments here.
         let mut pass_args = String::new();
         if run_debug {
@@ -586,8 +586,8 @@ int main() {{
             }
             write!(w, "\treturn 0;\n");
             write!(w, "}}\n");
-        } 
-        
+        }
+
         else {
             // The first set of parameters are dynamic constants.
             let mut first_param = true;
@@ -633,9 +633,9 @@ int main() {{
      * a) domination by F
      * b) no domination by F's join
      * c) no domination by any other fork that's also dominated by F, where we don't count self-domination
-     * Note that the fork_tree also includes the start node, to include all controls 
+     * Note that the fork_tree also includes the start node, to include all controls
      * outside any fork.
-     * 
+     *
      * Second, fork_control_map is a map from fork node to all control nodes (including itself) satisfying:
      * a) domination by F
      * b) no domination by F's join
@@ -665,8 +665,8 @@ int main() {{
 
     /*
      * If tree has a single root fork of known size s <= max_num_blocks
-     * with parallel-fork schedule, then set num_blocks to s, else set num_blocks 
-     * to 1. Also return the root fork(s) for parallelization strategy within 
+     * with parallel-fork schedule, then set num_blocks to s, else set num_blocks
+     * to 1. Also return the root fork(s) for parallelization strategy within
      * threadblocks for threads and their eventual generation.
      */
     fn get_root_forks_and_num_blocks(
@@ -720,10 +720,10 @@ int main() {{
      * maximum over its descendants (leafs have base 1). We traverse up (details
      * in helper) and pass the factor and a map from fork node to a tuple of
      * (max quota of its siblings (including itself), its quota, its fork factor)
-     * from each node to its parents. The parent then compares 
+     * from each node to its parents. The parent then compares
      * - all three are needed for codegen. A node is in the map IFF it will be parallelized.
-     * If not, the fork will use the parent's quota and serialize over the Fork's 
-     * ThreadIDs. Nodes may be removed from the map when traversing up the tree 
+     * If not, the fork will use the parent's quota and serialize over the Fork's
+     * ThreadIDs. Nodes may be removed from the map when traversing up the tree
      * due to an ancestor having a larger factor that conflicts.
      */
     fn get_thread_quotas(
@@ -743,9 +743,9 @@ int main() {{
         is_root: bool,
     ) -> (HashMap<NodeID, (usize, usize, usize)>, usize, bool) {
         // Subsubtree map is the union of all keys for grandchildren and lower
-        // nodes. children_quota_map is a constructed map from parallelized children 
+        // nodes. children_quota_map is a constructed map from parallelized children
         // to their quota to update the subsubtree map at grandchildren level to
-        // subtreemap at children level. subtree_quota is cumulative factor of 
+        // subtreemap at children level. subtree_quota is cumulative factor of
         // subtree and is then compared to this fork's factor.
         let (mut subsubtree_map, children_quota_map, subtree_quota) = fork_tree
             .get(&curr_fork)
@@ -762,7 +762,7 @@ int main() {{
                     (subsubtree_map, children_quota_map, subtree_quota.max(curr_quota))
                 },
             );
-        // First update children_quota_map items with full information and add 
+        // First update children_quota_map items with full information and add
         // to subsubtree_map
         for (&child, quota) in children_quota_map.iter() {
             let Node::Fork { factors, .. } = &self.function.nodes[child.idx()] else {
@@ -780,7 +780,7 @@ int main() {{
         // b) the known size is less than or equal to the max_num_threads
         // c) the known size is a power of 2
         // d) all reduces are parallel-reduce or associative
-        // 
+        //
         // If not, just take the max cumulative factor of its subtree
         let reduces = &self.fork_reduce_map[&curr_fork];
         if let Node::Fork { factors, .. } = &self.function.nodes[curr_fork.idx()]
@@ -792,12 +792,12 @@ int main() {{
                     || self.function.schedules[reduce.idx()].contains(&Schedule::TightAssociative)
             })
         {
-            // If there's an associative Reduce, parallelize the larger factor 
+            // If there's an associative Reduce, parallelize the larger factor
             // between the Fork and subtree
-            // Else, all Reduces must be only parallel-reduce, so parallelize 
+            // Else, all Reduces must be only parallel-reduce, so parallelize
             // both if they fit and the larger if not.
             // The reason for this distinction is that we only perform Reduces over
-            // ThreadID-based values over consecutive CUDA threads, so there's no 
+            // ThreadID-based values over consecutive CUDA threads, so there's no
             // opportunity for further nested parallelization. In contrast, this
             // restriction doesn't help for parallel Writes, so nested parallelization
             // is possible.
@@ -817,10 +817,10 @@ int main() {{
         }
     }
 
-    /* 
+    /*
      * All non reduced-over collections used in fork joins have an extra dimension.
-     * However, this is only useful if ThreadIDs run in parallel not serially, 
-     * otherwise it's unnecessarily consuming shared memory. This function returns 
+     * However, this is only useful if ThreadIDs run in parallel not serially,
+     * otherwise it's unnecessarily consuming shared memory. This function returns
      * the set of collections that have an unnecessary extra dimension.
      */
     fn get_extra_dim_collects(
@@ -1036,8 +1036,8 @@ int main() {{
                             let fork_iter = self.get_fork_iter(*control, false);
                             write!(w, "{}{} = ({} / {}) % {};\n", tabs, define_variable, fork_iter, divide, modulo)?;
                         } else {
-                            // We can directly use use_thread_quota and not worry about available 
-                            // because Fork basic block's init section already does gating 
+                            // We can directly use use_thread_quota and not worry about available
+                            // because Fork basic block's init section already does gating
                             write!(w, "{}{} = (threadIdx.x % {}) / {};\n", tabs, define_variable, use_thread_quota.unwrap(), use_thread_quota.unwrap() / parallel_factor.unwrap())?;
                         }
                     }
@@ -1046,8 +1046,8 @@ int main() {{
                     }
                 }
             }
-            // The Reduce node only generates it's initialization, as reduct will 
-            // perform the update. If serialized, add gate to prevent re-assignment 
+            // The Reduce node only generates it's initialization, as reduct will
+            // perform the update. If serialized, add gate to prevent re-assignment
             // when we hit this reduce again due to the control flow loop between
             // the Fork and Join.
             Node::Reduce {
@@ -1076,7 +1076,7 @@ int main() {{
             Node::Constant { id: cons_id } => {
                 let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive();
                 if (!is_primitive) {
-                    let cg_tile = { 
+                    let cg_tile = {
                         let KernelState::OutBlock = state else {
                             panic!("Expected constant to be in start basic block
                             outside any fork");
@@ -1153,9 +1153,9 @@ int main() {{
                 let left_val = self.get_value(*left, false, false);
                 let right_val = self.get_value(*right, false, false);
                 let id_type = self.typing[id.idx()];
-                if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And 
+                if matches!(op, BinaryOperator::Add | BinaryOperator::Or | BinaryOperator::And
                     | BinaryOperator::Xor) && is_special_reduct {
-                    // For parallelized associative Reduces, use the cooperative 
+                    // For parallelized associative Reduces, use the cooperative
                     // groups reduce API. Associative multiplication is not
                     // supported. We need to use CGType::Use not CGType::UsePerId
                     // because for parallelized reduction we only have one thread
@@ -1349,7 +1349,7 @@ int main() {{
                 panic!("Unsupported data node type")
             }
         }
-        // Since the data uses and reducts are responsible for updating Phi and 
+        // Since the data uses and reducts are responsible for updating Phi and
         // Reduce nodes, respectively, we check and emit those for each data node.
         if let Some(phis) = self.label_data_for_phi.get(&id) {
             let val = self.get_value(id, false, false);
@@ -1403,12 +1403,12 @@ int main() {{
             }
             Node::Fork { control: _, factors: _ } => {
                 // We create a cooperative group tile for each of: used threads per
-                // thread ID- for reads and writes-, used threads across all thread 
+                // thread ID- for reads and writes-, used threads across all thread
                 // IDs- for parallelized reductions-, and available threads- to
                 // synchronize between used and unused threads. We want to create
                 // these only once, so we create two goto sections for each fork-
                 // one run only once for creating groups, and other may be ran
-                // multiple times if the Fork is serialized and Join jumps back 
+                // multiple times if the Fork is serialized and Join jumps back
                 // to it.
                 let cg_tile = self.get_cg_tile(id, CGType::UsePerId);
                 if use_thread_quota.is_some() {
@@ -1453,7 +1453,7 @@ int main() {{
                 }
             }
             Node::Join { control: _ } => {
-                // Join nodes also gate the used vs unused threads with a tile 
+                // Join nodes also gate the used vs unused threads with a tile
                 // sync after the body.
                 let succ = self.control_subgraph.succs(id).next().unwrap();
                 let has_thread_quota = available_thread_quota.is_some();
@@ -1516,7 +1516,7 @@ int main() {{
     /*
      * This function emits collection name + pointer math for the provided indices.
      * One nuance is whether the collection is represented as char pointer or
-     * the original primitive pointer. For Field, it's always char, for Variant, 
+     * the original primitive pointer. For Field, it's always char, for Variant,
      * it doesn't matter here, and for Array, it depends- so we may need to tack
      * on the element size to the index math.
      */
@@ -1571,12 +1571,12 @@ int main() {{
     /*
      * The outlined codegen for constants allows us to handle recursive initialization
      * for collections. We perform "allocation" by atomically incrementing dynamic
-     * shared memory and CUDA's support for dynamic is limited to a single extern 
+     * shared memory and CUDA's support for dynamic is limited to a single extern
      * array. Dynamic is required here because not all dynamic constants and therefore
-     * array sizes are known. This approach will need further work, as currently 
-     * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate` 
-     * prevents unnecessary shared memory allocations for nested product and summation 
-     * collections, since the outermost allocates everything for the full collection. 
+     * array sizes are known. This approach will need further work, as currently
+     * we keep allocating new shmem and don't reuse any old and unused. `allow_allocate`
+     * prevents unnecessary shared memory allocations for nested product and summation
+     * collections, since the outermost allocates everything for the full collection.
      * Since not initialized, array collections don't need to be recursed into.
      */
     fn codegen_constant(
@@ -1911,7 +1911,7 @@ int main() {{
 
     /*
      * Setting `ty = true` will return with type in declaration format. `make_pointer`
-     * is only considered if `ty = true` and only relevant for primitive types- 
+     * is only considered if `ty = true` and only relevant for primitive types-
      * otherwise it makes no difference because collections are already pointers.
      */
     fn get_value(&self, id: NodeID, ty: bool, make_pointer: bool) -> String {