diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index afc016a48a39a091cf6638ade071d40cbb901afa..8f186aa7e4ff6f0a7436d94604b127ff2f1f3ea2 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -622,23 +622,23 @@ extern \"C\" {} {}(",
             write!(pass_args, "ret")?;
             write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
         }
-        write!(w, "\tcudaError_t err;\n");
+        write!(w, "\tcudaError_t err;\n")?;
         write!(
             w,
             "\t{}_gpu<<<{}, {}, {}>>>({});\n",
             self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args
         )?;
-        write!(w, "\terr = cudaGetLastError();\n");
+        write!(w, "\terr = cudaGetLastError();\n")?;
         write!(
             w,
             "\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n"
-        );
+        )?;
         write!(w, "\tcudaDeviceSynchronize();\n")?;
-        write!(w, "\terr = cudaGetLastError();\n");
+        write!(w, "\terr = cudaGetLastError();\n")?;
         write!(
             w,
             "\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n"
-        );
+        )?;
         if has_ret_var {
             // Copy return from device to host, whether it's primitive value or collection pointer
             write!(w, "\t{} host_ret;\n", ret_type)?;
@@ -1150,7 +1150,8 @@ extern \"C\" {} {}(",
             // for all threads. Otherwise, it can be inside or outside block fork.
             // If inside, it's stored in shared memory so we "allocate" it once
             // and parallelize memset to 0. If outside, we initialize as offset
-            // to backing, but if multi-block grid, don't memset to avoid grid-level sync.
+            // to backing, but if multi-block grid, don't memset to avoid grid-
+            // level sync.
             Node::Constant { id: cons_id } => {
                 let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive();
                 let cg_tile = match state {
@@ -1192,9 +1193,7 @@ extern \"C\" {} {}(",
                     )?;
                 }
                 if !is_primitive
-                    && (state != KernelState::OutBlock
-                        || is_block_parallel.is_none()
-                        || !is_block_parallel.unwrap())
+                    && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false))
                 {
                     let data_size =
                         self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects));
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index b13c919abe4f7f99874185f110e7a145c9327d5e..65f7c2d06c1ace23e9d698387e714ade24f92c59 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -90,6 +90,7 @@ pub fn gcm(
         loops,
         fork_join_map,
         objects,
+        devices,
     );
 
     let liveness = liveness_dataflow(
@@ -174,6 +175,7 @@ fn basic_blocks(
     loops: &LoopTree,
     fork_join_map: &HashMap<NodeID, NodeID>,
     objects: &CollectionObjects,
+    devices: &Vec<Device>,
 ) -> BasicBlocks {
     let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()];
 
@@ -421,9 +423,18 @@ fn basic_blocks(
                 // If the next node further up the dominator tree is in a shallower
                 // loop nest or if we can get out of a reduce loop when we don't
                 // need to be in one, place this data node in a higher-up location.
-                // Only do this is the node isn't a constant or undef.
+                // Only do this is the node isn't a constant or undef - if a
+                // node is a constant or undef, we want its placement to be as
+                // control dependent as possible, even inside loops. In GPU
+                // functions specifically, lift constants that may be returned
+                // outside fork-joins.
                 let is_constant_or_undef =
                     function.nodes[id.idx()].is_constant() || function.nodes[id.idx()].is_undef();
+                let is_gpu_returned = devices[func_id.idx()] == Device::CUDA
+                    && objects[&func_id]
+                        .objects(id)
+                        .into_iter()
+                        .any(|obj| objects[&func_id].returned_objects().contains(obj));
                 let old_nest = loops
                     .header_of(location)
                     .map(|header| loops.nesting(header).unwrap());
@@ -444,7 +455,10 @@ fn basic_blocks(
                 // loop use the reduce node forming the loop, so the dominator chain
                 // will consist of one block, and this loop won't ever iterate.
                 let currently_at_join = function.nodes[location.idx()].is_join();
-                if !is_constant_or_undef && (shallower_nest || currently_at_join) {
+
+                if (!is_constant_or_undef || is_gpu_returned)
+                    && (shallower_nest || currently_at_join)
+                {
                     location = control_node;
                 }
             }