diff --git a/.gitignore b/.gitignore
index 29eb3e0464828eef0883e286d0a2c7e5396fe2f3..749cea40376eed265a3734958bb1b298e2f083de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 *.o
 *.a
 *.hrt
-.*.swp
+*.png
+*.swp
 .vscode
 *_env
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 31c502121e4e02ebb6ff1d121d6cd2d2de90739d..a0281795ac25fad03146e7d5301f351de90b4706 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -183,21 +183,19 @@ pub fn gpu_codegen<W: Write>(
         threads_per_warp: 32,
     };
 
-    let label_data_for_phi = || -> HashMap<NodeID, Vec<NodeID>> {
-        let mut label_data_for_phi = HashMap::new();
-        for (idx, node) in function.nodes.iter().enumerate() {
-            if let Node::Phi { control: _, data } = node {
-                for &data_id in data.iter() {
-                    label_data_for_phi
-                        .entry(data_id)
-                        .or_insert(vec![])
-                        .push(NodeID::new(idx));
-                }
+    // Map from control to pairs of data to update phi
+    // For each phi, we go to its region and get region's controls
+    let control_data_phi_map: &mut HashMap<NodeID, Vec<(NodeID, NodeID)>> = &mut HashMap::new();
+    for (idx, node) in function.nodes.iter().enumerate() {
+        if let Node::Phi { control, data } = node {
+            let Node::Region { preds } = &function.nodes[control.idx()] else {
+                panic!("Phi's control must be a region node");
+            };
+            for (i, &pred) in preds.iter().enumerate() {
+                control_data_phi_map.entry(pred).or_insert(vec![]).push((data[i], NodeID::new(idx)));
             }
         }
-        label_data_for_phi
-    };
-    let label_data_for_phi = &label_data_for_phi();
+    }
 
     let def_use_map = &def_use(function);
 
@@ -215,7 +213,7 @@ pub fn gpu_codegen<W: Write>(
         join_fork_map,
         fork_reduce_map,
         reduct_reduce_map,
-        label_data_for_phi,
+        control_data_phi_map,
         return_type_id,
     };
     ctx.codegen_function(w)
@@ -241,7 +239,7 @@ struct GPUContext<'a> {
     join_fork_map: &'a HashMap<NodeID, NodeID>,
     fork_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>,
     reduct_reduce_map: &'a HashMap<NodeID, Vec<NodeID>>,
-    label_data_for_phi: &'a HashMap<NodeID, Vec<NodeID>>,
+    control_data_phi_map: &'a HashMap<NodeID, Vec<(NodeID, NodeID)>>,
     return_type_id: &'a TypeID,
 }
 
@@ -490,6 +488,9 @@ namespace cg = cooperative_groups;
                 !self.function.nodes[id.idx()].is_parameter() {
                 write!(w, "\t{};\n", self.get_value(id, true, false))?;
             }
+            if self.function.nodes[id.idx()].is_phi() {
+                write!(w, "\t{}_tmp;\n", self.get_value(id, true, false))?;
+            }
         }
         Ok(())
     }
@@ -900,9 +901,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 for data in self.bbs.1[control.idx()].iter() {
                     self.codegen_data_node(*data, KernelState::OutBlock, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
                 }
-                for data in self.bbs.1[control.idx()].iter() {
-                    self.codegen_data_phi(*data, tabs, body)?;
-                }
                 Ok(())
             })
     }
@@ -935,9 +933,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
             for data in self.bbs.1[control.idx()].iter() {
                 self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
             }
-            for data in self.bbs.1[control.idx()].iter() {
-                self.codegen_data_phi(*data, tabs, body)?;
-            }
         }
         // Then generate data and control for the single block fork if it exists
         if block_fork.is_some() {
@@ -952,9 +947,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 for data in self.bbs.1[control.idx()].iter() {
                     self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
                 }
-                for data in self.bbs.1[control.idx()].iter() {
-                    self.codegen_data_phi(*data, tabs, body)?;
-                }
             }
         }
         // Then generate for the thread fork tree through Fork node traversal.
@@ -1034,9 +1026,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                     &mut tabs,
                 )?;
             }
-            for data in self.bbs.1[control.idx()].iter() {
-                self.codegen_data_phi(*data, tabs, body)?;
-            }
         }
         for child in fork_tree.get(&curr_fork).unwrap() {
             self.codegen_data_control_traverse(
@@ -1071,12 +1060,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
         let define_variable = self.get_value(id, false, false).to_string();
         let tabs = "\t".repeat(*num_tabs);
         match &self.function.nodes[id.idx()] {
-            // Phi registers emitted at top and the data nodes it uses will
-            // update the phi
-            Node::Phi {
-                control: _,
-                data: _,
-            } => {}
+            Node::Phi { control: _, data: _ } => {
+                write!(w, "{}{} = {}_tmp;\n", tabs, define_variable, define_variable)?;
+            }
             Node::ThreadID { control, dimension } => {
                 let Node::Fork { factors, .. } = &self.function.nodes[control.idx()] else {
                     panic!("Expected ThreadID's control to be a fork node");
@@ -1413,8 +1399,8 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 panic!("Unsupported data node type")
             }
         }
-        // Since reducts are responsible for updating Reduce nodes,
-        // we check and emit those for each data node.
+        // Since reducts are responsible for updating Reduce nodes, we check and
+        // emit those for each data node.
         if let Some(reduces) = self.reduct_reduce_map.get(&id) {
             let val = self.get_value(id, false, false);
             for reduce in reduces {
@@ -1425,22 +1411,6 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
         Ok(())
     }
 
-    /*
-     * Update Phi assignments for each data node. This is run after all data nodes
-     * for given control block have been emitted.
-     */
-    fn codegen_data_phi(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> {
-        let tabs = "\t".repeat(num_tabs);
-        if let Some(phis) = self.label_data_for_phi.get(&id) {
-            let val = self.get_value(id, false, false);
-            for phi in phis {
-                let phi_val = self.get_value(*phi, false, false);
-                write!(w, "{}{} = {};\n", tabs, phi_val, val)?;
-            }
-        }
-        Ok(())
-    }
-
     fn codegen_control_node(
         &self,
         id: NodeID,
@@ -1451,6 +1421,11 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
         w_post_init: &mut String,
         w_term: &mut String,
     ) -> Result<usize, Error> {
+        for (data, phi) in self.control_data_phi_map.get(&id).unwrap_or(&vec![]).iter() {
+            let data = self.get_value(*data, false, false);
+            let phi = self.get_value(*phi, false, false);
+            write!(w_term, "\t{}_tmp = {};\n", phi, data)?;
+        }
         let tabs = match &self.function.nodes[id.idx()] {
             Node::Start
             | Node::Region { preds: _ }
@@ -1572,7 +1547,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 // we write to that parameter upon return.
                 if self.types[self.typing[data.idx()].idx()].is_primitive() {
                     let return_val = self.get_value(*data, false, false);
-                    write!(w_term, "\tif (threadIdx.x == 0) {{\n")?;
+                    write!(w_term, "\tif (grid.thread_rank() == 0) {{\n")?;
                     write!(w_term, "\t\t*ret = {};\n", return_val)?;
                     write!(w_term, "\t}}\n")?;
                 }
diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs
index fe6fee0946b4eb05548ac79c31996095585073e1..4d526366808e8b2aea39fecd81f6c00269ffb154 100644
--- a/hercules_ir/src/dot.rs
+++ b/hercules_ir/src/dot.rs
@@ -1,5 +1,5 @@
 use std::collections::HashMap;
-use std::env::{temp_dir};
+use std::env::temp_dir;
 use std::fmt::Write;
 use std::fs::File;
 use std::io::Write as _;
@@ -23,7 +23,6 @@ pub fn xdot_module(
     let mut rng = rand::thread_rng();
     let num: u64 = rng.gen();
     tmp_path.push(format!("hercules_dot_{}.dot", num));
-    let tmp_path = std::path::PathBuf::from(format!("hercules_dot.dot"));
     let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output file.");
     let mut contents = String::new();
     write_dot(
diff --git a/juno_build/src/lib.rs b/juno_build/src/lib.rs
index 4066080653c75b42705ed129c0b43f41b30491df..0c676e4c1b203c53e98a8430e0f2354104540e07 100644
--- a/juno_build/src/lib.rs
+++ b/juno_build/src/lib.rs
@@ -27,7 +27,7 @@ impl JunoCompiler {
             src_path: None,
             out_path: None,
             verify: JunoVerify::None,
-            x_dot: true,
+            x_dot: false,
             schedule: JunoSchedule::None,
         }
     }