diff --git a/.gitignore b/.gitignore
index 45f2e61bd195df944e7db69346bdd6b434308c6f..29eb3e0464828eef0883e286d0a2c7e5396fe2f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,11 +4,10 @@
 *.out
 *.ll
 *.c
+*.cu
 *.o
 *.a
 *.hrt
 .*.swp
 .vscode
 *_env
-
-juno_samples/matmul/src/matmul_indented.jn
diff --git a/Cargo.lock b/Cargo.lock
index a1eb77de19e2b05c2d96c44fe6579b9b71fdd62f..3f1e2e7c9c082585011c0f2b27b0b89fe2ea38c4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -374,9 +374,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.9"
+version = "1.2.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b"
+checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229"
 dependencies = [
  "jobserver",
  "libc",
@@ -1278,9 +1278,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.8.2"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ffbe83022cedc1d264172192511ae958937694cd57ce297164951b8b3568394"
+checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924"
 dependencies = [
  "adler2",
  "simd-adler32",
diff --git a/Cargo.toml b/Cargo.toml
index c57125f7f51c3a8d5c51e81d1fc482ea920fd62d..4e5826caf264974b1e50e13f0bf91083ad43e097 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,13 +5,13 @@ members = [
 	"hercules_ir",
 	"hercules_opt",
 	"hercules_rt",
-	
+
 	"hercules_tools/hercules_driver",
 
 	"juno_frontend",
 	"juno_scheduler",
 	"juno_build",
-	
+
 	#"hercules_test/hercules_interpreter",
 	#"hercules_test/hercules_tests",
 
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 731e57e3cbc8c5bd3f829c6f2aa85602cc84a001..31c502121e4e02ebb6ff1d121d6cd2d2de90739d 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -638,7 +638,10 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 let ret_type_pnt = self.get_type(*self.return_type_id, true);
                 write!(w, "\t{} ret;\n", ret_type_pnt)?;
                 write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
-                write!(pass_args, ", ret")?;
+		if !first_param {
+                    write!(pass_args, ", ")?;
+		}
+                write!(pass_args, "ret")?;
             }
             write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?;
             write!(w, "\tcudaDeviceSynchronize();\n")?;
@@ -897,6 +900,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 for data in self.bbs.1[control.idx()].iter() {
                     self.codegen_data_node(*data, KernelState::OutBlock, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
                 }
+                for data in self.bbs.1[control.idx()].iter() {
+                    self.codegen_data_phi(*data, tabs, body)?;
+                }
                 Ok(())
             })
     }
@@ -929,6 +935,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
             for data in self.bbs.1[control.idx()].iter() {
                 self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
             }
+            for data in self.bbs.1[control.idx()].iter() {
+                self.codegen_data_phi(*data, tabs, body)?;
+            }
         }
         // Then generate data and control for the single block fork if it exists
         if block_fork.is_some() {
@@ -943,6 +952,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 for data in self.bbs.1[control.idx()].iter() {
                     self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?;
                 }
+                for data in self.bbs.1[control.idx()].iter() {
+                    self.codegen_data_phi(*data, tabs, body)?;
+                }
             }
         }
         // Then generate for the thread fork tree through Fork node traversal.
@@ -1022,6 +1034,9 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                     &mut tabs,
                 )?;
             }
+            for data in self.bbs.1[control.idx()].iter() {
+                self.codegen_data_phi(*data, tabs, body)?;
+            }
         }
         for child in fork_tree.get(&curr_fork).unwrap() {
             self.codegen_data_control_traverse(
@@ -1398,15 +1413,8 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 panic!("Unsupported data node type")
             }
         }
-        // Since the data uses and reducts are responsible for updating Phi and
-        // Reduce nodes, respectively, we check and emit those for each data node.
-        if let Some(phis) = self.label_data_for_phi.get(&id) {
-            let val = self.get_value(id, false, false);
-            for phi in phis {
-                let phi_val = self.get_value(*phi, false, false);
-                write!(w, "{}{} = {};\n", tabs, phi_val, val,)?;
-            }
-        }
+        // Since reducts are responsible for updating Reduce nodes,
+        // we check and emit those for each data node.
         if let Some(reduces) = self.reduct_reduce_map.get(&id) {
             let val = self.get_value(id, false, false);
             for reduce in reduces {
@@ -1417,6 +1425,22 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
         Ok(())
     }
 
+    /*
+     * Update Phi assignments for each data node. This is run after all data nodes
+     * for given control block have been emitted.
+     */
+    fn codegen_data_phi(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> {
+        let tabs = "\t".repeat(num_tabs);
+        if let Some(phis) = self.label_data_for_phi.get(&id) {
+            let val = self.get_value(id, false, false);
+            for phi in phis {
+                let phi_val = self.get_value(*phi, false, false);
+                write!(w, "{}{} = {};\n", tabs, phi_val, val)?;
+            }
+        }
+        Ok(())
+    }
+
     fn codegen_control_node(
         &self,
         id: NodeID,
diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs
index 4d526366808e8b2aea39fecd81f6c00269ffb154..fe6fee0946b4eb05548ac79c31996095585073e1 100644
--- a/hercules_ir/src/dot.rs
+++ b/hercules_ir/src/dot.rs
@@ -1,5 +1,5 @@
 use std::collections::HashMap;
-use std::env::temp_dir;
+use std::env::{temp_dir};
 use std::fmt::Write;
 use std::fs::File;
 use std::io::Write as _;
@@ -23,6 +23,7 @@ pub fn xdot_module(
     let mut rng = rand::thread_rng();
     let num: u64 = rng.gen();
     tmp_path.push(format!("hercules_dot_{}.dot", num));
+    let tmp_path = std::path::PathBuf::from(format!("hercules_dot.dot"));
     let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output file.");
     let mut contents = String::new();
     write_dot(
diff --git a/hercules_opt/src/pass.rs b/hercules_opt/src/pass.rs
index 149e4eeb84f157714a6529ba830541e54f599900..dbc24016d1888a5bdce229f48e4522f1fa41b03b 100644
--- a/hercules_opt/src/pass.rs
+++ b/hercules_opt/src/pass.rs
@@ -1079,6 +1079,12 @@ impl PassManager {
                         file.write_all(cuda_ir.as_bytes())
                             .expect("PANIC: Unable to write output CUDA IR file contents.");
 
+                        let cuda_text_path = format!("{}.cu", module_name);
+                        let mut cuda_text_file = File::create(&cuda_text_path)
+                            .expect("PANIC: Unable to open CUDA IR text file.");
+                        cuda_text_file.write_all(cuda_ir.as_bytes())
+                            .expect("PANIC: Unable to write CUDA IR text file contents.");
+
                         let mut nvcc_process = Command::new("nvcc")
                             .arg("-c")
                             .arg("-O3")
diff --git a/hercules_samples/call/Cargo.toml b/hercules_samples/call/Cargo.toml
index 52c588e7032a79bf0043a6334e818e68b727f1b3..a5a44c2e061043101e377bff1492a4eaba995927 100644
--- a/hercules_samples/call/Cargo.toml
+++ b/hercules_samples/call/Cargo.toml
@@ -4,11 +4,14 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = ["juno_build/cuda"]
+
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 rand = "*"
 async-std = "*"
 with_builtin_macros = "0.1.0"
diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs
index 0b984a0f4f3581cd4cd1a3b2dcf067ef4ca4c83e..f04d48c7d0ea6df8b16d70b05cedabfc04c1f6f2 100644
--- a/hercules_samples/ccp/build.rs
+++ b/hercules_samples/ccp/build.rs
@@ -6,11 +6,4 @@ fn main() {
         .unwrap()
         .build()
         .unwrap();
-
-    #[cfg(feature = "cuda")]
-    println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
-    #[cfg(feature = "cuda")]
-    println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-    #[cfg(feature = "cuda")]
-    println!("cargo:rustc-link-lib=cudart");
 }
diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs
index 43cd34f9300acbd21598ef5685802be1a0ffd58d..2a239bc6c3ebd3780cb15358375c59bdfb2e25ae 100644
--- a/hercules_samples/dot/build.rs
+++ b/hercules_samples/dot/build.rs
@@ -6,11 +6,4 @@ fn main() {
         .unwrap()
         .build()
         .unwrap();
-
-    #[cfg(feature = "cuda")]
-    println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
-    #[cfg(feature = "cuda")]
-    println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-    #[cfg(feature = "cuda")]
-    println!("cargo:rustc-link-lib=cudart");
 }
diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs
index 6863b27bc4802abf8dded89759cef5f9e26fbe95..4d8226f11183d9500e6affec4c46110e8626ee69 100644
--- a/hercules_samples/fac/build.rs
+++ b/hercules_samples/fac/build.rs
@@ -6,11 +6,4 @@ fn main() {
         .unwrap()
         .build()
         .unwrap();
-
-    #[cfg(feature = "cuda")]
-    println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
-    #[cfg(feature = "cuda")]
-    println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-    #[cfg(feature = "cuda")]
-    println!("cargo:rustc-link-lib=cudart");
 }
diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs
index b170024bd280183bc559af88e0570d90c1e0fed2..08478deaac459d9a94f79fdabce37da9a1205f89 100644
--- a/hercules_samples/matmul/build.rs
+++ b/hercules_samples/matmul/build.rs
@@ -6,11 +6,4 @@ fn main() {
         .unwrap()
         .build()
         .unwrap();
-
-    #[cfg(feature = "cuda")]
-    println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
-    #[cfg(feature = "cuda")]
-    println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
-    #[cfg(feature = "cuda")]
-    println!("cargo:rustc-link-lib=cudart");
 }
diff --git a/hercules_samples/matmul/src/matmul.hir b/hercules_samples/matmul/src/matmul.hir
index 400ab5e14e91169ce9152a79c3401456baa18241..b0c31da4b32207bf3308c4b03583bc74c61f3737 100644
--- a/hercules_samples/matmul/src/matmul.hir
+++ b/hercules_samples/matmul/src/matmul.hir
@@ -1,9 +1,9 @@
-fn matmul(a: array(i32, 16, 64), b: array(i32, 64, 32)) -> array(i32, 16, 32)
-  c = constant(array(i32, 16, 32), [])
-  i_j_ctrl = fork(start, 16, 32)
+fn matmul<3>(a: array(i32, #0, #1), b: array(i32, #1, #2)) -> array(i32, #0, #2)
+  c = constant(array(i32, #0, #2), [])
+  i_j_ctrl = fork(start, #0, #2)
   i_idx = thread_id(i_j_ctrl, 0)
   j_idx = thread_id(i_j_ctrl, 1)
-  k_ctrl = fork(i_j_ctrl, 64)
+  k_ctrl = fork(i_j_ctrl, #1)
   k_idx = thread_id(k_ctrl, 0)
   k_join_ctrl = join(k_ctrl)
   i_j_join_ctrl = join(k_join_ctrl)
@@ -15,4 +15,4 @@ fn matmul(a: array(i32, 16, 64), b: array(i32, 64, 32)) -> array(i32, 16, 32)
   add = add(mul, dot)
   dot = reduce(k_join_ctrl, zero, add)
   update_c = write(update_i_j_c, dot, position(i_idx, j_idx))
-  update_i_j_c = reduce(i_j_join_ctrl, c, update_c)
\ No newline at end of file
+  update_i_j_c = reduce(i_j_join_ctrl, c, update_c)
diff --git a/juno_build/build.rs b/juno_build/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..7ba34c8c9395926bd16a92dcf831eadda3f3af26
--- /dev/null
+++ b/juno_build/build.rs
@@ -0,0 +1,8 @@
+fn main() {
+    #[cfg(feature = "cuda")]
+    println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+    #[cfg(feature = "cuda")]
+    println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64");
+    #[cfg(feature = "cuda")]
+    println!("cargo:rustc-link-lib=cudart");
+}
diff --git a/juno_build/src/lib.rs b/juno_build/src/lib.rs
index 0c676e4c1b203c53e98a8430e0f2354104540e07..4066080653c75b42705ed129c0b43f41b30491df 100644
--- a/juno_build/src/lib.rs
+++ b/juno_build/src/lib.rs
@@ -27,7 +27,7 @@ impl JunoCompiler {
             src_path: None,
             out_path: None,
             verify: JunoVerify::None,
-            x_dot: false,
+            x_dot: true,
             schedule: JunoSchedule::None,
         }
     }
diff --git a/juno_samples/antideps/Cargo.toml b/juno_samples/antideps/Cargo.toml
index e492e2ae7f23c5afa4f9c1ddc50e98dd93f40e4d..e6f38e095f492b2d7934b6ff969fa468137d1d2c 100644
--- a/juno_samples/antideps/Cargo.toml
+++ b/juno_samples/antideps/Cargo.toml
@@ -8,11 +8,14 @@ edition = "2021"
 name = "juno_antideps"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
-hercules_rt = { path = "../../hercules_rt", features = [] }
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/antideps/antideps.mod b/juno_samples/antideps/antideps.mod
new file mode 100644
index 0000000000000000000000000000000000000000..b4abaef48222ace6f66264d6746dd2d7924b9039
Binary files /dev/null and b/juno_samples/antideps/antideps.mod differ
diff --git a/juno_samples/casts_and_intrinsics/Cargo.toml b/juno_samples/casts_and_intrinsics/Cargo.toml
index 83d5be587125dc9c846b38c3c15089dfda5c3ff8..9fac18b77db9e01bb55ad491a93f1558a5339c54 100644
--- a/juno_samples/casts_and_intrinsics/Cargo.toml
+++ b/juno_samples/casts_and_intrinsics/Cargo.toml
@@ -8,10 +8,13 @@ edition = "2021"
 name = "juno_casts_and_intrinsics"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda"]
+
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/cava/Cargo.toml b/juno_samples/cava/Cargo.toml
index dfde5978e688cc9635308f6418573cd4e4a22c65..ff375d80d637e2ac9baa1377e168f912454b5fbc 100644
--- a/juno_samples/cava/Cargo.toml
+++ b/juno_samples/cava/Cargo.toml
@@ -9,11 +9,11 @@ name = "juno_cava"
 path = "src/main.rs"
 
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
-hercules_rt = { path = "../../hercules_rt", features = [] }
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 async-std = "*"
 clap = { version = "*", features = ["derive"] }
 image = "*"
diff --git a/juno_samples/concat/Cargo.toml b/juno_samples/concat/Cargo.toml
index 888a083fa41dd1f154d8b3e72a58d666b43860c2..24ba1acff56dc3997162c1f3afc2334fd3428155 100644
--- a/juno_samples/concat/Cargo.toml
+++ b/juno_samples/concat/Cargo.toml
@@ -9,10 +9,10 @@ name = "juno_concat"
 path = "src/main.rs"
 
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
-hercules_rt = { path = "../../hercules_rt", features = [] }
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/implicit_clone/Cargo.toml b/juno_samples/implicit_clone/Cargo.toml
index 4f5387e7a3068536657e50b9852b9006052318f7..9612937190366bea144ade4b9ae334b228233b49 100644
--- a/juno_samples/implicit_clone/Cargo.toml
+++ b/juno_samples/implicit_clone/Cargo.toml
@@ -8,11 +8,14 @@ edition = "2021"
 name = "juno_implicit_clone"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
-hercules_rt = { path = "../../hercules_rt", features = [] }
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/matmul/Cargo.toml b/juno_samples/matmul/Cargo.toml
index 1e004dcd5cd9a56dacfe9f70cee64ae883b90695..eac83d15f430a7752d14a1ec3105c88209143477 100644
--- a/juno_samples/matmul/Cargo.toml
+++ b/juno_samples/matmul/Cargo.toml
@@ -8,12 +8,15 @@ edition = "2021"
 name = "juno_matmul"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
-hercules_rt = { path = "../../hercules_rt", features = [] }
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
 rand = "*"
diff --git a/juno_samples/nested_ccp/Cargo.toml b/juno_samples/nested_ccp/Cargo.toml
index bcf1fff8fac48b20868b5c6d84f350d98723c637..5ee3f747477c901806642818553e94b046b50242 100644
--- a/juno_samples/nested_ccp/Cargo.toml
+++ b/juno_samples/nested_ccp/Cargo.toml
@@ -8,11 +8,14 @@ edition = "2021"
 name = "juno_nested_ccp"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
-hercules_rt = { path = "../../hercules_rt", features = [] }
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"
diff --git a/juno_samples/simple3/Cargo.toml b/juno_samples/simple3/Cargo.toml
index c66dc977c705ec15958a0511b507b0edabe8b663..36d50dbd75c410b88142bb5ca5d680fbac0be3ce 100644
--- a/juno_samples/simple3/Cargo.toml
+++ b/juno_samples/simple3/Cargo.toml
@@ -8,11 +8,14 @@ edition = "2021"
 name = "juno_simple3"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
-juno_build = { path = "../../juno_build", features = [] }
+juno_build = { path = "../../juno_build" }
 
 [dependencies]
-juno_build = { path = "../../juno_build", features = [] }
-hercules_rt = { path = "../../hercules_rt", features = [] }
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
 with_builtin_macros = "0.1.0"
 async-std = "*"