diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index a6711a33ab0325afb84362e8f3e7b3827efb0db7..074281f8231739665f224e0fbd93f1cb337b0bc3 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -534,6 +534,8 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
             write!(pass_args, "p{}", idx)?;
         }
         write!(w, ") {{\n")?;
+        // For case of dynamic block count
+        self.codegen_dynamic_constants(w)?;
         let has_ret_var = self.return_parameter.is_none();
         if has_ret_var {
             // Allocate return parameter and lift to kernel argument
@@ -584,7 +586,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                 self.collection_objects.origin(*object).try_parameter().is_some()
             })
         }), "All collection reduces in block fork must originate from parameters");
-        if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork)
+        if true || self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork)
         {
             let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * ");
             (root_forks, fork_size, true)
@@ -973,7 +975,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                 reduct: _,
             } => {
                 let init_val = self.get_value(*init, false, false);
-                if parallel_factor.is_none() {
+                if parallel_factor.is_none() && KernelState::InThread == state {
                     let Some(nesting_fork) = nesting_fork else {
                         panic!("Expected reduce to be nested in a fork node");
                     };
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 452c1995ee313e77d787ba7b769788c6e67b1271..d01fef736f7fcacc5d74eb06a11f2d5cb5a7a3f2 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -183,6 +183,8 @@ struct PassManager {
     pub postdoms: Option<Vec<DomTree>>,
     pub fork_join_maps: Option<Vec<HashMap<NodeID, NodeID>>>,
     pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>,
+    pub fork_control_maps: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
+    pub fork_trees: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
     pub loops: Option<Vec<LoopTree>>,
     pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
     pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>,
@@ -214,6 +216,8 @@ impl PassManager {
             postdoms: None,
             fork_join_maps: None,
             fork_join_nests: None,
+            fork_control_maps: None,
+            fork_trees: None,
             loops: None,
             reduce_cycles: None,
             data_nodes_in_fork_joins: None,
@@ -332,6 +336,29 @@ impl PassManager {
         }
     }
 
+    pub fn make_fork_control_maps(&mut self) {
+        if self.fork_control_maps.is_none() {
+            self.make_fork_join_nests();
+            self.fork_control_maps = Some(
+                self.fork_join_nests.as_ref().unwrap().iter().map(fork_control_map).collect(),
+            );
+        }
+    }
+
+    pub fn make_fork_trees(&mut self) {
+        if self.fork_trees.is_none() {
+            self.make_fork_join_nests();
+            self.fork_trees = Some(
+                zip(
+                    self.functions.iter(),
+                    self.fork_join_nests.as_ref().unwrap().iter(),
+                )
+                .map(|(function, fork_join_nesting)| fork_tree(function, fork_join_nesting))
+                .collect(),
+            );
+        }
+    }
+
     pub fn make_loops(&mut self) {
         if self.loops.is_none() {
             self.make_control_subgraphs();
@@ -464,6 +491,10 @@ impl PassManager {
         self.make_control_subgraphs();
         self.make_collection_objects();
         self.make_callgraph();
+        self.make_def_uses();
+        self.make_fork_join_maps();
+        self.make_fork_control_maps();
+        self.make_fork_trees();
 
         let PassManager {
             functions,
@@ -476,6 +507,10 @@ impl PassManager {
             bbs: Some(bbs),
             collection_objects: Some(collection_objects),
             callgraph: Some(callgraph),
+            def_uses: Some(def_uses),
+            fork_join_maps: Some(fork_join_maps),
+            fork_control_maps: Some(fork_control_maps),
+            fork_trees: Some(fork_trees),
             ..
         } = self
         else {
@@ -497,6 +532,7 @@ impl PassManager {
 
         let mut rust_rt = String::new();
         let mut llvm_ir = String::new();
+        let mut cuda_ir = String::new();
         for idx in 0..module.functions.len() {
             match devices[idx] {
                 Device::LLVM => cpu_codegen(
@@ -513,6 +549,25 @@ impl PassManager {
                     pass: "cpu codegen".to_string(),
                     error: format!("{}", e),
                 })?,
+                Device::CUDA => gpu_codegen(
+                    &module.functions[idx],
+                    &module.types,
+                    &module.constants,
+                    &module.dynamic_constants,
+                    &typing[idx],
+                    &control_subgraphs[idx],
+                    &bbs[idx],
+                    &collection_objects[&FunctionID::new(idx)],
+                    &def_uses[idx],
+                    &fork_join_maps[idx],
+                    &fork_control_maps[idx],
+                    &fork_trees[idx],
+                    &mut cuda_ir,
+                )
+                .map_err(|e| SchedulerError::PassError {
+                    pass: "cuda codegen".to_string(),
+                    error: format!("{}", e),
+                })?,
                 Device::AsyncRust => rt_codegen(
                     FunctionID::new(idx),
                     &module,
@@ -528,41 +583,76 @@ impl PassManager {
                     pass: "rust codegen".to_string(),
                     error: format!("{}", e),
                 })?,
-                _ => todo!(),
             }
         }
         println!("{}", llvm_ir);
+        println!("{}", cuda_ir);
         println!("{}", rust_rt);
 
+        let output_archive = format!("{}/lib{}.a", output_dir, module_name);
+        println!("{}", output_archive);
+
         // Write the LLVM IR into a temporary file.
         let tmp_dir = TempDir::new().unwrap();
-        let mut tmp_path = tmp_dir.path().to_path_buf();
-        tmp_path.push(format!("{}.ll", module_name));
-        println!("{}", tmp_path.display());
-        let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output LLVM IR file.");
+        let mut llvm_path = tmp_dir.path().to_path_buf();
+        llvm_path.push(format!("{}.ll", module_name));
+        println!("{}", llvm_path.display());
+        let mut file = File::create(&llvm_path)
+            .expect("PANIC: Unable to open output LLVM IR file.");
         file.write_all(llvm_ir.as_bytes())
             .expect("PANIC: Unable to write output LLVM IR file contents.");
 
         // Compile LLVM IR into an ELF object file.
-        let output_archive = format!("{}/lib{}.a", output_dir, module_name);
+        let llvm_object = format!("{}/{}_cpu.o", tmp_dir.path().to_str().unwrap(), module_name);
         let mut clang_process = Command::new("clang")
-            .arg(&tmp_path)
-            .arg("--emit-static-lib")
+            .arg(&llvm_path)
+            .arg("-c")
             .arg("-O3")
             .arg("-march=native")
             .arg("-o")
-            .arg(&output_archive)
+            .arg(&llvm_object)
             .stdin(Stdio::piped())
             .stdout(Stdio::piped())
             .spawn()
             .expect("Error running clang. Is it installed?");
         assert!(clang_process.wait().unwrap().success());
 
+        let mut ar_args = vec!["crus", &output_archive, &llvm_object];
+
+        let cuda_object = format!("{}/{}_cuda.o", tmp_dir.path().to_str().unwrap(), module_name);
+        if cfg!(feature = "cuda") {
+            // Write the CUDA IR into a temporary file.
+            let mut cuda_path = tmp_dir.path().to_path_buf();
+            cuda_path.push(format!("{}.cu", module_name));
+            let mut file = File::create(&cuda_path)
+                .expect("PANIC: Unable to open output CUDA IR file.");
+            file.write_all(cuda_ir.as_bytes())
+                .expect("PANIC: Unable to write output CUDA IR file contents.");
+
+            let mut nvcc_process = Command::new("nvcc")
+                .arg("-c")
+                .arg("-O3")
+                .arg("-o")
+                .arg(&cuda_object)
+                .arg(&cuda_path)
+                .spawn()
+                .expect("Error running nvcc. Is it installed?");
+            assert!(nvcc_process.wait().unwrap().success());
+
+            ar_args.push(&cuda_object);
+        }
+
+        let mut ar_process = Command::new("ar")
+            .args(&ar_args)
+            .spawn()
+            .expect("Error running ar. Is it installed?");
+        assert!(ar_process.wait().unwrap().success());
+
         // Write the Rust runtime into a file.
         let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name);
         println!("{}", output_rt);
-        let mut file =
-            File::create(&output_rt).expect("PANIC: Unable to open output Rust runtime file.");
+        let mut file = File::create(&output_rt)
+            .expect("PANIC: Unable to open output Rust runtime file.");
         file.write_all(rust_rt.as_bytes())
             .expect("PANIC: Unable to write output Rust runtime file contents.");