diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a6711a33ab0325afb84362e8f3e7b3827efb0db7..074281f8231739665f224e0fbd93f1cb337b0bc3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -534,6 +534,8 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(pass_args, "p{}", idx)?; } write!(w, ") {{\n")?; + // For case of dynamic block count + self.codegen_dynamic_constants(w)?; let has_ret_var = self.return_parameter.is_none(); if has_ret_var { // Allocate return parameter and lift to kernel argument @@ -584,7 +586,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; self.collection_objects.origin(*object).try_parameter().is_some() }) }), "All collection reduces in block fork must originate from parameters"); - if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) + if true || self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); (root_forks, fork_size, true) @@ -973,7 +975,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; reduct: _, } => { let init_val = self.get_value(*init, false, false); - if parallel_factor.is_none() { + if parallel_factor.is_none() && KernelState::InThread == state { let Some(nesting_fork) = nesting_fork else { panic!("Expected reduce to be nested in a fork node"); }; diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index 452c1995ee313e77d787ba7b769788c6e67b1271..d01fef736f7fcacc5d74eb06a11f2d5cb5a7a3f2 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -183,6 +183,8 @@ struct PassManager { pub postdoms: Option<Vec<DomTree>>, pub fork_join_maps: Option<Vec<HashMap<NodeID, NodeID>>>, pub fork_join_nests: Option<Vec<HashMap<NodeID, Vec<NodeID>>>>, + pub fork_control_maps: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, + pub fork_trees: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub loops: Option<Vec<LoopTree>>, pub reduce_cycles: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, pub data_nodes_in_fork_joins: Option<Vec<HashMap<NodeID, HashSet<NodeID>>>>, @@ -214,6 +216,8 @@ impl PassManager { postdoms: None, fork_join_maps: None, fork_join_nests: None, + fork_control_maps: None, + fork_trees: None, loops: None, reduce_cycles: None, data_nodes_in_fork_joins: None, @@ -332,6 +336,29 @@ impl PassManager { } } + pub fn make_fork_control_maps(&mut self) { + if self.fork_control_maps.is_none() { + self.make_fork_join_nests(); + self.fork_control_maps = Some( + self.fork_join_nests.as_ref().unwrap().iter().map(fork_control_map).collect(), + ); + } + } + + pub fn make_fork_trees(&mut self) { + if self.fork_trees.is_none() { + self.make_fork_join_nests(); + self.fork_trees = Some( + zip( + self.functions.iter(), + self.fork_join_nests.as_ref().unwrap().iter(), + ) + .map(|(function, fork_join_nesting)| fork_tree(function, fork_join_nesting)) + .collect(), + ); + } + } + pub fn make_loops(&mut self) { if self.loops.is_none() { self.make_control_subgraphs(); @@ -464,6 +491,10 @@ impl PassManager { self.make_control_subgraphs(); self.make_collection_objects(); self.make_callgraph(); + self.make_def_uses(); + self.make_fork_join_maps(); + self.make_fork_control_maps(); + self.make_fork_trees(); let PassManager { functions, @@ -476,6 +507,10 @@ impl PassManager { bbs: Some(bbs), collection_objects: Some(collection_objects), callgraph: Some(callgraph), + def_uses: Some(def_uses), + fork_join_maps: Some(fork_join_maps), + fork_control_maps: Some(fork_control_maps), + fork_trees: Some(fork_trees), .. } = self else { @@ -497,6 +532,7 @@ impl PassManager { let mut rust_rt = String::new(); let mut llvm_ir = String::new(); + let mut cuda_ir = String::new(); for idx in 0..module.functions.len() { match devices[idx] { Device::LLVM => cpu_codegen( @@ -513,6 +549,25 @@ impl PassManager { pass: "cpu codegen".to_string(), error: format!("{}", e), })?, + Device::CUDA => gpu_codegen( + &module.functions[idx], + &module.types, + &module.constants, + &module.dynamic_constants, + &typing[idx], + &control_subgraphs[idx], + &bbs[idx], + &collection_objects[&FunctionID::new(idx)], + &def_uses[idx], + &fork_join_maps[idx], + &fork_control_maps[idx], + &fork_trees[idx], + &mut cuda_ir, + ) + .map_err(|e| SchedulerError::PassError { + pass: "cuda codegen".to_string(), + error: format!("{}", e), + })?, Device::AsyncRust => rt_codegen( FunctionID::new(idx), &module, @@ -528,41 +583,76 @@ impl PassManager { pass: "rust codegen".to_string(), error: format!("{}", e), })?, - _ => todo!(), } } println!("{}", llvm_ir); + println!("{}", cuda_ir); println!("{}", rust_rt); + let output_archive = format!("{}/lib{}.a", output_dir, module_name); + println!("{}", output_archive); + // Write the LLVM IR into a temporary file. let tmp_dir = TempDir::new().unwrap(); - let mut tmp_path = tmp_dir.path().to_path_buf(); - tmp_path.push(format!("{}.ll", module_name)); - println!("{}", tmp_path.display()); - let mut file = File::create(&tmp_path).expect("PANIC: Unable to open output LLVM IR file."); + let mut llvm_path = tmp_dir.path().to_path_buf(); + llvm_path.push(format!("{}.ll", module_name)); + println!("{}", llvm_path.display()); + let mut file = File::create(&llvm_path) + .expect("PANIC: Unable to open output LLVM IR file."); file.write_all(llvm_ir.as_bytes()) .expect("PANIC: Unable to write output LLVM IR file contents."); // Compile LLVM IR into an ELF object file. - let output_archive = format!("{}/lib{}.a", output_dir, module_name); + let llvm_object = format!("{}/{}_cpu.o", tmp_dir.path().to_str().unwrap(), module_name); let mut clang_process = Command::new("clang") - .arg(&tmp_path) - .arg("--emit-static-lib") + .arg(&llvm_path) + .arg("-c") .arg("-O3") .arg("-march=native") .arg("-o") - .arg(&output_archive) + .arg(&llvm_object) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .spawn() .expect("Error running clang. Is it installed?"); assert!(clang_process.wait().unwrap().success()); + let mut ar_args = vec!["crus", &output_archive, &llvm_object]; + + let cuda_object = format!("{}/{}_cuda.o", tmp_dir.path().to_str().unwrap(), module_name); + if cfg!(feature = "cuda") { + // Write the CUDA IR into a temporary file. + let mut cuda_path = tmp_dir.path().to_path_buf(); + cuda_path.push(format!("{}.cu", module_name)); + let mut file = File::create(&cuda_path) + .expect("PANIC: Unable to open output CUDA IR file."); + file.write_all(cuda_ir.as_bytes()) + .expect("PANIC: Unable to write output CUDA IR file contents."); + + let mut nvcc_process = Command::new("nvcc") + .arg("-c") + .arg("-O3") + .arg("-o") + .arg(&cuda_object) + .arg(&cuda_path) + .spawn() + .expect("Error running nvcc. Is it installed?"); + assert!(nvcc_process.wait().unwrap().success()); + + ar_args.push(&cuda_object); + } + + let mut ar_process = Command::new("ar") + .args(&ar_args) + .spawn() + .expect("Error running ar. Is it installed?"); + assert!(ar_process.wait().unwrap().success()); + // Write the Rust runtime into a file. let output_rt = format!("{}/rt_{}.hrt", output_dir, module_name); println!("{}", output_rt); - let mut file = - File::create(&output_rt).expect("PANIC: Unable to open output Rust runtime file."); + let mut file = File::create(&output_rt) + .expect("PANIC: Unable to open output Rust runtime file."); file.write_all(rust_rt.as_bytes()) .expect("PANIC: Unable to write output Rust runtime file contents.");