diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index afc016a48a39a091cf6638ade071d40cbb901afa..8f186aa7e4ff6f0a7436d94604b127ff2f1f3ea2 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -622,23 +622,23 @@ extern \"C\" {} {}(", write!(pass_args, "ret")?; write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; } - write!(w, "\tcudaError_t err;\n"); + write!(w, "\tcudaError_t err;\n")?; write!( w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args )?; - write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\terr = cudaGetLastError();\n")?; write!( w, "\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n" - ); + )?; write!(w, "\tcudaDeviceSynchronize();\n")?; - write!(w, "\terr = cudaGetLastError();\n"); + write!(w, "\terr = cudaGetLastError();\n")?; write!( w, "\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n" - ); + )?; if has_ret_var { // Copy return from device to host, whether it's primitive value or collection pointer write!(w, "\t{} host_ret;\n", ret_type)?; @@ -1150,7 +1150,8 @@ extern \"C\" {} {}(", // for all threads. Otherwise, it can be inside or outside block fork. // If inside, it's stored in shared memory so we "allocate" it once // and parallelize memset to 0. If outside, we initialize as offset - // to backing, but if multi-block grid, don't memset to avoid grid-level sync. + // to backing, but if multi-block grid, don't memset to avoid grid- + // level sync. Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); let cg_tile = match state { @@ -1192,9 +1193,7 @@ extern \"C\" {} {}(", )?; } if !is_primitive - && (state != KernelState::OutBlock - || is_block_parallel.is_none() - || !is_block_parallel.unwrap()) + && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false)) { let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs index b13c919abe4f7f99874185f110e7a145c9327d5e..65f7c2d06c1ace23e9d698387e714ade24f92c59 100644 --- a/hercules_opt/src/gcm.rs +++ b/hercules_opt/src/gcm.rs @@ -90,6 +90,7 @@ pub fn gcm( loops, fork_join_map, objects, + devices, ); let liveness = liveness_dataflow( @@ -174,6 +175,7 @@ fn basic_blocks( loops: &LoopTree, fork_join_map: &HashMap<NodeID, NodeID>, objects: &CollectionObjects, + devices: &Vec<Device>, ) -> BasicBlocks { let mut bbs: Vec<Option<NodeID>> = vec![None; function.nodes.len()]; @@ -421,9 +423,18 @@ fn basic_blocks( // If the next node further up the dominator tree is in a shallower // loop nest or if we can get out of a reduce loop when we don't // need to be in one, place this data node in a higher-up location. - // Only do this is the node isn't a constant or undef. + // Only do this is the node isn't a constant or undef - if a + // node is a constant or undef, we want its placement to be as + // control dependent as possible, even inside loops. In GPU + // functions specifically, lift constants that may be returned + // outside fork-joins. let is_constant_or_undef = function.nodes[id.idx()].is_constant() || function.nodes[id.idx()].is_undef(); + let is_gpu_returned = devices[func_id.idx()] == Device::CUDA + && objects[&func_id] + .objects(id) + .into_iter() + .any(|obj| objects[&func_id].returned_objects().contains(obj)); let old_nest = loops .header_of(location) .map(|header| loops.nesting(header).unwrap()); @@ -444,7 +455,10 @@ fn basic_blocks( // loop use the reduce node forming the loop, so the dominator chain // will consist of one block, and this loop won't ever iterate. let currently_at_join = function.nodes[location.idx()].is_join(); - if !is_constant_or_undef && (shallower_nest || currently_at_join) { + + if (!is_constant_or_undef || is_gpu_returned) + && (shallower_nest || currently_at_join) + { location = control_node; } }