diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index f51479a73fd1e14334e0fdbe136c722e8538ad7a..a3a46d93943d10bbebd125df93f95685c3ad5ac0 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -555,7 +555,7 @@ int main() {{ } else { let param_size = self.get_size(*ty, None, None); write!(w, "\t{} p{};\n", param_type, idx); - write!(w, "\tif (cudaMalloc(&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; + write!(w, "\tif (cudaMalloc((void**)&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; write!(w, "\t\tprintf(\"Error allocating memory for parameter %d\\n\", {});\n", idx)?; write!(w, "\t\treturn -1;\n"); write!(w, "\t}}\n"); @@ -564,15 +564,14 @@ int main() {{ } // Pull primitive return to a pointer parameter if self.types[self.return_type_id.idx()].is_primitive() { - write!(pass_args, ", ")?; let ret_type_no_pnt = self.get_type(*self.return_type_id, false); let ret_type = self.get_type(*self.return_type_id, true); write!(w, "\t{} ret;\n", ret_type)?; - write!(w, "\tif (cudaMalloc(&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; + write!(w, "\tif (cudaMalloc((void**)&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; write!(w, "\t\tprintf(\"Error allocating memory for return value\\n\");\n")?; write!(w, "\t\treturn -1;\n")?; write!(w, "\t}}\n"); - write!(pass_args, "ret")?; + write!(pass_args, ", ret")?; } write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args); write!(w, "\tbool skip = false;\n")?; @@ -596,8 +595,10 @@ int main() {{ } else { + let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); + let ret_type = self.get_type(*self.return_type_id, false); write!(w, " -extern \"C\" int {}(", self.function.name)?; +extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_string() }, self.function.name)?; // The first set of parameters are dynamic constants. let mut first_param = true; for idx in 0..self.function.num_dynamic_constants { @@ -622,20 +623,23 @@ extern \"C\" int {}(", self.function.name)?; write!(w, "{} p{}", param_type, idx)?; write!(pass_args, "p{}", idx)?; } - // Pull primitive return to a pointer parameter - if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, ", ")?; - write!(pass_args, ", ")?; - let ret_type = self.get_type(*self.return_type_id, true); - write!(w, "{} ret", ret_type)?; - write!(pass_args, "ret")?; - } write!(w, ") {{\n")?; + // Pull primitive return as pointer parameter for kernel + if ret_primitive { + let ret_type_pnt = self.get_type(*self.return_type_id, true); + write!(w, "\t{} ret;\n", ret_type_pnt)?; + write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; + write!(pass_args, ", ret")?; + } write!(w, "\t{}<<<{}_gpu, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; write!(w, "\tcudaDeviceSynchronize();\n")?; + if ret_primitive { + write!(w, "\t{} host_ret;\n", ret_type)?; + write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; + write!(w, "\treturn host_ret;\n")?; + } } - write!(w, "\treturn 0;\n")?; write!(w, "}}\n")?; Ok(()) }