Skip to content
Snippets Groups Projects

Forkify fixes

Merged Xavier Routh requested to merge forkify-fixes into main
1 unresolved thread
7 files
+ 66
14
Compare changes
  • Side-by-side
  • Inline
Files
7
+ 8
9
@@ -622,23 +622,23 @@ extern \"C\" {} {}(",
@@ -622,23 +622,23 @@ extern \"C\" {} {}(",
write!(pass_args, "ret")?;
write!(pass_args, "ret")?;
write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
}
}
write!(w, "\tcudaError_t err;\n");
write!(w, "\tcudaError_t err;\n")?;
write!(
write!(
w,
w,
"\t{}_gpu<<<{}, {}, {}>>>({});\n",
"\t{}_gpu<<<{}, {}, {}>>>({});\n",
self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args
self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args
)?;
)?;
write!(w, "\terr = cudaGetLastError();\n");
write!(w, "\terr = cudaGetLastError();\n")?;
write!(
write!(
w,
w,
"\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n"
"\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n"
);
)?;
write!(w, "\tcudaDeviceSynchronize();\n")?;
write!(w, "\tcudaDeviceSynchronize();\n")?;
write!(w, "\terr = cudaGetLastError();\n");
write!(w, "\terr = cudaGetLastError();\n")?;
write!(
write!(
w,
w,
"\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n"
"\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n"
);
)?;
if has_ret_var {
if has_ret_var {
// Copy return from device to host, whether it's primitive value or collection pointer
// Copy return from device to host, whether it's primitive value or collection pointer
write!(w, "\t{} host_ret;\n", ret_type)?;
write!(w, "\t{} host_ret;\n", ret_type)?;
@@ -1150,7 +1150,8 @@ extern \"C\" {} {}(",
@@ -1150,7 +1150,8 @@ extern \"C\" {} {}(",
// for all threads. Otherwise, it can be inside or outside block fork.
// for all threads. Otherwise, it can be inside or outside block fork.
// If inside, it's stored in shared memory so we "allocate" it once
// If inside, it's stored in shared memory so we "allocate" it once
// and parallelize memset to 0. If outside, we initialize as offset
// and parallelize memset to 0. If outside, we initialize as offset
// to backing, but if multi-block grid, don't memset to avoid grid-level sync.
// to backing, but if multi-block grid, don't memset to avoid grid-
 
// level sync.
Node::Constant { id: cons_id } => {
Node::Constant { id: cons_id } => {
let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive();
let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive();
let cg_tile = match state {
let cg_tile = match state {
@@ -1192,9 +1193,7 @@ extern \"C\" {} {}(",
@@ -1192,9 +1193,7 @@ extern \"C\" {} {}(",
)?;
)?;
}
}
if !is_primitive
if !is_primitive
&& (state != KernelState::OutBlock
&& (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false))
|| is_block_parallel.is_none()
|| !is_block_parallel.unwrap())
{
{
let data_size =
let data_size =
self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects));
self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects));
Loading