Skip to content
Snippets Groups Projects

Large benches

Merged rarbore2 requested to merge large-benches into main
2 files
+ 12
8
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 11
7
@@ -354,6 +354,7 @@ impl GPUContext<'_> {
write!(
w,
"
#define _CG_ABI_EXPERIMENTAL
#include <assert.h>
#include <stdio.h>
#include <stddef.h>
@@ -1294,7 +1295,7 @@ namespace cg = cooperative_groups;
}
if !is_primitive && state != KernelState::OutBlock {
write!(w, "{}}}\n", tabs)?;
write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
//write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
*num_tabs -= 1;
}
if !is_primitive && state == KernelState::OutBlock {
@@ -1311,6 +1312,7 @@ namespace cg = cooperative_groups;
}
if !is_primitive
&& (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false))
&& !self.function.schedules[id.idx()].contains(&Schedule::NoResetConstant)
{
let data_size = self.get_size(self.typing[id.idx()], None);
write!(
@@ -1320,7 +1322,8 @@ namespace cg = cooperative_groups;
)?;
write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?;
write!(w, "{}}}\n", tabs)?;
write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
//write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
write!(w, "__syncthreads\n")?;
}
}
// Dynamic constants emitted at top
@@ -1595,7 +1598,7 @@ namespace cg = cooperative_groups;
write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?;
write!(w, "{}}}\n", tabs)?;
}
write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
//write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
let collect_variable = self.get_value(*collect, false, false);
write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?;
}
@@ -1705,20 +1708,20 @@ namespace cg = cooperative_groups;
};
write!(
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
"\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
use_thread_per_id, cg_tile, use_thread_per_id
)?;
let cg_tile_use = self.get_cg_tile(id, CGType::Use);
write!(
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
"\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
use_thread_quota, cg_tile_use, use_thread_quota
)?;
let available_thread_quota = available_thread_quota.unwrap();
let cg_tile_available = self.get_cg_tile(id, CGType::Available);
write!(
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
"\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
available_thread_quota, cg_tile_available, available_thread_quota
)?;
if parallel_factor.is_none() {
@@ -1780,7 +1783,8 @@ namespace cg = cooperative_groups;
}
let fork = self.join_fork_map.get(&id).unwrap();
let cg_tile_available = self.get_cg_tile(*fork, CGType::Available);
write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
//write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
write!(w_term, "\t__syncthreads;\n")?;
}
// If the Fork was parallelized, each thread or UsedPerId tile of
// threads only runs one ThreadID, so we can jump straight to the
Loading