Skip to content
Snippets Groups Projects

More optimizations

Merged rarbore2 requested to merge more_opt4 into main
11 files
+ 126
229
Compare changes
  • Side-by-side
  • Inline
Files
11
+ 28
7
@@ -354,7 +354,6 @@ impl GPUContext<'_> {
@@ -354,7 +354,6 @@ impl GPUContext<'_> {
write!(
write!(
w,
w,
"
"
#define _CG_ABI_EXPERIMENTAL
#include <assert.h>
#include <assert.h>
#include <stdio.h>
#include <stdio.h>
#include <stddef.h>
#include <stddef.h>
@@ -362,8 +361,23 @@ impl GPUContext<'_> {
@@ -362,8 +361,23 @@ impl GPUContext<'_> {
#include <cuda_runtime.h>
#include <cuda_runtime.h>
#include <math_constants.h>
#include <math_constants.h>
#include <mma.h>
#include <mma.h>
 
 
#if (CUDA_VERSION >= 12000)
 
#else
 
#define _CG_ABI_EXPERIMENTAL
 
#endif
 
#include <cooperative_groups.h>
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cooperative_groups/reduce.h>
 
 
#if (CUDA_VERSION >= 12000)
 
namespace cg = cooperative_groups;
 
namespace cge = cooperative_groups;
 
#else
 
namespace cg = cooperative_groups;
 
namespace cge = cooperative_groups::experimental;
 
#endif
 
#include <cuda_bf16.h>
#include <cuda_bf16.h>
namespace cg = cooperative_groups;
namespace cg = cooperative_groups;
@@ -562,9 +576,15 @@ namespace cg = cooperative_groups;
@@ -562,9 +576,15 @@ namespace cg = cooperative_groups;
* and writes.
* and writes.
*/
*/
fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?;
write!(
 
w,
 
"\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n"
 
)?;
write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?;
write!(
 
w,
 
"\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n"
 
)?;
Ok(())
Ok(())
}
}
@@ -1709,24 +1729,25 @@ namespace cg = cooperative_groups;
@@ -1709,24 +1729,25 @@ namespace cg = cooperative_groups;
};
};
write!(
write!(
thread_block_tiles,
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
"\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
use_thread_per_id, cg_tile, use_thread_per_id
use_thread_per_id, cg_tile, use_thread_per_id
)?;
)?;
let cg_tile_use = self.get_cg_tile(id, CGType::Use);
let cg_tile_use = self.get_cg_tile(id, CGType::Use);
write!(
write!(
thread_block_tiles,
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
"\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
use_thread_quota, cg_tile_use, use_thread_quota
use_thread_quota, cg_tile_use, use_thread_quota
)?;
)?;
let available_thread_quota = available_thread_quota.unwrap();
let available_thread_quota = available_thread_quota.unwrap();
let cg_tile_available = self.get_cg_tile(id, CGType::Available);
let cg_tile_available = self.get_cg_tile(id, CGType::Available);
write!(
write!(
thread_block_tiles,
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
"\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
available_thread_quota, cg_tile_available, available_thread_quota
available_thread_quota, cg_tile_available, available_thread_quota
)?;
)?;
if parallel_factor.is_none() {
if parallel_factor.is_none() {
write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, true))?;
write!(thread_block_tiles, "\t{};\n", self.get_fork_iter(id, true))?;
 
write!(w_init, "\t{} = 0;\n", self.get_fork_iter(id, false))?;
write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?;
write!(w_init, "\tgoto {};\n", self.get_block_name(id, true))?;
}
}
}
}
Loading