diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index dd87acbe18de297557fc8b97696928946e90c2e2..4069cb02bcf1ed0a93f5c40df432253e842ead9a 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -354,7 +354,6 @@ impl GPUContext<'_> { write!( w, " -#define _CG_ABI_EXPERIMENTAL #include <assert.h> #include <stdio.h> #include <stddef.h> @@ -362,8 +361,23 @@ impl GPUContext<'_> { #include <cuda_runtime.h> #include <math_constants.h> #include <mma.h> + +#if (CUDA_VERSION >= 12000) +#else +#define _CG_ABI_EXPERIMENTAL +#endif + #include <cooperative_groups.h> #include <cooperative_groups/reduce.h> + +#if (CUDA_VERSION >= 12000) +namespace cg = cooperative_groups; +namespace cge = cooperative_groups; +#else +namespace cg = cooperative_groups; +namespace cge = cooperative_groups::experimental; +#endif + #include <cuda_bf16.h> namespace cg = cooperative_groups; @@ -564,12 +578,12 @@ namespace cg = cooperative_groups; fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> { write!( w, - "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n" + "\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n" )?; write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?; write!( w, - "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n" + "\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n" )?; Ok(()) } @@ -1715,20 +1729,20 @@ namespace cg = cooperative_groups; }; write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id )?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota )?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() {