fix gpu backend to emit namespaces properly across cuda versions

54acf3e2 · Russel Arbore · 94950efe · 54acf3e2
Commit 54acf3e2 authored 3 weeks ago by Russel Arbore
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -354,7 +354,6 @@ impl GPUContext<'_> {
        write!(
            w,
            "
-#define _CG_ABI_EXPERIMENTAL
 #include <assert.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -362,8 +361,23 @@ impl GPUContext<'_> {
 #include <cuda_runtime.h>
 #include <math_constants.h>
 #include <mma.h>
+#if (CUDA_VERSION >= 12000)
+#else
+#define _CG_ABI_EXPERIMENTAL
+#endif
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#if (CUDA_VERSION >= 12000)
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups;
+#else
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups::experimental;
+#endif
 #include <cuda_bf16.h>
 namespace cg = cooperative_groups;
@@ -564,12 +578,12 @@ namespace cg = cooperative_groups;
    fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
        write!(
            w,
-            "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n"
+            "\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n"
        )?;
        write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
        write!(
            w,
-            "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n"
+            "\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n"
        )?;
        Ok(())
    }
@@ -1715,20 +1729,20 @@ namespace cg = cooperative_groups;
                    };
                    write!(
                        thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                        use_thread_per_id, cg_tile, use_thread_per_id
                    )?;
                    let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                    write!(
                        thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                        use_thread_quota, cg_tile_use, use_thread_quota
                    )?;
                    let available_thread_quota = available_thread_quota.unwrap();
                    let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                    write!(
                        thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                        available_thread_quota, cg_tile_available, available_thread_quota
                    )?;
                    if parallel_factor.is_none() {