diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index dd87acbe18de297557fc8b97696928946e90c2e2..4069cb02bcf1ed0a93f5c40df432253e842ead9a 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -354,7 +354,6 @@ impl GPUContext<'_> {
         write!(
             w,
             "
-#define _CG_ABI_EXPERIMENTAL
 #include <assert.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -362,8 +361,23 @@ impl GPUContext<'_> {
 #include <cuda_runtime.h>
 #include <math_constants.h>
 #include <mma.h>
+
+#if (CUDA_VERSION >= 12000)
+#else
+#define _CG_ABI_EXPERIMENTAL
+#endif
+
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+
+#if (CUDA_VERSION >= 12000)
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups;
+#else
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups::experimental;
+#endif
+
 #include <cuda_bf16.h>
 namespace cg = cooperative_groups;
 
@@ -564,12 +578,12 @@ namespace cg = cooperative_groups;
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
         write!(
             w,
-            "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n"
+            "\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n"
         )?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
         write!(
             w,
-            "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n"
+            "\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n"
         )?;
         Ok(())
     }
@@ -1715,20 +1729,20 @@ namespace cg = cooperative_groups;
                     };
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         use_thread_per_id, cg_tile, use_thread_per_id
                     )?;
                     let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         use_thread_quota, cg_tile_use, use_thread_quota
                     )?;
                     let available_thread_quota = available_thread_quota.unwrap();
                     let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {