rarbore2 · rarbore2
--- a/hercules_cg/src/gpu.rs

+ 11

− 6
+++ b/hercules_cg/src/gpu.rs

+ 11

− 6
 @@ -354,6 +354,7 @@ impl GPUContext<'_> {
        write!(
            w,
            "
+#define _CG_ABI_EXPERIMENTAL
 #include <assert.h>
 #include <stdio.h>
 #include <stddef.h>
 @@ -561,8 +562,9 @@ namespace cg = cooperative_groups;
     * and writes.
     */
    fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
+        write!(w, "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n")?;
        write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
-        write!(w, "\tcg::thread_block block = cg::this_thread_block();\n")?;
+        write!(w, "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n")?;
        Ok(())
    }

 @@ -1294,7 +1296,7 @@ namespace cg = cooperative_groups;
                }
                if !is_primitive && state != KernelState::OutBlock {
                    write!(w, "{}}}\n", tabs)?;
-                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    //write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
                    *num_tabs -= 1;
                }
                if !is_primitive && state == KernelState::OutBlock {
 @@ -1311,6 +1313,7 @@ namespace cg = cooperative_groups;
                }
                if !is_primitive
                    && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false))
+                    && !self.function.schedules[id.idx()].contains(&Schedule::NoResetConstant)
                {
                    let data_size = self.get_size(self.typing[id.idx()], None);
                    write!(
 @@ -1321,6 +1324,7 @@ namespace cg = cooperative_groups;
                    write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?;
                    write!(w, "{}}}\n", tabs)?;
                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                    //write!(w, "__syncthreads\n")?;
                }
            }
            // Dynamic constants emitted at top
 @@ -1595,7 +1599,7 @@ namespace cg = cooperative_groups;
                    write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?;
                    write!(w, "{}}}\n", tabs)?;
                }
-                write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                //write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
                let collect_variable = self.get_value(*collect, false, false);
                write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?;
            }
 @@ -1705,20 +1709,20 @@ namespace cg = cooperative_groups;
                    };
                    write!(
                        thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                        use_thread_per_id, cg_tile, use_thread_per_id
                    )?;
                    let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                    write!(
                        thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                        use_thread_quota, cg_tile_use, use_thread_quota
                    )?;
                    let available_thread_quota = available_thread_quota.unwrap();
                    let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                    write!(
                        thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
                        available_thread_quota, cg_tile_available, available_thread_quota
                    )?;
                    if parallel_factor.is_none() {
 @@ -1781,6 +1785,7 @@ namespace cg = cooperative_groups;
                    let fork = self.join_fork_map.get(&id).unwrap();
                    let cg_tile_available = self.get_cg_tile(*fork, CGType::Available);
                    write!(w_term, "\t{}.sync();\n", cg_tile_available)?;
+                    //write!(w_term, "\t__syncthreads;\n")?;
                }
                // If the Fork was parallelized, each thread or UsedPerId tile of
                // threads only runs one ThreadID, so we can jump straight to the