From 7b6ba729fdeaf8e76a2bb2aa2631f931a4525070 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Mon, 3 Mar 2025 13:25:13 -0600 Subject: [PATCH] more bfs opt --- hercules_cg/src/gpu.rs | 18 +++++++++++------- juno_samples/rodinia/bfs/src/gpu.sch | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 07dd3ebf..3a00e547 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -354,6 +354,7 @@ impl GPUContext<'_> { write!( w, " +#define _CG_ABI_EXPERIMENTAL #include <assert.h> #include <stdio.h> #include <stddef.h> @@ -1294,7 +1295,7 @@ namespace cg = cooperative_groups; } if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; *num_tabs -= 1; } if !is_primitive && state == KernelState::OutBlock { @@ -1311,6 +1312,7 @@ namespace cg = cooperative_groups; } if !is_primitive && (state != KernelState::OutBlock || !is_block_parallel.unwrap_or(false)) + && !self.function.schedules[id.idx()].contains(&Schedule::NoResetConstant) { let data_size = self.get_size(self.typing[id.idx()], None); write!( @@ -1320,7 +1322,8 @@ namespace cg = cooperative_groups; )?; write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + write!(w, "__syncthreads\n")?; } } // Dynamic constants emitted at top @@ -1595,7 +1598,7 @@ namespace cg = cooperative_groups; write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, data_size, cg_tile, cg_tile, data_variable, cg_tile, data_size, cg_tile, cg_tile)?; write!(w, "{}}}\n", tabs)?; } - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + //write!(w, "{}{}.sync();\n", tabs, cg_tile)?; let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -1705,20 +1708,20 @@ namespace cg = cooperative_groups; }; write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", use_thread_per_id, cg_tile, use_thread_per_id )?; let cg_tile_use = self.get_cg_tile(id, CGType::Use); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", use_thread_quota, cg_tile_use, use_thread_quota )?; let available_thread_quota = available_thread_quota.unwrap(); let cg_tile_available = self.get_cg_tile(id, CGType::Available); write!( thread_block_tiles, - "\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n", + "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n", available_thread_quota, cg_tile_available, available_thread_quota )?; if parallel_factor.is_none() { @@ -1780,7 +1783,8 @@ namespace cg = cooperative_groups; } let fork = self.join_fork_map.get(&id).unwrap(); let cg_tile_available = self.get_cg_tile(*fork, CGType::Available); - write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + //write!(w_term, "\t{}.sync();\n", cg_tile_available)?; + write!(w_term, "\t__syncthreads;\n")?; } // If the Fork was parallelized, each thread or UsedPerId tile of // threads only runs one ThreadID, so we can jump straight to the diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch index d5c8dee6..4e5c1f74 100644 --- a/juno_samples/rodinia/bfs/src/gpu.sch +++ b/juno_samples/rodinia/bfs/src/gpu.sch @@ -38,7 +38,7 @@ fixpoint { } simpl!(collect); -fork-tile[32, 0, false, true](traverse, collect); +fork-tile[1024, 0, false, true](traverse, collect); fork-split(traverse, collect); unforkify(init); -- GitLab