Skip to content
Snippets Groups Projects

Misc. GPU improvements

Merged rarbore2 requested to merge edge_gpu into main
13 files
+ 369
60
Compare changes
  • Side-by-side
  • Inline
Files
13
+ 55
20
@@ -3,8 +3,6 @@ extern crate hercules_ir;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::fmt::{Error, Write};
use std::fs::{File, OpenOptions};
use std::io::Write as _;
use self::hercules_ir::*;
@@ -269,7 +267,6 @@ impl GPUContext<'_> {
self.codegen_dynamic_constants(&mut top)?;
self.codegen_declare_data(&mut top)?;
self.codegen_helpers(&mut top)?;
self.codegen_goto_start(&mut top)?;
write!(w, "{}", top)?;
// Setup for CUDA's "goto" for control flow between basic blocks.
@@ -281,10 +278,15 @@ impl GPUContext<'_> {
(node_id, goto)
})
.collect();
let mut thread_block_tiles = String::new();
// If there are no forks, fast forward to single-block, single-thread codegen
let (num_blocks, num_threads) = if self.fork_join_map.is_empty() {
self.codegen_data_control_no_forks(&mut dynamic_shared_offset, &mut gotos)?;
self.codegen_data_control_no_forks(
&mut dynamic_shared_offset,
&mut thread_block_tiles,
&mut gotos,
)?;
("1".to_string(), "1".to_string())
} else {
// Create structures and determine block and thread parallelization strategy
@@ -307,12 +309,15 @@ impl GPUContext<'_> {
&mut dynamic_shared_offset,
is_block_parallel,
num_threads,
&mut thread_block_tiles,
&mut gotos,
)?;
(num_blocks, num_threads.to_string())
};
// Emit all GPU kernel code from previous steps
self.codegen_goto_start(&mut thread_block_tiles)?;
write!(w, "{}", thread_block_tiles)?;
let mut kernel_body = String::new();
let rev_po = self.control_subgraph.rev_po(NodeID::new(0));
write!(w, "\n")?;
@@ -696,7 +701,7 @@ extern \"C\" {} {}(",
let Node::Fork { factors, .. } = &self.function.nodes[root_fork.idx()] else {
panic!("Expected fork node");
};
let reduces = &self.fork_reduce_map[root_fork];
let _reduces = &self.fork_reduce_map[root_fork];
if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) {
let fork_size = factors
.iter()
@@ -847,6 +852,7 @@ extern \"C\" {} {}(",
fn codegen_data_control_no_forks(
&self,
dynamic_shared_offset: &mut String,
thread_block_tiles: &mut String,
gotos: &mut BTreeMap<NodeID, CudaGoto>,
) -> Result<(), Error> {
(0..self.function.nodes.len())
@@ -858,8 +864,16 @@ extern \"C\" {} {}(",
let post_init = &mut goto.post_init;
let body = &mut goto.body;
let term = &mut goto.term;
let mut tabs =
self.codegen_control_node(control, None, None, None, init, post_init, term)?;
let mut tabs = self.codegen_control_node(
control,
None,
None,
None,
thread_block_tiles,
init,
post_init,
term,
)?;
for data in self.bbs.1[control.idx()].iter() {
self.codegen_data_node(
*data,
@@ -889,6 +903,7 @@ extern \"C\" {} {}(",
dynamic_shared_offset: &mut String,
is_block_parallel: bool,
num_threads: usize,
thread_block_tiles: &mut String,
gotos: &mut BTreeMap<NodeID, CudaGoto>,
) -> Result<(), Error> {
// First emit data and control gen for each control node outside any fork.
@@ -900,8 +915,16 @@ extern \"C\" {} {}(",
let post_init = &mut goto.post_init;
let body = &mut goto.body;
let term = &mut goto.term;
let mut tabs =
self.codegen_control_node(*control, None, None, None, init, post_init, term)?;
let mut tabs = self.codegen_control_node(
*control,
None,
None,
None,
thread_block_tiles,
init,
post_init,
term,
)?;
for data in self.bbs.1[control.idx()].iter() {
self.codegen_data_node(
*data,
@@ -931,6 +954,7 @@ extern \"C\" {} {}(",
Some(num_threads),
Some(num_threads),
Some(1),
thread_block_tiles,
init,
post_init,
term,
@@ -961,6 +985,7 @@ extern \"C\" {} {}(",
1,
num_threads,
dynamic_shared_offset,
thread_block_tiles,
gotos,
)?;
}
@@ -981,6 +1006,7 @@ extern \"C\" {} {}(",
parent_quota: usize,
num_threads: usize,
dynamic_shared_offset: &mut String,
thread_block_tiles: &mut String,
gotos: &mut BTreeMap<NodeID, CudaGoto>,
) -> Result<(), Error> {
let (available_thread_quota, use_thread_quota, parallel_factor) = fork_thread_quota_map
@@ -1017,6 +1043,7 @@ extern \"C\" {} {}(",
Some(available_thread_quota),
Some(use_thread_quota),
parallel_factor,
thread_block_tiles,
init,
post_init,
term,
@@ -1044,6 +1071,7 @@ extern \"C\" {} {}(",
use_thread_quota,
num_threads,
dynamic_shared_offset,
thread_block_tiles,
gotos,
)?;
}
@@ -1383,15 +1411,15 @@ extern \"C\" {} {}(",
let cg_tile = self.get_cg_tile(nesting_fork.unwrap(), CGType::Use);
#[allow(unreachable_patterns)]
let cg_op = match intrinsic {
Intrinsic::Max => "max",
Intrinsic::Min => "min",
Intrinsic::Max => "greater",
Intrinsic::Min => "less",
_ => unreachable!(),
};
let id_type_name = self.get_type(id_type, false);
write!(
w,
"{}{} = cg::reduce({}, {}, cg::{}<{}>());\n",
tabs, define_variable, non_reduce_arg, cg_tile, cg_op, id_type_name
tabs, define_variable, cg_tile, non_reduce_arg, cg_op, id_type_name
)?;
} else {
let ty = &self.types[id_type.idx()];
@@ -1504,6 +1532,7 @@ extern \"C\" {} {}(",
available_thread_quota: Option<usize>,
use_thread_quota: Option<usize>,
parallel_factor: Option<usize>,
thread_block_tiles: &mut String,
w_init: &mut String,
w_post_init: &mut String,
w_term: &mut String,
@@ -1579,20 +1608,20 @@ extern \"C\" {} {}(",
use_thread_quota
};
write!(
w_init,
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
use_thread_per_id, cg_tile, use_thread_per_id
)?;
let cg_tile_use = self.get_cg_tile(id, CGType::Use);
write!(
w_init,
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
use_thread_quota, cg_tile_use, use_thread_quota
)?;
let available_thread_quota = available_thread_quota.unwrap();
let cg_tile_available = self.get_cg_tile(id, CGType::Available);
write!(
w_init,
thread_block_tiles,
"\tcg::thread_block_tile<{}> {} = cg::tiled_partition<{}>(block);\n",
available_thread_quota, cg_tile_available, available_thread_quota
)?;
@@ -1797,7 +1826,7 @@ extern \"C\" {} {}(",
Constant::Integer64(val) => write!(w, "{}{} = {}ll;\n", tabs, name, val)?,
Constant::UnsignedInteger64(val) => write!(w, "{}{} = {}ull;\n", tabs, name, val)?,
Constant::Float32(val) => {
write!(w, "{}{} = {}f;\n", tabs, name, format_float(**val as f64))?
write!(w, "{}{} = {};\n", tabs, name, format_float(**val as f64))?
}
Constant::Float64(val) => write!(w, "{}{} = {};\n", tabs, name, format_float(**val))?,
// All three following collections involve align then allocate from the
@@ -2223,9 +2252,15 @@ fn convert_type(ty: &Type, make_pointer: bool) -> String {
}
fn format_float(val: f64) -> String {
let mut s = val.to_string();
if !s.contains('.') && !s.contains('e') && !s.contains('E') {
s.push_str(".0");
if val == f64::INFINITY {
"INFINITY".to_string()
} else if val == f64::NEG_INFINITY {
"-INFINITY".to_string()
} else {
let mut s = val.to_string();
if !s.contains('.') && !s.contains('e') && !s.contains('E') {
s.push_str(".0");
}
s
}
s
}
Loading