Skip to content
Snippets Groups Projects

GPU backend

Merged prathi3 requested to merge gpu-cg into main
1 file
+ 240
70
Compare changes
  • Side-by-side
  • Inline
+ 240
70
@@ -85,6 +85,7 @@ pub fn gpu_codegen<W: Write>(
};
let (fork_forward_adjacency, fork_join_map) = make_fork_structures();
// Maybe can delete
let map_join_reduce = || -> HashMap<NodeID, Vec<NodeID>> {
let reduce_nodes: Vec<NodeID> = (0..function.nodes.len())
.filter(|idx| function.nodes[*idx].is_reduce())
@@ -219,6 +220,7 @@ impl GPUContext<'_> {
"
#include <assert.h>
#include <stdio.h>
#include <stddef.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <mma.h>
@@ -293,7 +295,7 @@ impl GPUContext<'_> {
for (id, ty) in self.types.iter().enumerate() {
match ty {
Type::Product(ref product_ty_ids) => {
write!(w, "\nstruct Product_{} {{\n", id)?;
write!(w, "\ntypedef struct Product_{} {{\n", id)?;
for (i, product_ty_id) in product_ty_ids.iter().enumerate() {
write!(
w,
@@ -302,10 +304,10 @@ impl GPUContext<'_> {
i
)?;
}
write!(w, "}};\n")?;
write!(w, "}} Product_{};\n", id)?;
}
Type::Summation(ref summation_ty_ids) => {
write!(w, "\nstruct Summation_{} {{\n\t union {{\n", id)?;
write!(w, "\ntypedef struct Summation_{} {{\n\t union {{\n", id)?;
for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() {
write!(
w,
@@ -314,7 +316,7 @@ impl GPUContext<'_> {
i
)?;
}
write!(w, "\t}};\n\tuint8_t tag;\n}};\n")?;
write!(w, "\t}};\n\tuint8_t tag;\n}} Summation_{};\n", id)?;
}
_ => {}
}
@@ -348,7 +350,7 @@ impl GPUContext<'_> {
write!(w, "{} p{}", self.get_type(*ty, true), idx)?;
}
// We convert originally non-void functions to void functions by adding a
// return parameter. For now we ignore the case where return was derived
// return parameter. For now we ignore the case where return was derived
// from a parameter through reads and writes, and instead always memcpy.
let return_index = self.function.nodes.iter().position(|node| node.is_return());
if let Some(return_index) = return_index {
@@ -424,7 +426,7 @@ impl GPUContext<'_> {
Ok(())
}
// Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the join has no user reduce nodes, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred.
// Construct block forks by greedily accepting while: a) each fork join is strictly nested meaning no other neighbor fork joins, b) the forks are parallel reduce forks, c) total number of blocks < max_num_blocks, d) each fork join's bounds are independent of outer fork joins, and e) each fork join's reduction has no dependency or synchronization between {hercules} threads. smarter policy may be needed, particularly for underutilized kernels where saturating threads per block instead of blocks per kernel may be preferred.
fn codegen_block_creation(&mut self) -> Result<(Vec<NodeID>, Vec<usize>), Error> {
let mut root_forks: HashSet<NodeID> = self.fork_forward_adjacency.keys().copied().collect();
for (_, children) in self.fork_forward_adjacency.iter() {
@@ -442,7 +444,7 @@ impl GPUContext<'_> {
let mut curr_fork = root_forks[0];
while let Some(join) = self.fork_join_map.get(&curr_fork) {
let children = &self.fork_forward_adjacency[&curr_fork];
if children.len() != 1 || !self.join_reduce_map.contains_key(join) {
if children.len() != 1 || !self.function.schedules[curr_fork.idx()].contains(&Schedule::ParallelFork) {
break;
}
curr_fork = children[0];
@@ -556,7 +558,7 @@ impl GPUContext<'_> {
}
}
// 2. Emit data flow for nodes assigned to those basic blocks
// 2. Emit data flow for nodes assigned to basic blocks in block sink
// 2a. All phi registers first
self.codegen_phi_registers(w, |id| control_nodes_between.contains(&self.bbs[id.idx()]))?;
// 2b. All other data nodes
@@ -578,6 +580,7 @@ impl GPUContext<'_> {
{
self.codegen_data_node(
*id,
1,
&mut gotos.get_mut(&self.bbs[id.idx()]).unwrap().body,
)?;
visited.insert(id);
@@ -598,13 +601,10 @@ impl GPUContext<'_> {
Ok(())
}
fn codegen_data_node(
&self,
id: NodeID,
w: &mut String,
) -> Result<(), Error> {
fn codegen_data_node(&self, id: NodeID, num_tabs: usize, w: &mut String) -> Result<(), Error> {
// For now only used shared memory when creating an array
let declare_variable = self.get_value(id, true, false).to_string();
let tabs = "\t".repeat(num_tabs);
match &self.function.nodes[id.idx()] {
// Phi registers were already emitted.
Node::Phi {
@@ -628,7 +628,8 @@ impl GPUContext<'_> {
Type::Boolean => {
write!(
w,
"\t{} = !{};\n",
"{}{} = !{};\n",
tabs,
declare_variable,
self.get_value(*input, false, false),
)?;
@@ -636,7 +637,8 @@ impl GPUContext<'_> {
ty if ty.is_fixed() => {
write!(
w,
"\t{} = ~{};\n",
"{}{} = ~{};\n",
tabs,
declare_variable,
self.get_value(*input, false, false),
)?;
@@ -647,7 +649,8 @@ impl GPUContext<'_> {
ty if ty.is_signed() || ty.is_float() => {
write!(
w,
"\t{} = -{};\n",
"{}{} = -{};\n",
tabs,
declare_variable,
self.get_value(*input, false, false),
)?;
@@ -659,7 +662,8 @@ impl GPUContext<'_> {
UnaryOperator::Cast(dst_ty_id) => {
write!(
w,
"\t{} = static_cast<{}>({});\n",
"{}{} = static_cast<{}>({});\n",
tabs,
declare_variable,
self.get_type(*dst_ty_id, false),
self.get_value(*input, false, false),
@@ -672,36 +676,29 @@ impl GPUContext<'_> {
match (op, &self.types[self.typing[left.idx()].idx()]) {
(BinaryOperator::Rem, Type::Float32) => write!(
w,
"\t{} = fmodf({}, {});\n",
declare_variable,
left_val,
right_val,
"{}{} = fmodf({}, {});\n",
tabs, declare_variable, left_val, right_val,
)?,
(BinaryOperator::Rem, Type::Float64) => write!(
w,
"\t{} = fmod({}, {});\n",
declare_variable,
left_val,
right_val,
"{}{} = fmod({}, {});\n",
tabs, declare_variable, left_val, right_val,
)?,
// Doesn't need special syntax but bool type
(BinaryOperator::Or, Type::Boolean) => write!(
w,
"\t{} = {} || {};\n",
declare_variable,
left_val,
right_val,
"{}{} = {} || {};\n",
tabs, declare_variable, left_val, right_val,
)?,
(BinaryOperator::And, Type::Boolean) => write!(
w,
"\t{} = {} && {};\n",
declare_variable,
left_val,
right_val,
"{}{} = {} && {};\n",
tabs, declare_variable, left_val, right_val,
)?,
(op, _) => write!(
w,
"\t{} = {} {} {};\n",
"{}{} = {} {} {};\n",
tabs,
declare_variable,
left_val,
match op {
@@ -726,11 +723,17 @@ impl GPUContext<'_> {
)?,
};
}
Node::Ternary {op, first, second, third} => match op {
Node::Ternary {
op,
first,
second,
third,
} => match op {
TernaryOperator::Select => {
write!(
w,
"\t{} = {} ? {} : {};\n",
"{}{} = {} ? {} : {};\n",
tabs,
declare_variable,
self.get_value(*first, false, false),
self.get_value(*second, false, false),
@@ -743,30 +746,53 @@ impl GPUContext<'_> {
let func_name = self.codegen_intrinsic(intrinsic, ty);
write!(
w,
"\t{} = {}({});\n",
"{}{} = {}({});\n",
tabs,
declare_variable,
func_name,
self.get_value(args[0], false, false),
)?;
}
Node::Read { collect, indices } => {
let index_ptr_name = self.codegen_indices(*collect, indices);
// If it's a parameter node then copy from global memory, else
// reference from shared memory or registers.
// If it's a parameter node then copy from global memory, else
// from shared memory or registers.
if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] {
// We parallelize copies from global memory across threads for
// array types, either immediate or nested in the collection.
if self.types[self.typing[id.idx()].idx()].is_primitive() {
write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?;
} else {
self.codegen_global_to_shared(id, declare_variable, index_ptr_name, indices.len(), true, w)?;
}
let index_ptr_name = self.codegen_indices(*collect, indices, true);
self.codegen_copy_from_global(
true,
self.typing[id.idx()],
&declare_variable,
&index_ptr_name,
Some(indices.len()),
true,
num_tabs,
w,
)?;
} else {
write!(w, "\t{} = {};\n", declare_variable, index_ptr_name)?;
let index_ptr_name = self.codegen_indices(*collect, indices,false);
write!(w, "{}{} = {};\n", tabs, declare_variable, index_ptr_name)?;
}
}
Node::Write {collect: _, data: _, indices: _} => {
// TODO
Node::Write {collect, data, indices} => {
let data_variable = self.get_value(*data, false, false);
// If it's a parameter node then copy to global memory, else
// to shared memory or registers
if let Node::Parameter { index: _ } = &self.function.nodes[collect.idx()] {
let index_ptr_name = self.codegen_indices(*collect, indices, true);
self.codegen_copy_to_from_global(
false,
self.typing[id.idx()],
&data_variable,
&index_ptr_name,
Some(indices.len()),
true,
num_tabs,
w,
)?;
} else {
let index_ptr_name = self.codegen_indices(*collect, indices, false);
write!(w, "{}{} = {};\n", tabs, index_ptr_name, data_variable)?;
}
}
_ => {
panic!("Unsupported node type")
@@ -848,7 +874,7 @@ impl GPUContext<'_> {
let Type::Array(element_type, extents) = &self.types[type_id.idx()] else {
panic!("Expected array type")
};
// For now we do element-wise alignment, later could consider (n-1)d array
// For now we do element-wise alignment, later could consider (n-1)d array
// alignment. Then we "allocate" from the single dynamic shared memory buffer
// by using and updating the offset.
let element_size = format!("sizeof({})", self.get_type(*element_type, false));
@@ -857,18 +883,41 @@ impl GPUContext<'_> {
.map(|id| format!("dc{}", id.idx()))
.collect::<Vec<_>>()
.join("*");
write!(w, ";\n\talignment = {};\n\tdynamic_shared_offset =
write!(
w,
";\n\talignment = {};\n\tdynamic_shared_offset =
(dynamic_shared_offset + alignment - 1) / alignment * alignment;\n\t{} =
reinterpret_cast<{}>(&dynamic_shared[dynamic_shared_offset]);\n\t
dynamic_shared_offset += {}", element_size, name, self.get_type(*element_type, false), array_size)?;
dynamic_shared_offset += {}",
element_size,
name,
self.get_type(*element_type, false),
array_size
)?;
}
}
Ok(())
}
fn codegen_global_to_shared(&self, id: NodeID, declare_variable: String, index_ptr_name: String, array_depth: Option<usize>, outermost: bool, w: &mut String) -> Result<(), Error> {
match &self.types[self.typing[id.idx()].idx()] {
Type::Array(_, extents) => {
// Used for reads and writes due to identical logic. data_variable is the
// resulting reference for reads, and is the source for writes. Writes don't
// emit a new reference.
fn codegen_copy_from_global(
&self,
is_read: bool,
type_id: TypeID,
data_variable: &String,
index_ptr_name: &String,
array_depth: Option<usize>,
parallelize: bool,
num_tabs: usize,
w: &mut String,
) -> Result<(), Error> {
let tabs = "\t".repeat(num_tabs);
let lhs = if is_read { data_variable } else { index_ptr_name };
let rhs = if is_read { index_ptr_name } else { data_variable };
match &self.types[type_id.idx()] {
Type::Array(element_type_id, extents) => {
let array_depth = array_depth.unwrap();
let rem_array_size = extents
.iter()
@@ -878,23 +927,95 @@ impl GPUContext<'_> {
.collect::<Vec<_>>()
.join("*");
let mut running_div_factor = "1".to_string();
write!(w, "\tfor (int i = threadIdx.x; i < {}; i += {}) {{\n", rem_array_size, self.kernel_attrs.num_threads)?;
let mut indices = vec![];
for i in (array_depth..extents.len()).rev() {
indices.push(format!("[(({}) / ({})) % dc{}]", rem_array_size, running_div_factor, extents[i].idx()));
running_div_factor = format!("{} * {}", running_div_factor, format!("dc{}", extents[i].idx()));
indices.push(format!(
"[(({}) / ({})) % dc{}]",
rem_array_size,
running_div_factor,
extents[i].idx()
));
running_div_factor = format!(
"{} * {}",
running_div_factor,
format!("dc{}", extents[i].idx())
);
}
let indices_str = indices.join("");
// TODO: condition by primitive vs collection, if latter then recurse
// with outermost = false
write!(w, "\t\t{}{} = {}{};\n", declare_variable, indices_str, index_ptr_name, indices_str)?;
// Parallelizing only affects loop bounds
let begin_copy = if parallelize {
format!(
"{}for (int i = threadIdx.x; i < {}; i += {}) {{\n",
tabs, rem_array_size, self.kernel_attrs.num_threads
)
} else {
format!("{}for (int i = 0; i < {}; i++) {{\n", tabs, rem_array_size)
};
write!(w, "{}", begin_copy)?;
self.codegen_copy_to_from_global(
is_read,
*element_type_id,
&format!("{}{}", data_variable, indices_str),
&format!("{}{}", index_ptr_name, indices_str),
None,
false,
num_tabs + 1,
w,
)?;
let end_copy = if parallelize {
format!("{}}}\n{}__syncthreads();\n", tabs, tabs)
} else {
format!("{}}}\n", tabs)
};
write!(w, "{}", end_copy)?;
}
Type::Product(fields) => {
for field in fields {
self.codegen_copy_to_from_global(
is_read,
*field,
&format!("{}{}", data_variable, field.idx()),
&format!("{}{}", index_ptr_name, field.idx()),
None,
false,
num_tabs + 1,
w,
)?;
}
}
Type::Summation(fields) => {
// First copy the tag
write!(w, "{}{}.tag = {}.tag;\n", tabs, lhs, rhs)?;
// Then copy the active field based on the tag
write!(w, "{}switch({}.tag) {{\n", tabs, rhs)?;
for (variant_idx, field) in fields.iter().enumerate() {
write!(w, "{}\tcase {}: {{\n", tabs, variant_idx)?;
// Recursively copy the field's contents
self.codegen_copy_to_from_global(
is_read,
*field,
&format!("{}.field_{}", data_variable, variant_idx),
&format!("{}.field_{}", index_ptr_name, variant_idx),
None,
false,
num_tabs + 2,
w
)?;
write!(w, "{}\t\tbreak;\n", tabs)?;
write!(w, "{}\t}}\n", tabs)?;
}
write!(w, "{}}}\n", tabs)?;
}
// Primitive types
_ => {
write!(w, "{}{} = {};\n", tabs, lhs, rhs)?;
}
// TODO: handle product and summation collections
}
Ok(())
}
fn codegen_indices(&self, collect: NodeID, indices: &[Index]) -> String {
// Use normal indexing for local collections
fn codegen_indices_local(&self, collect: NodeID, indices: &[Index]) -> String {
let mut index_ptr_name = format!("{}", self.get_value(collect, false, false));
for index in indices {
match index {
@@ -905,15 +1026,54 @@ impl GPUContext<'_> {
index_ptr_name.push_str(&format!(".field_{}", variant));
}
Index::Position(indices) => {
index_ptr_name.push_str(&indices
index_ptr_name.push_str(
&indices
.iter()
.map(|index| format!("[{}]", self.get_value(*index, false, false)))
.collect::<Vec<_>>()
.join(""),
);
}
}
}
index_ptr_name
}
// Use arithmetic for global collections as they're accessed as pointers
fn codegen_indices_global(&self, collect: NodeID, indices: &[Index]) -> String {
let mut index_ptr_name = format!("{}[0", self.get_value(collect, false, false));
let type_id = self.typing[collect.idx()];
for index in indices {
match index {
Index::Field(field) => {
let offset = (0..*field)
.map(|i| format!("offsetof({}, field_{})", self.get_type(type_id, false), i))
.collect::<Vec<_>>()
.join(" + ");
index_ptr_name.push_str(&format!(" + {}", offset));
}
// Variants of summations have zero offset
Index::Variant(_) => {}
Index::Position(array_indices) => {
let Type::Array(_, extents) = &self.types[self.typing[collect.idx()].idx()] else {
panic!("Expected array type")
};
let mut cumulative_offset = "1 * ".to_string() + extents
.iter()
.map(|index| format!("[{}]", self.get_value(*index, false, false)))
.enumerate()
.filter(|(i, _)| *i >= array_indices.len())
.map(|(_, id)| format!("dc{}", id.idx()))
.collect::<Vec<_>>()
.join(""));
.join(" * ")
.as_str();
for index in array_indices.iter().rev() {
cumulative_offset = format!("{} * ({} + ", cumulative_offset, self.get_value(*index, false, false));
}
index_ptr_name.push_str(&format!(" + {}{}", cumulative_offset, ")".repeat(array_indices.len())));
}
}
}
index_ptr_name
format!("{}]", index_ptr_name)
}
fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String {
@@ -1050,7 +1210,17 @@ impl GPUContext<'_> {
panic!("Parameters shouldn't be re-initialized")
}
format!("p{}", index)
} else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()] {
} else if let Node::Write { collect, data: _, indices: _ } = &self.function.nodes[id.idx()] {
if ty {
panic!("Writes shouldn't be initialized, they're replaced with the referenced collection")
}
if make_pointer {
panic!("Writes shouldn't be called as pointer")
}
self.get_value(*collect, false, false)
} else if ty && let Type::Array(element_type, extents) = &self.types[self.typing[id.idx()].idx()]
{
// Shmem/register arrays have special formatting
let mut declare_array = format!(
"{} (*{}{})",
self.get_type(*element_type, false),
@@ -1077,7 +1247,7 @@ impl GPUContext<'_> {
}
}
// make_pointer enforces static pointer and not recursive or array pointer:
// make_pointer enforces static pointer and not recursive or array pointer:
// multi-d arrays are single pointers with custom indexing.
fn get_type(&self, id: TypeID, make_pointer: bool) -> String {
match &self.types[id.idx()] {
Loading