Skip to content
Snippets Groups Projects

GPU backend

Merged prathi3 requested to merge gpu-cg into main
1 file
+ 181
0
Compare changes
  • Side-by-side
  • Inline
+ 181
0
extern crate bitvec;
extern crate hercules_ir;
use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
use std::fmt::{Error, Write};
use std::iter::{zip, FromIterator};
use std::sync::atomic::{AtomicUsize, Ordering};
use self::bitvec::prelude::*;
use self::hercules_ir::*;
use crate::*;
static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0);
/*
* The top level function to compile a Hercules IR function into NVVM IR kernel for
* execution on the GPU. We generate NVVM IR textually, copying from the CPU LLVM approach.
*/
pub fn gpu_codegen<W: Write>(
function: &Function,
types: &Vec<Type>,
constants: &Vec<Constant>,
dynamic_constants: &Vec<DynamicConstant>,
reverse_postorder: &Vec<NodeID>,
typing: &Vec<TypeID>,
control_subgraph: &Subgraph,
data_nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>,
bbs: &Vec<NodeID>,
w: &mut W,
) -> Result<(), Error> {
// temporary hardcoded values
let kernel = GPUKernel {
max_num_blocks: 1024,
max_num_threads: 1024,
threads_per_warp: 32,
};
let ctx = GPUContext {
function,
types,
constants,
dynamic_constants,
reverse_postorder,
typing,
control_subgraph,
bbs,
structs: HashSet::new(),
w,
kernel,
};
ctx.codegen_function()
}
struct GPUContext<'a, W: Write> {
function: &'a Function,
types: &'a Vec<Type>,
constants: &'a Vec<Constant>,
dynamic_constants: &'a Vec<DynamicConstant>,
reverse_postorder: &'a Vec<NodeID>,
typing: &'a Vec<TypeID>,
control_subgraph: &'a Subgraph,
bbs: &'a Vec<NodeID>,
structs: HashSet<usize>,
w: &'a mut W,
kernel: GPUKernel,
}
struct GPUKernel {
max_num_blocks: usize,
max_num_threads: usize,
threads_per_warp: usize,
}
#[derive(Default, Debug)]
struct CudaBlock {
label: String,
body: String,
}
impl<'a, W: Write> GPUContext<'a, W> {
fn codegen_function(&self) -> Result<(), Error> {
// Static content and function signature
write!(
self.w,
"
#include <assert.h>
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <helper_cuda.h>
",
)?;
let mut function_signature = String::new();
write!(&mut function_signature, "template <")?;
// The dynamic constants become template parameters.
let mut first_template_param = true;
for idx in 0..self.function.num_dynamic_constants {
if first_param {
first_param = false;
} else {
write!(&mut function_signature, ", ")?;
}
write!(&mut function_signature, "long long int dc_p{}", idx)?;
}
write!(&mut function_signature, ">\n")?;
write!(&mut function_signature, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_blocks, self.function.name)?;
// The second set of parameters are normal arguments.
let mut first_param = true;
for (idx, ty) in self.function.param_types.iter().enumerate() {
if first_param {
first_param = false;
} else {
write!(&mut function_signature, ", ")?;
}
write!(&mut function_signature, "{} p{}", self.get_type(*ty)?, idx)?;
}
write!(&mut function_signature, ") {\n")?;
// do actual stuff
// step 1. determine number of outermost fork joins at block level. we greedily accept while: a) total number of blocks < max_num_blocks, b) each fork join is strictly nested meaning no other neighbor fork joins, and c) each fork join's
// finish kernel
write!(&mut function_signature, "}\n")?;
}
// matmul detection- only called if einsum detected
fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {}
// convolution detection- only called if einsum detected
fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {}
fn get_type(&self, id: TypeID) -> Result<String, Error> {
match self.types[id.idx()] {
Type::Product(ref product_ty_ids) => {
if !self.structs.contains(&id.idx()) {
write!(self.w, "\nstruct Product_{} {{\n", id.idx())?;
for (i, product_ty_id) in product_ty_ids.iter().enumerate() {
write!(self.w, "\t{} field_{};\n", self.get_type(*product_ty_id)?, i)?;
}
write!(self.w, "}};\n")?;
self.structs.insert(id.idx());
}
Ok(format!("Product_{}", id.idx()))
}
Type::Summation(ref summation_ty_ids) => {
if !self.structs.contains(&id.idx()) {
write!(self.w, "\nstruct Summation_{} {{\n\t union {{\n", id.idx())?;
for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() {
write!(self.w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id)?, i)?;
}
write!(self.w, "\t}};\n\tuint8_t tag;\n}};\n")?;
self.structs.insert(id.idx());
}
Ok(format!("Summation_{}", id.idx()))
}
_ => Ok(convert_type(&self.types[id.idx()])),
}
}
// TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16
fn convert_type(ty: &Type) -> String {
match ty {
Type::Boolean => "bool".to_string(),
Type::Integer8 => "int8_t".to_string(),
Type::UnsignedInteger8 => "uint8_t".to_string(),
Type::Integer16 => "short".to_string(),
Type::UnsignedInteger16 => "unsigned short".to_string(),
Type::Integer32 => "int".to_string(),
Type::UnsignedInteger32 => "unsigned int".to_string(),
Type::Integer64 => "long long".to_string(),
Type::UnsignedInteger64 => "unsigned long long".to_string(),
Type::Float32 => "float".to_string(),
Type::Float64 => "double".to_string(),
_ => panic!("Unsupported type"),
}
}
}
\ No newline at end of file
Loading