prathi3 · prathi3
--- a/hercules_cg/src/gpu.rs 0 → 100644

+ 181

− 0
+++ b/hercules_cg/src/gpu.rs 0 → 100644

+ 181

− 0
+extern crate bitvec;
+extern crate hercules_ir;
+
+use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
+use std::fmt::{Error, Write};
+use std::iter::{zip, FromIterator};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use self::bitvec::prelude::*;
+
+use self::hercules_ir::*;
+
+use crate::*;
+
+static NUM_FILLER_REGS: AtomicUsize = AtomicUsize::new(0);
+
+/*
+ * The top level function to compile a Hercules IR function into NVVM IR kernel for
+ * execution on the GPU. We generate NVVM IR textually, copying from the CPU LLVM approach.
+ */
+pub fn gpu_codegen<W: Write>(
+    function: &Function,
+    types: &Vec<Type>,
+    constants: &Vec<Constant>,
+    dynamic_constants: &Vec<DynamicConstant>,
+    reverse_postorder: &Vec<NodeID>,
+    typing: &Vec<TypeID>,
+    control_subgraph: &Subgraph,
+    data_nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>,
+    bbs: &Vec<NodeID>,
+    w: &mut W,
+) -> Result<(), Error> {
+    // temporary hardcoded values
+    let kernel = GPUKernel {
+        max_num_blocks: 1024,
+        max_num_threads: 1024,
+        threads_per_warp: 32,
+    };
+    let ctx = GPUContext {
+        function,
+        types,
+        constants,
+        dynamic_constants,
+        reverse_postorder,
+        typing,
+        control_subgraph,
+        bbs,
+        structs: HashSet::new(),
+        w,
+        kernel,
+    };
+    ctx.codegen_function()
+}
+
+struct GPUContext<'a, W: Write> {
+    function: &'a Function,
+    types: &'a Vec<Type>,
+    constants: &'a Vec<Constant>,
+    dynamic_constants: &'a Vec<DynamicConstant>,
+    reverse_postorder: &'a Vec<NodeID>,
+    typing: &'a Vec<TypeID>,
+    control_subgraph: &'a Subgraph,
+    bbs: &'a Vec<NodeID>,
+    structs: HashSet<usize>,
+    w: &'a mut W,
+    kernel: GPUKernel,
+}
+
+struct GPUKernel {
+    max_num_blocks: usize,
+    max_num_threads: usize,
+    threads_per_warp: usize,
+}
+
+#[derive(Default, Debug)]
+struct CudaBlock {
+    label: String,
+    body: String,
+}
+
+impl<'a, W: Write> GPUContext<'a, W> {
+    fn codegen_function(&self) -> Result<(), Error> {
+        // Static content and function signature
+        write!(
+            self.w, 
+            "
+#include <assert.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <mma.h>
+#include <helper_cuda.h>
+",
+        )?;
+
+        let mut function_signature = String::new();
+        write!(&mut function_signature, "template <")?;
+        // The dynamic constants become template parameters.
+        let mut first_template_param = true;
+        for idx in 0..self.function.num_dynamic_constants {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(&mut function_signature, ", ")?;
+            }
+            write!(&mut function_signature, "long long int dc_p{}", idx)?;
+        }
+        write!(&mut function_signature, ">\n")?;
+
+        write!(&mut function_signature, "__global__ void __launch_bounds__({}) {}(", self.kernel.max_num_blocks, self.function.name)?;
+        // The second set of parameters are normal arguments.
+        let mut first_param = true;
+        for (idx, ty) in self.function.param_types.iter().enumerate() {
+            if first_param {
+                first_param = false;
+            } else {
+                write!(&mut function_signature, ", ")?;
+            }
+            write!(&mut function_signature, "{} p{}", self.get_type(*ty)?, idx)?;
+        }
+        write!(&mut function_signature, ") {\n")?;
+
+        // do actual stuff
+        // step 1. determine number of outermost fork joins at block level. we greedily accept while: a) total number of blocks < max_num_blocks, b) each fork join is strictly nested meaning no other neighbor fork joins, and c) each fork join's 
+
+        // finish kernel
+        write!(&mut function_signature, "}\n")?;
+    }
+
+    // matmul detection- only called if einsum detected
+    fn matmul_detection(&self, w: &mut W) -> Result<(), Error> {}
+
+    // convolution detection- only called if einsum detected
+    fn convolution_detection(&self, w: &mut W) -> Result<(), Error> {}
+
+    fn get_type(&self, id: TypeID) -> Result<String, Error> {
+        match self.types[id.idx()] {
+            Type::Product(ref product_ty_ids) => {
+                if !self.structs.contains(&id.idx()) {
+                    write!(self.w, "\nstruct Product_{} {{\n", id.idx())?;
+                    for (i, product_ty_id) in product_ty_ids.iter().enumerate() {
+                        write!(self.w, "\t{} field_{};\n", self.get_type(*product_ty_id)?, i)?;
+                    }
+                    write!(self.w, "}};\n")?;
+                    self.structs.insert(id.idx());
+                }
+                Ok(format!("Product_{}", id.idx()))
+            }
+            Type::Summation(ref summation_ty_ids) => {
+                if !self.structs.contains(&id.idx()) {
+                    write!(self.w, "\nstruct Summation_{} {{\n\t union {{\n", id.idx())?;
+                    for (i, summation_ty_id) in summation_ty_ids.iter().enumerate() {
+                        write!(self.w, "\t\t{} field_{};\n", self.get_type(*summation_ty_id)?, i)?;
+                    }
+                    write!(self.w, "\t}};\n\tuint8_t tag;\n}};\n")?;
+                    self.structs.insert(id.idx());
+                }
+                Ok(format!("Summation_{}", id.idx()))
+            }
+            _ => Ok(convert_type(&self.types[id.idx()])),
+        }
+    }
+
+    // TODO: run this at end and add const qualifier where applicable; moar dtypes float8, float16, bfloat16
+    fn convert_type(ty: &Type) -> String {
+        match ty {
+            Type::Boolean => "bool".to_string(),
+            Type::Integer8 => "int8_t".to_string(),
+            Type::UnsignedInteger8 => "uint8_t".to_string(),
+            Type::Integer16 => "short".to_string(),
+            Type::UnsignedInteger16 => "unsigned short".to_string(),
+            Type::Integer32 => "int".to_string(),
+            Type::UnsignedInteger32 => "unsigned int".to_string(),
+            Type::Integer64 => "long long".to_string(),
+            Type::UnsignedInteger64 => "unsigned long long".to_string(),
+            Type::Float32 => "float".to_string(),
+            Type::Float64 => "double".to_string(),
+            _ => panic!("Unsupported type"),
+        }
+    }
+}
+\ No newline at end of file