Compare revisions

902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc · 902f4dfc
--- a/hercules_rt/build.rs
+++ b/hercules_rt/build.rs
@@ -28,6 +28,7 @@ fn main() {
        println!("cargo::rustc-link-search=native=/opt/cuda/lib/");
        println!("cargo::rustc-link-lib=static=rtdefs");
        println!("cargo::rustc-link-lib=cudart");
+        println!("cargo::rustc-link-lib=cublas");
        println!("cargo::rerun-if-changed=src/rtdefs.cu");
    }
 }
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -29,7 +29,10 @@ pub unsafe fn __cpu_dealloc(ptr: *mut u8, size: usize) {
        eprintln!("__cpu_dealloc: {:?}, {}", ptr, size);
        assert!(!ptr.is_null() || size == 0);
    }
-    dealloc(ptr, Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap())
+    dealloc(
+        ptr,
+        Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap(),
+    )
 }

 pub unsafe fn __cpu_zero_mem(ptr: *mut u8, size: usize) {
@@ -103,14 +106,48 @@ pub unsafe fn __copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize) {
    ___copy_cuda_to_cuda(dst, src, size);
 }

+#[derive(Debug, Copy, Clone)]
+pub enum PrimTy {
+    Bool,
+    U8,
+    U16,
+    U32,
+    U64,
+    I8,
+    I16,
+    I32,
+    I64,
+    F8,
+    BF16,
+    F32,
+    F64,
+}
+
+#[cfg(feature = "cuda")]
+pub unsafe fn __library_cuda_gemm(
+    i: u64,
+    j: u64,
+    k: u64,
+    c: *mut u8,
+    a: *const u8,
+    b: *const u8,
+    ty: PrimTy,
+) {
+    match ty {
+        PrimTy::F32 => ___cublas_sgemm(i, j, k, c, a, b),
+        _ => todo!(),
+    }
+}
+
 #[cfg(feature = "cuda")]
 extern "C" {
-    pub fn ___cuda_alloc(size: usize) -> *mut u8;
-    pub fn ___cuda_dealloc(ptr: *mut u8, size: usize);
-    pub fn ___cuda_zero_mem(ptr: *mut u8, size: usize);
-    pub fn ___copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
-    pub fn ___copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
-    pub fn ___copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+    fn ___cuda_alloc(size: usize) -> *mut u8;
+    fn ___cuda_dealloc(ptr: *mut u8, size: usize);
+    fn ___cuda_zero_mem(ptr: *mut u8, size: usize);
+    fn ___copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+    fn ___copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
+    fn ___copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+    fn ___cublas_sgemm(i: u64, j: u64, k: u64, c: *mut u8, a: *const u8, b: *const u8);
 }

 #[derive(Clone, Debug)]
@@ -284,14 +321,6 @@ impl<'a> HerculesCUDARefMut<'a> {
        }
    }

-    pub fn dup(&'a mut self) -> Self {
-        HerculesCUDARefMut {
-            ptr: self.ptr,
-            size: self.size,
-            _phantom: PhantomData,
-        }
-    }
-
    pub unsafe fn __ptr(&self) -> *mut u8 {
        self.ptr.as_ptr()
    }
@@ -309,6 +338,17 @@ impl<'a> HerculesCUDARefMut<'a> {
    }
 }

+#[cfg(feature = "cuda")]
+impl<'a, 'b: 'a> HerculesCUDARefMut<'b> {
+    pub fn dup(&'a mut self) -> HerculesCUDARefMut<'a> {
+        HerculesCUDARefMut {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
+        }
+    }
+}
+
 #[cfg(feature = "cuda")]
 impl CUDABox {
    pub fn from_cpu_ref(cpu_ref: HerculesCPURef) -> Self {
@@ -662,7 +702,7 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
    }
 }

-impl<'a, T> HerculesMutBox<'a, T>
+impl<'a, 'b: 'a, T> HerculesMutBox<'b, T>
 where
    T: Default + Clone,
 {
@@ -688,7 +728,7 @@ where
                let elements = unsafe { cuda_ref.__size() / size_of::<T>() };

                // Allocate host memory (if needed)
-                let cpu_alloc: Allocation<&'a mut [T], Vec<T>> = match self.cpu_alloc.take() {
+                let cpu_alloc: Allocation<&'b mut [T], Vec<T>> = match self.cpu_alloc.take() {
                    Allocation::Reference(val) if val.len() == elements => {
                        Allocation::Reference(val)
                    }
@@ -793,7 +833,7 @@ pub trait HerculesMutBoxTo<'a, T> {
    fn to(&'a mut self) -> T;
 }

-impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
+impl<'a, 'b: 'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'b, T>
 where
    T: Default + Clone,
 {
@@ -803,7 +843,7 @@ where
 }

 #[cfg(feature = "cuda")]
-impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
+impl<'a, 'b: 'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'b, T>
 where
    T: Default + Clone,
 {

--- a/hercules_rt/src/rtdefs.cu
+++ b/hercules_rt/src/rtdefs.cu
-extern "C" {
-	void *___cuda_alloc(size_t size) {
-		void *ptr = NULL;
-		cudaError_t res = cudaMalloc(&ptr, size);
-		if (res != cudaSuccess) {
-			ptr = NULL;
-		}
-		return ptr;
-	}
+#include <stdint.h>
+#include <cublas_v2.h>

-	void ___cuda_dealloc(void *ptr, size_t size) {
-		(void) size;
-		cudaFree(ptr);
-	}
-
-	void ___cuda_zero_mem(void *ptr, size_t size) {
-		cudaMemset(ptr, 0, size);
-	}
+static cublasHandle_t cublas_handle = 0;

-	void ___copy_cpu_to_cuda(void *dst, void *src, size_t size) {
-		cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
-	}
-
-	void ___copy_cuda_to_cpu(void *dst, void *src, size_t size) {
-		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
+extern "C" {
+    void *___cuda_alloc(size_t size) {
+	void *ptr = NULL;
+	cudaError_t res = cudaMalloc(&ptr, size);
+	if (res != cudaSuccess) {
+	    ptr = NULL;
 	}
-
-	void ___copy_cuda_to_cuda(void *dst, void *src, size_t size) {
-		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
+	return ptr;
+    }
+    
+    void ___cuda_dealloc(void *ptr, size_t size) {
+	(void) size;
+	cudaFree(ptr);
+    }
+    
+    void ___cuda_zero_mem(void *ptr, size_t size) {
+	cudaMemset(ptr, 0, size);
+    }
+    
+    void ___copy_cpu_to_cuda(void *dst, void *src, size_t size) {
+	cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
+    }
+    
+    void ___copy_cuda_to_cpu(void *dst, void *src, size_t size) {
+	cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
+    }
+    
+    void ___copy_cuda_to_cuda(void *dst, void *src, size_t size) {
+	cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
+    }
+    
+    void ___cublas_sgemm(uint64_t i, uint64_t j, uint64_t k, float *c, float *a, float *b) {
+	if (!cublas_handle) {
+	    cublasCreate(&cublas_handle);
 	}
+	float alf = 1.0;
+	float beta = 0.0;
+	cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+		    k, i, j,
+		    &alf, b, k, a, j,
+		    &beta, c, k);
+	cudaDeviceSynchronize();
+    }
 }
--- a/juno_frontend/src/codegen.rs
+++ b/juno_frontend/src/codegen.rs
@@ -540,25 +540,31 @@ impl CodeGenerator<'_> {
                block = after_call_region;

                // Read each of the "inout values" and perform the SSA update
-                let inouts_index = self.builder.builder.create_field_index(1);
+                let has_inouts = !inouts.is_empty();
+                // TODO: We should omit unit returns, if we do so the + 1 below is not needed
                for (idx, var) in inouts.into_iter().enumerate() {
-                    let index = self.builder.builder.create_field_index(idx);
+                    let index = self.builder.builder.create_field_index(idx + 1);
                    let mut read = self.builder.allocate_node();
                    let read_id = read.id();
-                    read.build_read(call_id, vec![inouts_index.clone(), index].into());
+                    read.build_read(call_id, vec![index].into());
                    self.builder.add_node(read);

                    ssa.write_variable(var, block, read_id);
                }

                // Read the "actual return" value and return it
-                let value_index = self.builder.builder.create_field_index(0);
-                let mut read = self.builder.allocate_node();
-                let read_id = read.id();
-                read.build_read(call_id, vec![value_index].into());
-                self.builder.add_node(read);
+                let result = if !has_inouts {
+                    call_id
+                } else {
+                    let value_index = self.builder.builder.create_field_index(0);
+                    let mut read = self.builder.allocate_node();
+                    let read_id = read.id();
+                    read.build_read(call_id, vec![value_index].into());
+                    self.builder.add_node(read);
+                    read_id
+                };

-                (read_id, block)
+                (result, block)
            }
            Expr::Intrinsic {
                id,

--- a/juno_frontend/src/lang.l
+++ b/juno_frontend/src/lang.l
@@ -28,6 +28,7 @@ for      "for"
 if       "if"
 inout    "inout"
 integer  "integer"
+in       "in"
 let      "let"
 match    "match"
 mod      "mod"
@@ -128,7 +129,7 @@ _        "_"
 0x[0-9a-fA-F]+            "HEX_INT"
 0b[0-1]+                  "BIN_INT"
 0o[0-7]+                  "OCT_INT"
-[0-9]+\.[0-9]*(|e[0-9]+)  "FLOAT_LIT"
+[0-9]+\.[0-9]+(|e[0-9]+)  "FLOAT_LIT"
 @[a-zA-Z0-9_]+            "LABEL"

 . "UNMATCHED"

--- a/juno_frontend/src/lang.y
+++ b/juno_frontend/src/lang.y
@@ -99,13 +99,18 @@ TypeDef -> Result<TyDef, ()>
  ;

 ObjFields -> Result<Vec<ObjField>, ()>
-  :                     { Ok(vec![]) }
-  | ObjFields ObjField  { flatten($1, $2) }
+  : ObjFieldList { Ok($1?.into_iter().collect()) }
+  ;
+ObjFieldList -> Result<VecDeque<ObjField>, ()>
+  :                           { Ok(VecDeque::new()) }
+  | ObjField                  { Ok(VecDeque::from([$1?])) }
+  | ObjField ',' ObjFieldList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
+  | ObjField ';' ObjFieldList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
  ;
 ObjField -> Result<ObjField, ()>
-  : PubOption 'ID' ';'
+  : PubOption 'ID'
        { Ok(ObjField{ span : $span, public : $1?, name : span_of_tok($2)?, typ : None }) }
-  | PubOption 'ID' ':' Type ';'
+  | PubOption 'ID' ':' Type
        { Ok(ObjField{ span : $span, public : $1?, name : span_of_tok($2)?, typ : Some($4?) }) }
  ;

@@ -287,11 +292,17 @@ Stmt -> Result<Stmt, ()>
  | 'match' NonStructExpr Cases
      { Ok(Stmt::MatchStmt{ span : $span, expr : $2?, body : $3? }) }
  | 'for' VarBind '=' NonStructExpr 'to' NonStructExpr Stmts
-      { Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?, step : None,
-                          body : Box::new($7?) }) }
+      { Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?,
+                          inclusive: false, step : None, body : Box::new($7?) }) }
  | 'for' VarBind '=' NonStructExpr 'to' NonStructExpr 'by' SignedIntLit Stmts
-      { Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?, step : Some($8?),
-                          body : Box::new($9?) }) }
+      { Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?,
+                          inclusive: false, step : Some($8?), body : Box::new($9?) }) }
+  | 'for' VarBind 'in' NonStructExpr '..' NonStructExpr Stmts
+      { Ok(Stmt::ForStmt{ span: $span, var: $2?, init: $4?, bound: $6?,
+                          inclusive: false, step: None, body: Box::new($7?) }) }
+  | 'for' VarBind 'in' NonStructExpr '..' '=' NonStructExpr Stmts
+      { Ok(Stmt::ForStmt{ span: $span, var: $2?, init: $4?, bound: $7?,
+                          inclusive: true, step: None, body: Box::new($8?) }) }
  | 'while' NonStructExpr Stmts
      { Ok(Stmt::WhileStmt{ span : $span, cond : $2?, body : Box::new($3?) }) }
  | 'return' ';'
@@ -457,12 +468,16 @@ Expr -> Result<Expr, ()>
      { Ok(Expr::IntrinsicExpr{ span : $span, name : $1?, ty_args : Some($5?), args: $8? }) }
  ;
 IdExprs -> Result<Vec<(Id, Expr)>, ()>
-  : 'ID' '=' Expr               { Ok(vec![(span_of_tok($1)?, $3?)]) }
-  | IdExprsS ',' 'ID' '=' Expr  { flatten($1, res_pair(span_of_tok($3), $5)) }
+  : IdExprList  { Ok($1?.into_iter().collect()) }
+  ;
+IdExprList -> Result<VecDeque<(Id, Expr)>, ()>
+  :                       { Ok(VecDeque::new()) }
+  | IdExpr                { Ok(VecDeque::from([$1?])) }
+  | IdExpr ',' IdExprList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
  ;
-IdExprsS -> Result<Vec<(Id, Expr)>, ()>
-  : 'ID' '=' Expr               { Ok(vec![(span_of_tok($1)?, $3?)]) }
-  | IdExprsS ',' 'ID' '=' Expr  { flatten($1, res_pair(span_of_tok($3), $5)) }
+IdExpr -> Result<(Id, Expr), ()>
+  : 'ID' ':' Expr               { Ok((span_of_tok($1)?, $3?)) }
+  | 'ID' '=' Expr               { Ok((span_of_tok($1)?, $3?)) }
  ;
 Params -> Result<Vec<(bool, Expr)>, ()>
  :                       { Ok(vec![]) }
@@ -678,9 +693,9 @@ pub enum Stmt {
  AssignStmt { span : Span, lhs : LExpr, assign : AssignOp, assign_span : Span, rhs : Expr },
  IfStmt     { span : Span, cond : Expr, thn : Box<Stmt>, els : Option<Box<Stmt>> },
  MatchStmt  { span : Span, expr : Expr, body : Vec<Case> },
-  // The step records: negative, number, base
-  ForStmt    { span : Span, var : VarBind, init : Expr, bound : Expr, step : Option<(bool, Span, IntBase)>,
-               body : Box<Stmt> },
+  // The step records: negative, number, base, inclusive records whether the bound is included in the range
+  ForStmt    { span : Span, var : VarBind, init : Expr, bound : Expr,
+               inclusive: bool, step : Option<(bool, Span, IntBase)>, body : Box<Stmt> },
  WhileStmt  { span : Span, cond : Expr, body : Box<Stmt> },
  ReturnStmt { span : Span, expr : Option<Expr> },
  BreakStmt  { span : Span },

--- a/juno_frontend/src/semant.rs
+++ b/juno_frontend/src/semant.rs
@@ -808,8 +808,14 @@ fn analyze_program(
                // Compute the proper type accounting for the inouts (which become returns)
                let mut inout_types = inouts.iter().map(|e| e.get_type()).collect::<Vec<_>>();

-                let inout_tuple = types.new_tuple(inout_types);
-                let pure_return_type = types.new_tuple(vec![return_type, inout_tuple]);
+                let mut return_types = vec![return_type];
+                return_types.extend(inout_types);
+                // TODO: Ideally we would omit unit returns
+                let pure_return_type = if return_types.len() == 1 {
+                    return_types.pop().unwrap()
+                } else {
+                    types.new_tuple(return_types)
+                };

                // Finally, we have a properly built environment and we can
                // start processing the body
@@ -1993,6 +1999,7 @@ fn process_stmt(
                },
            init,
            bound,
+            inclusive,
            step,
            body,
        } => {
@@ -2124,10 +2131,19 @@ fn process_stmt(
                val: bound_val,
            };

-            // The condition of the loop is var < bound, unless the step is negative in which case
-            // it is var > bound
+            // There are four cases for the condition that we generate, though it always takes the
+            // form var OP bound:
+            // 1. The step is positive and the range is exclusive of the bound, OP = <
+            // 2. The step is positive and the range is inclusive of the bound, OP = <=
+            // 3. The step is negative and the range is exclusive of the bound, OP = >
+            // 4. The step is negative and the range is inclusive of the bound, OP = >=
            let condition = Expr::BinaryExp {
-                op: if step_pos { BinaryOp::Lt } else { BinaryOp::Gt },
+                op: match (step_pos, inclusive) {
+                    (true, false) => BinaryOp::Lt,
+                    (true, true) => BinaryOp::Le,
+                    (false, false) => BinaryOp::Gt,
+                    (false, true) => BinaryOp::Ge,
+                },
                lhs: Box::new(Expr::Variable {
                    var: var,
                    typ: var_type,
@@ -4809,7 +4825,7 @@ fn process_expr(
                    };

                    // Now, process the arguments to ensure they has the type needed by this
-                    // constructor
+                    // function
                    let mut arg_vals: Vec<Either<Expr, usize>> = vec![];
                    let mut errors = LinkedList::new();

@@ -5009,19 +5025,21 @@ fn process_expr(
 }

 fn generate_return(expr: Expr, inouts: &Vec<Expr>, types: &mut TypeSolver) -> Stmt {
-    let inout_types = inouts.iter().map(|e| e.get_type()).collect();
-    let inout_type = types.new_tuple(inout_types);
+    let inout_types = inouts.iter().map(|e| e.get_type()).collect::<Vec<_>>();

-    let inout_vals = Expr::Tuple {
-        vals: inouts.clone(),
-        typ: inout_type,
-    };
+    let mut return_types = vec![expr.get_type()];
+    return_types.extend(inout_types);

-    let expr_type = expr.get_type();
+    let mut return_vals = vec![expr];
+    return_vals.extend_from_slice(inouts);

-    let val = Expr::Tuple {
-        vals: vec![expr, inout_vals],
-        typ: types.new_tuple(vec![expr_type, inout_type]),
+    let val = if return_vals.len() == 1 {
+        return_vals.pop().unwrap()
+    } else {
+        Expr::Tuple {
+            vals: return_vals,
+            typ: types.new_tuple(return_types),
+        }
    };

    Stmt::ReturnStmt { expr: val }

--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
@@ -12,9 +12,12 @@ fixpoint {
 fork-coalesce(*);
 infer-schedules(*);
 dce(*);
+rewrite(*);
+fixpoint {
+  simplify-cfg(*);
+  dce(*);
+}

-let out = auto-outline(*);
-gpu(out.matmul);
 ip-sroa(*);
 sroa(*);
 dce(*);

--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
 #![feature(concat_idents)]
+use std::iter::zip;

 use rand::random;

@@ -13,9 +14,9 @@ fn main() {
        const I: usize = 256;
        const J: usize = 64;
        const K: usize = 128;
-        let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
-        let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
-        let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
+        let a: Box<[f32]> = (0..I * J).map(|_| random::<f32>()).collect();
+        let b: Box<[f32]> = (0..J * K).map(|_| random::<f32>()).collect();
+        let mut correct_c: Box<[f32]> = (0..I * K).map(|_| 0.0).collect();
        for i in 0..I {
            for k in 0..K {
                for j in 0..J {
@@ -27,7 +28,8 @@ fn main() {
        {
            let mut r = runner!(matmul);
            let c = r.run(I as u64, J as u64, K as u64, a.to(), b.to()).await;
-            assert_eq!(c.as_slice::<i32>(), &*correct_c);
+            let c = c.as_slice::<f32>();
+            assert_eq!(c, &*correct_c);
        }
        #[cfg(feature = "cuda")]
        {
@@ -37,9 +39,9 @@ fn main() {
            let c = r
                .run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
                .await;
-            let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
+            let mut c_cpu: Box<[f32]> = vec![0.0; correct_c.len()].into_boxed_slice();
            c.to_cpu_ref(&mut c_cpu);
-            assert_eq!(&*c_cpu, &*correct_c);
+            assert!(zip(c_cpu, correct_c).all(|(calc, correct)| (calc - correct).abs() < 0.00001));
        }
    });
 }

--- a/juno_samples/matmul/src/matmul.jn
+++ b/juno_samples/matmul/src/matmul.jn
 #[entry]
-fn matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]) -> i32[n, l] {
-  let res : i32[n, l];
+fn matmul<n : usize, m : usize, l : usize>(a : f32[n, m], b : f32[m, l]) -> f32[n, l] {
+  let res : f32[n, l];

  @outer for i = 0 to n {
    @middle for j = 0 to l {

--- a/juno_samples/rodinia/README.md
+++ b/juno_samples/rodinia/README.md
+# Rodinia Benchmarks
+This directory contains several of the benchmarks from the [Rodinia Benchmark Suite](http://www.cs.virginia.edu/rodinia/doku.php) ported into Juno.
+The implementations are based on those provided with Rodinia version 3.1.
--- a/juno_samples/rodinia/backprop/Cargo.toml
+++ b/juno_samples/rodinia/backprop/Cargo.toml
+[package]
+name = "juno_backprop"
+version = "0.1.0"
+authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
+
+[[bin]]
+name = "juno_backprop"
+path = "src/main.rs"
+
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
+[build-dependencies]
+juno_build = { path = "../../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../../juno_build" }
+hercules_rt = { path = "../../../hercules_rt" }
+async-std = "*"
+clap = { version = "*", features = ["derive"] }
+with_builtin_macros = "0.1.0"
+nom = "*"
+rand = "0.9.0"
--- a/juno_samples/rodinia/backprop/build.rs
+++ b/juno_samples/rodinia/backprop/build.rs
+use juno_build::JunoCompiler;
+
+fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("backprop.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    JunoCompiler::new()
+        .file_in_src("backprop.jn")
+        .unwrap()
+        .schedule_in_src("cpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+}
--- a/juno_samples/rodinia/backprop/src/backprop.jn
+++ b/juno_samples/rodinia/backprop/src/backprop.jn
+fn squash(x: f32) -> f32 {
+  // Sigmoid
+  return 1.0 / (1.0 + exp!(-x));
+}
+
+fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] {
+  let result : f32[m + 1];
+  result[0] = 1.0;
+
+  for j in 1..=m {
+    let sum = 0.0;
+    for k in 0..=n {
+      sum += weights[k, j] * vals[k];
+    }
+    result[j] = squash(sum);
+  }
+
+  return result;
+}
+
+fn output_error<n: usize>(target: f32[n + 1], actual: f32[n + 1]) -> (f32, f32[n + 1]) {
+  let errsum = 0.0;
+  let delta : f32[n + 1];
+
+  for j in 1..=n {
+    let a = actual[j];
+    let t = target[j];
+    delta[j] = a * (1.0 - a) * (t - a);
+    errsum += abs!(delta[j]);
+  }
+
+  return (errsum, delta);
+}
+
+fn hidden_error<hidden_n, output_n: usize>(
+  out_delta: f32[output_n + 1],
+  hidden_weights: f32[hidden_n + 1, output_n + 1],
+  hidden_vals: f32[hidden_n + 1],
+) -> (f32, f32[hidden_n + 1]) {
+  let errsum = 0.0;
+  let delta : f32[hidden_n + 1];
+
+  for j in 1..=hidden_n {
+    let h = hidden_vals[j];
+
+    let sum = 0.0;
+    for k in 1..=output_n {
+      sum += out_delta[k] * hidden_weights[j, k];
+    }
+
+    delta[j] = h * (1.0 - h) * sum;
+    errsum += abs!(delta[j]);
+  }
+
+  return (errsum, delta);
+}
+
+const ETA : f32 = 0.3;
+const MOMENTUM : f32 = 0.3;
+
+fn adjust_weights<n, m: usize>(
+  delta: f32[m + 1],
+  vals: f32[n + 1],
+  weights: f32[n + 1, m + 1],
+  prev_weights: f32[n + 1, m + 1]
+) -> (f32[n + 1, m + 1], f32[n + 1, m + 1]) {
+  for j in 1..=m {
+    for k in 0..=n {
+      let new_dw = ETA * delta[j] * vals[k] + MOMENTUM * prev_weights[k, j];
+      weights[k, j] += new_dw;
+      prev_weights[k, j] = new_dw;
+    }
+  }
+
+  return (weights, prev_weights);
+}
+
+#[entry]
+fn backprop<input_n, hidden_n, output_n: usize>(
+  input_vals: f32[input_n + 1],
+  input_weights: f32[input_n + 1, hidden_n + 1],
+  hidden_weights: f32[hidden_n + 1, output_n + 1],
+  target: f32[output_n + 1],
+  input_prev_weights: f32[input_n + 1, hidden_n + 1],
+  hidden_prev_weights: f32[hidden_n + 1, output_n + 1],
+//) -> (f32, f32,
+//      f32[input_n + 1, hidden_n + 1], f32[input_n + 1, hidden_n + 1],
+//      f32[hidden_n + 1, output_n + 1], f32[hidden_n + 1, output_n + 1]) {
+) -> (f32, f32, f32) {
+  let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights);
+  let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights);
+
+  let (out_err, out_delta) = output_error::<output_n>(target, output_vals);
+  let (hid_err, hid_delta) = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals);
+
+  let (hidden_weights, hidden_prev_weights)
+    = adjust_weights::<hidden_n, output_n>(out_delta, hidden_vals, hidden_weights, hidden_prev_weights);
+  let (input_weights, input_prev_weights)
+    = adjust_weights::<input_n, hidden_n>(hid_delta, input_vals, input_weights, input_prev_weights);
+
+  return (out_err, hid_err, input_weights[0, 0] + input_prev_weights[0, 0] + hidden_weights[0, 0] + hidden_prev_weights[0, 0]);
+  //return (input_weights, input_prev_weights, hidden_weights, hidden_prev_weights);
+}
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+
+let auto = auto-outline(backprop);
+cpu(auto.backprop);
+
+inline(auto.backprop);
+inline(auto.backprop);
+delete-uncalled(*);
+
+sroa[true](*);
+dce(*);
+float-collections(*);
+reuse-products(*);
+dce(*);
+
+gcm(*);
+
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+
+let auto = auto-outline(backprop);
+gpu(auto.backprop);
+
+inline(auto.backprop);
+inline(auto.backprop);
+delete-uncalled(*);
+
+sroa[true](*);
+dce(*);
+float-collections(*);
+reuse-products(*);
+dce(*);
+
+gcm(*);
+
--- a/juno_samples/rodinia/backprop/src/main.rs
+++ b/juno_samples/rodinia/backprop/src/main.rs
+#![feature(concat_idents)]
+
+juno_build::juno!("backprop");
+
+mod rust_backprop;
+
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox, HerculesMutBoxTo};
+
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+use clap::Parser;
+
+#[derive(Parser)]
+#[clap(author, version, about, long_about = None)]
+struct BackpropInputs {
+    layer_size: usize,
+}
+
+fn run_backprop(
+    input_n: u64,
+    hidden_n: u64,
+    output_n: u64,
+    input_vals: &[f32],
+    input_weights: &[f32],
+    hidden_weights: &[f32],
+    target: &[f32],
+    input_prev_weights: &[f32],
+    hidden_prev_weights: &[f32],
+) -> (f32, f32, Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
+    let input_vals = HerculesImmBox::from(input_vals);
+    let target = HerculesImmBox::from(target);
+
+    let mut input_weights = HerculesMutBox::from(input_weights.to_vec());
+    let mut hidden_weights = HerculesMutBox::from(hidden_weights.to_vec());
+    let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec());
+    let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec());
+
+    let mut runner = runner!(backprop);
+    let res = HerculesMutBox::from(async_std::task::block_on(async {
+        runner
+            .run(
+                input_n,
+                hidden_n,
+                output_n,
+                input_vals.to(),
+                input_weights.to(),
+                hidden_weights.to(),
+                target.to(),
+                input_prev_weights.to(),
+                hidden_prev_weights.to(),
+            )
+            .await
+    }))
+    .as_slice()
+    .to_vec();
+    let out_err = res[0];
+    let hid_err = res[1];
+
+    (
+        out_err,
+        hid_err,
+        input_weights.as_slice().to_vec(),
+        hidden_weights.as_slice().to_vec(),
+        input_prev_weights.as_slice().to_vec(),
+        hidden_prev_weights.as_slice().to_vec(),
+    )
+}
+
+fn compare_float(x: f32, y: f32) -> bool {
+    (x - y).abs() < 1e-5
+}
+
+fn compare_floats(xs: &[f32], ys: &[f32]) -> bool {
+    xs.len() == ys.len() && xs.iter().zip(ys.iter()).all(|(x, y)| compare_float(*x, *y))
+}
+
+fn backprop_harness(args: BackpropInputs) {
+    let BackpropInputs { layer_size } = args;
+
+    let mut rng = StdRng::seed_from_u64(7);
+
+    let input_n = layer_size;
+    let hidden_n = 16;
+    let output_n = 1;
+
+    let mut input_vals = vec![0.0; input_n + 1];
+    input_vals[0] = 1.0;
+
+    // For some reason the bpnn_randomize_row function used on target just sets it to 0.1
+    let target = vec![0.1; output_n + 1];
+
+    let input_weights = (0..(input_n + 1) * (hidden_n + 1))
+        .map(|_| rng.random::<f32>())
+        .collect::<Vec<_>>();
+    let hidden_weights = (0..(hidden_n + 1) * (output_n + 1))
+        .map(|_| rng.random::<f32>())
+        .collect::<Vec<_>>();
+
+    let input_prev_weights = vec![0.0; (input_n + 1) * (hidden_n + 1)];
+    let hidden_prev_weights = vec![0.0; (hidden_n + 1) * (output_n + 1)];
+
+    let (
+        juno_out_err,
+        juno_hid_err,
+        juno_input_weights,
+        juno_hidden_weights,
+        juno_input_prev_weights,
+        juno_hidden_prev_weights,
+    ) = run_backprop(
+        input_n as u64,
+        hidden_n as u64,
+        output_n as u64,
+        &input_vals,
+        &input_weights,
+        &hidden_weights,
+        &target,
+        &input_prev_weights,
+        &hidden_prev_weights,
+    );
+
+    let (
+        rust_out_err,
+        rust_hid_err,
+        rust_input_weights,
+        rust_hidden_weights,
+        rust_input_prev_weights,
+        rust_hidden_prev_weights,
+    ) = rust_backprop::backprop(
+        input_n,
+        hidden_n,
+        output_n,
+        &input_vals,
+        input_weights,
+        hidden_weights,
+        &target,
+        input_prev_weights,
+        hidden_prev_weights,
+    );
+
+    assert!(compare_float(juno_out_err, rust_out_err));
+    assert!(compare_float(juno_hid_err, rust_hid_err));
+    if !compare_floats(&juno_input_weights, &rust_input_weights) {
+        panic!("Input weights do not match after training");
+    }
+    if !compare_floats(&juno_hidden_weights, &rust_hidden_weights) {
+        panic!("Hidden weights do not match after training");
+    }
+    if !compare_floats(&juno_input_prev_weights, &rust_input_prev_weights) {
+        panic!("Input prev_weights do not match after training");
+    }
+    if !compare_floats(&juno_hidden_prev_weights, &rust_hidden_prev_weights) {
+        panic!("Hidden prev_weights do not match after training");
+    }
+}
+
+fn main() {
+    let args = BackpropInputs::parse();
+    backprop_harness(args);
+}
+
+#[test]
+fn backprop_test() {
+    backprop_harness(BackpropInputs { layer_size: 65536 });
+}
--- a/juno_samples/rodinia/backprop/src/rust_backprop.rs
+++ b/juno_samples/rodinia/backprop/src/rust_backprop.rs
+fn layer_forward(n: usize, m: usize, vals: &[f32], weights: &[f32]) -> Vec<f32> {
+    let mut result = vec![0.0; m + 1];
+    result[0] = 1.0;
+
+    for j in 1..=m {
+        let mut sum = 0.0;
+        for k in 0..=n {
+            sum += weights[k * (m + 1) + j] * vals[k];
+        }
+        result[j] = 1.0 / (1.0 + (-sum).exp());
+    }
+
+    result
+}
+
+fn output_error(n: usize, target: &[f32], actual: &[f32]) -> (f32, Vec<f32>) {
+    let mut result = vec![0.0; n + 1];
+    let mut error = 0.0;
+
+    for j in 1..=n {
+        let o = actual[j];
+        let t = target[j];
+        result[j] = o * (1.0 - o) * (t - o);
+        error += result[j].abs();
+    }
+
+    (error, result)
+}
+
+fn hidden_error(
+    n: usize,
+    m: usize,
+    delta: &[f32],
+    weights: &[f32],
+    actual: &[f32],
+) -> (f32, Vec<f32>) {
+    let mut result = vec![0.0; n + 1];
+    let mut error = 0.0;
+
+    for j in 1..=n {
+        let h = actual[j];
+        let mut sum = 0.0;
+        for k in 1..=m {
+            sum += delta[k] * weights[j * (m + 1) + k];
+        }
+        result[j] = h * (1.0 - h) * sum;
+        error += result[j].abs();
+    }
+
+    (error, result)
+}
+
+fn adjust_weights(
+    n: usize,
+    m: usize,
+    delta: &[f32],
+    vals: &[f32],
+    mut weights: Vec<f32>,
+    mut prev_weights: Vec<f32>,
+) -> (Vec<f32>, Vec<f32>) {
+    for j in 1..=m {
+        for k in 0..=n {
+            let new_dw = (0.3 * delta[j] * vals[k]) + (0.3 * prev_weights[k * (m + 1) + j]);
+            weights[k * (m + 1) + j] += new_dw;
+            prev_weights[k * (m + 1) + j] = new_dw;
+        }
+    }
+
+    (weights, prev_weights)
+}
+
+pub fn backprop(
+    input_n: usize,
+    hidden_n: usize,
+    output_n: usize,
+    input_vals: &[f32],
+    input_weights: Vec<f32>,
+    hidden_weights: Vec<f32>,
+    target: &[f32],
+    input_prev_weights: Vec<f32>,
+    hidden_prev_weights: Vec<f32>,
+) -> (f32, f32, Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
+    let hidden_vals = layer_forward(input_n, hidden_n, input_vals, &input_weights);
+    let output_vals = layer_forward(hidden_n, output_n, &hidden_vals, &hidden_weights);
+
+    let (out_err, out_delta) = output_error(output_n, target, &output_vals);
+    let (hid_err, hid_delta) = hidden_error(
+        hidden_n,
+        output_n,
+        &out_delta,
+        &hidden_weights,
+        &hidden_vals,
+    );
+
+    let (hidden_weights, hidden_prev_weights) = adjust_weights(
+        hidden_n,
+        output_n,
+        &out_delta,
+        &hidden_vals,
+        hidden_weights,
+        hidden_prev_weights,
+    );
+    let (input_weights, input_prev_weights) = adjust_weights(
+        input_n,
+        hidden_n,
+        &hid_delta,
+        &input_vals,
+        input_weights,
+        input_prev_weights,
+    );
+
+    (
+        out_err,
+        hid_err,
+        input_weights,
+        hidden_weights,
+        input_prev_weights,
+        hidden_prev_weights,
+    )
+}
--- a/juno_samples/rodinia/bfs/Cargo.toml
+++ b/juno_samples/rodinia/bfs/Cargo.toml
+[package]
+name = "juno_bfs"
+version = "0.1.0"
+authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
+
+[[bin]]
+name = "juno_bfs"
+path = "src/main.rs"
+
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
+[build-dependencies]
+juno_build = { path = "../../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../../juno_build" }
+hercules_rt = { path = "../../../hercules_rt" }
+async-std = "*"
+clap = { version = "*", features = ["derive"] }
+with_builtin_macros = "0.1.0"
+nom = "*"
--- a/juno_samples/rodinia/bfs/build.rs
+++ b/juno_samples/rodinia/bfs/build.rs
+use juno_build::JunoCompiler;
+
+fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("bfs.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    JunoCompiler::new()
+        .file_in_src("bfs.jn")
+        .unwrap()
+        .build()
+        .unwrap();
+}
No results found