Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • llvm/hercules
1 result
Show changes
Showing
with 725 additions and 97 deletions
......@@ -28,6 +28,7 @@ fn main() {
println!("cargo::rustc-link-search=native=/opt/cuda/lib/");
println!("cargo::rustc-link-lib=static=rtdefs");
println!("cargo::rustc-link-lib=cudart");
println!("cargo::rustc-link-lib=cublas");
println!("cargo::rerun-if-changed=src/rtdefs.cu");
}
}
......@@ -29,7 +29,10 @@ pub unsafe fn __cpu_dealloc(ptr: *mut u8, size: usize) {
eprintln!("__cpu_dealloc: {:?}, {}", ptr, size);
assert!(!ptr.is_null() || size == 0);
}
dealloc(ptr, Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap())
dealloc(
ptr,
Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap(),
)
}
pub unsafe fn __cpu_zero_mem(ptr: *mut u8, size: usize) {
......@@ -103,14 +106,48 @@ pub unsafe fn __copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize) {
___copy_cuda_to_cuda(dst, src, size);
}
#[derive(Debug, Copy, Clone)]
pub enum PrimTy {
Bool,
U8,
U16,
U32,
U64,
I8,
I16,
I32,
I64,
F8,
BF16,
F32,
F64,
}
#[cfg(feature = "cuda")]
pub unsafe fn __library_cuda_gemm(
i: u64,
j: u64,
k: u64,
c: *mut u8,
a: *const u8,
b: *const u8,
ty: PrimTy,
) {
match ty {
PrimTy::F32 => ___cublas_sgemm(i, j, k, c, a, b),
_ => todo!(),
}
}
#[cfg(feature = "cuda")]
extern "C" {
pub fn ___cuda_alloc(size: usize) -> *mut u8;
pub fn ___cuda_dealloc(ptr: *mut u8, size: usize);
pub fn ___cuda_zero_mem(ptr: *mut u8, size: usize);
pub fn ___copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
pub fn ___copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
pub fn ___copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
fn ___cuda_alloc(size: usize) -> *mut u8;
fn ___cuda_dealloc(ptr: *mut u8, size: usize);
fn ___cuda_zero_mem(ptr: *mut u8, size: usize);
fn ___copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
fn ___copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
fn ___copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
fn ___cublas_sgemm(i: u64, j: u64, k: u64, c: *mut u8, a: *const u8, b: *const u8);
}
#[derive(Clone, Debug)]
......@@ -284,14 +321,6 @@ impl<'a> HerculesCUDARefMut<'a> {
}
}
pub fn dup(&'a mut self) -> Self {
HerculesCUDARefMut {
ptr: self.ptr,
size: self.size,
_phantom: PhantomData,
}
}
pub unsafe fn __ptr(&self) -> *mut u8 {
self.ptr.as_ptr()
}
......@@ -309,6 +338,17 @@ impl<'a> HerculesCUDARefMut<'a> {
}
}
#[cfg(feature = "cuda")]
impl<'a, 'b: 'a> HerculesCUDARefMut<'b> {
pub fn dup(&'a mut self) -> HerculesCUDARefMut<'a> {
HerculesCUDARefMut {
ptr: self.ptr,
size: self.size,
_phantom: PhantomData,
}
}
}
#[cfg(feature = "cuda")]
impl CUDABox {
pub fn from_cpu_ref(cpu_ref: HerculesCPURef) -> Self {
......@@ -662,7 +702,7 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
}
}
impl<'a, T> HerculesMutBox<'a, T>
impl<'a, 'b: 'a, T> HerculesMutBox<'b, T>
where
T: Default + Clone,
{
......@@ -688,7 +728,7 @@ where
let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
// Allocate host memory (if needed)
let cpu_alloc: Allocation<&'a mut [T], Vec<T>> = match self.cpu_alloc.take() {
let cpu_alloc: Allocation<&'b mut [T], Vec<T>> = match self.cpu_alloc.take() {
Allocation::Reference(val) if val.len() == elements => {
Allocation::Reference(val)
}
......@@ -793,7 +833,7 @@ pub trait HerculesMutBoxTo<'a, T> {
fn to(&'a mut self) -> T;
}
impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
impl<'a, 'b: 'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'b, T>
where
T: Default + Clone,
{
......@@ -803,7 +843,7 @@ where
}
#[cfg(feature = "cuda")]
impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
impl<'a, 'b: 'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'b, T>
where
T: Default + Clone,
{
......
extern "C" {
void *___cuda_alloc(size_t size) {
void *ptr = NULL;
cudaError_t res = cudaMalloc(&ptr, size);
if (res != cudaSuccess) {
ptr = NULL;
}
return ptr;
}
#include <stdint.h>
#include <cublas_v2.h>
void ___cuda_dealloc(void *ptr, size_t size) {
(void) size;
cudaFree(ptr);
}
void ___cuda_zero_mem(void *ptr, size_t size) {
cudaMemset(ptr, 0, size);
}
static cublasHandle_t cublas_handle = 0;
void ___copy_cpu_to_cuda(void *dst, void *src, size_t size) {
cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
}
void ___copy_cuda_to_cpu(void *dst, void *src, size_t size) {
cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
extern "C" {
void *___cuda_alloc(size_t size) {
void *ptr = NULL;
cudaError_t res = cudaMalloc(&ptr, size);
if (res != cudaSuccess) {
ptr = NULL;
}
void ___copy_cuda_to_cuda(void *dst, void *src, size_t size) {
cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
return ptr;
}
void ___cuda_dealloc(void *ptr, size_t size) {
(void) size;
cudaFree(ptr);
}
void ___cuda_zero_mem(void *ptr, size_t size) {
cudaMemset(ptr, 0, size);
}
void ___copy_cpu_to_cuda(void *dst, void *src, size_t size) {
cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
}
void ___copy_cuda_to_cpu(void *dst, void *src, size_t size) {
cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
}
void ___copy_cuda_to_cuda(void *dst, void *src, size_t size) {
cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
}
void ___cublas_sgemm(uint64_t i, uint64_t j, uint64_t k, float *c, float *a, float *b) {
if (!cublas_handle) {
cublasCreate(&cublas_handle);
}
float alf = 1.0;
float beta = 0.0;
cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
k, i, j,
&alf, b, k, a, j,
&beta, c, k);
cudaDeviceSynchronize();
}
}
......@@ -540,25 +540,31 @@ impl CodeGenerator<'_> {
block = after_call_region;
// Read each of the "inout values" and perform the SSA update
let inouts_index = self.builder.builder.create_field_index(1);
let has_inouts = !inouts.is_empty();
// TODO: We should omit unit returns, if we do so the + 1 below is not needed
for (idx, var) in inouts.into_iter().enumerate() {
let index = self.builder.builder.create_field_index(idx);
let index = self.builder.builder.create_field_index(idx + 1);
let mut read = self.builder.allocate_node();
let read_id = read.id();
read.build_read(call_id, vec![inouts_index.clone(), index].into());
read.build_read(call_id, vec![index].into());
self.builder.add_node(read);
ssa.write_variable(var, block, read_id);
}
// Read the "actual return" value and return it
let value_index = self.builder.builder.create_field_index(0);
let mut read = self.builder.allocate_node();
let read_id = read.id();
read.build_read(call_id, vec![value_index].into());
self.builder.add_node(read);
let result = if !has_inouts {
call_id
} else {
let value_index = self.builder.builder.create_field_index(0);
let mut read = self.builder.allocate_node();
let read_id = read.id();
read.build_read(call_id, vec![value_index].into());
self.builder.add_node(read);
read_id
};
(read_id, block)
(result, block)
}
Expr::Intrinsic {
id,
......
......@@ -28,6 +28,7 @@ for "for"
if "if"
inout "inout"
integer "integer"
in "in"
let "let"
match "match"
mod "mod"
......@@ -128,7 +129,7 @@ _ "_"
0x[0-9a-fA-F]+ "HEX_INT"
0b[0-1]+ "BIN_INT"
0o[0-7]+ "OCT_INT"
[0-9]+\.[0-9]*(|e[0-9]+) "FLOAT_LIT"
[0-9]+\.[0-9]+(|e[0-9]+) "FLOAT_LIT"
@[a-zA-Z0-9_]+ "LABEL"
. "UNMATCHED"
......
......@@ -99,13 +99,18 @@ TypeDef -> Result<TyDef, ()>
;
ObjFields -> Result<Vec<ObjField>, ()>
: { Ok(vec![]) }
| ObjFields ObjField { flatten($1, $2) }
: ObjFieldList { Ok($1?.into_iter().collect()) }
;
ObjFieldList -> Result<VecDeque<ObjField>, ()>
: { Ok(VecDeque::new()) }
| ObjField { Ok(VecDeque::from([$1?])) }
| ObjField ',' ObjFieldList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
| ObjField ';' ObjFieldList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
;
ObjField -> Result<ObjField, ()>
: PubOption 'ID' ';'
: PubOption 'ID'
{ Ok(ObjField{ span : $span, public : $1?, name : span_of_tok($2)?, typ : None }) }
| PubOption 'ID' ':' Type ';'
| PubOption 'ID' ':' Type
{ Ok(ObjField{ span : $span, public : $1?, name : span_of_tok($2)?, typ : Some($4?) }) }
;
......@@ -287,11 +292,17 @@ Stmt -> Result<Stmt, ()>
| 'match' NonStructExpr Cases
{ Ok(Stmt::MatchStmt{ span : $span, expr : $2?, body : $3? }) }
| 'for' VarBind '=' NonStructExpr 'to' NonStructExpr Stmts
{ Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?, step : None,
body : Box::new($7?) }) }
{ Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?,
inclusive: false, step : None, body : Box::new($7?) }) }
| 'for' VarBind '=' NonStructExpr 'to' NonStructExpr 'by' SignedIntLit Stmts
{ Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?, step : Some($8?),
body : Box::new($9?) }) }
{ Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?,
inclusive: false, step : Some($8?), body : Box::new($9?) }) }
| 'for' VarBind 'in' NonStructExpr '..' NonStructExpr Stmts
{ Ok(Stmt::ForStmt{ span: $span, var: $2?, init: $4?, bound: $6?,
inclusive: false, step: None, body: Box::new($7?) }) }
| 'for' VarBind 'in' NonStructExpr '..' '=' NonStructExpr Stmts
{ Ok(Stmt::ForStmt{ span: $span, var: $2?, init: $4?, bound: $7?,
inclusive: true, step: None, body: Box::new($8?) }) }
| 'while' NonStructExpr Stmts
{ Ok(Stmt::WhileStmt{ span : $span, cond : $2?, body : Box::new($3?) }) }
| 'return' ';'
......@@ -457,12 +468,16 @@ Expr -> Result<Expr, ()>
{ Ok(Expr::IntrinsicExpr{ span : $span, name : $1?, ty_args : Some($5?), args: $8? }) }
;
IdExprs -> Result<Vec<(Id, Expr)>, ()>
: 'ID' '=' Expr { Ok(vec![(span_of_tok($1)?, $3?)]) }
| IdExprsS ',' 'ID' '=' Expr { flatten($1, res_pair(span_of_tok($3), $5)) }
: IdExprList { Ok($1?.into_iter().collect()) }
;
IdExprList -> Result<VecDeque<(Id, Expr)>, ()>
: { Ok(VecDeque::new()) }
| IdExpr { Ok(VecDeque::from([$1?])) }
| IdExpr ',' IdExprList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
;
IdExprsS -> Result<Vec<(Id, Expr)>, ()>
: 'ID' '=' Expr { Ok(vec![(span_of_tok($1)?, $3?)]) }
| IdExprsS ',' 'ID' '=' Expr { flatten($1, res_pair(span_of_tok($3), $5)) }
IdExpr -> Result<(Id, Expr), ()>
: 'ID' ':' Expr { Ok((span_of_tok($1)?, $3?)) }
| 'ID' '=' Expr { Ok((span_of_tok($1)?, $3?)) }
;
Params -> Result<Vec<(bool, Expr)>, ()>
: { Ok(vec![]) }
......@@ -678,9 +693,9 @@ pub enum Stmt {
AssignStmt { span : Span, lhs : LExpr, assign : AssignOp, assign_span : Span, rhs : Expr },
IfStmt { span : Span, cond : Expr, thn : Box<Stmt>, els : Option<Box<Stmt>> },
MatchStmt { span : Span, expr : Expr, body : Vec<Case> },
// The step records: negative, number, base
ForStmt { span : Span, var : VarBind, init : Expr, bound : Expr, step : Option<(bool, Span, IntBase)>,
body : Box<Stmt> },
// The step records: negative, number, base, inclusive records whether the bound is included in the range
ForStmt { span : Span, var : VarBind, init : Expr, bound : Expr,
inclusive: bool, step : Option<(bool, Span, IntBase)>, body : Box<Stmt> },
WhileStmt { span : Span, cond : Expr, body : Box<Stmt> },
ReturnStmt { span : Span, expr : Option<Expr> },
BreakStmt { span : Span },
......
......@@ -808,8 +808,14 @@ fn analyze_program(
// Compute the proper type accounting for the inouts (which become returns)
let mut inout_types = inouts.iter().map(|e| e.get_type()).collect::<Vec<_>>();
let inout_tuple = types.new_tuple(inout_types);
let pure_return_type = types.new_tuple(vec![return_type, inout_tuple]);
let mut return_types = vec![return_type];
return_types.extend(inout_types);
// TODO: Ideally we would omit unit returns
let pure_return_type = if return_types.len() == 1 {
return_types.pop().unwrap()
} else {
types.new_tuple(return_types)
};
// Finally, we have a properly built environment and we can
// start processing the body
......@@ -1993,6 +1999,7 @@ fn process_stmt(
},
init,
bound,
inclusive,
step,
body,
} => {
......@@ -2124,10 +2131,19 @@ fn process_stmt(
val: bound_val,
};
// The condition of the loop is var < bound, unless the step is negative in which case
// it is var > bound
// There are four cases for the condition that we generate, though it always takes the
// form var OP bound:
// 1. The step is positive and the range is exclusive of the bound, OP = <
// 2. The step is positive and the range is inclusive of the bound, OP = <=
// 3. The step is negative and the range is exclusive of the bound, OP = >
// 4. The step is negative and the range is inclusive of the bound, OP = >=
let condition = Expr::BinaryExp {
op: if step_pos { BinaryOp::Lt } else { BinaryOp::Gt },
op: match (step_pos, inclusive) {
(true, false) => BinaryOp::Lt,
(true, true) => BinaryOp::Le,
(false, false) => BinaryOp::Gt,
(false, true) => BinaryOp::Ge,
},
lhs: Box::new(Expr::Variable {
var: var,
typ: var_type,
......@@ -4809,7 +4825,7 @@ fn process_expr(
};
// Now, process the arguments to ensure they has the type needed by this
// constructor
// function
let mut arg_vals: Vec<Either<Expr, usize>> = vec![];
let mut errors = LinkedList::new();
......@@ -5009,19 +5025,21 @@ fn process_expr(
}
fn generate_return(expr: Expr, inouts: &Vec<Expr>, types: &mut TypeSolver) -> Stmt {
let inout_types = inouts.iter().map(|e| e.get_type()).collect();
let inout_type = types.new_tuple(inout_types);
let inout_types = inouts.iter().map(|e| e.get_type()).collect::<Vec<_>>();
let inout_vals = Expr::Tuple {
vals: inouts.clone(),
typ: inout_type,
};
let mut return_types = vec![expr.get_type()];
return_types.extend(inout_types);
let expr_type = expr.get_type();
let mut return_vals = vec![expr];
return_vals.extend_from_slice(inouts);
let val = Expr::Tuple {
vals: vec![expr, inout_vals],
typ: types.new_tuple(vec![expr_type, inout_type]),
let val = if return_vals.len() == 1 {
return_vals.pop().unwrap()
} else {
Expr::Tuple {
vals: return_vals,
typ: types.new_tuple(return_types),
}
};
Stmt::ReturnStmt { expr: val }
......
......@@ -12,9 +12,12 @@ fixpoint {
fork-coalesce(*);
infer-schedules(*);
dce(*);
rewrite(*);
fixpoint {
simplify-cfg(*);
dce(*);
}
let out = auto-outline(*);
gpu(out.matmul);
ip-sroa(*);
sroa(*);
dce(*);
......
#![feature(concat_idents)]
use std::iter::zip;
use rand::random;
......@@ -13,9 +14,9 @@ fn main() {
const I: usize = 256;
const J: usize = 64;
const K: usize = 128;
let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
let a: Box<[f32]> = (0..I * J).map(|_| random::<f32>()).collect();
let b: Box<[f32]> = (0..J * K).map(|_| random::<f32>()).collect();
let mut correct_c: Box<[f32]> = (0..I * K).map(|_| 0.0).collect();
for i in 0..I {
for k in 0..K {
for j in 0..J {
......@@ -27,7 +28,8 @@ fn main() {
{
let mut r = runner!(matmul);
let c = r.run(I as u64, J as u64, K as u64, a.to(), b.to()).await;
assert_eq!(c.as_slice::<i32>(), &*correct_c);
let c = c.as_slice::<f32>();
assert_eq!(c, &*correct_c);
}
#[cfg(feature = "cuda")]
{
......@@ -37,9 +39,9 @@ fn main() {
let c = r
.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
.await;
let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
let mut c_cpu: Box<[f32]> = vec![0.0; correct_c.len()].into_boxed_slice();
c.to_cpu_ref(&mut c_cpu);
assert_eq!(&*c_cpu, &*correct_c);
assert!(zip(c_cpu, correct_c).all(|(calc, correct)| (calc - correct).abs() < 0.00001));
}
});
}
......
#[entry]
fn matmul<n : usize, m : usize, l : usize>(a : i32[n, m], b : i32[m, l]) -> i32[n, l] {
let res : i32[n, l];
fn matmul<n : usize, m : usize, l : usize>(a : f32[n, m], b : f32[m, l]) -> f32[n, l] {
let res : f32[n, l];
@outer for i = 0 to n {
@middle for j = 0 to l {
......
# Rodinia Benchmarks
This directory contains several of the benchmarks from the [Rodinia Benchmark Suite](http://www.cs.virginia.edu/rodinia/doku.php) ported into Juno.
The implementations are based on those provided with Rodinia version 3.1.
[package]
name = "juno_backprop"
version = "0.1.0"
authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
edition = "2021"
[[bin]]
name = "juno_backprop"
path = "src/main.rs"
[features]
cuda = ["juno_build/cuda", "hercules_rt/cuda"]
[build-dependencies]
juno_build = { path = "../../../juno_build" }
[dependencies]
juno_build = { path = "../../../juno_build" }
hercules_rt = { path = "../../../hercules_rt" }
async-std = "*"
clap = { version = "*", features = ["derive"] }
with_builtin_macros = "0.1.0"
nom = "*"
rand = "0.9.0"
use juno_build::JunoCompiler;
fn main() {
#[cfg(feature = "cuda")]
JunoCompiler::new()
.file_in_src("backprop.jn")
.unwrap()
.schedule_in_src("gpu.sch")
.unwrap()
.build()
.unwrap();
#[cfg(not(feature = "cuda"))]
JunoCompiler::new()
.file_in_src("backprop.jn")
.unwrap()
.schedule_in_src("cpu.sch")
.unwrap()
.build()
.unwrap();
}
fn squash(x: f32) -> f32 {
// Sigmoid
return 1.0 / (1.0 + exp!(-x));
}
fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] {
let result : f32[m + 1];
result[0] = 1.0;
for j in 1..=m {
let sum = 0.0;
for k in 0..=n {
sum += weights[k, j] * vals[k];
}
result[j] = squash(sum);
}
return result;
}
fn output_error<n: usize>(target: f32[n + 1], actual: f32[n + 1]) -> (f32, f32[n + 1]) {
let errsum = 0.0;
let delta : f32[n + 1];
for j in 1..=n {
let a = actual[j];
let t = target[j];
delta[j] = a * (1.0 - a) * (t - a);
errsum += abs!(delta[j]);
}
return (errsum, delta);
}
fn hidden_error<hidden_n, output_n: usize>(
out_delta: f32[output_n + 1],
hidden_weights: f32[hidden_n + 1, output_n + 1],
hidden_vals: f32[hidden_n + 1],
) -> (f32, f32[hidden_n + 1]) {
let errsum = 0.0;
let delta : f32[hidden_n + 1];
for j in 1..=hidden_n {
let h = hidden_vals[j];
let sum = 0.0;
for k in 1..=output_n {
sum += out_delta[k] * hidden_weights[j, k];
}
delta[j] = h * (1.0 - h) * sum;
errsum += abs!(delta[j]);
}
return (errsum, delta);
}
const ETA : f32 = 0.3;
const MOMENTUM : f32 = 0.3;
fn adjust_weights<n, m: usize>(
delta: f32[m + 1],
vals: f32[n + 1],
weights: f32[n + 1, m + 1],
prev_weights: f32[n + 1, m + 1]
) -> (f32[n + 1, m + 1], f32[n + 1, m + 1]) {
for j in 1..=m {
for k in 0..=n {
let new_dw = ETA * delta[j] * vals[k] + MOMENTUM * prev_weights[k, j];
weights[k, j] += new_dw;
prev_weights[k, j] = new_dw;
}
}
return (weights, prev_weights);
}
#[entry]
fn backprop<input_n, hidden_n, output_n: usize>(
input_vals: f32[input_n + 1],
input_weights: f32[input_n + 1, hidden_n + 1],
hidden_weights: f32[hidden_n + 1, output_n + 1],
target: f32[output_n + 1],
input_prev_weights: f32[input_n + 1, hidden_n + 1],
hidden_prev_weights: f32[hidden_n + 1, output_n + 1],
//) -> (f32, f32,
// f32[input_n + 1, hidden_n + 1], f32[input_n + 1, hidden_n + 1],
// f32[hidden_n + 1, output_n + 1], f32[hidden_n + 1, output_n + 1]) {
) -> (f32, f32, f32) {
let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights);
let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights);
let (out_err, out_delta) = output_error::<output_n>(target, output_vals);
let (hid_err, hid_delta) = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals);
let (hidden_weights, hidden_prev_weights)
= adjust_weights::<hidden_n, output_n>(out_delta, hidden_vals, hidden_weights, hidden_prev_weights);
let (input_weights, input_prev_weights)
= adjust_weights::<input_n, hidden_n>(hid_delta, input_vals, input_weights, input_prev_weights);
return (out_err, hid_err, input_weights[0, 0] + input_prev_weights[0, 0] + hidden_weights[0, 0] + hidden_prev_weights[0, 0]);
//return (input_weights, input_prev_weights, hidden_weights, hidden_prev_weights);
}
gvn(*);
dce(*);
phi-elim(*);
dce(*);
crc(*);
dce(*);
slf(*);
dce(*);
let auto = auto-outline(backprop);
cpu(auto.backprop);
inline(auto.backprop);
inline(auto.backprop);
delete-uncalled(*);
sroa[true](*);
dce(*);
float-collections(*);
reuse-products(*);
dce(*);
gcm(*);
gvn(*);
dce(*);
phi-elim(*);
dce(*);
crc(*);
dce(*);
slf(*);
dce(*);
let auto = auto-outline(backprop);
gpu(auto.backprop);
inline(auto.backprop);
inline(auto.backprop);
delete-uncalled(*);
sroa[true](*);
dce(*);
float-collections(*);
reuse-products(*);
dce(*);
gcm(*);
#![feature(concat_idents)]
juno_build::juno!("backprop");
mod rust_backprop;
use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox, HerculesMutBoxTo};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use clap::Parser;
#[derive(Parser)]
#[clap(author, version, about, long_about = None)]
struct BackpropInputs {
layer_size: usize,
}
fn run_backprop(
input_n: u64,
hidden_n: u64,
output_n: u64,
input_vals: &[f32],
input_weights: &[f32],
hidden_weights: &[f32],
target: &[f32],
input_prev_weights: &[f32],
hidden_prev_weights: &[f32],
) -> (f32, f32, Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
let input_vals = HerculesImmBox::from(input_vals);
let target = HerculesImmBox::from(target);
let mut input_weights = HerculesMutBox::from(input_weights.to_vec());
let mut hidden_weights = HerculesMutBox::from(hidden_weights.to_vec());
let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec());
let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec());
let mut runner = runner!(backprop);
let res = HerculesMutBox::from(async_std::task::block_on(async {
runner
.run(
input_n,
hidden_n,
output_n,
input_vals.to(),
input_weights.to(),
hidden_weights.to(),
target.to(),
input_prev_weights.to(),
hidden_prev_weights.to(),
)
.await
}))
.as_slice()
.to_vec();
let out_err = res[0];
let hid_err = res[1];
(
out_err,
hid_err,
input_weights.as_slice().to_vec(),
hidden_weights.as_slice().to_vec(),
input_prev_weights.as_slice().to_vec(),
hidden_prev_weights.as_slice().to_vec(),
)
}
fn compare_float(x: f32, y: f32) -> bool {
(x - y).abs() < 1e-5
}
fn compare_floats(xs: &[f32], ys: &[f32]) -> bool {
xs.len() == ys.len() && xs.iter().zip(ys.iter()).all(|(x, y)| compare_float(*x, *y))
}
fn backprop_harness(args: BackpropInputs) {
let BackpropInputs { layer_size } = args;
let mut rng = StdRng::seed_from_u64(7);
let input_n = layer_size;
let hidden_n = 16;
let output_n = 1;
let mut input_vals = vec![0.0; input_n + 1];
input_vals[0] = 1.0;
// For some reason the bpnn_randomize_row function used on target just sets it to 0.1
let target = vec![0.1; output_n + 1];
let input_weights = (0..(input_n + 1) * (hidden_n + 1))
.map(|_| rng.random::<f32>())
.collect::<Vec<_>>();
let hidden_weights = (0..(hidden_n + 1) * (output_n + 1))
.map(|_| rng.random::<f32>())
.collect::<Vec<_>>();
let input_prev_weights = vec![0.0; (input_n + 1) * (hidden_n + 1)];
let hidden_prev_weights = vec![0.0; (hidden_n + 1) * (output_n + 1)];
let (
juno_out_err,
juno_hid_err,
juno_input_weights,
juno_hidden_weights,
juno_input_prev_weights,
juno_hidden_prev_weights,
) = run_backprop(
input_n as u64,
hidden_n as u64,
output_n as u64,
&input_vals,
&input_weights,
&hidden_weights,
&target,
&input_prev_weights,
&hidden_prev_weights,
);
let (
rust_out_err,
rust_hid_err,
rust_input_weights,
rust_hidden_weights,
rust_input_prev_weights,
rust_hidden_prev_weights,
) = rust_backprop::backprop(
input_n,
hidden_n,
output_n,
&input_vals,
input_weights,
hidden_weights,
&target,
input_prev_weights,
hidden_prev_weights,
);
assert!(compare_float(juno_out_err, rust_out_err));
assert!(compare_float(juno_hid_err, rust_hid_err));
if !compare_floats(&juno_input_weights, &rust_input_weights) {
panic!("Input weights do not match after training");
}
if !compare_floats(&juno_hidden_weights, &rust_hidden_weights) {
panic!("Hidden weights do not match after training");
}
if !compare_floats(&juno_input_prev_weights, &rust_input_prev_weights) {
panic!("Input prev_weights do not match after training");
}
if !compare_floats(&juno_hidden_prev_weights, &rust_hidden_prev_weights) {
panic!("Hidden prev_weights do not match after training");
}
}
fn main() {
let args = BackpropInputs::parse();
backprop_harness(args);
}
#[test]
fn backprop_test() {
backprop_harness(BackpropInputs { layer_size: 65536 });
}
fn layer_forward(n: usize, m: usize, vals: &[f32], weights: &[f32]) -> Vec<f32> {
let mut result = vec![0.0; m + 1];
result[0] = 1.0;
for j in 1..=m {
let mut sum = 0.0;
for k in 0..=n {
sum += weights[k * (m + 1) + j] * vals[k];
}
result[j] = 1.0 / (1.0 + (-sum).exp());
}
result
}
fn output_error(n: usize, target: &[f32], actual: &[f32]) -> (f32, Vec<f32>) {
let mut result = vec![0.0; n + 1];
let mut error = 0.0;
for j in 1..=n {
let o = actual[j];
let t = target[j];
result[j] = o * (1.0 - o) * (t - o);
error += result[j].abs();
}
(error, result)
}
fn hidden_error(
n: usize,
m: usize,
delta: &[f32],
weights: &[f32],
actual: &[f32],
) -> (f32, Vec<f32>) {
let mut result = vec![0.0; n + 1];
let mut error = 0.0;
for j in 1..=n {
let h = actual[j];
let mut sum = 0.0;
for k in 1..=m {
sum += delta[k] * weights[j * (m + 1) + k];
}
result[j] = h * (1.0 - h) * sum;
error += result[j].abs();
}
(error, result)
}
fn adjust_weights(
n: usize,
m: usize,
delta: &[f32],
vals: &[f32],
mut weights: Vec<f32>,
mut prev_weights: Vec<f32>,
) -> (Vec<f32>, Vec<f32>) {
for j in 1..=m {
for k in 0..=n {
let new_dw = (0.3 * delta[j] * vals[k]) + (0.3 * prev_weights[k * (m + 1) + j]);
weights[k * (m + 1) + j] += new_dw;
prev_weights[k * (m + 1) + j] = new_dw;
}
}
(weights, prev_weights)
}
pub fn backprop(
input_n: usize,
hidden_n: usize,
output_n: usize,
input_vals: &[f32],
input_weights: Vec<f32>,
hidden_weights: Vec<f32>,
target: &[f32],
input_prev_weights: Vec<f32>,
hidden_prev_weights: Vec<f32>,
) -> (f32, f32, Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
let hidden_vals = layer_forward(input_n, hidden_n, input_vals, &input_weights);
let output_vals = layer_forward(hidden_n, output_n, &hidden_vals, &hidden_weights);
let (out_err, out_delta) = output_error(output_n, target, &output_vals);
let (hid_err, hid_delta) = hidden_error(
hidden_n,
output_n,
&out_delta,
&hidden_weights,
&hidden_vals,
);
let (hidden_weights, hidden_prev_weights) = adjust_weights(
hidden_n,
output_n,
&out_delta,
&hidden_vals,
hidden_weights,
hidden_prev_weights,
);
let (input_weights, input_prev_weights) = adjust_weights(
input_n,
hidden_n,
&hid_delta,
&input_vals,
input_weights,
input_prev_weights,
);
(
out_err,
hid_err,
input_weights,
hidden_weights,
input_prev_weights,
hidden_prev_weights,
)
}
[package]
name = "juno_bfs"
version = "0.1.0"
authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
edition = "2021"
[[bin]]
name = "juno_bfs"
path = "src/main.rs"
[features]
cuda = ["juno_build/cuda", "hercules_rt/cuda"]
[build-dependencies]
juno_build = { path = "../../../juno_build" }
[dependencies]
juno_build = { path = "../../../juno_build" }
hercules_rt = { path = "../../../hercules_rt" }
async-std = "*"
clap = { version = "*", features = ["derive"] }
with_builtin_macros = "0.1.0"
nom = "*"
use juno_build::JunoCompiler;
fn main() {
#[cfg(feature = "cuda")]
JunoCompiler::new()
.file_in_src("bfs.jn")
.unwrap()
.schedule_in_src("gpu.sch")
.unwrap()
.build()
.unwrap();
#[cfg(not(feature = "cuda"))]
JunoCompiler::new()
.file_in_src("bfs.jn")
.unwrap()
.build()
.unwrap();
}