Compare revisions

Aaron Councilman · Aaron Councilman · 9e99a426 · 9e99a426 · 9e99a426 · 9e99a426
--- a/.gitattributes
+++ b/.gitattributes
+# Highlight juno source files like they're rust source files
+*.jn gitlab-language=rust
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1119,6 +1119,31 @@ dependencies = [
 "with_builtin_macros",
 ]

+[[package]]
+name = "juno_backprop"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "clap",
+ "hercules_rt",
+ "juno_build",
+ "nom 6.2.2",
+ "rand 0.9.0",
+ "with_builtin_macros",
+]
+
+[[package]]
+name = "juno_bfs"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "clap",
+ "hercules_rt",
+ "juno_build",
+ "nom 6.2.2",
+ "with_builtin_macros",
+]
+
 [[package]]
 name = "juno_build"
 version = "0.1.0"
@@ -1150,6 +1175,18 @@ dependencies = [
 "with_builtin_macros",
 ]

+[[package]]
+name = "juno_cfd"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "clap",
+ "hercules_rt",
+ "juno_build",
+ "nom 6.2.2",
+ "with_builtin_macros",
+]
+
 [[package]]
 name = "juno_concat"
 version = "0.1.0"
@@ -1321,6 +1358,18 @@ dependencies = [
 "with_builtin_macros",
 ]

+[[package]]
+name = "juno_srad"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "clap",
+ "hercules_rt",
+ "juno_build",
+ "nom 6.2.2",
+ "with_builtin_macros",
+]
+
 [[package]]
 name = "juno_utils"
 version = "0.1.0"

--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,6 +28,10 @@ members = [
 	"juno_samples/multi_device",
 	"juno_samples/patterns",
 	"juno_samples/products",
+	"juno_samples/rodinia/backprop",
+	"juno_samples/rodinia/bfs",
+	"juno_samples/rodinia/cfd",
+	"juno_samples/rodinia/srad",
 	"juno_samples/schedule_test",
 	"juno_samples/simple3",
 	"juno_scheduler",

--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -964,7 +964,17 @@ fn convert_type(ty: &Type) -> &'static str {

 fn convert_intrinsic(intrinsic: &Intrinsic, ty: &Type) -> String {
    let intrinsic = match intrinsic {
-        Intrinsic::Abs => "abs",
+        Intrinsic::Abs => {
+            if ty.is_float() {
+                "fabs"
+            } else if ty.is_signed() {
+                "abs"
+            } else if ty.is_unsigned() {
+                panic!("llvm doesn't define abs for unsigned integers")
+            } else {
+                panic!()
+            }
+        },
        Intrinsic::ACos => "acos",
        Intrinsic::ASin => "asin",
        Intrinsic::ATan => "atan",

--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -2004,7 +2004,7 @@ extern \"C\" {} {}(",
    fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String {
        let func_name = match intrinsic {
            Intrinsic::Abs => match ty {
-                Type::Float32 => "__fabsf",
+                Type::Float32 => "fabsf",
                Type::Float64 => "__fabs",
                ty if ty.is_signed() => "abs",
                ty if ty.is_unsigned() => "uabs",

--- a/hercules_opt/src/outline.rs
+++ b/hercules_opt/src/outline.rs
@@ -180,6 +180,13 @@ pub fn outline(
    editor.edit(|mut edit| {
        // Step 2: assemble the outlined function.
        let u32_ty = edit.add_type(Type::UnsignedInteger32);
+        let return_types: Box<[_]> = return_idx_to_inside_id
+            .iter()
+            .map(|id| typing[id.idx()])
+            .chain(callee_succ_return_idx.map(|_| u32_ty))
+            .collect();
+        let single_return = return_types.len() == 1;
+
        let mut outlined = Function {
            name: format!(
                "{}_{}",
@@ -191,13 +198,11 @@ pub fn outline(
                .map(|id| typing[id.idx()])
                .chain(callee_pred_param_idx.map(|_| u32_ty))
                .collect(),
-            return_type: edit.add_type(Type::Product(
-                return_idx_to_inside_id
-                    .iter()
-                    .map(|id| typing[id.idx()])
-                    .chain(callee_succ_return_idx.map(|_| u32_ty))
-                    .collect(),
-            )),
+            return_type: if single_return {
+                return_types[0]
+            } else {
+                edit.add_type(Type::Product(return_types))
+            },
            num_dynamic_constants: edit.get_num_dynamic_constant_params(),
            entry: false,
            nodes: vec![],
@@ -393,18 +398,24 @@ pub fn outline(
                data_ids.push(cons_node_id);
            }

-            // Build the return product.
-            let mut construct_id = NodeID::new(outlined.nodes.len());
-            outlined.nodes.push(Node::Constant { id: cons_id });
-            for (idx, data) in data_ids.into_iter().enumerate() {
-                let write = Node::Write {
-                    collect: construct_id,
-                    data: data,
-                    indices: Box::new([Index::Field(idx)]),
-                };
-                construct_id = NodeID::new(outlined.nodes.len());
-                outlined.nodes.push(write);
-            }
+            // Build the return value
+            let construct_id = if single_return {
+                assert!(data_ids.len() == 1);
+                data_ids.pop().unwrap()
+            } else {
+                let mut construct_id = NodeID::new(outlined.nodes.len());
+                outlined.nodes.push(Node::Constant { id: cons_id });
+                for (idx, data) in data_ids.into_iter().enumerate() {
+                    let write = Node::Write {
+                        collect: construct_id,
+                        data: data,
+                        indices: Box::new([Index::Field(idx)]),
+                    };
+                    construct_id = NodeID::new(outlined.nodes.len());
+                    outlined.nodes.push(write);
+                }
+                construct_id
+            };

            // Return the return product.
            outlined.nodes.push(Node::Return {
@@ -505,16 +516,20 @@ pub fn outline(
        };

        // Create the read nodes from the call node to get the outputs of the
-        // outlined function.
-        let output_reads: Vec<_> = (0..return_idx_to_inside_id.len())
-            .map(|idx| {
-                let read = Node::Read {
-                    collect: call_id,
-                    indices: Box::new([Index::Field(idx)]),
-                };
-                edit.add_node(read)
-            })
-            .collect();
+        // outlined function (if there are multiple returned values)
+        let output_reads: Vec<_> = if single_return {
+            vec![call_id]
+        } else {
+            (0..return_idx_to_inside_id.len())
+                .map(|idx| {
+                    let read = Node::Read {
+                        collect: call_id,
+                        indices: Box::new([Index::Field(idx)]),
+                    };
+                    edit.add_node(read)
+                })
+                .collect()
+        };
        let indicator_read = callee_succ_return_idx.map(|idx| {
            let read = Node::Read {
                collect: call_id,

--- a/hercules_opt/src/sroa.rs
+++ b/hercules_opt/src/sroa.rs
@@ -38,14 +38,33 @@ use crate::*;
 *
 * - Write: the write node writes primitive fields in product values - these get
 *   replaced by a direct def of the field value
+ *
+ * The allow_sroa_arrays variable controls whether products that contain arrays
+ * will be broken into pieces. This option is useful to have since breaking
+ * these products up can be expensive if it requires destructing and
+ * reconstructing the product at any point.
+ *
+ * TODO: Handle partial selections (i.e. immutable nodes). This will involve
+ * actually tracking each source and use of a product and verifying that all of
+ * the nodes involved are mutable.
 */
-pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types: &Vec<TypeID>) {
+pub fn sroa(
+    editor: &mut FunctionEditor,
+    reverse_postorder: &Vec<NodeID>,
+    types: &Vec<TypeID>,
+    allow_sroa_arrays: bool,
+) {
    let mut types: HashMap<NodeID, TypeID> = types
        .iter()
        .enumerate()
        .map(|(i, t)| (NodeID::new(i), *t))
        .collect();

+    let can_sroa_type = |editor: &FunctionEditor, typ: TypeID| {
+        editor.get_type(typ).is_product()
+            && (allow_sroa_arrays || !type_contains_array(editor, typ))
+    };
+
    // This map stores a map from NodeID to an index tree which can be used to lookup the NodeID
    // that contains the corresponding fields of the original value
    let mut field_map: HashMap<NodeID, IndexTree<NodeID>> = HashMap::new();
@@ -67,7 +86,7 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:
                second: _,
                third: _,
                op: TernaryOperator::Select,
-            } if editor.get_type(types[&node]).is_product() => product_nodes.push(*node),
+            } if can_sroa_type(editor, types[&node]) => product_nodes.push(*node),

            Node::Write {
                collect,
@@ -83,19 +102,23 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:
                    let mut fields = vec![];
                    let mut remainder = vec![];

-                    let mut indices = indices.iter();
-                    while let Some(idx) = indices.next() {
-                        if idx.is_field() {
-                            fields.push(idx.clone());
-                        } else {
-                            remainder.push(idx.clone());
-                            remainder.extend(indices.cloned());
-                            break;
+                    if can_sroa_type(editor, types[&node]) {
+                        let mut indices = indices.iter();
+                        while let Some(idx) = indices.next() {
+                            if idx.is_field() {
+                                fields.push(idx.clone());
+                            } else {
+                                remainder.push(idx.clone());
+                                remainder.extend(indices.cloned());
+                                break;
+                            }
                        }
+                    } else {
+                        remainder.extend_from_slice(indices);
                    }

                    if fields.is_empty() {
-                        if editor.get_type(types[&data]).is_product() {
+                        if can_sroa_type(editor, types[&data]) {
                            (None, Some((*node, collect, remainder)))
                        } else {
                            (None, None)
@@ -205,9 +228,13 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:
                // that information to the node map for the rest of SROA (this produces some reads
                // that mix types of indices, since we only read leaves but that's okay since those
                // reads are not handled by SROA)
-                let indices = indices
-                    .chunk_by(|i, j| i.is_field() && j.is_field())
-                    .collect::<Vec<_>>();
+                let indices = if can_sroa_type(editor, types[collect]) {
+                    indices
+                        .chunk_by(|i, j| i.is_field() == j.is_field())
+                        .collect::<Vec<_>>()
+                } else {
+                    vec![indices.as_ref()]
+                };

                let (field_reads, non_fields_produce_prod) = {
                    if indices.len() == 0 {
@@ -217,9 +244,9 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:
                    } else if indices.len() == 1 {
                        // If once we perform chunking there's only one set of indices, we can just
                        // use the original node
-                        if indices[0][0].is_field() {
+                        if can_sroa_type(editor, types[collect]) {
                            (vec![*node], vec![])
-                        } else if editor.get_type(types[node]).is_product() {
+                        } else if can_sroa_type(editor, types[node]) {
                            (vec![], vec![*node])
                        } else {
                            (vec![], vec![])
@@ -278,7 +305,7 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:

            // We add all calls to the call/return list and check their arguments later
            Node::Call { .. } => call_return_nodes.push(*node),
-            Node::Return { control: _, data } if editor.get_type(types[&data]).is_product() => {
+            Node::Return { control: _, data } if can_sroa_type(editor, types[&data]) => {
                call_return_nodes.push(*node)
            }

@@ -296,7 +323,7 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:
    for node in call_return_nodes {
        match &editor.func().nodes[node.idx()] {
            Node::Return { control, data } => {
-                assert!(editor.get_type(types[&data]).is_product());
+                assert!(can_sroa_type(editor, types[&data]));
                let control = *control;
                let new_data = reconstruct_product(editor, types[&data], *data, &mut product_nodes);
                editor.edit(|mut edit| {
@@ -319,8 +346,8 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:
                let dynamic_constants = dynamic_constants.clone();
                let args = args.clone();

-                // If the call returns a product, we generate reads for each field
-                let fields = if editor.get_type(types[&node]).is_product() {
+                // If the call returns a product that we can sroa, we generate reads for each field
+                let fields = if can_sroa_type(editor, types[&node]) {
                    Some(generate_reads(editor, types[&node], node))
                } else {
                    None
@@ -328,7 +355,7 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:

                let mut new_args = vec![];
                for arg in args {
-                    if editor.get_type(types[&arg]).is_product() {
+                    if can_sroa_type(editor, types[&arg]) {
                        new_args.push(reconstruct_product(
                            editor,
                            types[&arg],
@@ -489,7 +516,7 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:
                    indices,
                } => {
                    if let Some(index_map) = field_map.get(collect) {
-                        if editor.get_type(types[&data]).is_product() {
+                        if can_sroa_type(editor, types[&data]) {
                            if let Some(data_idx) = field_map.get(data) {
                                field_map.insert(
                                    node,
@@ -698,6 +725,16 @@ pub fn sroa(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, types:
    });
 }

+fn type_contains_array(editor: &FunctionEditor, typ: TypeID) -> bool {
+    match &*editor.get_type(typ) {
+        Type::Array(_, _) => true,
+        Type::Product(ts) | Type::Summation(ts) => {
+            ts.iter().any(|t| type_contains_array(editor, *t))
+        }
+        _ => false,
+    }
+}
+
 // An index tree is used to store results at many index lists
 #[derive(Clone, Debug)]
 pub enum IndexTree<T> {

--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -284,14 +284,6 @@ impl<'a> HerculesCUDARefMut<'a> {
        }
    }

-    pub fn dup(&'a mut self) -> Self {
-        HerculesCUDARefMut {
-            ptr: self.ptr,
-            size: self.size,
-            _phantom: PhantomData,
-        }
-    }
-
    pub unsafe fn __ptr(&self) -> *mut u8 {
        self.ptr.as_ptr()
    }
@@ -309,6 +301,17 @@ impl<'a> HerculesCUDARefMut<'a> {
    }
 }

+#[cfg(feature = "cuda")]
+impl<'a, 'b: 'a> HerculesCUDARefMut<'b> {
+    pub fn dup(&'a mut self) -> HerculesCUDARefMut<'a> {
+        HerculesCUDARefMut {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
+        }
+    }
+}
+
 #[cfg(feature = "cuda")]
 impl CUDABox {
    pub fn from_cpu_ref(cpu_ref: HerculesCPURef) -> Self {
@@ -662,7 +665,7 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
    }
 }

-impl<'a, T> HerculesMutBox<'a, T>
+impl<'a, 'b: 'a, T> HerculesMutBox<'b, T>
 where
    T: Default + Clone,
 {
@@ -688,7 +691,7 @@ where
                let elements = unsafe { cuda_ref.__size() / size_of::<T>() };

                // Allocate host memory (if needed)
-                let cpu_alloc: Allocation<&'a mut [T], Vec<T>> = match self.cpu_alloc.take() {
+                let cpu_alloc: Allocation<&'b mut [T], Vec<T>> = match self.cpu_alloc.take() {
                    Allocation::Reference(val) if val.len() == elements => {
                        Allocation::Reference(val)
                    }
@@ -793,7 +796,7 @@ pub trait HerculesMutBoxTo<'a, T> {
    fn to(&'a mut self) -> T;
 }

-impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
+impl<'a, 'b: 'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'b, T>
 where
    T: Default + Clone,
 {
@@ -803,7 +806,7 @@ where
 }

 #[cfg(feature = "cuda")]
-impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
+impl<'a, 'b: 'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'b, T>
 where
    T: Default + Clone,
 {

--- a/juno_frontend/src/codegen.rs
+++ b/juno_frontend/src/codegen.rs
@@ -540,25 +540,31 @@ impl CodeGenerator<'_> {
                block = after_call_region;

                // Read each of the "inout values" and perform the SSA update
-                let inouts_index = self.builder.builder.create_field_index(1);
+                let has_inouts = !inouts.is_empty();
+                // TODO: We should omit unit returns, if we do so the + 1 below is not needed
                for (idx, var) in inouts.into_iter().enumerate() {
-                    let index = self.builder.builder.create_field_index(idx);
+                    let index = self.builder.builder.create_field_index(idx + 1);
                    let mut read = self.builder.allocate_node();
                    let read_id = read.id();
-                    read.build_read(call_id, vec![inouts_index.clone(), index].into());
+                    read.build_read(call_id, vec![index].into());
                    self.builder.add_node(read);

                    ssa.write_variable(var, block, read_id);
                }

                // Read the "actual return" value and return it
-                let value_index = self.builder.builder.create_field_index(0);
-                let mut read = self.builder.allocate_node();
-                let read_id = read.id();
-                read.build_read(call_id, vec![value_index].into());
-                self.builder.add_node(read);
+                let result = if !has_inouts {
+                    call_id
+                } else {
+                    let value_index = self.builder.builder.create_field_index(0);
+                    let mut read = self.builder.allocate_node();
+                    let read_id = read.id();
+                    read.build_read(call_id, vec![value_index].into());
+                    self.builder.add_node(read);
+                    read_id
+                };

-                (read_id, block)
+                (result, block)
            }
            Expr::Intrinsic {
                id,

--- a/juno_frontend/src/lang.l
+++ b/juno_frontend/src/lang.l
@@ -28,6 +28,7 @@ for      "for"
 if       "if"
 inout    "inout"
 integer  "integer"
+in       "in"
 let      "let"
 match    "match"
 mod      "mod"
@@ -128,7 +129,7 @@ _        "_"
 0x[0-9a-fA-F]+            "HEX_INT"
 0b[0-1]+                  "BIN_INT"
 0o[0-7]+                  "OCT_INT"
-[0-9]+\.[0-9]*(|e[0-9]+)  "FLOAT_LIT"
+[0-9]+\.[0-9]+(|e[0-9]+)  "FLOAT_LIT"
 @[a-zA-Z0-9_]+            "LABEL"

 . "UNMATCHED"

--- a/juno_frontend/src/lang.y
+++ b/juno_frontend/src/lang.y
@@ -99,13 +99,18 @@ TypeDef -> Result<TyDef, ()>
  ;

 ObjFields -> Result<Vec<ObjField>, ()>
-  :                     { Ok(vec![]) }
-  | ObjFields ObjField  { flatten($1, $2) }
+  : ObjFieldList { Ok($1?.into_iter().collect()) }
+  ;
+ObjFieldList -> Result<VecDeque<ObjField>, ()>
+  :                           { Ok(VecDeque::new()) }
+  | ObjField                  { Ok(VecDeque::from([$1?])) }
+  | ObjField ',' ObjFieldList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
+  | ObjField ';' ObjFieldList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
  ;
 ObjField -> Result<ObjField, ()>
-  : PubOption 'ID' ';'
+  : PubOption 'ID'
        { Ok(ObjField{ span : $span, public : $1?, name : span_of_tok($2)?, typ : None }) }
-  | PubOption 'ID' ':' Type ';'
+  | PubOption 'ID' ':' Type
        { Ok(ObjField{ span : $span, public : $1?, name : span_of_tok($2)?, typ : Some($4?) }) }
  ;

@@ -287,11 +292,17 @@ Stmt -> Result<Stmt, ()>
  | 'match' NonStructExpr Cases
      { Ok(Stmt::MatchStmt{ span : $span, expr : $2?, body : $3? }) }
  | 'for' VarBind '=' NonStructExpr 'to' NonStructExpr Stmts
-      { Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?, step : None,
-                          body : Box::new($7?) }) }
+      { Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?,
+                          inclusive: false, step : None, body : Box::new($7?) }) }
  | 'for' VarBind '=' NonStructExpr 'to' NonStructExpr 'by' SignedIntLit Stmts
-      { Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?, step : Some($8?),
-                          body : Box::new($9?) }) }
+      { Ok(Stmt::ForStmt{ span : $span, var : $2?, init : $4?, bound : $6?,
+                          inclusive: false, step : Some($8?), body : Box::new($9?) }) }
+  | 'for' VarBind 'in' NonStructExpr '..' NonStructExpr Stmts
+      { Ok(Stmt::ForStmt{ span: $span, var: $2?, init: $4?, bound: $6?,
+                          inclusive: false, step: None, body: Box::new($7?) }) }
+  | 'for' VarBind 'in' NonStructExpr '..' '=' NonStructExpr Stmts
+      { Ok(Stmt::ForStmt{ span: $span, var: $2?, init: $4?, bound: $7?,
+                          inclusive: true, step: None, body: Box::new($8?) }) }
  | 'while' NonStructExpr Stmts
      { Ok(Stmt::WhileStmt{ span : $span, cond : $2?, body : Box::new($3?) }) }
  | 'return' ';'
@@ -457,12 +468,16 @@ Expr -> Result<Expr, ()>
      { Ok(Expr::IntrinsicExpr{ span : $span, name : $1?, ty_args : Some($5?), args: $8? }) }
  ;
 IdExprs -> Result<Vec<(Id, Expr)>, ()>
-  : 'ID' '=' Expr               { Ok(vec![(span_of_tok($1)?, $3?)]) }
-  | IdExprsS ',' 'ID' '=' Expr  { flatten($1, res_pair(span_of_tok($3), $5)) }
+  : IdExprList  { Ok($1?.into_iter().collect()) }
+  ;
+IdExprList -> Result<VecDeque<(Id, Expr)>, ()>
+  :                       { Ok(VecDeque::new()) }
+  | IdExpr                { Ok(VecDeque::from([$1?])) }
+  | IdExpr ',' IdExprList { let mut lst = $3?; lst.push_front($1?); Ok(lst) }
  ;
-IdExprsS -> Result<Vec<(Id, Expr)>, ()>
-  : 'ID' '=' Expr               { Ok(vec![(span_of_tok($1)?, $3?)]) }
-  | IdExprsS ',' 'ID' '=' Expr  { flatten($1, res_pair(span_of_tok($3), $5)) }
+IdExpr -> Result<(Id, Expr), ()>
+  : 'ID' ':' Expr               { Ok((span_of_tok($1)?, $3?)) }
+  | 'ID' '=' Expr               { Ok((span_of_tok($1)?, $3?)) }
  ;
 Params -> Result<Vec<(bool, Expr)>, ()>
  :                       { Ok(vec![]) }
@@ -678,9 +693,9 @@ pub enum Stmt {
  AssignStmt { span : Span, lhs : LExpr, assign : AssignOp, assign_span : Span, rhs : Expr },
  IfStmt     { span : Span, cond : Expr, thn : Box<Stmt>, els : Option<Box<Stmt>> },
  MatchStmt  { span : Span, expr : Expr, body : Vec<Case> },
-  // The step records: negative, number, base
-  ForStmt    { span : Span, var : VarBind, init : Expr, bound : Expr, step : Option<(bool, Span, IntBase)>,
-               body : Box<Stmt> },
+  // The step records: negative, number, base, inclusive records whether the bound is included in the range
+  ForStmt    { span : Span, var : VarBind, init : Expr, bound : Expr,
+               inclusive: bool, step : Option<(bool, Span, IntBase)>, body : Box<Stmt> },
  WhileStmt  { span : Span, cond : Expr, body : Box<Stmt> },
  ReturnStmt { span : Span, expr : Option<Expr> },
  BreakStmt  { span : Span },

--- a/juno_frontend/src/semant.rs
+++ b/juno_frontend/src/semant.rs
@@ -808,8 +808,14 @@ fn analyze_program(
                // Compute the proper type accounting for the inouts (which become returns)
                let mut inout_types = inouts.iter().map(|e| e.get_type()).collect::<Vec<_>>();

-                let inout_tuple = types.new_tuple(inout_types);
-                let pure_return_type = types.new_tuple(vec![return_type, inout_tuple]);
+                let mut return_types = vec![return_type];
+                return_types.extend(inout_types);
+                // TODO: Ideally we would omit unit returns
+                let pure_return_type = if return_types.len() == 1 {
+                    return_types.pop().unwrap()
+                } else {
+                    types.new_tuple(return_types)
+                };

                // Finally, we have a properly built environment and we can
                // start processing the body
@@ -1993,6 +1999,7 @@ fn process_stmt(
                },
            init,
            bound,
+            inclusive,
            step,
            body,
        } => {
@@ -2124,10 +2131,19 @@ fn process_stmt(
                val: bound_val,
            };

-            // The condition of the loop is var < bound, unless the step is negative in which case
-            // it is var > bound
+            // There are four cases for the condition that we generate, though it always takes the
+            // form var OP bound:
+            // 1. The step is positive and the range is exclusive of the bound, OP = <
+            // 2. The step is positive and the range is inclusive of the bound, OP = <=
+            // 3. The step is negative and the range is exclusive of the bound, OP = >
+            // 4. The step is negative and the range is inclusive of the bound, OP = >=
            let condition = Expr::BinaryExp {
-                op: if step_pos { BinaryOp::Lt } else { BinaryOp::Gt },
+                op: match (step_pos, inclusive) {
+                    (true, false) => BinaryOp::Lt,
+                    (true, true) => BinaryOp::Le,
+                    (false, false) => BinaryOp::Gt,
+                    (false, true) => BinaryOp::Ge,
+                },
                lhs: Box::new(Expr::Variable {
                    var: var,
                    typ: var_type,
@@ -4809,7 +4825,7 @@ fn process_expr(
                    };

                    // Now, process the arguments to ensure they has the type needed by this
-                    // constructor
+                    // function
                    let mut arg_vals: Vec<Either<Expr, usize>> = vec![];
                    let mut errors = LinkedList::new();

@@ -5009,19 +5025,21 @@ fn process_expr(
 }

 fn generate_return(expr: Expr, inouts: &Vec<Expr>, types: &mut TypeSolver) -> Stmt {
-    let inout_types = inouts.iter().map(|e| e.get_type()).collect();
-    let inout_type = types.new_tuple(inout_types);
+    let inout_types = inouts.iter().map(|e| e.get_type()).collect::<Vec<_>>();

-    let inout_vals = Expr::Tuple {
-        vals: inouts.clone(),
-        typ: inout_type,
-    };
+    let mut return_types = vec![expr.get_type()];
+    return_types.extend(inout_types);

-    let expr_type = expr.get_type();
+    let mut return_vals = vec![expr];
+    return_vals.extend_from_slice(inouts);

-    let val = Expr::Tuple {
-        vals: vec![expr, inout_vals],
-        typ: types.new_tuple(vec![expr_type, inout_type]),
+    let val = if return_vals.len() == 1 {
+        return_vals.pop().unwrap()
+    } else {
+        Expr::Tuple {
+            vals: return_vals,
+            typ: types.new_tuple(return_types),
+        }
    };

    Stmt::ReturnStmt { expr: val }

--- a/juno_samples/rodinia/README.md
+++ b/juno_samples/rodinia/README.md
+# Rodinia Benchmarks
+This directory contains several of the benchmarks from the [Rodinia Benchmark Suite](http://www.cs.virginia.edu/rodinia/doku.php) ported into Juno.
+The implementations are based on those provided with Rodinia version 3.1.
--- a/juno_samples/rodinia/backprop/Cargo.toml
+++ b/juno_samples/rodinia/backprop/Cargo.toml
+[package]
+name = "juno_backprop"
+version = "0.1.0"
+authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
+
+[[bin]]
+name = "juno_backprop"
+path = "src/main.rs"
+
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
+[build-dependencies]
+juno_build = { path = "../../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../../juno_build" }
+hercules_rt = { path = "../../../hercules_rt" }
+async-std = "*"
+clap = { version = "*", features = ["derive"] }
+with_builtin_macros = "0.1.0"
+nom = "*"
+rand = "0.9.0"
--- a/juno_samples/rodinia/backprop/build.rs
+++ b/juno_samples/rodinia/backprop/build.rs
+use juno_build::JunoCompiler;
+
+fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("backprop.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    JunoCompiler::new()
+        .file_in_src("backprop.jn")
+        .unwrap()
+        .schedule_in_src("cpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+}
--- a/juno_samples/rodinia/backprop/src/backprop.jn
+++ b/juno_samples/rodinia/backprop/src/backprop.jn
+fn squash(x: f32) -> f32 {
+  // Sigmoid
+  return 1.0 / (1.0 + exp!(-x));
+}
+
+fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] {
+  let result : f32[m + 1];
+  result[0] = 1.0;
+
+  for j in 1..=m {
+    let sum = 0.0;
+    for k in 0..=n {
+      sum += weights[k, j] * vals[k];
+    }
+    result[j] = squash(sum);
+  }
+
+  return result;
+}
+
+fn output_error<n: usize>(target: f32[n + 1], actual: f32[n + 1]) -> (f32, f32[n + 1]) {
+  let errsum = 0.0;
+  let delta : f32[n + 1];
+
+  for j in 1..=n {
+    let a = actual[j];
+    let t = target[j];
+    delta[j] = a * (1.0 - a) * (t - a);
+    errsum += abs!(delta[j]);
+  }
+
+  return (errsum, delta);
+}
+
+fn hidden_error<hidden_n, output_n: usize>(
+  out_delta: f32[output_n + 1],
+  hidden_weights: f32[hidden_n + 1, output_n + 1],
+  hidden_vals: f32[hidden_n + 1],
+) -> (f32, f32[hidden_n + 1]) {
+  let errsum = 0.0;
+  let delta : f32[hidden_n + 1];
+
+  for j in 1..=hidden_n {
+    let h = hidden_vals[j];
+
+    let sum = 0.0;
+    for k in 1..=output_n {
+      sum += out_delta[k] * hidden_weights[j, k];
+    }
+
+    delta[j] = h * (1.0 - h) * sum;
+    errsum += abs!(delta[j]);
+  }
+
+  return (errsum, delta);
+}
+
+const ETA : f32 = 0.3;
+const MOMENTUM : f32 = 0.3;
+
+fn adjust_weights<n, m: usize>(
+  delta: f32[m + 1],
+  vals: f32[n + 1],
+  weights: f32[n + 1, m + 1],
+  prev_weights: f32[n + 1, m + 1]
+) -> (f32[n + 1, m + 1], f32[n + 1, m + 1]) {
+  for j in 1..=m {
+    for k in 0..=n {
+      let new_dw = ETA * delta[j] * vals[k] + MOMENTUM * prev_weights[k, j];
+      weights[k, j] += new_dw;
+      prev_weights[k, j] = new_dw;
+    }
+  }
+
+  return (weights, prev_weights);
+}
+
+#[entry]
+fn backprop<input_n, hidden_n, output_n: usize>(
+  input_vals: f32[input_n + 1],
+  input_weights: f32[input_n + 1, hidden_n + 1],
+  hidden_weights: f32[hidden_n + 1, output_n + 1],
+  target: f32[output_n + 1],
+  input_prev_weights: f32[input_n + 1, hidden_n + 1],
+  hidden_prev_weights: f32[hidden_n + 1, output_n + 1],
+//) -> (f32, f32,
+//      f32[input_n + 1, hidden_n + 1], f32[input_n + 1, hidden_n + 1],
+//      f32[hidden_n + 1, output_n + 1], f32[hidden_n + 1, output_n + 1]) {
+) -> (f32, f32, f32) {
+  let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights);
+  let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights);
+
+  let (out_err, out_delta) = output_error::<output_n>(target, output_vals);
+  let (hid_err, hid_delta) = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals);
+
+  let (hidden_weights, hidden_prev_weights)
+    = adjust_weights::<hidden_n, output_n>(out_delta, hidden_vals, hidden_weights, hidden_prev_weights);
+  let (input_weights, input_prev_weights)
+    = adjust_weights::<input_n, hidden_n>(hid_delta, input_vals, input_weights, input_prev_weights);
+
+  return (out_err, hid_err, input_weights[0, 0] + input_prev_weights[0, 0] + hidden_weights[0, 0] + hidden_prev_weights[0, 0]);
+  //return (input_weights, input_prev_weights, hidden_weights, hidden_prev_weights);
+}
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+
+let auto = auto-outline(backprop);
+cpu(auto.backprop);
+
+inline(auto.backprop);
+inline(auto.backprop);
+delete-uncalled(*);
+
+sroa[true](*);
+dce(*);
+float-collections(*);
+reuse-products(*);
+dce(*);
+
+gcm(*);
+
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
+gvn(*);
+dce(*);
+phi-elim(*);
+dce(*);
+crc(*);
+dce(*);
+slf(*);
+dce(*);
+
+let auto = auto-outline(backprop);
+gpu(auto.backprop);
+
+inline(auto.backprop);
+inline(auto.backprop);
+delete-uncalled(*);
+
+sroa[true](*);
+dce(*);
+float-collections(*);
+reuse-products(*);
+dce(*);
+
+gcm(*);
+
--- a/juno_samples/rodinia/backprop/src/main.rs
+++ b/juno_samples/rodinia/backprop/src/main.rs
+#![feature(concat_idents)]
+
+juno_build::juno!("backprop");
+
+mod rust_backprop;
+
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox, HerculesMutBoxTo};
+
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+
+use clap::Parser;
+
+#[derive(Parser)]
+#[clap(author, version, about, long_about = None)]
+struct BackpropInputs {
+    layer_size: usize,
+}
+
+fn run_backprop(
+    input_n: u64,
+    hidden_n: u64,
+    output_n: u64,
+    input_vals: &[f32],
+    input_weights: &[f32],
+    hidden_weights: &[f32],
+    target: &[f32],
+    input_prev_weights: &[f32],
+    hidden_prev_weights: &[f32],
+) -> (f32, f32, Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
+    let input_vals = HerculesImmBox::from(input_vals);
+    let target = HerculesImmBox::from(target);
+
+    let mut input_weights = HerculesMutBox::from(input_weights.to_vec());
+    let mut hidden_weights = HerculesMutBox::from(hidden_weights.to_vec());
+    let mut input_prev_weights = HerculesMutBox::from(input_prev_weights.to_vec());
+    let mut hidden_prev_weights = HerculesMutBox::from(hidden_prev_weights.to_vec());
+
+    let mut runner = runner!(backprop);
+    let res = HerculesMutBox::from(async_std::task::block_on(async {
+        runner
+            .run(
+                input_n,
+                hidden_n,
+                output_n,
+                input_vals.to(),
+                input_weights.to(),
+                hidden_weights.to(),
+                target.to(),
+                input_prev_weights.to(),
+                hidden_prev_weights.to(),
+            )
+            .await
+    }))
+    .as_slice()
+    .to_vec();
+    let out_err = res[0];
+    let hid_err = res[1];
+
+    (
+        out_err,
+        hid_err,
+        input_weights.as_slice().to_vec(),
+        hidden_weights.as_slice().to_vec(),
+        input_prev_weights.as_slice().to_vec(),
+        hidden_prev_weights.as_slice().to_vec(),
+    )
+}
+
+fn compare_float(x: f32, y: f32) -> bool {
+    (x - y).abs() < 1e-5
+}
+
+fn compare_floats(xs: &[f32], ys: &[f32]) -> bool {
+    xs.len() == ys.len() && xs.iter().zip(ys.iter()).all(|(x, y)| compare_float(*x, *y))
+}
+
+fn backprop_harness(args: BackpropInputs) {
+    let BackpropInputs { layer_size } = args;
+
+    let mut rng = StdRng::seed_from_u64(7);
+
+    let input_n = layer_size;
+    let hidden_n = 16;
+    let output_n = 1;
+
+    let mut input_vals = vec![0.0; input_n + 1];
+    input_vals[0] = 1.0;
+
+    // For some reason the bpnn_randomize_row function used on target just sets it to 0.1
+    let target = vec![0.1; output_n + 1];
+
+    let input_weights = (0..(input_n + 1) * (hidden_n + 1))
+        .map(|_| rng.random::<f32>())
+        .collect::<Vec<_>>();
+    let hidden_weights = (0..(hidden_n + 1) * (output_n + 1))
+        .map(|_| rng.random::<f32>())
+        .collect::<Vec<_>>();
+
+    let input_prev_weights = vec![0.0; (input_n + 1) * (hidden_n + 1)];
+    let hidden_prev_weights = vec![0.0; (hidden_n + 1) * (output_n + 1)];
+
+    let (
+        juno_out_err,
+        juno_hid_err,
+        juno_input_weights,
+        juno_hidden_weights,
+        juno_input_prev_weights,
+        juno_hidden_prev_weights,
+    ) = run_backprop(
+        input_n as u64,
+        hidden_n as u64,
+        output_n as u64,
+        &input_vals,
+        &input_weights,
+        &hidden_weights,
+        &target,
+        &input_prev_weights,
+        &hidden_prev_weights,
+    );
+
+    let (
+        rust_out_err,
+        rust_hid_err,
+        rust_input_weights,
+        rust_hidden_weights,
+        rust_input_prev_weights,
+        rust_hidden_prev_weights,
+    ) = rust_backprop::backprop(
+        input_n,
+        hidden_n,
+        output_n,
+        &input_vals,
+        input_weights,
+        hidden_weights,
+        &target,
+        input_prev_weights,
+        hidden_prev_weights,
+    );
+
+    assert!(compare_float(juno_out_err, rust_out_err));
+    assert!(compare_float(juno_hid_err, rust_hid_err));
+    if !compare_floats(&juno_input_weights, &rust_input_weights) {
+        panic!("Input weights do not match after training");
+    }
+    if !compare_floats(&juno_hidden_weights, &rust_hidden_weights) {
+        panic!("Hidden weights do not match after training");
+    }
+    if !compare_floats(&juno_input_prev_weights, &rust_input_prev_weights) {
+        panic!("Input prev_weights do not match after training");
+    }
+    if !compare_floats(&juno_hidden_prev_weights, &rust_hidden_prev_weights) {
+        panic!("Hidden prev_weights do not match after training");
+    }
+}
+
+fn main() {
+    let args = BackpropInputs::parse();
+    backprop_harness(args);
+}
+
+#[test]
+fn backprop_test() {
+    backprop_harness(BackpropInputs { layer_size: 65536 });
+}
--- a/juno_samples/rodinia/backprop/src/rust_backprop.rs
+++ b/juno_samples/rodinia/backprop/src/rust_backprop.rs
+fn layer_forward(n: usize, m: usize, vals: &[f32], weights: &[f32]) -> Vec<f32> {
+    let mut result = vec![0.0; m + 1];
+    result[0] = 1.0;
+
+    for j in 1..=m {
+        let mut sum = 0.0;
+        for k in 0..=n {
+            sum += weights[k * (m + 1) + j] * vals[k];
+        }
+        result[j] = 1.0 / (1.0 + (-sum).exp());
+    }
+
+    result
+}
+
+fn output_error(n: usize, target: &[f32], actual: &[f32]) -> (f32, Vec<f32>) {
+    let mut result = vec![0.0; n + 1];
+    let mut error = 0.0;
+
+    for j in 1..=n {
+        let o = actual[j];
+        let t = target[j];
+        result[j] = o * (1.0 - o) * (t - o);
+        error += result[j].abs();
+    }
+
+    (error, result)
+}
+
+fn hidden_error(
+    n: usize,
+    m: usize,
+    delta: &[f32],
+    weights: &[f32],
+    actual: &[f32],
+) -> (f32, Vec<f32>) {
+    let mut result = vec![0.0; n + 1];
+    let mut error = 0.0;
+
+    for j in 1..=n {
+        let h = actual[j];
+        let mut sum = 0.0;
+        for k in 1..=m {
+            sum += delta[k] * weights[j * (m + 1) + k];
+        }
+        result[j] = h * (1.0 - h) * sum;
+        error += result[j].abs();
+    }
+
+    (error, result)
+}
+
+fn adjust_weights(
+    n: usize,
+    m: usize,
+    delta: &[f32],
+    vals: &[f32],
+    mut weights: Vec<f32>,
+    mut prev_weights: Vec<f32>,
+) -> (Vec<f32>, Vec<f32>) {
+    for j in 1..=m {
+        for k in 0..=n {
+            let new_dw = (0.3 * delta[j] * vals[k]) + (0.3 * prev_weights[k * (m + 1) + j]);
+            weights[k * (m + 1) + j] += new_dw;
+            prev_weights[k * (m + 1) + j] = new_dw;
+        }
+    }
+
+    (weights, prev_weights)
+}
+
+pub fn backprop(
+    input_n: usize,
+    hidden_n: usize,
+    output_n: usize,
+    input_vals: &[f32],
+    input_weights: Vec<f32>,
+    hidden_weights: Vec<f32>,
+    target: &[f32],
+    input_prev_weights: Vec<f32>,
+    hidden_prev_weights: Vec<f32>,
+) -> (f32, f32, Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
+    let hidden_vals = layer_forward(input_n, hidden_n, input_vals, &input_weights);
+    let output_vals = layer_forward(hidden_n, output_n, &hidden_vals, &hidden_weights);
+
+    let (out_err, out_delta) = output_error(output_n, target, &output_vals);
+    let (hid_err, hid_delta) = hidden_error(
+        hidden_n,
+        output_n,
+        &out_delta,
+        &hidden_weights,
+        &hidden_vals,
+    );
+
+    let (hidden_weights, hidden_prev_weights) = adjust_weights(
+        hidden_n,
+        output_n,
+        &out_delta,
+        &hidden_vals,
+        hidden_weights,
+        hidden_prev_weights,
+    );
+    let (input_weights, input_prev_weights) = adjust_weights(
+        input_n,
+        hidden_n,
+        &hid_delta,
+        &input_vals,
+        input_weights,
+        input_prev_weights,
+    );
+
+    (
+        out_err,
+        hid_err,
+        input_weights,
+        hidden_weights,
+        input_prev_weights,
+        hidden_prev_weights,
+    )
+}
No results found