Compare revisions

rarbore2 · rarbore2 · 259b6268 · 259b6268 · 259b6268 · 259b6268
--- a/hercules_cg/src/fork_tree.rs
+++ b/hercules_cg/src/fork_tree.rs
@@ -9,11 +9,16 @@ use crate::*;
 * c) no domination by any other fork that's also dominated by F, where we do count self-domination
 * Here too we include the non-fork start node, as key for all controls outside any fork.
 */
-pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> {
+pub fn fork_control_map(
+    fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>,
+) -> HashMap<NodeID, HashSet<NodeID>> {
    let mut fork_control_map = HashMap::new();
    for (control, forks) in fork_join_nesting {
        let fork = forks.first().copied().unwrap_or(NodeID::new(0));
-        fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(*control);
+        fork_control_map
+            .entry(fork)
+            .or_insert_with(HashSet::new)
+            .insert(*control);
    }
    fork_control_map
 }
@@ -24,13 +29,19 @@ pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> Has
 * c) no domination by any other fork that's also dominated by F, where we don't count self-domination
 * Note that the fork_tree also includes the non-fork start node, as unique root node.
 */
-pub fn fork_tree(function: &Function, fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> {
+pub fn fork_tree(
+    function: &Function,
+    fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>,
+) -> HashMap<NodeID, HashSet<NodeID>> {
    let mut fork_tree = HashMap::new();
    for (control, forks) in fork_join_nesting {
        if function.nodes[control.idx()].is_fork() {
            fork_tree.entry(*control).or_insert_with(HashSet::new);
            let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0));
-            fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(*control);
+            fork_tree
+                .entry(nesting_fork)
+                .or_insert_with(HashSet::new)
+                .insert(*control);
        }
    }
    fork_tree

--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1521,6 +1521,8 @@ extern \"C\" {} {}(",
                let collect_variable = self.get_value(*collect, false, false);
                write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?;
            }
+            // Undef nodes never need to be assigned to.
+            Node::Undef { ty: _ } => {}
            _ => {
                panic!(
                    "Unsupported data node type: {:?}",

--- a/hercules_opt/src/fork_guard_elim.rs
+++ b/hercules_opt/src/fork_guard_elim.rs
@@ -77,13 +77,16 @@ fn guarded_fork(
        };

        // Filter out any terms which are just 1s
-        let non_ones = xs.iter().filter(|i| {
-            if let DynamicConstant::Constant(1) = editor.get_dynamic_constant(**i).deref() {
-                false
-            } else {
-                true
-            }
-        }).collect::<Vec<_>>();
+        let non_ones = xs
+            .iter()
+            .filter(|i| {
+                if let DynamicConstant::Constant(1) = editor.get_dynamic_constant(**i).deref() {
+                    false
+                } else {
+                    true
+                }
+            })
+            .collect::<Vec<_>>();
        // If we're left with just one term x, we had max { 1, x }
        if non_ones.len() == 1 {
            Factor::Max(idx, *non_ones[0])

--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -181,8 +181,13 @@ fn preliminary_fixups(
    reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>,
 ) -> bool {
    let nodes = &editor.func().nodes;
+    let schedules = &editor.func().schedules;
+
+    // Sequentialize non-parallel forks that contain problematic reduce cycles.
    for (reduce, cycle) in reduce_cycles {
-        if cycle.into_iter().any(|id| nodes[id.idx()].is_reduce()) {
+        if !schedules[reduce.idx()].contains(&Schedule::ParallelReduce)
+            && cycle.into_iter().any(|id| nodes[id.idx()].is_reduce())
+        {
            let join = nodes[reduce.idx()].try_reduce().unwrap().0;
            let fork = fork_join_map
                .into_iter()
@@ -198,6 +203,31 @@ fn preliminary_fixups(
            return true;
        }
    }
+
+    // Get rid of the backward edge on parallel reduces in fork-joins.
+    for (_, join) in fork_join_map {
+        let parallel_reduces: Vec<_> = editor
+            .get_users(*join)
+            .filter(|id| {
+                nodes[id.idx()].is_reduce()
+                    && schedules[id.idx()].contains(&Schedule::ParallelReduce)
+            })
+            .collect();
+        for reduce in parallel_reduces {
+            if reduce_cycles[&reduce].is_empty() {
+                continue;
+            }
+            let (_, init, _) = nodes[reduce.idx()].try_reduce().unwrap();
+
+            // Replace uses of the reduce in its cycle with the init.
+            let success = editor.edit(|edit| {
+                edit.replace_all_uses_where(reduce, init, |id| reduce_cycles[&reduce].contains(id))
+            });
+            assert!(success);
+            return true;
+        }
+    }
+
    false
 }

@@ -511,7 +541,8 @@ fn basic_blocks(
                // outside of reduce loops. Nodes that do need to be in a reduce
                // loop use the reduce node forming the loop, so the dominator chain
                // will consist of one block, and this loop won't ever iterate.
-                let currently_at_join = function.nodes[location.idx()].is_join();
+                let currently_at_join = function.nodes[location.idx()].is_join()
+                    && !function.nodes[control_node.idx()].is_join();

                if (!is_constant_or_undef || is_gpu_returned)
                    && (shallower_nest || currently_at_join)
@@ -811,7 +842,14 @@ fn spill_clones(
                .into_iter()
                .any(|u| *u == *b)
                && (editor.func().nodes[a.idx()].is_phi()
-                    || editor.func().nodes[a.idx()].is_reduce()))
+                    || editor.func().nodes[a.idx()].is_reduce())
+                && !editor.func().nodes[a.idx()]
+                    .try_reduce()
+                    .map(|(_, init, _)| {
+                        init == *b
+                            && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce)
+                    })
+                    .unwrap_or(false))
    });

    // Step 3: if there is a spill edge, spill it and return true. Otherwise,
@@ -989,15 +1027,16 @@ fn liveness_dataflow(
    }
    let mut num_phis_reduces = vec![0; function.nodes.len()];
    let mut has_phi = vec![false; function.nodes.len()];
-    let mut has_reduce = vec![false; function.nodes.len()];
+    let mut has_seq_reduce = vec![false; function.nodes.len()];
    for (node_idx, bb) in bbs.0.iter().enumerate() {
        let node = &function.nodes[node_idx];
        if node.is_phi() || node.is_reduce() {
            num_phis_reduces[bb.idx()] += 1;
        }
        has_phi[bb.idx()] = node.is_phi();
-        has_reduce[bb.idx()] = node.is_reduce();
-        assert!(!has_phi[bb.idx()] || !has_reduce[bb.idx()]);
+        has_seq_reduce[bb.idx()] =
+            node.is_reduce() && !function.schedules[node_idx].contains(&Schedule::ParallelReduce);
+        assert!(!node.is_phi() || !node.is_reduce());
    }
    let is_obj = |id: NodeID| !objects[&func_id].objects(id).is_empty();

@@ -1009,11 +1048,14 @@ fn liveness_dataflow(
            let last_pt = bbs.1[bb.idx()].len();
            let old_value = &liveness[&bb][last_pt];
            let mut new_value = BTreeSet::new();
-            for succ in control_subgraph.succs(*bb).chain(if has_reduce[bb.idx()] {
-                Either::Left(once(*bb))
-            } else {
-                Either::Right(empty())
-            }) {
+            for succ in control_subgraph
+                .succs(*bb)
+                .chain(if has_seq_reduce[bb.idx()] {
+                    Either::Left(once(*bb))
+                } else {
+                    Either::Right(empty())
+                })
+            {
                // The liveness at the bottom of a basic block is the union of:
                // 1. The liveness of each succecessor right after its phis and
                //    reduces.
@@ -1041,7 +1083,9 @@ fn liveness_dataflow(
                            assert_eq!(control, succ);
                            if succ == *bb {
                                new_value.insert(reduct);
-                            } else {
+                            } else if !function.schedules[id.idx()]
+                                .contains(&Schedule::ParallelReduce)
+                            {
                                new_value.insert(init);
                            }
                        }
@@ -1058,6 +1102,7 @@ fn liveness_dataflow(
                let mut new_value = liveness[&bb][pt + 1].clone();
                let id = bbs.1[bb.idx()][pt];
                let uses = get_uses(&function.nodes[id.idx()]);
+                let is_obj = |id: &NodeID| is_obj(*id);
                new_value.remove(&id);
                new_value.extend(
                    if let Node::Write {
@@ -1070,14 +1115,19 @@ fn liveness_dataflow(
                        // If this write is a cloning write, the `collect` input
                        // isn't actually live, because its value doesn't
                        // matter.
-                        Either::Left(once(data).filter(|id| is_obj(*id)))
+                        Either::Left(once(data).filter(is_obj))
+                    } else if let Node::Reduce {
+                        control: _,
+                        init: _,
+                        reduct,
+                    } = function.nodes[id.idx()]
+                        && function.schedules[id.idx()].contains(&Schedule::ParallelReduce)
+                    {
+                        // If this reduce is a parallel reduce, the `init` input
+                        // isn't actually live.
+                        Either::Left(once(reduct).filter(is_obj))
                    } else {
-                        Either::Right(
-                            uses.as_ref()
-                                .into_iter()
-                                .map(|id| *id)
-                                .filter(|id| is_obj(*id)),
-                        )
+                        Either::Right(uses.as_ref().into_iter().map(|id| *id).filter(is_obj))
                    },
                );
                changed |= *old_value != new_value;

--- a/hercules_samples/dot/build.rs
+++ b/hercules_samples/dot/build.rs
@@ -4,7 +4,11 @@ fn main() {
    JunoCompiler::new()
        .ir_in_src("dot.hir")
        .unwrap()
-        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .schedule_in_src(if cfg!(feature = "cuda") {
+            "gpu.sch"
+        } else {
+            "cpu.sch"
+        })
        .unwrap()
        .build()
        .unwrap();

--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
 #![feature(concat_idents)]

-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};

 juno_build::juno!("dot");


--- a/hercules_samples/matmul/build.rs
+++ b/hercules_samples/matmul/build.rs
@@ -4,7 +4,11 @@ fn main() {
    JunoCompiler::new()
        .ir_in_src("matmul.hir")
        .unwrap()
-        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .schedule_in_src(if cfg!(feature = "cuda") {
+            "gpu.sch"
+        } else {
+            "cpu.sch"
+        })
        .unwrap()
        .build()
        .unwrap();

--- a/hercules_samples/matmul/src/main.rs
+++ b/hercules_samples/matmul/src/main.rs
@@ -2,9 +2,9 @@

 use rand::random;

-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};

 juno_build::juno!("matmul");

@@ -36,7 +36,9 @@ fn main() {
            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
            let mut r = runner!(matmul);
-            let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let c = r
+                .run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
+                .await;
            let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
            c.to_cpu_ref(&mut c_cpu);
            assert_eq!(&*c_cpu, &*correct_c);

--- a/hercules_test/hercules_interpreter/src/interpreter.rs
+++ b/hercules_test/hercules_interpreter/src/interpreter.rs
@@ -69,18 +69,18 @@ pub fn dyn_const_value(
    match dc {
        DynamicConstant::Constant(v) => *v,
        DynamicConstant::Parameter(v) => dyn_const_params[*v],
-        DynamicConstant::Add(xs) => {
-            xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
-                .fold(0, |s, v| s + v)
-        }
+        DynamicConstant::Add(xs) => xs
+            .iter()
+            .map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
+            .fold(0, |s, v| s + v),
        DynamicConstant::Sub(a, b) => {
            dyn_const_value(a, dyn_const_values, dyn_const_params)
                - dyn_const_value(b, dyn_const_values, dyn_const_params)
        }
-        DynamicConstant::Mul(xs) => {
-            xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
-                .fold(1, |p, v| p * v)
-        }
+        DynamicConstant::Mul(xs) => xs
+            .iter()
+            .map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
+            .fold(1, |p, v| p * v),
        DynamicConstant::Div(a, b) => {
            dyn_const_value(a, dyn_const_values, dyn_const_params)
                / dyn_const_value(b, dyn_const_values, dyn_const_params)
@@ -89,28 +89,28 @@ pub fn dyn_const_value(
            dyn_const_value(a, dyn_const_values, dyn_const_params)
                % dyn_const_value(b, dyn_const_values, dyn_const_params)
        }
-        DynamicConstant::Max(xs) => {
-            xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
-                .fold(None, |m, v| {
-                    if let Some(m) = m {
-                        Some(max(m, v))
-                    } else {
-                        Some(v)
-                    }
-                })
-                .unwrap()
-        }
-        DynamicConstant::Min(xs) => {
-            xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
-                .fold(None, |m, v| {
-                    if let Some(m) = m {
-                        Some(min(m, v))
-                    } else {
-                        Some(v)
-                    }
-                })
-                .unwrap()
-        }
+        DynamicConstant::Max(xs) => xs
+            .iter()
+            .map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
+            .fold(None, |m, v| {
+                if let Some(m) = m {
+                    Some(max(m, v))
+                } else {
+                    Some(v)
+                }
+            })
+            .unwrap(),
+        DynamicConstant::Min(xs) => xs
+            .iter()
+            .map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
+            .fold(None, |m, v| {
+                if let Some(m) = m {
+                    Some(min(m, v))
+                } else {
+                    Some(v)
+                }
+            })
+            .unwrap(),
    }
 }


--- a/hercules_test/hercules_tests/tests/loop_tests.rs
+++ b/hercules_test/hercules_tests/tests/loop_tests.rs
@@ -35,9 +35,7 @@ fn alternate_bounds_use_after_loop_no_tid() {

    println!("result: {:?}", result_1);

-    let schedule = default_schedule![
-        Forkify,
-    ];
+    let schedule = default_schedule![Forkify,];

    let module = run_schedule_on_hercules(module, Some(schedule)).unwrap();

@@ -61,9 +59,7 @@ fn alternate_bounds_use_after_loop() {

    println!("result: {:?}", result_1);

-    let schedule = Some(default_schedule![
-        Forkify,
-    ]);
+    let schedule = Some(default_schedule![Forkify,]);

    let module = run_schedule_on_hercules(module, schedule).unwrap();

@@ -108,10 +104,7 @@ fn do_while_separate_body() {

    println!("result: {:?}", result_1);

-    let schedule = Some(default_schedule![
-        PhiElim,
-        Forkify,
-    ]);
+    let schedule = Some(default_schedule![PhiElim, Forkify,]);

    let module = run_schedule_on_hercules(module, schedule).unwrap();

@@ -131,10 +124,7 @@ fn alternate_bounds_internal_control() {

    println!("result: {:?}", result_1);

-    let schedule = Some(default_schedule![
-        PhiElim,
-        Forkify,
-    ]);
+    let schedule = Some(default_schedule![PhiElim, Forkify,]);

    let module = run_schedule_on_hercules(module, schedule).unwrap();

@@ -155,10 +145,7 @@ fn alternate_bounds_internal_control2() {

    println!("result: {:?}", result_1);

-    let schedule = Some(default_schedule![
-        PhiElim,
-        Forkify,
-    ]);
+    let schedule = Some(default_schedule![PhiElim, Forkify,]);

    let module = run_schedule_on_hercules(module, schedule).unwrap();

@@ -366,16 +353,13 @@ fn look_at_local() {
        "/home/xavierrouth/dev/hercules/hercules_test/hercules_tests/save_me.hbin",
    );

-    let schedule = Some(default_schedule![
-    ]);
+    let schedule = Some(default_schedule![]);

    let result_1 = interp_module!(module, 0, dyn_consts, a.clone(), b.clone());

    let module = run_schedule_on_hercules(module.clone(), schedule).unwrap();

-    let schedule = Some(default_schedule![
-        Unforkify, Verify,
-    ]);
+    let schedule = Some(default_schedule![Unforkify, Verify,]);

    let module = run_schedule_on_hercules(module.clone(), schedule).unwrap();


--- a/juno_frontend/src/semant.rs
+++ b/juno_frontend/src/semant.rs
@@ -752,7 +752,16 @@ fn analyze_program(
                            }
                            arg_info.push((ty, inout.is_some(), var));

-                            match process_irrefutable_pattern(pattern, false, var, ty, lexer, &mut stringtab, &mut env, &mut types) {
+                            match process_irrefutable_pattern(
+                                pattern,
+                                false,
+                                var,
+                                ty,
+                                lexer,
+                                &mut stringtab,
+                                &mut env,
+                                &mut types,
+                            ) {
                                Ok(prep) => {
                                    stmts.extend(prep);
                                }

--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -8,9 +8,9 @@ use self::camera_model::*;
 use self::cava_rust::CHAN;
 use self::image_proc::*;

-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};

 use image::ImageError;

@@ -31,7 +31,6 @@ fn run_cava(
    coefs: &[f32],
    tonemap: &[f32],
 ) -> Box<[u8]> {
-
    assert_eq!(image.len(), CHAN * rows * cols);
    assert_eq!(tstw.len(), CHAN * CHAN);
    assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN);
@@ -47,21 +46,24 @@ fn run_cava(
        let weights = HerculesCPURef::from_slice(weights);
        let coefs = HerculesCPURef::from_slice(coefs);
        let tonemap = HerculesCPURef::from_slice(tonemap);
-	    let mut r = runner!(cava);
-	    async_std::task::block_on(async {
-		r.run(
-		    rows as u64,
-		    cols as u64,
-		    num_ctrl_pts as u64,
-		    image,
-		    tstw,
-		    ctrl_pts,
-		    weights,
-		    coefs,
-		    tonemap,
-		)
-		.await
-	    }).as_slice::<u8>().to_vec().into_boxed_slice()
+        let mut r = runner!(cava);
+        async_std::task::block_on(async {
+            r.run(
+                rows as u64,
+                cols as u64,
+                num_ctrl_pts as u64,
+                image,
+                tstw,
+                ctrl_pts,
+                weights,
+                coefs,
+                tonemap,
+            )
+            .await
+        })
+        .as_slice::<u8>()
+        .to_vec()
+        .into_boxed_slice()
    }

    #[cfg(feature = "cuda")]
@@ -72,8 +74,8 @@ fn run_cava(
        let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
        let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
        let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
-	    let mut r = runner!(cava);
-	    let res = async_std::task::block_on(async {
+        let mut r = runner!(cava);
+        let res = async_std::task::block_on(async {
            r.run(
                rows as u64,
                cols as u64,
@@ -86,7 +88,7 @@ fn run_cava(
                tonemap.get_ref(),
            )
            .await
-	    });
+        });
        let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
        let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
        res.to_cpu_ref(&mut res_cpu);
@@ -204,7 +206,8 @@ fn cava_harness(args: CavaInputs) {
                .expect("Error saving verification image");
        }

-        let max_diff = result.iter()
+        let max_diff = result
+            .iter()
            .zip(cpu_result.iter())
            .map(|(a, b)| (*a as i16 - *b as i16).abs())
            .max()

--- a/juno_samples/concat/src/main.rs
+++ b/juno_samples/concat/src/main.rs
 #![feature(concat_idents)]

 use hercules_rt::runner;
-use hercules_rt::HerculesCPURef;
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::HerculesCPURef;

 juno_build::juno!("concat");

@@ -20,7 +20,7 @@ fn main() {
            assert_eq!(output, 42);

            const N: usize = 3;
-            let arr : Box<[i32]> = (2..=4).collect();
+            let arr: Box<[i32]> = (2..=4).collect();
            let arr = HerculesCPURef::from_slice(&arr);

            let mut r = runner!(concat_switch);

--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -2,9 +2,9 @@

 mod edge_detection_rust;

-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};

 use std::slice::from_raw_parts;

@@ -228,9 +228,9 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
        });

        #[cfg(not(feature = "cuda"))]
-        let result : Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
+        let result: Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
        #[cfg(feature = "cuda")]
-        let result : Box<[f32]> = {
+        let result: Box<[f32]> = {
            let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() };
            let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice();
            result.to_cpu_ref(&mut res_cpu);
@@ -261,7 +261,10 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
                theta,
            );

-            assert_eq!(result.as_ref(), <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result));
+            assert_eq!(
+                result.as_ref(),
+                <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result)
+            );
            println!("Frames {} match", i);

            if display_verify {

--- a/juno_samples/fork_join_tests/src/fork_join_tests.jn
+++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn
@@ -62,7 +62,7 @@ fn test4(input : i32) -> i32[4, 4] {

 #[entry]
 fn test5(input : i32) -> i32[4] {
-  let arr1 : i32[4];
+  @cons let arr1 : i32[4];
  for i = 0 to 4 {
    let red = arr1[i];
    for k = 0 to 3 {

--- a/juno_samples/fork_join_tests/src/gpu.sch
+++ b/juno_samples/fork_join_tests/src/gpu.sch
+no-memset(test5@cons);
 parallel-reduce(test5@reduce);

 gvn(*);

--- a/juno_samples/fork_join_tests/src/main.rs
+++ b/juno_samples/fork_join_tests/src/main.rs
@@ -46,6 +46,6 @@ fn main() {
 }

 #[test]
-fn implicit_clone_test() {
+fn fork_join_test() {
    main();
 }
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -2,9 +2,9 @@

 use rand::random;

-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};

 juno_build::juno!("matmul");

@@ -28,10 +28,14 @@ fn main() {
            let a = HerculesCPURef::from_slice(&a);
            let b = HerculesCPURef::from_slice(&b);
            let mut r = runner!(matmul);
-            let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
+            let c = r
+                .run(I as u64, J as u64, K as u64, a.clone(), b.clone())
+                .await;
            assert_eq!(c.as_slice::<i32>(), &*correct_c);
            let mut r = runner!(tiled_64_matmul);
-            let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
+            let tiled_c = r
+                .run(I as u64, J as u64, K as u64, a.clone(), b.clone())
+                .await;
            assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
        }
        #[cfg(feature = "cuda")]
@@ -39,12 +43,16 @@ fn main() {
            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
            let mut r = runner!(matmul);
-            let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let c = r
+                .run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
+                .await;
            let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
            c.to_cpu_ref(&mut c_cpu);
            assert_eq!(&*c_cpu, &*correct_c);
            let mut r = runner!(tiled_64_matmul);
-            let tiled_c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let tiled_c = r
+                .run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
+                .await;
            let mut tiled_c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
            tiled_c.to_cpu_ref(&mut tiled_c_cpu);
            assert_eq!(&*tiled_c_cpu, &*correct_c);

--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
 #![feature(concat_idents)]

-use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut};

 juno_build::juno!("nested_ccp");


--- a/juno_samples/patterns/src/main.rs
+++ b/juno_samples/patterns/src/main.rs
 #![feature(concat_idents)]

-use hercules_rt::{runner};
+use hercules_rt::runner;

 juno_build::juno!("patterns");
No results found