diff --git a/hercules_cg/src/fork_tree.rs b/hercules_cg/src/fork_tree.rs
index 64a93160aabc5564c784bf0ff8b14e82045b81bb..c048f7e388a7eb4f73c0c866896fadfa5d2556a1 100644
--- a/hercules_cg/src/fork_tree.rs
+++ b/hercules_cg/src/fork_tree.rs
@@ -9,11 +9,16 @@ use crate::*;
  * c) no domination by any other fork that's also dominated by F, where we do count self-domination
  * Here too we include the non-fork start node, as key for all controls outside any fork.
  */
-pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> {
+pub fn fork_control_map(
+    fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>,
+) -> HashMap<NodeID, HashSet<NodeID>> {
     let mut fork_control_map = HashMap::new();
     for (control, forks) in fork_join_nesting {
         let fork = forks.first().copied().unwrap_or(NodeID::new(0));
-        fork_control_map.entry(fork).or_insert_with(HashSet::new).insert(*control);
+        fork_control_map
+            .entry(fork)
+            .or_insert_with(HashSet::new)
+            .insert(*control);
     }
     fork_control_map
 }
@@ -24,13 +29,19 @@ pub fn fork_control_map(fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> Has
  * c) no domination by any other fork that's also dominated by F, where we don't count self-domination
  * Note that the fork_tree also includes the non-fork start node, as unique root node.
  */
-pub fn fork_tree(function: &Function, fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>) -> HashMap<NodeID, HashSet<NodeID>> {
+pub fn fork_tree(
+    function: &Function,
+    fork_join_nesting: &HashMap<NodeID, Vec<NodeID>>,
+) -> HashMap<NodeID, HashSet<NodeID>> {
     let mut fork_tree = HashMap::new();
     for (control, forks) in fork_join_nesting {
         if function.nodes[control.idx()].is_fork() {
             fork_tree.entry(*control).or_insert_with(HashSet::new);
             let nesting_fork = forks.get(1).copied().unwrap_or(NodeID::new(0));
-            fork_tree.entry(nesting_fork).or_insert_with(HashSet::new).insert(*control);
+            fork_tree
+                .entry(nesting_fork)
+                .or_insert_with(HashSet::new)
+                .insert(*control);
         }
     }
     fork_tree
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 1086d1aa3f3c662e12da9f28102d4fc89f321bec..f20bfeb727dc6233dbb212a72c3a25b0c720cf36 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1521,6 +1521,8 @@ extern \"C\" {} {}(",
                 let collect_variable = self.get_value(*collect, false, false);
                 write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?;
             }
+            // Undef nodes never need to be assigned to.
+            Node::Undef { ty: _ } => {}
             _ => {
                 panic!(
                     "Unsupported data node type: {:?}",
diff --git a/hercules_opt/src/fork_guard_elim.rs b/hercules_opt/src/fork_guard_elim.rs
index 052fd0e493327fceb3bd1b1918d4d4aafc93bf79..eb755509e48eccb02a311fb6004abd37d263c7f8 100644
--- a/hercules_opt/src/fork_guard_elim.rs
+++ b/hercules_opt/src/fork_guard_elim.rs
@@ -77,13 +77,16 @@ fn guarded_fork(
         };
 
         // Filter out any terms which are just 1s
-        let non_ones = xs.iter().filter(|i| {
-            if let DynamicConstant::Constant(1) = editor.get_dynamic_constant(**i).deref() {
-                false
-            } else {
-                true
-            }
-        }).collect::<Vec<_>>();
+        let non_ones = xs
+            .iter()
+            .filter(|i| {
+                if let DynamicConstant::Constant(1) = editor.get_dynamic_constant(**i).deref() {
+                    false
+                } else {
+                    true
+                }
+            })
+            .collect::<Vec<_>>();
         // If we're left with just one term x, we had max { 1, x }
         if non_ones.len() == 1 {
             Factor::Max(idx, *non_ones[0])
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index d9505fde5e5f7ec19be86ded7cbd4af6e467869a..b2f9767c9cd2eb56b698f05d7b93dbba59c8681e 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -181,8 +181,13 @@ fn preliminary_fixups(
     reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>,
 ) -> bool {
     let nodes = &editor.func().nodes;
+    let schedules = &editor.func().schedules;
+
+    // Sequentialize non-parallel forks that contain problematic reduce cycles.
     for (reduce, cycle) in reduce_cycles {
-        if cycle.into_iter().any(|id| nodes[id.idx()].is_reduce()) {
+        if !schedules[reduce.idx()].contains(&Schedule::ParallelReduce)
+            && cycle.into_iter().any(|id| nodes[id.idx()].is_reduce())
+        {
             let join = nodes[reduce.idx()].try_reduce().unwrap().0;
             let fork = fork_join_map
                 .into_iter()
@@ -198,6 +203,31 @@ fn preliminary_fixups(
             return true;
         }
     }
+
+    // Get rid of the backward edge on parallel reduces in fork-joins.
+    for (_, join) in fork_join_map {
+        let parallel_reduces: Vec<_> = editor
+            .get_users(*join)
+            .filter(|id| {
+                nodes[id.idx()].is_reduce()
+                    && schedules[id.idx()].contains(&Schedule::ParallelReduce)
+            })
+            .collect();
+        for reduce in parallel_reduces {
+            if reduce_cycles[&reduce].is_empty() {
+                continue;
+            }
+            let (_, init, _) = nodes[reduce.idx()].try_reduce().unwrap();
+
+            // Replace uses of the reduce in its cycle with the init.
+            let success = editor.edit(|edit| {
+                edit.replace_all_uses_where(reduce, init, |id| reduce_cycles[&reduce].contains(id))
+            });
+            assert!(success);
+            return true;
+        }
+    }
+
     false
 }
 
@@ -511,7 +541,8 @@ fn basic_blocks(
                 // outside of reduce loops. Nodes that do need to be in a reduce
                 // loop use the reduce node forming the loop, so the dominator chain
                 // will consist of one block, and this loop won't ever iterate.
-                let currently_at_join = function.nodes[location.idx()].is_join();
+                let currently_at_join = function.nodes[location.idx()].is_join()
+                    && !function.nodes[control_node.idx()].is_join();
 
                 if (!is_constant_or_undef || is_gpu_returned)
                     && (shallower_nest || currently_at_join)
@@ -811,7 +842,14 @@ fn spill_clones(
                 .into_iter()
                 .any(|u| *u == *b)
                 && (editor.func().nodes[a.idx()].is_phi()
-                    || editor.func().nodes[a.idx()].is_reduce()))
+                    || editor.func().nodes[a.idx()].is_reduce())
+                && !editor.func().nodes[a.idx()]
+                    .try_reduce()
+                    .map(|(_, init, _)| {
+                        init == *b
+                            && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce)
+                    })
+                    .unwrap_or(false))
     });
 
     // Step 3: if there is a spill edge, spill it and return true. Otherwise,
@@ -989,15 +1027,16 @@ fn liveness_dataflow(
     }
     let mut num_phis_reduces = vec![0; function.nodes.len()];
     let mut has_phi = vec![false; function.nodes.len()];
-    let mut has_reduce = vec![false; function.nodes.len()];
+    let mut has_seq_reduce = vec![false; function.nodes.len()];
     for (node_idx, bb) in bbs.0.iter().enumerate() {
         let node = &function.nodes[node_idx];
         if node.is_phi() || node.is_reduce() {
             num_phis_reduces[bb.idx()] += 1;
         }
         has_phi[bb.idx()] = node.is_phi();
-        has_reduce[bb.idx()] = node.is_reduce();
-        assert!(!has_phi[bb.idx()] || !has_reduce[bb.idx()]);
+        has_seq_reduce[bb.idx()] =
+            node.is_reduce() && !function.schedules[node_idx].contains(&Schedule::ParallelReduce);
+        assert!(!node.is_phi() || !node.is_reduce());
     }
     let is_obj = |id: NodeID| !objects[&func_id].objects(id).is_empty();
 
@@ -1009,11 +1048,14 @@ fn liveness_dataflow(
             let last_pt = bbs.1[bb.idx()].len();
             let old_value = &liveness[&bb][last_pt];
             let mut new_value = BTreeSet::new();
-            for succ in control_subgraph.succs(*bb).chain(if has_reduce[bb.idx()] {
-                Either::Left(once(*bb))
-            } else {
-                Either::Right(empty())
-            }) {
+            for succ in control_subgraph
+                .succs(*bb)
+                .chain(if has_seq_reduce[bb.idx()] {
+                    Either::Left(once(*bb))
+                } else {
+                    Either::Right(empty())
+                })
+            {
                 // The liveness at the bottom of a basic block is the union of:
                 // 1. The liveness of each succecessor right after its phis and
                 //    reduces.
@@ -1041,7 +1083,9 @@ fn liveness_dataflow(
                             assert_eq!(control, succ);
                             if succ == *bb {
                                 new_value.insert(reduct);
-                            } else {
+                            } else if !function.schedules[id.idx()]
+                                .contains(&Schedule::ParallelReduce)
+                            {
                                 new_value.insert(init);
                             }
                         }
@@ -1058,6 +1102,7 @@ fn liveness_dataflow(
                 let mut new_value = liveness[&bb][pt + 1].clone();
                 let id = bbs.1[bb.idx()][pt];
                 let uses = get_uses(&function.nodes[id.idx()]);
+                let is_obj = |id: &NodeID| is_obj(*id);
                 new_value.remove(&id);
                 new_value.extend(
                     if let Node::Write {
@@ -1070,14 +1115,19 @@ fn liveness_dataflow(
                         // If this write is a cloning write, the `collect` input
                         // isn't actually live, because its value doesn't
                         // matter.
-                        Either::Left(once(data).filter(|id| is_obj(*id)))
+                        Either::Left(once(data).filter(is_obj))
+                    } else if let Node::Reduce {
+                        control: _,
+                        init: _,
+                        reduct,
+                    } = function.nodes[id.idx()]
+                        && function.schedules[id.idx()].contains(&Schedule::ParallelReduce)
+                    {
+                        // If this reduce is a parallel reduce, the `init` input
+                        // isn't actually live.
+                        Either::Left(once(reduct).filter(is_obj))
                     } else {
-                        Either::Right(
-                            uses.as_ref()
-                                .into_iter()
-                                .map(|id| *id)
-                                .filter(|id| is_obj(*id)),
-                        )
+                        Either::Right(uses.as_ref().into_iter().map(|id| *id).filter(is_obj))
                     },
                 );
                 changed |= *old_value != new_value;
diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs
index 8657fdc166fe68ad2565a8a0736984c7991be0a7..c8de7e9009a2a4a30831f8826d3f60b46d3e536a 100644
--- a/hercules_samples/dot/build.rs
+++ b/hercules_samples/dot/build.rs
@@ -4,7 +4,11 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("dot.hir")
         .unwrap()
-        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .schedule_in_src(if cfg!(feature = "cuda") {
+            "gpu.sch"
+        } else {
+            "cpu.sch"
+        })
         .unwrap()
         .build()
         .unwrap();
diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs
index 8862c11a9273f9808f2148f1067dcf3f5953c11f..7f5b453ab426f1ce0ab220682ce6be89bf851305 100644
--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
@@ -1,8 +1,8 @@
 #![feature(concat_idents)]
 
-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};
 
 juno_build::juno!("dot");
 
diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs
index 735458c0c8be76bdae6cd7b3b308e38ccae78edd..ed92e02249a754cb0cbc41b8e39e97bfcf93c9ed 100644
--- a/hercules_samples/matmul/build.rs
+++ b/hercules_samples/matmul/build.rs
@@ -4,7 +4,11 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("matmul.hir")
         .unwrap()
-        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .schedule_in_src(if cfg!(feature = "cuda") {
+            "gpu.sch"
+        } else {
+            "cpu.sch"
+        })
         .unwrap()
         .build()
         .unwrap();
diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs
index abd25ec9cddbd4be508b3f484cffc1df1365dc4d..5c87991569ab59e8d978f17d74f1d18423679669 100644
--- a/hercules_samples/matmul/src/main.rs
+++ b/hercules_samples/matmul/src/main.rs
@@ -2,9 +2,9 @@
 
 use rand::random;
 
-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};
 
 juno_build::juno!("matmul");
 
@@ -36,7 +36,9 @@ fn main() {
             let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
             let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
             let mut r = runner!(matmul);
-            let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let c = r
+                .run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
+                .await;
             let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
             c.to_cpu_ref(&mut c_cpu);
             assert_eq!(&*c_cpu, &*correct_c);
diff --git a/hercules_test/hercules_interpreter/src/interpreter.rs b/hercules_test/hercules_interpreter/src/interpreter.rs
index 871e304a2f8fb285cc9d8c64d4aa62ec5eef3a1d..709c64fb0f8b45d7903d6b1e7f1a5d5ee2f28185 100644
--- a/hercules_test/hercules_interpreter/src/interpreter.rs
+++ b/hercules_test/hercules_interpreter/src/interpreter.rs
@@ -69,18 +69,18 @@ pub fn dyn_const_value(
     match dc {
         DynamicConstant::Constant(v) => *v,
         DynamicConstant::Parameter(v) => dyn_const_params[*v],
-        DynamicConstant::Add(xs) => {
-            xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
-                .fold(0, |s, v| s + v)
-        }
+        DynamicConstant::Add(xs) => xs
+            .iter()
+            .map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
+            .fold(0, |s, v| s + v),
         DynamicConstant::Sub(a, b) => {
             dyn_const_value(a, dyn_const_values, dyn_const_params)
                 - dyn_const_value(b, dyn_const_values, dyn_const_params)
         }
-        DynamicConstant::Mul(xs) => {
-            xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
-                .fold(1, |p, v| p * v)
-        }
+        DynamicConstant::Mul(xs) => xs
+            .iter()
+            .map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
+            .fold(1, |p, v| p * v),
         DynamicConstant::Div(a, b) => {
             dyn_const_value(a, dyn_const_values, dyn_const_params)
                 / dyn_const_value(b, dyn_const_values, dyn_const_params)
@@ -89,28 +89,28 @@ pub fn dyn_const_value(
             dyn_const_value(a, dyn_const_values, dyn_const_params)
                 % dyn_const_value(b, dyn_const_values, dyn_const_params)
         }
-        DynamicConstant::Max(xs) => {
-            xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
-                .fold(None, |m, v| {
-                    if let Some(m) = m {
-                        Some(max(m, v))
-                    } else {
-                        Some(v)
-                    }
-                })
-                .unwrap()
-        }
-        DynamicConstant::Min(xs) => {
-            xs.iter().map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
-                .fold(None, |m, v| {
-                    if let Some(m) = m {
-                        Some(min(m, v))
-                    } else {
-                        Some(v)
-                    }
-                })
-                .unwrap()
-        }
+        DynamicConstant::Max(xs) => xs
+            .iter()
+            .map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
+            .fold(None, |m, v| {
+                if let Some(m) = m {
+                    Some(max(m, v))
+                } else {
+                    Some(v)
+                }
+            })
+            .unwrap(),
+        DynamicConstant::Min(xs) => xs
+            .iter()
+            .map(|x| dyn_const_value(x, dyn_const_values, dyn_const_params))
+            .fold(None, |m, v| {
+                if let Some(m) = m {
+                    Some(min(m, v))
+                } else {
+                    Some(v)
+                }
+            })
+            .unwrap(),
     }
 }
 
diff --git a/hercules_test/hercules_tests/tests/loop_tests.rs b/hercules_test/hercules_tests/tests/loop_tests.rs
index 5832a161a18f18ea43860c1c5d6364385d0f187f..7e40f602294243bd93e5ba3d075c10cfffa5d614 100644
--- a/hercules_test/hercules_tests/tests/loop_tests.rs
+++ b/hercules_test/hercules_tests/tests/loop_tests.rs
@@ -35,9 +35,7 @@ fn alternate_bounds_use_after_loop_no_tid() {
 
     println!("result: {:?}", result_1);
 
-    let schedule = default_schedule![
-        Forkify,
-    ];
+    let schedule = default_schedule![Forkify,];
 
     let module = run_schedule_on_hercules(module, Some(schedule)).unwrap();
 
@@ -61,9 +59,7 @@ fn alternate_bounds_use_after_loop() {
 
     println!("result: {:?}", result_1);
 
-    let schedule = Some(default_schedule![
-        Forkify,
-    ]);
+    let schedule = Some(default_schedule![Forkify,]);
 
     let module = run_schedule_on_hercules(module, schedule).unwrap();
 
@@ -108,10 +104,7 @@ fn do_while_separate_body() {
 
     println!("result: {:?}", result_1);
 
-    let schedule = Some(default_schedule![
-        PhiElim,
-        Forkify,
-    ]);
+    let schedule = Some(default_schedule![PhiElim, Forkify,]);
 
     let module = run_schedule_on_hercules(module, schedule).unwrap();
 
@@ -131,10 +124,7 @@ fn alternate_bounds_internal_control() {
 
     println!("result: {:?}", result_1);
 
-    let schedule = Some(default_schedule![
-        PhiElim,
-        Forkify,
-    ]);
+    let schedule = Some(default_schedule![PhiElim, Forkify,]);
 
     let module = run_schedule_on_hercules(module, schedule).unwrap();
 
@@ -155,10 +145,7 @@ fn alternate_bounds_internal_control2() {
 
     println!("result: {:?}", result_1);
 
-    let schedule = Some(default_schedule![
-        PhiElim,
-        Forkify,
-    ]);
+    let schedule = Some(default_schedule![PhiElim, Forkify,]);
 
     let module = run_schedule_on_hercules(module, schedule).unwrap();
 
@@ -366,16 +353,13 @@ fn look_at_local() {
         "/home/xavierrouth/dev/hercules/hercules_test/hercules_tests/save_me.hbin",
     );
 
-    let schedule = Some(default_schedule![
-    ]);
+    let schedule = Some(default_schedule![]);
 
     let result_1 = interp_module!(module, 0, dyn_consts, a.clone(), b.clone());
 
     let module = run_schedule_on_hercules(module.clone(), schedule).unwrap();
 
-    let schedule = Some(default_schedule![
-        Unforkify, Verify,
-    ]);
+    let schedule = Some(default_schedule![Unforkify, Verify,]);
 
     let module = run_schedule_on_hercules(module.clone(), schedule).unwrap();
 
diff --git a/juno_frontend/src/semant.rs b/juno_frontend/src/semant.rs
index e133e3c20b7590eb372756cfdbce1f732d57d4f6..8668d1b45f7d68cff90ec90f7b2b55953b8d5a9c 100644
--- a/juno_frontend/src/semant.rs
+++ b/juno_frontend/src/semant.rs
@@ -752,7 +752,16 @@ fn analyze_program(
                             }
                             arg_info.push((ty, inout.is_some(), var));
 
-                            match process_irrefutable_pattern(pattern, false, var, ty, lexer, &mut stringtab, &mut env, &mut types) {
+                            match process_irrefutable_pattern(
+                                pattern,
+                                false,
+                                var,
+                                ty,
+                                lexer,
+                                &mut stringtab,
+                                &mut env,
+                                &mut types,
+                            ) {
                                 Ok(prep) => {
                                     stmts.extend(prep);
                                 }
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index 482bbf8deb0af323255b004a7e33e70202acb886..e8a7e4e94393c8684c2a10a1e040e3be3f2600cb 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -8,9 +8,9 @@ use self::camera_model::*;
 use self::cava_rust::CHAN;
 use self::image_proc::*;
 
-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};
 
 use image::ImageError;
 
@@ -31,7 +31,6 @@ fn run_cava(
     coefs: &[f32],
     tonemap: &[f32],
 ) -> Box<[u8]> {
-
     assert_eq!(image.len(), CHAN * rows * cols);
     assert_eq!(tstw.len(), CHAN * CHAN);
     assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN);
@@ -47,21 +46,24 @@ fn run_cava(
         let weights = HerculesCPURef::from_slice(weights);
         let coefs = HerculesCPURef::from_slice(coefs);
         let tonemap = HerculesCPURef::from_slice(tonemap);
-	    let mut r = runner!(cava);
-	    async_std::task::block_on(async {
-		r.run(
-		    rows as u64,
-		    cols as u64,
-		    num_ctrl_pts as u64,
-		    image,
-		    tstw,
-		    ctrl_pts,
-		    weights,
-		    coefs,
-		    tonemap,
-		)
-		.await
-	    }).as_slice::<u8>().to_vec().into_boxed_slice()
+        let mut r = runner!(cava);
+        async_std::task::block_on(async {
+            r.run(
+                rows as u64,
+                cols as u64,
+                num_ctrl_pts as u64,
+                image,
+                tstw,
+                ctrl_pts,
+                weights,
+                coefs,
+                tonemap,
+            )
+            .await
+        })
+        .as_slice::<u8>()
+        .to_vec()
+        .into_boxed_slice()
     }
 
     #[cfg(feature = "cuda")]
@@ -72,8 +74,8 @@ fn run_cava(
         let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
         let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
         let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
-	    let mut r = runner!(cava);
-	    let res = async_std::task::block_on(async {
+        let mut r = runner!(cava);
+        let res = async_std::task::block_on(async {
             r.run(
                 rows as u64,
                 cols as u64,
@@ -86,7 +88,7 @@ fn run_cava(
                 tonemap.get_ref(),
             )
             .await
-	    });
+        });
         let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
         let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
         res.to_cpu_ref(&mut res_cpu);
@@ -204,7 +206,8 @@ fn cava_harness(args: CavaInputs) {
                 .expect("Error saving verification image");
         }
 
-        let max_diff = result.iter()
+        let max_diff = result
+            .iter()
             .zip(cpu_result.iter())
             .map(|(a, b)| (*a as i16 - *b as i16).abs())
             .max()
diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs
index 9674c2c54b328aefcb4e670dc7e9ec482f8b2508..547dee08b118c475e1905d0fe93e9aaebfdca535 100644
--- a/juno_samples/concat/src/main.rs
+++ b/juno_samples/concat/src/main.rs
@@ -1,9 +1,9 @@
 #![feature(concat_idents)]
 
 use hercules_rt::runner;
-use hercules_rt::HerculesCPURef;
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::HerculesCPURef;
 
 juno_build::juno!("concat");
 
@@ -20,7 +20,7 @@ fn main() {
             assert_eq!(output, 42);
 
             const N: usize = 3;
-            let arr : Box<[i32]> = (2..=4).collect();
+            let arr: Box<[i32]> = (2..=4).collect();
             let arr = HerculesCPURef::from_slice(&arr);
 
             let mut r = runner!(concat_switch);
diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
index eda65016c60de26a5cd1fe21d8d95dcaff826cf9..3b067ebd0c74ba4fe4b1cd4a39cf4f0b0c8b46cd 100644
--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -2,9 +2,9 @@
 
 mod edge_detection_rust;
 
-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};
 
 use std::slice::from_raw_parts;
 
@@ -228,9 +228,9 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
         });
 
         #[cfg(not(feature = "cuda"))]
-        let result : Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
+        let result: Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
         #[cfg(feature = "cuda")]
-        let result : Box<[f32]> = {
+        let result: Box<[f32]> = {
             let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() };
             let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice();
             result.to_cpu_ref(&mut res_cpu);
@@ -261,7 +261,10 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
                 theta,
             );
 
-            assert_eq!(result.as_ref(), <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result));
+            assert_eq!(
+                result.as_ref(),
+                <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result)
+            );
             println!("Frames {} match", i);
 
             if display_verify {
diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn
index bcfa1d25d0281b5c842d62142976c7518fe194e6..fdfa51a87a36f0def2c3b24701469a56fe498440 100644
--- a/juno_samples/fork_join_tests/src/fork_join_tests.jn
+++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn
@@ -62,7 +62,7 @@ fn test4(input : i32) -> i32[4, 4] {
 
 #[entry]
 fn test5(input : i32) -> i32[4] {
-  let arr1 : i32[4];
+  @cons let arr1 : i32[4];
   for i = 0 to 4 {
     let red = arr1[i];
     for k = 0 to 3 {
diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch
index 6e2d6845423d13e0c2208b0169e3437a4735ba8f..0987083ec8c40565d8c079fbc5535177d68353b7 100644
--- a/juno_samples/fork_join_tests/src/gpu.sch
+++ b/juno_samples/fork_join_tests/src/gpu.sch
@@ -1,3 +1,4 @@
+no-memset(test5@cons);
 parallel-reduce(test5@reduce);
 
 gvn(*);
diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs
index 4b6aba655a45a2fe2f844efe75f4e8a738a17b96..5e848adeeb3ddde17a5794afd783559b8f0f8c16 100644
--- a/juno_samples/fork_join_tests/src/main.rs
+++ b/juno_samples/fork_join_tests/src/main.rs
@@ -46,6 +46,6 @@ fn main() {
 }
 
 #[test]
-fn implicit_clone_test() {
+fn fork_join_test() {
     main();
 }
diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs
index 50fe1760eeeedc946d510a6d5285d76e1346f3cc..2892cd3473251b60b0a2d3d381ae0848c8b7fadb 100644
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -2,9 +2,9 @@
 
 use rand::random;
 
-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};
 
 juno_build::juno!("matmul");
 
@@ -28,10 +28,14 @@ fn main() {
             let a = HerculesCPURef::from_slice(&a);
             let b = HerculesCPURef::from_slice(&b);
             let mut r = runner!(matmul);
-            let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
+            let c = r
+                .run(I as u64, J as u64, K as u64, a.clone(), b.clone())
+                .await;
             assert_eq!(c.as_slice::<i32>(), &*correct_c);
             let mut r = runner!(tiled_64_matmul);
-            let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
+            let tiled_c = r
+                .run(I as u64, J as u64, K as u64, a.clone(), b.clone())
+                .await;
             assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
         }
         #[cfg(feature = "cuda")]
@@ -39,12 +43,16 @@ fn main() {
             let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
             let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
             let mut r = runner!(matmul);
-            let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let c = r
+                .run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
+                .await;
             let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
             c.to_cpu_ref(&mut c_cpu);
             assert_eq!(&*c_cpu, &*correct_c);
             let mut r = runner!(tiled_64_matmul);
-            let tiled_c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let tiled_c = r
+                .run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref())
+                .await;
             let mut tiled_c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
             tiled_c.to_cpu_ref(&mut tiled_c_cpu);
             assert_eq!(&*tiled_c_cpu, &*correct_c);
diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs
index bc99a4bdd071ff19c70977e29241b76e3e249014..b364c03c4cddcc1fc94171ff55eb3c1ff00e5b3e 100644
--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
@@ -1,8 +1,8 @@
 #![feature(concat_idents)]
 
-use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut};
 
 juno_build::juno!("nested_ccp");
 
diff --git a/juno_samples/patterns/src/main.rs b/juno_samples/patterns/src/main.rs
index 5cc2e7c874c590ab4f7ce313b697ea9ed3ae3a30..a5586c8b0ff3ce4e1ebddcf8d1a592701dfbbcee 100644
--- a/juno_samples/patterns/src/main.rs
+++ b/juno_samples/patterns/src/main.rs
@@ -1,6 +1,6 @@
 #![feature(concat_idents)]
 
-use hercules_rt::{runner};
+use hercules_rt::runner;
 
 juno_build::juno!("patterns");
 
diff --git a/juno_samples/schedule_test/build.rs b/juno_samples/schedule_test/build.rs
index 749a660c551e8b231f63287898adb2863aef826e..0129c4de3d2ceb99b060b0c3d61523b5188639dd 100644
--- a/juno_samples/schedule_test/build.rs
+++ b/juno_samples/schedule_test/build.rs
@@ -4,7 +4,11 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("code.jn")
         .unwrap()
-        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .schedule_in_src(if cfg!(feature = "cuda") {
+            "gpu.sch"
+        } else {
+            "cpu.sch"
+        })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/schedule_test/src/main.rs b/juno_samples/schedule_test/src/main.rs
index 1505d4e5ff620a53d1095cdc4185a5a6d665e71e..f769e750a5ac3fae2594b2d4285646efd544416e 100644
--- a/juno_samples/schedule_test/src/main.rs
+++ b/juno_samples/schedule_test/src/main.rs
@@ -2,9 +2,9 @@
 
 use rand::random;
 
-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};
 
 juno_build::juno!("code");
 
@@ -43,7 +43,16 @@ fn main() {
             let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
             let c = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&c));
             let mut r = runner!(test);
-            let res = r.run(N as u64, M as u64, K as u64, a.get_ref(), b.get_ref(), c.get_ref()).await;
+            let res = r
+                .run(
+                    N as u64,
+                    M as u64,
+                    K as u64,
+                    a.get_ref(),
+                    b.get_ref(),
+                    c.get_ref(),
+                )
+                .await;
             let mut res_cpu: Box<[i32]> = vec![0; correct_res.len()].into_boxed_slice();
             res.to_cpu_ref(&mut res_cpu);
             assert_eq!(&*res_cpu, &*correct_res);
diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs
index 8eb78f7c93b0f195ffee1be120376dbe3f9a2a62..687ff414a399679fd27aae5c7b81555c1e1d5e2c 100644
--- a/juno_samples/simple3/src/main.rs
+++ b/juno_samples/simple3/src/main.rs
@@ -1,8 +1,8 @@
 #![feature(concat_idents)]
 
-use hercules_rt::{runner, HerculesCPURef};
 #[cfg(feature = "cuda")]
 use hercules_rt::CUDABox;
+use hercules_rt::{runner, HerculesCPURef};
 
 juno_build::juno!("simple3");