From 3f8acd3bb4d0b7388a74c7dc24e3d1e4ad9771b1 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Thu, 27 Feb 2025 22:44:17 -0600
Subject: [PATCH 1/9] misc. progress for bfs gpu

---
 hercules_cg/src/rt.rs                |  2 +-
 hercules_opt/src/gcm.rs              | 70 ++++++++++++++++------------
 juno_samples/rodinia/bfs/src/gpu.sch | 52 ++++++++++++++-------
 3 files changed, 76 insertions(+), 48 deletions(-)

diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 6981a3da..d94f0e19 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -938,7 +938,7 @@ impl<'a> RTContext<'a> {
                     let dst_device = self.node_colors.0[&collect];
                     write!(
                         block,
-                        "::hercules_rt::__copy_{}_to_{}({}.byte_add({} as usize).0, {}.0, {});",
+                        "::hercules_rt::__copy_{}_to_{}({}.byte_add({} as usize).0, {}.0, {} as usize);",
                         src_device.name(),
                         dst_device.name(),
                         self.get_value(collect, bb, false),
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index c612acac..3f326051 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -1304,37 +1304,49 @@ enum UTerm {
 
 fn unify(
     mut equations: VecDeque<(UTerm, UTerm)>,
-) -> Result<BTreeMap<NodeID, Device>, BTreeMap<NodeID, Device>> {
+) -> Result<BTreeMap<NodeID, Device>, (NodeID, NodeID)> {
     let mut theta = BTreeMap::new();
 
+    // First, assign devices to nodes when a rule directly says to.
+    for _ in 0..equations.len() {
+        let (l, r) = equations.pop_front().unwrap();
+        match (l, r) {
+            (UTerm::Node(n), UTerm::Device(d)) | (UTerm::Device(d), UTerm::Node(n)) => {
+                theta.insert(n, d);
+            }
+            _ => equations.push_back((l, r)),
+        }
+    }
+
+    // Second, iterate the rest of the rules until...
+    // 1. The rules are exhausted. All the nodes have device assignments.
+    // 2. No progress is being made. Some nodes may not have device assignments.
+    // 3. An inconsistency has been found. The inconsistency is returned.
     let mut no_progress_iters = 0;
     while no_progress_iters <= equations.len()
         && let Some((l, r)) = equations.pop_front()
     {
-        match (l, r) {
-            (UTerm::Node(_), UTerm::Node(_)) => {
-                if l != r {
-                    equations.push_back((l, r));
-                }
-                no_progress_iters += 1;
-            }
-            (UTerm::Node(n), UTerm::Device(d)) | (UTerm::Device(d), UTerm::Node(n)) => {
-                theta.insert(n, d);
-                for (l, r) in equations.iter_mut() {
-                    if *l == UTerm::Node(n) {
-                        *l = UTerm::Device(d);
-                    }
-                    if *r == UTerm::Node(n) {
-                        *r = UTerm::Device(d);
-                    }
+        let (UTerm::Node(l), UTerm::Node(r)) = (l, r) else {
+            panic!();
+        };
+
+        match (theta.get(&l), theta.get(&r)) {
+            (Some(ld), Some(rd)) => {
+                if ld != rd {
+                    return Err((l, r));
+                } else {
+                    no_progress_iters = 0;
                 }
-                no_progress_iters = 0;
             }
-            (UTerm::Device(d1), UTerm::Device(d2)) if d1 == d2 => {
+            (Some(d), None) | (None, Some(d)) => {
+                let d = *d;
+                theta.insert(l, d);
+                theta.insert(r, d);
                 no_progress_iters = 0;
             }
-            _ => {
-                return Err(theta);
+            (None, None) => {
+                equations.push_back((UTerm::Node(l), UTerm::Node(r)));
+                no_progress_iters += 1;
             }
         }
     }
@@ -1377,8 +1389,8 @@ fn color_nodes(
             } if !editor.get_type(typing[id.idx()]).is_primitive() => {
                 // Every input to a phi needs to be on the same device. The
                 // phi itself is also on this device.
-                for (l, r) in zip(data.into_iter(), data.into_iter().skip(1).chain(once(&id))) {
-                    equations.push((UTerm::Node(*l), UTerm::Node(*r)));
+                for data in data {
+                    equations.push((UTerm::Node(*data), UTerm::Node(id)));
                 }
             }
             Node::Reduce {
@@ -1394,7 +1406,7 @@ fn color_nodes(
             } if !editor.get_type(typing[id.idx()]).is_primitive() => {
                 // Every input to the reduce, and the reduce itself, are on
                 // the same device.
-                equations.push((UTerm::Node(first), UTerm::Node(second)));
+                equations.push((UTerm::Node(first), UTerm::Node(id)));
                 equations.push((UTerm::Node(second), UTerm::Node(id)));
             }
             Node::Constant { id: _ }
@@ -1533,12 +1545,12 @@ fn color_nodes(
             }
             Some(func_colors)
         }
-        Err(progress) => {
+        Err(inconsistency) => {
             // If unification failed, then there's some node using a node in
-            // `progress` that's expecting a different type than what it got.
-            // Pick one and add potentially inter-device copies on each def-use
-            // edge. We'll clean these up later.
-            let (id, _) = progress.into_iter().next().unwrap();
+            // that's expecting a different type than what it got. Pick one and
+            // add potentially inter-device copies on each def-use edge. We'll
+            // clean these up later.
+            let id = inconsistency.0;
             let users: Vec<_> = editor.get_users(id).collect();
             let success = editor.edit(|mut edit| {
                 let cons = edit.add_zero_constant(typing[id.idx()]);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 0a3f4d77..49d44b98 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -1,23 +1,39 @@
-gvn(*);
-phi-elim(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  infer-schedules(X);
+}
 
-let outline = auto-outline(bfs);
-gpu(outline.bfs);
+phi-elim(bfs);
+no-memset(bfs@cost);
+let cost_init = outline(bfs@cost_init);
+let loop1 = outline(bfs@loop1);
+let loop2 = outline(bfs@loop2);
+gpu(cost_init, loop1, loop2);
 
-ip-sroa(*);
-sroa(*);
-dce(*);
-gvn(*);
-phi-elim(*);
-dce(*);
+simpl!(*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
+predication(*);
+simpl!(*);
 
-//forkify(*);
-infer-schedules(*);
+parallel-reduce(loop1);
+forkify(*);
+fork-guard-elim(*);
+simpl!(*);
+predication(*);
+reduce-slf(*);
+simpl!(*);
 
 gcm(*);
-fixpoint {
-  float-collections(*);
-  dce(*);
-  gcm(*);
-}
+xdot[true](*);
-- 
GitLab


From 2654f19895ca35203f8431d46613c5566e194e40 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Thu, 27 Feb 2025 22:50:22 -0600
Subject: [PATCH 2/9] oh my

---
 juno_samples/rodinia/bfs/src/gpu.sch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 49d44b98..5be2d903 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -13,7 +13,7 @@ no-memset(bfs@cost);
 let cost_init = outline(bfs@cost_init);
 let loop1 = outline(bfs@loop1);
 let loop2 = outline(bfs@loop2);
-gpu(cost_init, loop1, loop2);
+gpu(loop1, loop2);
 
 simpl!(*);
 predication(*);
@@ -35,5 +35,5 @@ predication(*);
 reduce-slf(*);
 simpl!(*);
 
+unforkify(cost_init);
 gcm(*);
-xdot[true](*);
-- 
GitLab


From ab1eafd464dd73b2e9f81633e42ede950766f600 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Fri, 28 Feb 2025 09:52:12 -0600
Subject: [PATCH 3/9] fix unify

---
 hercules_opt/src/gcm.rs | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index 3f326051..1dcedd97 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -1302,9 +1302,7 @@ enum UTerm {
     Device(Device),
 }
 
-fn unify(
-    mut equations: VecDeque<(UTerm, UTerm)>,
-) -> Result<BTreeMap<NodeID, Device>, (NodeID, NodeID)> {
+fn unify(mut equations: VecDeque<(UTerm, UTerm)>) -> Result<BTreeMap<NodeID, Device>, NodeID> {
     let mut theta = BTreeMap::new();
 
     // First, assign devices to nodes when a rule directly says to.
@@ -1312,7 +1310,11 @@ fn unify(
         let (l, r) = equations.pop_front().unwrap();
         match (l, r) {
             (UTerm::Node(n), UTerm::Device(d)) | (UTerm::Device(d), UTerm::Node(n)) => {
-                theta.insert(n, d);
+                if let Some(old_d) = theta.insert(n, d)
+                    && old_d != d
+                {
+                    return Err(n);
+                }
             }
             _ => equations.push_back((l, r)),
         }
@@ -1333,7 +1335,7 @@ fn unify(
         match (theta.get(&l), theta.get(&r)) {
             (Some(ld), Some(rd)) => {
                 if ld != rd {
-                    return Err((l, r));
+                    return Err(l);
                 } else {
                     no_progress_iters = 0;
                 }
@@ -1545,12 +1547,11 @@ fn color_nodes(
             }
             Some(func_colors)
         }
-        Err(inconsistency) => {
+        Err(id) => {
             // If unification failed, then there's some node using a node in
-            // that's expecting a different type than what it got. Pick one and
-            // add potentially inter-device copies on each def-use edge. We'll
-            // clean these up later.
-            let id = inconsistency.0;
+            // that's expecting a different type than what it got. Add
+            // potentially inter-device copies on each def-use edge. We'll clean
+            // these up later.
             let users: Vec<_> = editor.get_users(id).collect();
             let success = editor.edit(|mut edit| {
                 let cons = edit.add_zero_constant(typing[id.idx()]);
-- 
GitLab


From 96c8dc524fe30a271e57a2f9d02f3759411767ad Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Fri, 28 Feb 2025 11:14:12 -0600
Subject: [PATCH 4/9] sequentializing channel loop in gamut in cava is
 beneficial

---
 juno_samples/cava/src/gpu.sch | 1 +
 1 file changed, 1 insertion(+)

diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index c8db124e..bacfd3ab 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -120,6 +120,7 @@ simpl!(fuse4);
 //fork-tile[2, 0, false, true](fuse4@channel_loop);
 //fork-split(fuse4@channel_loop);
 //clean-monoid-reduces(fuse4);
+unforkify(fuse4@channel_loop);
 
 no-memset(fuse5@res1);
 no-memset(fuse5@res2);
-- 
GitLab


From 28ea01a1afcbb630b570e360693defb3e76c70d9 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Fri, 28 Feb 2025 13:24:48 -0600
Subject: [PATCH 5/9] Infer more indices as parallel

---
 hercules_opt/src/utils.rs                     | 18 ++++++++++++++++++
 juno_samples/rodinia/backprop/src/backprop.jn |  4 ++--
 juno_samples/rodinia/backprop/src/cpu.sch     | 10 +++-------
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/hercules_opt/src/utils.rs b/hercules_opt/src/utils.rs
index e962b81d..b910a128 100644
--- a/hercules_opt/src/utils.rs
+++ b/hercules_opt/src/utils.rs
@@ -532,6 +532,24 @@ where
     let fork_thread_id_pairs = node_indices(indices).filter_map(|id| {
         if let Node::ThreadID { control, dimension } = nodes[id.idx()] {
             Some((control, dimension))
+        } else if let Node::Binary {
+            op: BinaryOperator::Add,
+            left: tid,
+            right: cons,
+        } = nodes[id.idx()]
+            && let Node::ThreadID { control, dimension } = nodes[tid.idx()]
+            && (nodes[cons.idx()].is_constant() || nodes[cons.idx()].is_dynamic_constant())
+        {
+            Some((control, dimension))
+        } else if let Node::Binary {
+            op: BinaryOperator::Add,
+            left: cons,
+            right: tid,
+        } = nodes[id.idx()]
+            && let Node::ThreadID { control, dimension } = nodes[tid.idx()]
+            && (nodes[cons.idx()].is_constant() || nodes[cons.idx()].is_dynamic_constant())
+        {
+            Some((control, dimension))
         } else {
             None
         }
diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn
index 356bb3d9..94c4334c 100644
--- a/juno_samples/rodinia/backprop/src/backprop.jn
+++ b/juno_samples/rodinia/backprop/src/backprop.jn
@@ -7,9 +7,9 @@ fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f
   @res let result : f32[m + 1];
   result[0] = 1.0;
 
-  for j in 1..=m {
+  @outer_loop for j in 1..=m {
     let sum = 0.0;
-    for k in 0..=n {
+    @inner_loop for k in 0..=n {
       sum += weights[k, j] * vals[k];
     }
     result[j] = squash(sum);
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index d1fe8953..d59fd5f5 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -15,20 +15,16 @@ delete-uncalled(*);
 no-memset(layer_forward@res);
 lift-dc-math(*);
 loop-bound-canon(*);
-dce(*);
+simpl!(*);
 lift-dc-math(*);
+slf(*);
 fixpoint {
   forkify(*);
   fork-guard-elim(*);
   fork-coalesce(*);
 }
+simpl!(*);
 
 fork-split(*);
-gvn(*);
-phi-elim(*);
-dce(*);
 unforkify(*);
-gvn(*);
-phi-elim(*);
-dce(*);
 gcm(*);
-- 
GitLab


From 3565d96d31fa97f32892548167b359843fafb6dc Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Fri, 28 Feb 2025 14:28:50 -0600
Subject: [PATCH 6/9] Tweak edge multi-core

---
 juno_samples/edge_detection/src/cpu.sch | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/juno_samples/edge_detection/src/cpu.sch b/juno_samples/edge_detection/src/cpu.sch
index 4bd3254b..ec9e423d 100644
--- a/juno_samples/edge_detection/src/cpu.sch
+++ b/juno_samples/edge_detection/src/cpu.sch
@@ -86,7 +86,7 @@ fixpoint {
 simpl!(max_gradient);
 fork-dim-merge(max_gradient);
 simpl!(max_gradient);
-fork-tile[8, 0, false, false](max_gradient);
+fork-tile[16, 0, false, false](max_gradient);
 let split = fork-split(max_gradient);
 clean-monoid-reduces(max_gradient);
 let out = outline(split._4_max_gradient.fj1);
@@ -104,11 +104,18 @@ fixpoint {
 }
 predication(reject_zero_crossings);
 simpl!(reject_zero_crossings);
+fork-tile[4, 1, false, false](reject_zero_crossings);
+fork-tile[4, 0, false, false](reject_zero_crossings);
+fork-interchange[1, 2](reject_zero_crossings);
+let split = fork-split(reject_zero_crossings);
+let reject_zero_crossings_body = outline(split._5_reject_zero_crossings.fj2);
+fork-coalesce(reject_zero_crossings, reject_zero_crossings_body);
+simpl!(reject_zero_crossings, reject_zero_crossings_body);
 
 async-call(edge_detection@le, edge_detection@zc);
 
-fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings);
-unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings);
+fork-split(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body);
+unforkify(gaussian_smoothing_body, laplacian_estimate_body, zero_crossings_body, gradient, reject_zero_crossings_body);
 
 simpl!(*);
 
-- 
GitLab


From da2cf7557ad1b74556df070c76d16094b9f20256 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Fri, 28 Feb 2025 14:45:40 -0600
Subject: [PATCH 7/9] gcm moment to allow reduce-slf in CPU code without worry

---
 hercules_opt/src/gcm.rs               | 7 +++++++
 juno_samples/rodinia/srad/src/cpu.sch | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index 1dcedd97..2579e73e 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -883,6 +883,13 @@ fn spill_clones(
                         init == *b
                             && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce)
                     })
+                    .unwrap_or(false)
+                && !editor.func().nodes[a.idx()]
+                    .try_phi()
+                    .map(|(_, data)| {
+                        data.contains(b)
+                            && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce)
+                    })
                     .unwrap_or(false))
     });
 
diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index 2b45e8c9..a4cd4956 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -29,6 +29,8 @@ fixpoint {
 }
 simpl!(*);
 fork-interchange[0, 1](loop1);
+reduce-slf(*);
+simpl!(*);
 
 fork-split(*);
 unforkify(*);
-- 
GitLab


From 1b78bc2e9d2acce3e9b1eec8aed25489eb9aced5 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Fri, 28 Feb 2025 15:59:10 -0600
Subject: [PATCH 8/9] Refactor infer parallelreduce

---
 hercules_opt/src/schedule.rs              | 46 ++++++-----------------
 juno_samples/rodinia/backprop/src/cpu.sch |  1 +
 2 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/hercules_opt/src/schedule.rs b/hercules_opt/src/schedule.rs
index 9bc7823e..10eca72e 100644
--- a/hercules_opt/src/schedule.rs
+++ b/hercules_opt/src/schedule.rs
@@ -1,6 +1,6 @@
 use std::collections::{BTreeSet, HashMap, HashSet};
+use std::iter::once;
 
-use hercules_ir::def_use::*;
 use hercules_ir::ir::*;
 
 use crate::*;
@@ -42,6 +42,10 @@ pub fn infer_parallel_reduce(
     fork_join_map: &HashMap<NodeID, NodeID>,
     reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>,
 ) {
+    let join_fork_map: HashMap<_, _> = fork_join_map
+        .into_iter()
+        .map(|(fork, join)| (*join, *fork))
+        .collect();
     for id in editor.node_ids() {
         let func = editor.func();
         if !func.nodes[id.idx()].is_reduce() {
@@ -98,40 +102,11 @@ pub fn infer_parallel_reduce(
             && *collect == last_reduce
             && !reduce_cycles[&last_reduce].contains(data)
         {
-            // If there is a Write-Reduce tight cycle, get the position indices.
-            let positions = indices
-                .iter()
-                .filter_map(|index| {
-                    if let Index::Position(indices) = index {
-                        Some(indices)
-                    } else {
-                        None
-                    }
-                })
-                .flat_map(|pos| pos.iter());
-
-            // Get the Forks corresponding to uses of bare ThreadIDs.
-            let fork_thread_id_pairs = positions.filter_map(|id| {
-                if let Node::ThreadID { control, dimension } = func.nodes[id.idx()] {
-                    Some((control, dimension))
-                } else {
-                    None
-                }
-            });
-            let mut forks = HashMap::<NodeID, Vec<usize>>::new();
-            for (fork, dim) in fork_thread_id_pairs {
-                forks.entry(fork).or_default().push(dim);
-            }
-
-            // Check if one of the Forks correspond to the Join associated with
-            // the Reduce being considered, and has all of its dimensions
-            // represented in the indexing.
-            let is_parallel = forks.into_iter().any(|(id, mut rep_dims)| {
-                rep_dims.sort();
-                rep_dims.dedup();
-                fork_join_map[&id] == first_control.unwrap()
-                    && func.nodes[id.idx()].try_fork().unwrap().1.len() == rep_dims.len()
-            });
+            let is_parallel = indices_parallel_over_forks(
+                editor,
+                indices,
+                once(join_fork_map[&first_control.unwrap()]),
+            );
 
             if is_parallel {
                 editor.edit(|edit| edit.add_schedule(id, Schedule::ParallelReduce));
@@ -145,6 +120,7 @@ pub fn infer_parallel_reduce(
  * operands must be the Reduce node, and all other operands must not be in the
  * Reduce node's cycle.
  */
+#[rustfmt::skip]
 pub fn infer_monoid_reduce(
     editor: &mut FunctionEditor,
     reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>,
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index d59fd5f5..de34d660 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -23,6 +23,7 @@ fixpoint {
   fork-guard-elim(*);
   fork-coalesce(*);
 }
+reduce-slf(*);
 simpl!(*);
 
 fork-split(*);
-- 
GitLab


From eaf09eb1b58e1d159839aa0c4b474e7d62d5dd09 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Fri, 28 Feb 2025 16:31:57 -0600
Subject: [PATCH 9/9] hmm

---
 hercules_opt/src/gcm.rs              | 4 ++--
 juno_samples/rodinia/bfs/src/gpu.sch | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index 2579e73e..d950941a 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -879,8 +879,8 @@ fn spill_clones(
                     || editor.func().nodes[a.idx()].is_reduce())
                 && !editor.func().nodes[a.idx()]
                     .try_reduce()
-                    .map(|(_, init, _)| {
-                        init == *b
+                    .map(|(_, init, reduct)| {
+                        (init == *b || reduct == *b)
                             && editor.func().schedules[a.idx()].contains(&Schedule::ParallelReduce)
                     })
                     .unwrap_or(false)
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 5be2d903..6c4d027b 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -27,6 +27,7 @@ simpl!(*);
 predication(*);
 simpl!(*);
 
+unforkify(cost_init);
 parallel-reduce(loop1);
 forkify(*);
 fork-guard-elim(*);
@@ -35,5 +36,4 @@ predication(*);
 reduce-slf(*);
 simpl!(*);
 
-unforkify(cost_init);
 gcm(*);
-- 
GitLab