diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 6981a3da7e59176f73d6fecdde07fe636cc6aecf..d94f0e19191a028dfaded785c460164513b712a4 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -938,7 +938,7 @@ impl<'a> RTContext<'a> {
                     let dst_device = self.node_colors.0[&collect];
                     write!(
                         block,
-                        "::hercules_rt::__copy_{}_to_{}({}.byte_add({} as usize).0, {}.0, {});",
+                        "::hercules_rt::__copy_{}_to_{}({}.byte_add({} as usize).0, {}.0, {} as usize);",
                         src_device.name(),
                         dst_device.name(),
                         self.get_value(collect, bb, false),
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index c612acac1e114fcb3a73edcf0468d2c6f7c84acd..3f326051a228d731c47e5ae72aac4a3892c363e8 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -1304,37 +1304,49 @@ enum UTerm {
 
 fn unify(
     mut equations: VecDeque<(UTerm, UTerm)>,
-) -> Result<BTreeMap<NodeID, Device>, BTreeMap<NodeID, Device>> {
+) -> Result<BTreeMap<NodeID, Device>, (NodeID, NodeID)> {
     let mut theta = BTreeMap::new();
 
+    // First, assign devices to nodes when a rule directly says to.
+    for _ in 0..equations.len() {
+        let (l, r) = equations.pop_front().unwrap();
+        match (l, r) {
+            (UTerm::Node(n), UTerm::Device(d)) | (UTerm::Device(d), UTerm::Node(n)) => {
+                theta.insert(n, d);
+            }
+            _ => equations.push_back((l, r)),
+        }
+    }
+
+    // Second, iterate the rest of the rules until...
+    // 1. The rules are exhausted. All the nodes have device assignments.
+    // 2. No progress is being made. Some nodes may not have device assignments.
+    // 3. An inconsistency has been found. The inconsistency is returned.
     let mut no_progress_iters = 0;
     while no_progress_iters <= equations.len()
         && let Some((l, r)) = equations.pop_front()
     {
-        match (l, r) {
-            (UTerm::Node(_), UTerm::Node(_)) => {
-                if l != r {
-                    equations.push_back((l, r));
-                }
-                no_progress_iters += 1;
-            }
-            (UTerm::Node(n), UTerm::Device(d)) | (UTerm::Device(d), UTerm::Node(n)) => {
-                theta.insert(n, d);
-                for (l, r) in equations.iter_mut() {
-                    if *l == UTerm::Node(n) {
-                        *l = UTerm::Device(d);
-                    }
-                    if *r == UTerm::Node(n) {
-                        *r = UTerm::Device(d);
-                    }
+        let (UTerm::Node(l), UTerm::Node(r)) = (l, r) else {
+            panic!();
+        };
+
+        match (theta.get(&l), theta.get(&r)) {
+            (Some(ld), Some(rd)) => {
+                if ld != rd {
+                    return Err((l, r));
+                } else {
+                    no_progress_iters = 0;
                 }
-                no_progress_iters = 0;
             }
-            (UTerm::Device(d1), UTerm::Device(d2)) if d1 == d2 => {
+            (Some(d), None) | (None, Some(d)) => {
+                let d = *d;
+                theta.insert(l, d);
+                theta.insert(r, d);
                 no_progress_iters = 0;
             }
-            _ => {
-                return Err(theta);
+            (None, None) => {
+                equations.push_back((UTerm::Node(l), UTerm::Node(r)));
+                no_progress_iters += 1;
             }
         }
     }
@@ -1377,8 +1389,8 @@ fn color_nodes(
             } if !editor.get_type(typing[id.idx()]).is_primitive() => {
                 // Every input to a phi needs to be on the same device. The
                 // phi itself is also on this device.
-                for (l, r) in zip(data.into_iter(), data.into_iter().skip(1).chain(once(&id))) {
-                    equations.push((UTerm::Node(*l), UTerm::Node(*r)));
+                for data in data {
+                    equations.push((UTerm::Node(*data), UTerm::Node(id)));
                 }
             }
             Node::Reduce {
@@ -1394,7 +1406,7 @@ fn color_nodes(
             } if !editor.get_type(typing[id.idx()]).is_primitive() => {
                 // Every input to the reduce, and the reduce itself, are on
                 // the same device.
-                equations.push((UTerm::Node(first), UTerm::Node(second)));
+                equations.push((UTerm::Node(first), UTerm::Node(id)));
                 equations.push((UTerm::Node(second), UTerm::Node(id)));
             }
             Node::Constant { id: _ }
@@ -1533,12 +1545,12 @@ fn color_nodes(
             }
             Some(func_colors)
         }
-        Err(progress) => {
+        Err(inconsistency) => {
             // If unification failed, then there's some node using a node in
-            // `progress` that's expecting a different type than what it got.
-            // Pick one and add potentially inter-device copies on each def-use
-            // edge. We'll clean these up later.
-            let (id, _) = progress.into_iter().next().unwrap();
+            // that's expecting a different type than what it got. Pick one and
+            // add potentially inter-device copies on each def-use edge. We'll
+            // clean these up later.
+            let id = inconsistency.0;
             let users: Vec<_> = editor.get_users(id).collect();
             let success = editor.edit(|mut edit| {
                 let cons = edit.add_zero_constant(typing[id.idx()]);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 0a3f4d7737ba22b77443e1f8e7b35ded35040e4c..49d44b98b3fe0ca48d57708a99c9c0763aa09b25 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -1,23 +1,39 @@
-gvn(*);
-phi-elim(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  infer-schedules(X);
+}
 
-let outline = auto-outline(bfs);
-gpu(outline.bfs);
+phi-elim(bfs);
+no-memset(bfs@cost);
+let cost_init = outline(bfs@cost_init);
+let loop1 = outline(bfs@loop1);
+let loop2 = outline(bfs@loop2);
+gpu(cost_init, loop1, loop2);
 
-ip-sroa(*);
-sroa(*);
-dce(*);
-gvn(*);
-phi-elim(*);
-dce(*);
+simpl!(*);
+predication(*);
+const-inline(*);
+simpl!(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+}
+simpl!(*);
+predication(*);
+simpl!(*);
 
-//forkify(*);
-infer-schedules(*);
+parallel-reduce(loop1);
+forkify(*);
+fork-guard-elim(*);
+simpl!(*);
+predication(*);
+reduce-slf(*);
+simpl!(*);
 
 gcm(*);
-fixpoint {
-  float-collections(*);
-  dce(*);
-  gcm(*);
-}
+xdot[true](*);