diff --git a/hercules_ir/src/dot.rs b/hercules_ir/src/dot.rs
index 9dd2cb1e4c0ae76ac2d60bb9e638d95cca646016..22cd0beb7e5b6946c1116422441a0777f21f064b 100644
--- a/hercules_ir/src/dot.rs
+++ b/hercules_ir/src/dot.rs
@@ -219,7 +219,13 @@ fn write_subgraph_header<W: Write>(
     } else {
         write!(w, "label=\"{}\"\n", function.name)?;
     }
-    write!(w, "bgcolor=ivory4\n")?;
+    let color = match function.device {
+        Some(Device::LLVM) => "paleturquoise1",
+        Some(Device::CUDA) => "darkseagreen1",
+        Some(Device::AsyncRust) => "peachpuff1",
+        None => "ivory2",
+    };
+    write!(w, "bgcolor={}\n", color)?;
     write!(w, "cluster=true\n")?;
     Ok(())
 }
diff --git a/hercules_ir/src/loops.rs b/hercules_ir/src/loops.rs
index 13e935e0dd151ba3a29c4d07c9f9ee50341d5091..1d706c7834cf30fa3bf5e556d812917942a48d8b 100644
--- a/hercules_ir/src/loops.rs
+++ b/hercules_ir/src/loops.rs
@@ -152,16 +152,7 @@ pub fn loops(
         })
         .collect();
 
-    // Step 6: compute the inverse loop map - this maps control nodes to which
-    // loop they are in (keyed by header), if they are in one.
-    let mut inverse_loops = HashMap::new();
-    for (header, (contents, _)) in loops.iter() {
-        for idx in contents.iter_ones() {
-            inverse_loops.insert(NodeID::new(idx), *header);
-        }
-    }
-
-    // Step 7: compute loop tree nesting.
+    // Step 6: compute loop tree nesting.
     let mut nesting = HashMap::new();
     let mut worklist: VecDeque<NodeID> = loops.keys().map(|id| *id).collect();
     while let Some(header) = worklist.pop_front() {
@@ -175,6 +166,24 @@ pub fn loops(
         }
     }
 
+    // Step 7: compute the inverse loop map - this maps control nodes to which
+    // loop they are in (identified by header), if they are in one. Pick the
+    // most nested loop as the loop they are in.
+    let mut inverse_loops = HashMap::new();
+    for (header, (contents, _)) in loops.iter() {
+        for idx in contents.iter_ones() {
+            let id = NodeID::new(idx);
+            if let Some(old_header) = inverse_loops.get(&id)
+                && nesting[old_header] > nesting[header]
+            {
+                // If the inserted header is more deeply nested, don't do anything.
+                assert!(nesting[old_header] != nesting[header] || old_header == header);
+            } else {
+                inverse_loops.insert(id, *header);
+            }
+        }
+    }
+
     LoopTree {
         root,
         loops,
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index 5ea9485d108ea6454d856bf164d990ea5d7895f8..1323d5a05a784e76d4d3b040f014acd216c710c0 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -5,7 +5,6 @@ use bitvec::prelude::*;
 use either::Either;
 use union_find::{QuickFindUf, UnionBySize, UnionFind};
 
-use hercules_cg::*;
 use hercules_ir::*;
 
 use crate::*;
@@ -837,19 +836,16 @@ fn liveness_dataflow(
         liveness.insert(NodeID::new(bb_idx), vec![BTreeSet::new(); insts.len() + 1]);
     }
     let mut num_phis_reduces = vec![0; function.nodes.len()];
-    let mut reducing = vec![false; function.nodes.len()];
+    let mut has_phi = vec![false; function.nodes.len()];
+    let mut has_reduce = vec![false; function.nodes.len()];
     for (node_idx, bb) in bbs.0.iter().enumerate() {
         let node = &function.nodes[node_idx];
         if node.is_phi() || node.is_reduce() {
             num_phis_reduces[bb.idx()] += 1;
-            // Phis and reduces can't be in the same basic block.
-            if node.is_reduce() {
-                assert!(num_phis_reduces[bb.idx()] == 0 || reducing[bb.idx()]);
-                reducing[bb.idx()] = true;
-            } else {
-                assert!(!reducing[bb.idx()]);
-            }
         }
+        has_phi[bb.idx()] = node.is_phi();
+        has_reduce[bb.idx()] = node.is_reduce();
+        assert!(!has_phi[bb.idx()] || !has_reduce[bb.idx()]);
     }
     let is_obj = |id: NodeID| !objects[&func_id].objects(id).is_empty();
 
@@ -861,7 +857,7 @@ fn liveness_dataflow(
             let last_pt = bbs.1[bb.idx()].len();
             let old_value = &liveness[&bb][last_pt];
             let mut new_value = BTreeSet::new();
-            for succ in control_subgraph.succs(*bb).chain(if reducing[bb.idx()] {
+            for succ in control_subgraph.succs(*bb).chain(if has_reduce[bb.idx()] {
                 Either::Left(once(*bb))
             } else {
                 Either::Right(empty())
diff --git a/hercules_samples/dot/Cargo.toml b/hercules_samples/dot/Cargo.toml
index 69cd39e388661b3f7f6dca53cf9210ab7050902c..99a48115197ce853941223ed360079ec5376583e 100644
--- a/hercules_samples/dot/Cargo.toml
+++ b/hercules_samples/dot/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = ["hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/hercules_samples/dot/build.rs b/hercules_samples/dot/build.rs
index 2a239bc6c3ebd3780cb15358375c59bdfb2e25ae..4cfd2a87fba14d3c542bb54806a65da2d1a9b8f5 100644
--- a/hercules_samples/dot/build.rs
+++ b/hercules_samples/dot/build.rs
@@ -4,6 +4,9 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("dot.hir")
         .unwrap()
+        //.schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .schedule_in_src("cpu.sch")
+        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/hercules_samples/dot/src/cpu.sch b/hercules_samples/dot/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..58a7266df5c71232aae41a969dcf286ec3a98385
--- /dev/null
+++ b/hercules_samples/dot/src/cpu.sch
@@ -0,0 +1,12 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+unforkify(*);
+dce(*);
+
+gcm(*);
diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..956eb99628a03a3efb3d77e97d93a8cb677bbd6a
--- /dev/null
+++ b/hercules_samples/dot/src/gpu.sch
@@ -0,0 +1,13 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+gpu(*);
+host(dot);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+
+gcm(*);
diff --git a/hercules_samples/matmul/Cargo.toml b/hercules_samples/matmul/Cargo.toml
index 9066c1535e2c40400bdb3b5ca20a3e38237ef597..89e46dd682024012942e6a5014cc5f2f6ec12b83 100644
--- a/hercules_samples/matmul/Cargo.toml
+++ b/hercules_samples/matmul/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = ["hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/hercules_samples/matmul/build.rs b/hercules_samples/matmul/build.rs
index 08478deaac459d9a94f79fdabce37da9a1205f89..f895af867a019dfd23381a4df2d9a02f80a032f8 100644
--- a/hercules_samples/matmul/build.rs
+++ b/hercules_samples/matmul/build.rs
@@ -4,6 +4,9 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("matmul.hir")
         .unwrap()
+        //.schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .schedule_in_src("cpu.sch")
+        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/hercules_samples/matmul/src/cpu.sch b/hercules_samples/matmul/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..42dda6e3fc02b23e72ca31ef89a83f020bc9bebc
--- /dev/null
+++ b/hercules_samples/matmul/src/cpu.sch
@@ -0,0 +1,14 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+fork-split(*);
+unforkify(*);
+dce(*);
+float-collections(*);
+
+gcm(*);
diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..9067a1908c6615a56f917cb4eb435ace93e9ba3a
--- /dev/null
+++ b/hercules_samples/matmul/src/gpu.sch
@@ -0,0 +1,15 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+gpu(*);
+host(matmul);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+
+gcm(*);
+xdot[true](*);