diff --git a/Cargo.lock b/Cargo.lock
index 61cde7f161b7c4177cb781addeb2f484af3b7477..acc9783042434d0569ed2fe7f1f577a43fc597d2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -23,12 +23,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1"
 
-[[package]]
-name = "allocator-api2"
-version = "0.2.21"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
-
 [[package]]
 name = "anes"
 version = "0.1.6"
@@ -684,27 +678,6 @@ version = "1.0.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
 
-[[package]]
-name = "egg"
-version = "0.10.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb749745461743bb477fba3ef87c663d5965876155c676c9489cfe0963de5ab"
-dependencies = [
- "env_logger",
- "hashbrown",
- "indexmap",
- "log",
- "num-bigint",
- "num-traits",
- "quanta",
- "rustc-hash",
- "saturating",
- "smallvec",
- "symbol_table",
- "symbolic_expressions",
- "thiserror",
-]
-
 [[package]]
 name = "either"
 version = "1.13.0"
@@ -723,15 +696,6 @@ version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d"
 
-[[package]]
-name = "env_logger"
-version = "0.9.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7"
-dependencies = [
- "log",
-]
-
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -845,12 +809,6 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
 
-[[package]]
-name = "foldhash"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
-
 [[package]]
 name = "funty"
 version = "2.0.0"
@@ -975,11 +933,6 @@ name = "hashbrown"
 version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
-dependencies = [
- "allocator-api2",
- "equivalent",
- "foldhash",
-]
 
 [[package]]
 name = "heapless"
@@ -1047,7 +1000,6 @@ version = "0.1.0"
 dependencies = [
  "bimap",
  "bitvec",
- "egg",
  "either",
  "hercules_cg",
  "hercules_ir",
@@ -2117,21 +2069,6 @@ dependencies = [
  "bytemuck",
 ]
 
-[[package]]
-name = "quanta"
-version = "0.12.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
-dependencies = [
- "crossbeam-utils",
- "libc",
- "once_cell",
- "raw-cpuid",
- "wasi 0.11.0+wasi-snapshot-preview1",
- "web-sys",
- "winapi",
-]
-
 [[package]]
 name = "quick-error"
 version = "2.0.1"
@@ -2266,15 +2203,6 @@ dependencies = [
  "rgb",
 ]
 
-[[package]]
-name = "raw-cpuid"
-version = "11.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "529468c1335c1c03919960dfefdb1b3648858c20d7ec2d0663e728e4a717efbc"
-dependencies = [
- "bitflags 2.8.0",
-]
-
 [[package]]
 name = "rayon"
 version = "1.10.0"
@@ -2339,12 +2267,6 @@ version = "0.8.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a"
 
-[[package]]
-name = "rustc-hash"
-version = "2.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
-
 [[package]]
 name = "rustc_version"
 version = "0.4.1"
@@ -2388,12 +2310,6 @@ dependencies = [
  "winapi-util",
 ]
 
-[[package]]
-name = "saturating"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ece8e78b2f38ec51c51f5d475df0a7187ba5111b2a28bdc761ee05b075d40a71"
-
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -2537,23 +2453,6 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 
-[[package]]
-name = "symbol_table"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f19bffd69fb182e684d14e3c71d04c0ef33d1641ac0b9e81c712c734e83703bc"
-dependencies = [
- "crossbeam-utils",
- "foldhash",
- "hashbrown",
-]
-
-[[package]]
-name = "symbolic_expressions"
-version = "5.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c68d531d83ec6c531150584c42a4290911964d5f0d79132b193b67252a23b71"
-
 [[package]]
 name = "syn"
 version = "1.0.109"
@@ -2938,22 +2837,6 @@ version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082"
 
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
 [[package]]
 name = "winapi-util"
 version = "0.1.9"
@@ -2963,12 +2846,6 @@ dependencies = [
  "windows-sys",
 ]
 
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
-
 [[package]]
 name = "windows"
 version = "0.59.0"
diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index 37bf814d85569a5aa8dd827e2fce17894da5a397..c5ffe76034775fccab6b3353a9a6a83625a4ab4b 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -146,7 +146,7 @@ impl<'a> CPUContext<'a> {
                 self.function.name,
             )?;
         }
-        write!(w, ") {{\n")?;
+        write!(w, ") nounwind nosync willreturn norecurse {{\n")?;
 
         let mut blocks: BTreeMap<_, _> = (0..self.function.nodes.len())
             .filter(|idx| self.function.nodes[*idx].is_control())
@@ -346,7 +346,7 @@ impl<'a> CPUContext<'a> {
                     let offset = offsets[&id].0;
                     write!(
                         body,
-                        "  {} = getelementptr i8, ptr %backing, i64 %dc{}\n",
+                        "  {} = getelementptr inbounds i8, ptr %backing, i64 %dc{}\n",
                         self.get_value(id, false),
                         offset.idx()
                     )?;
@@ -473,11 +473,14 @@ impl<'a> CPUContext<'a> {
 
                 let opcode = match (op, op_ty) {
                     (BinaryOperator::Add, OpTy::Float) => "fadd",
-                    (BinaryOperator::Add, _) => "add",
+                    (BinaryOperator::Add, OpTy::Unsigned) => "add nuw",
+                    (BinaryOperator::Add, OpTy::Signed) => "add nsw",
                     (BinaryOperator::Sub, OpTy::Float) => "fsub",
-                    (BinaryOperator::Sub, _) => "sub",
+                    (BinaryOperator::Sub, OpTy::Unsigned) => "sub nuw",
+                    (BinaryOperator::Sub, OpTy::Signed) => "sub nsw",
                     (BinaryOperator::Mul, OpTy::Float) => "fmul",
-                    (BinaryOperator::Mul, _) => "mul",
+                    (BinaryOperator::Mul, OpTy::Unsigned) => "mul nuw",
+                    (BinaryOperator::Mul, OpTy::Signed) => "mul nsw",
                     (BinaryOperator::Div, OpTy::Float) => "fdiv",
                     (BinaryOperator::Div, OpTy::Unsigned) => "udiv",
                     (BinaryOperator::Div, OpTy::Signed) => "sdiv",
@@ -1005,7 +1008,7 @@ impl<'a> CPUContext<'a> {
         let name = format!("%gep.{}", Self::gen_filler_id());
         write!(
             body,
-            "  {} = getelementptr i8, ptr {}, i64 {}\n",
+            "  {} = getelementptr inbounds i8, ptr {}, i64 {}\n",
             name, ptr, size
         )?;
         Ok(name)
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index dd87acbe18de297557fc8b97696928946e90c2e2..4069cb02bcf1ed0a93f5c40df432253e842ead9a 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -354,7 +354,6 @@ impl GPUContext<'_> {
         write!(
             w,
             "
-#define _CG_ABI_EXPERIMENTAL
 #include <assert.h>
 #include <stdio.h>
 #include <stddef.h>
@@ -362,8 +361,23 @@ impl GPUContext<'_> {
 #include <cuda_runtime.h>
 #include <math_constants.h>
 #include <mma.h>
+
+#if (CUDA_VERSION >= 12000)
+#else
+#define _CG_ABI_EXPERIMENTAL
+#endif
+
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+
+#if (CUDA_VERSION >= 12000)
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups;
+#else
+namespace cg = cooperative_groups;
+namespace cge = cooperative_groups::experimental;
+#endif
+
 #include <cuda_bf16.h>
 namespace cg = cooperative_groups;
 
@@ -564,12 +578,12 @@ namespace cg = cooperative_groups;
     fn codegen_helpers(&self, w: &mut String) -> Result<(), Error> {
         write!(
             w,
-            "\t__shared__ cg::experimental::block_tile_memory<1024> block_sync_shared;\n"
+            "\t__shared__ cge::block_tile_memory<1024> block_sync_shared;\n"
         )?;
         write!(w, "\tcg::grid_group grid = cg::this_grid();\n")?;
         write!(
             w,
-            "\tcg::thread_block block = cg::experimental::this_thread_block(block_sync_shared);\n"
+            "\tcg::thread_block block = cge::this_thread_block(block_sync_shared);\n"
         )?;
         Ok(())
     }
@@ -1715,20 +1729,20 @@ namespace cg = cooperative_groups;
                     };
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         use_thread_per_id, cg_tile, use_thread_per_id
                     )?;
                     let cg_tile_use = self.get_cg_tile(id, CGType::Use);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         use_thread_quota, cg_tile_use, use_thread_quota
                     )?;
                     let available_thread_quota = available_thread_quota.unwrap();
                     let cg_tile_available = self.get_cg_tile(id, CGType::Available);
                     write!(
                         thread_block_tiles,
-                        "\tcg::thread_block_tile<{}> {} = cg::experimental::tiled_partition<{}>(block);\n",
+                        "\tcg::thread_block_tile<{}> {} = cge::tiled_partition<{}>(block);\n",
                         available_thread_quota, cg_tile_available, available_thread_quota
                     )?;
                     if parallel_factor.is_none() {
diff --git a/hercules_opt/Cargo.toml b/hercules_opt/Cargo.toml
index 92e0533938eab0cc76fe4b69a205f4dceae2f244..892e4c1539f34a91087472b052f015f542a351a6 100644
--- a/hercules_opt/Cargo.toml
+++ b/hercules_opt/Cargo.toml
@@ -22,4 +22,4 @@ hercules_cg = { path = "../hercules_cg" }
 hercules_ir = { path = "../hercules_ir" }
 nestify = "*"
 bimap = "*"
-egg = "*"
+#egg = "*"
diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index 4a6365c8c68bd7714b529744be1cc2bd0259071a..4cee5e83a227d3c868bc7607146cdebfaeb0b4d5 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -221,6 +221,31 @@ fn preliminary_fixups(
         }
     }
 
+    // Add region nodes between join nodes and loop headers to aid in block
+    // placement.
+    for (_, join) in fork_join_map {
+        let control_user = editor
+            .get_users(*join)
+            .filter(|id| nodes[id.idx()].is_control())
+            .next()
+            .unwrap();
+        if nodes[control_user.idx()].is_fork()
+            || nodes[control_user.idx()]
+                .try_region()
+                .map(|preds| preds.len() > 1)
+                .unwrap_or(false)
+        {
+            let success = editor.edit(|mut edit| {
+                let region = edit.add_node(Node::Region {
+                    preds: Box::new([*join]),
+                });
+                edit.replace_all_uses_where(*join, region, |id| *id == control_user)
+            });
+            assert!(success);
+            return true;
+        }
+    }
+
     false
 }
 
diff --git a/hercules_opt/src/rewrite_math_expressions.rs b/hercules_opt/src/rewrite_math_expressions.rs
index 6f52dc58ee32751b96190b42a0add01d42ce3b1e..55e695ba75704395c5fde371449fad940310ad43 100644
--- a/hercules_opt/src/rewrite_math_expressions.rs
+++ b/hercules_opt/src/rewrite_math_expressions.rs
@@ -3,10 +3,22 @@ use std::fmt::{Error, Write};
 
 use hercules_ir::*;
 
-use egg::*;
+//use egg::*;
 
 use crate::*;
 
+pub fn rewrite_math_expressions(
+    editor: &mut FunctionEditor,
+    device: Device,
+    typing: &Vec<TypeID>,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    nodes_in_fork_joins: &HashMap<NodeID, HashSet<NodeID>>,
+    reduce_einsums: &(MathEnv, HashMap<NodeID, MathID>),
+) {
+    panic!("PANIC: The rewrite math expressions pass is currently disabled, as including egg increases compile times and we're not using it currently.");
+}
+
+/*
 define_language! {
     enum MathLanguage {
         "zero" = Zero,
@@ -164,3 +176,4 @@ fn egg_print_math_expr<W: Write>(id: MathID, env: &MathEnv, w: &mut W) -> Result
         _ => Err(Error::default()),
     }
 }
+*/
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index a5954ca03c32441709a2ce5452e9b2584f9c6155..9265808be132b0137293332fea7ba4a378bd1bd9 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -5,6 +5,7 @@ use std::future::Future;
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
 use std::slice::{from_raw_parts, from_raw_parts_mut};
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::OnceLock;
 
 /*
@@ -928,3 +929,30 @@ unsafe impl GlobalAlloc for AlignedAlloc {
 
 #[global_allocator]
 static A: AlignedAlloc = AlignedAlloc;
+
+pub struct SpinBarrier {
+    num: usize,
+    waiting: AtomicUsize,
+    gen: AtomicUsize,
+}
+
+impl SpinBarrier {
+    pub const fn new(num: usize) -> Self {
+        SpinBarrier {
+            num,
+            waiting: AtomicUsize::new(0),
+            gen: AtomicUsize::new(0),
+        }
+    }
+
+    pub fn wait(&self) {
+        let old_gen = self.gen.load(Ordering::Acquire);
+        let old_waiting = self.waiting.fetch_add(1, Ordering::Relaxed);
+        if old_waiting + 1 == self.num {
+            self.waiting.store(0, Ordering::Relaxed);
+            self.gen.fetch_add(1, Ordering::Release);
+        } else {
+            while old_gen == self.gen.load(Ordering::Acquire) {}
+        }
+    }
+}
diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch
index 306997f58eb217f9ce301dc18e418c412e6df621..6867576e4a3a81f327d23eca6dccbcd6a6183eb4 100644
--- a/juno_samples/matmul/src/matmul.sch
+++ b/juno_samples/matmul/src/matmul.sch
@@ -51,7 +51,9 @@ if feature("cuda") {
   fork-coalesce(*);
   infer-schedules(*);
   dce(*);
-  rewrite(*);
+  //rewrite(*);
+  let out = outline(matmul@outer);
+  gpu(out);
   fixpoint {
     simplify-cfg(*);
     dce(*);
diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn
index d6ec25f26b997205d4dcdf116bce462fa228fc88..75190300e4d8be463176c4c346d89715cece0ced 100644
--- a/juno_samples/rodinia/bfs/src/bfs.jn
+++ b/juno_samples/rodinia/bfs/src/bfs.jn
@@ -24,7 +24,7 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n]
   @cost_init for i in 0..n {
     cost[i] = -1;
   }
-  cost[source as u64] = 0;
+  @cost_init cost[source as u64] = 0;
 
   // Nodes that were updated in the current iteration
   let updated: bool[n];
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index 550dce319d3c5fe2c14168af825ad6417d6f8a6f..63b2e4fed6db89a091488117cdb6c17a14582c30 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -47,16 +47,15 @@ if !feature("seq") {
   let (outer, inner) = fork-reshape[[1], [0]](collect);
   collect = outline(inner);
 
-  // Following code seems to generate breaking RT code
-  //fork-tile[32, 0, false, true](init);
-  //let (outer, inner) = fork-reshape[[1], [0]](init);
-  //init = outline(inner);
-  //inline(bfs@cost_init);
+  fork-tile[32, 0, false, true](init);
+  let (outer, inner) = fork-reshape[[1], [0]](init);
+  let init_body = outline(inner);
 
-  inline(bfs@loop1, bfs@loop2);
+  inline(bfs@cost_init, bfs@loop1, bfs@loop2);
 }
 delete-uncalled(*);
 const-inline(*);
+simpl!(*);
 
 unforkify(init, traverse, collect);
 simpl!(*);
diff --git a/juno_samples/rodinia/bfs/src/gpu.sch b/juno_samples/rodinia/bfs/src/gpu.sch
index 4e5c1f74e34cedacd48ddf6c33b631a08079b995..0253a0210f6cc2451d38399601ad39ff3ab9465a 100644
--- a/juno_samples/rodinia/bfs/src/gpu.sch
+++ b/juno_samples/rodinia/bfs/src/gpu.sch
@@ -38,8 +38,12 @@ fixpoint {
 }
 simpl!(collect);
 
+fork-tile[32, 0, false, true](init);
+let (outer, inner) = fork-reshape[[1], [0]](init);
+let init_body = outline(inner);
+
 fork-tile[1024, 0, false, true](traverse, collect);
 fork-split(traverse, collect);
 
-unforkify(init);
+unforkify(init_body);
 gcm(*);