diff --git a/hercules_opt/src/gcm.rs b/hercules_opt/src/gcm.rs
index 4a6365c8c68bd7714b529744be1cc2bd0259071a..4cee5e83a227d3c868bc7607146cdebfaeb0b4d5 100644
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -221,6 +221,31 @@ fn preliminary_fixups(
         }
     }
 
+    // Add region nodes between join nodes and loop headers to aid in block
+    // placement.
+    for (_, join) in fork_join_map {
+        let control_user = editor
+            .get_users(*join)
+            .filter(|id| nodes[id.idx()].is_control())
+            .next()
+            .unwrap();
+        if nodes[control_user.idx()].is_fork()
+            || nodes[control_user.idx()]
+                .try_region()
+                .map(|preds| preds.len() > 1)
+                .unwrap_or(false)
+        {
+            let success = editor.edit(|mut edit| {
+                let region = edit.add_node(Node::Region {
+                    preds: Box::new([*join]),
+                });
+                edit.replace_all_uses_where(*join, region, |id| *id == control_user)
+            });
+            assert!(success);
+            return true;
+        }
+    }
+
     false
 }
 
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index a5954ca03c32441709a2ce5452e9b2584f9c6155..9265808be132b0137293332fea7ba4a378bd1bd9 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -5,6 +5,7 @@ use std::future::Future;
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
 use std::slice::{from_raw_parts, from_raw_parts_mut};
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::OnceLock;
 
 /*
@@ -928,3 +929,30 @@ unsafe impl GlobalAlloc for AlignedAlloc {
 
 #[global_allocator]
 static A: AlignedAlloc = AlignedAlloc;
+
+pub struct SpinBarrier {
+    num: usize,
+    waiting: AtomicUsize,
+    gen: AtomicUsize,
+}
+
+impl SpinBarrier {
+    pub const fn new(num: usize) -> Self {
+        SpinBarrier {
+            num,
+            waiting: AtomicUsize::new(0),
+            gen: AtomicUsize::new(0),
+        }
+    }
+
+    pub fn wait(&self) {
+        let old_gen = self.gen.load(Ordering::Acquire);
+        let old_waiting = self.waiting.fetch_add(1, Ordering::Relaxed);
+        if old_waiting + 1 == self.num {
+            self.waiting.store(0, Ordering::Relaxed);
+            self.gen.fetch_add(1, Ordering::Release);
+        } else {
+            while old_gen == self.gen.load(Ordering::Acquire) {}
+        }
+    }
+}
diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch
index 339782d6a84b6a43f31d81f5c9b23240dd10a292..2bd762b2ce8d3821a2971d8c7ad849fd722e5c5c 100644
--- a/juno_samples/rodinia/bfs/src/cpu.sch
+++ b/juno_samples/rodinia/bfs/src/cpu.sch
@@ -50,7 +50,7 @@ fork-tile[32, 0, false, true](init);
 let (outer, inner) = fork-reshape[[1], [0]](init);
 let init_body = outline(inner);
 
-inline(bfs@loop1, bfs@loop2);
+inline(bfs@cost_init, bfs@loop1, bfs@loop2);
 delete-uncalled(*);
 const-inline(*);
 simpl!(*);