diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index cbf70eec682be6dff91016e0f1b3722ce1fea0fa..beb83f51691a91ac8e9c377709f55de8f7be117d 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -138,7 +138,6 @@ struct RTContext<'a> {
 #[derive(Debug, Clone, Default)]
 struct RustBlock {
     prologue: String,
-    arc_clones: String,
     data: String,
     phi_tmp_assignments: String,
     phi_assignments: String,
@@ -269,9 +268,8 @@ impl<'a> RTContext<'a> {
             } else {
                 write!(
                     w,
-                    "{}{}{}{}{}{}",
+                    "{}{}{}{}{}",
                     block.prologue,
-                    block.arc_clones,
                     block.data,
                     block.phi_tmp_assignments,
                     block.phi_assignments,
@@ -361,6 +359,16 @@ impl<'a> RTContext<'a> {
                     write!(prologue, " {{")?;
                 }
 
+                // Emit clones of arcs used inside the fork-join.
+                for other_id in (0..func.nodes.len()).map(NodeID::new) {
+                    if self.def_use.get_users(other_id).into_iter().any(|user_id| {
+                        self.nodes_in_fork_joins[&id].contains(&self.bbs.0[user_id.idx()])
+                    }) && let Some(arc) = self.clone_arc(other_id)
+                    {
+                        write!(prologue, "{}", arc)?;
+                    }
+                }
+
                 // Spawn an async closure and push its future to a Vec.
                 write!(
                     prologue,
@@ -569,13 +577,12 @@ impl<'a> RTContext<'a> {
                 // The device backends ensure that device functions have the
                 // same interface as AsyncRust functions.
                 let block = &mut blocks.get_mut(&bb).unwrap();
-                let arc_clones = &mut block.arc_clones;
                 let block = &mut block.data;
                 let is_async = func.schedules[id.idx()].contains(&Schedule::AsyncCall);
                 if is_async {
                     for arg in args {
                         if let Some(arc) = self.clone_arc(*arg) {
-                            write!(arc_clones, "{}", arc)?;
+                            write!(block, "{}", arc)?;
                         }
                     }
                 }
@@ -585,7 +592,7 @@ impl<'a> RTContext<'a> {
                         format!("{} = ", self.get_value(id, bb, true))
                     }
                     (_, true) => {
-                        write!(arc_clones, "{}", self.clone_arc(id).unwrap())?;
+                        write!(block, "{}", self.clone_arc(id).unwrap())?;
                         format!(
                             "*async_call_{}.lock().await = Some(::async_std::task::spawn(async move {{ ",
                             id.idx(),
diff --git a/juno_samples/fork_join_tests/src/cpu.sch b/juno_samples/fork_join_tests/src/cpu.sch
index c71aec111f42aad90c383f8b42f622037efac60c..76dcbdf68ce0ac88f8a2a22bda364f60a88e78bb 100644
--- a/juno_samples/fork_join_tests/src/cpu.sch
+++ b/juno_samples/fork_join_tests/src/cpu.sch
@@ -1,5 +1,3 @@
-no-memset(test6@const);
-
 ccp(*);
 gvn(*);
 phi-elim(*);
@@ -63,7 +61,9 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-fork-tile[32, 0, false, true](test6@loop);
+async-call(test6@call);
+no-memset(test6@const);
+fork-tile[2, 0, false, false](test6@loop);
 let out = fork-split(test6@loop);
 let out = outline(out.test6.fj1);
 cpu(out);
diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn
index 51115f1576edd1d555395717bdc1dcb4e82a2529..bfb5564be29e444d065c3caaa40afb760d678aa7 100644
--- a/juno_samples/fork_join_tests/src/fork_join_tests.jn
+++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn
@@ -73,11 +73,16 @@ fn test5(input : i32) -> i32[4] {
   return arr1;
 }
 
+fn test6_helper(input: i32) -> i32 {
+  return input;
+}
+
 #[entry]
 fn test6(input: i32) -> i32[1024] {
+  @call let x = test6_helper(input);
   @const let arr : i32[1024];
   @loop for i = 0 to 1024 {
-    arr[i] = i as i32 + input;
+    arr[i] = i as i32 + x;
   }
   return arr;
 }