diff --git a/juno_samples/fork_join_tests/src/cpu.sch b/juno_samples/fork_join_tests/src/cpu.sch
index 76dcbdf68ce0ac88f8a2a22bda364f60a88e78bb..f46c91d6a84a08b2258332af1dc5d6a662d86639 100644
--- a/juno_samples/fork_join_tests/src/cpu.sch
+++ b/juno_samples/fork_join_tests/src/cpu.sch
@@ -3,7 +3,7 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8);
+let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9);
 cpu(auto.test1);
 cpu(auto.test2);
 cpu(auto.test3);
@@ -11,6 +11,7 @@ cpu(auto.test4);
 cpu(auto.test5);
 cpu(auto.test7);
 cpu(auto.test8);
+cpu(auto.test9);
 
 let test1_cpu = auto.test1;
 rename["test1_cpu"](test1_cpu);
@@ -51,11 +52,11 @@ fixpoint panic after 20 {
   unroll(auto.test1);
 }
 
-fork-split(auto.test2, auto.test3, auto.test4, auto.test5);
+fork-split(auto.test2, auto.test3, auto.test4, auto.test5, auto.test9);
 gvn(*);
 phi-elim(*);
 dce(*);
-unforkify(auto.test2, auto.test3, auto.test4, auto.test5);
+unforkify(auto.test2, auto.test3, auto.test4, auto.test5, auto.test9);
 ccp(*);
 gvn(*);
 phi-elim(*);
@@ -93,4 +94,6 @@ dce(auto.test8);
 simplify-cfg(auto.test8);
 dce(auto.test8);
 
+no-memset(test9@const);
+
 gcm(*);
diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn
index bfb5564be29e444d065c3caaa40afb760d678aa7..3b7c783308cd1949651afb1a7f0cdd708f3f71d7 100644
--- a/juno_samples/fork_join_tests/src/fork_join_tests.jn
+++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn
@@ -122,3 +122,27 @@ fn test8(input : i32) -> i32[8] {
   }
   return out;
 }
+
+#[entry]
+fn test9<r, c, z : usize>(input : i32[r, c]) -> i32[r, c] {
+  const rad = z / 2;
+  @const let out : i32[r, c];
+
+  for ir = 0 to r {
+    for ic = 0 to c {
+      let acc = 0;
+      @filter_loop for sr = 0 to z {
+        for sc = 0 to z {
+	  acc += if ir + sr < rad then 0
+	         else if ir + sr - rad > r - 1 then 0
+	         else if ic + sc < rad then 0
+	         else if ic + sc - rad > c - 1 then 0
+		 else input[ir + sr - rad, ic + sc - rad];
+	}
+      }
+      out[ir, ic] = acc;
+    }
+  }
+
+  return out;
+}
diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch
index 364673cd2aedf323ca7727dc1ea8515188373a8d..c554fd502292c040a405e0ffafb548844d0f1d5f 100644
--- a/juno_samples/fork_join_tests/src/gpu.sch
+++ b/juno_samples/fork_join_tests/src/gpu.sch
@@ -7,12 +7,13 @@ no-memset(test3@const3);
 no-memset(test6@const);
 no-memset(test8@const1);
 no-memset(test8@const2);
+no-memset(test9@const);
 
 gvn(*);
 phi-elim(*);
 dce(*);
 
-let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8);
+let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9);
 gpu(auto.test1);
 gpu(auto.test2);
 gpu(auto.test3);
@@ -20,6 +21,7 @@ gpu(auto.test4);
 gpu(auto.test5);
 gpu(auto.test7);
 gpu(auto.test8);
+gpu(auto.test9);
 
 ip-sroa(*);
 sroa(*);
@@ -34,7 +36,7 @@ fixpoint panic after 20 {
 }
 
 fixpoint panic after 20 {
-  fork-coalesce(auto.test1, auto.test3, auto.test4, auto.test5, auto.test7, auto.test8);
+  fork-coalesce(auto.test1, auto.test3, auto.test4, auto.test5, auto.test7, auto.test8, auto.test9);
 }
 
 gvn(*);
diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs
index cd715cace80ba3e58d0971b7956ec1e957b839e2..fa99f759ca08814215d1b138758f833f83c38c5f 100644
--- a/juno_samples/fork_join_tests/src/main.rs
+++ b/juno_samples/fork_join_tests/src/main.rs
@@ -1,6 +1,6 @@
 #![feature(concat_idents)]
 
-use hercules_rt::runner;
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
 
 juno_build::juno!("fork_join_tests");
 
@@ -57,6 +57,23 @@ fn main() {
         let output = r.run(0).await;
         let correct = vec![10, 17, 24, 31, 38, 45, 52, 59];
         assert(&correct, output);
+
+        let mut r = runner!(test9);
+        let input = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
+        let input = HerculesImmBox::from(&input as &[i32]);
+        let output = r.run(3, 3, 3, input.to()).await;
+        let correct = vec![
+            1 + 2 + 4 + 5,
+            1 + 2 + 3 + 4 + 5 + 6,
+            2 + 3 + 5 + 6,
+            1 + 2 + 4 + 5 + 7 + 8,
+            1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+            2 + 3 + 5 + 6 + 8 + 9,
+            4 + 5 + 7 + 8,
+            4 + 5 + 6 + 7 + 8 + 9,
+            5 + 6 + 8 + 9,
+        ];
+        assert(&correct, output);
     });
 }