diff --git a/hercules_ir/src/einsum.rs b/hercules_ir/src/einsum.rs
index 6c2ca31b48dc7f362d60404718ee3d2e00bd27fb..3922506b5815ee82655898ad0bc294e2282daf66 100644
--- a/hercules_ir/src/einsum.rs
+++ b/hercules_ir/src/einsum.rs
@@ -435,10 +435,9 @@ pub fn debug_print_math_expr(id: MathID, env: &MathEnv) {
         }
         MathExpr::IntrinsicFunc(intrinsic, ref args) => {
             print!("{}(", intrinsic.lower_case_name());
-            debug_print_math_expr(id, env);
             for arg in args {
-                print!(", ");
                 debug_print_math_expr(*arg, env);
+                print!(", ");
             }
             print!(")");
         }
diff --git a/hercules_ir/src/typecheck.rs b/hercules_ir/src/typecheck.rs
index b2567b8f1313e97aeb70268b11847b6d70b1a2f5..1ecebf117548b0ceeecc2ca2883a28129a8a5184 100644
--- a/hercules_ir/src/typecheck.rs
+++ b/hercules_ir/src/typecheck.rs
@@ -822,7 +822,7 @@ fn typeflow(
             // We also return the return type from here
             match intrinsic {
                 // Intrinsics that take any numeric type and return the same
-                Intrinsic::Abs => {
+                Intrinsic::Abs | Intrinsic::Max | Intrinsic::Min => {
                     if let Concrete(id) = inputs[0] {
                         if types[id.idx()].is_arithmetic() {
                             Concrete(*id)
@@ -856,8 +856,6 @@ fn typeflow(
                 | Intrinsic::Ln1P
                 | Intrinsic::Log10
                 | Intrinsic::Log2
-                | Intrinsic::Max
-                | Intrinsic::Min
                 | Intrinsic::Round
                 | Intrinsic::Sin
                 | Intrinsic::Sinh
diff --git a/hercules_opt/src/forkify.rs b/hercules_opt/src/forkify.rs
index a6308d66eae3bd47b3d607fb8970679843599168..189670deed876b5fd7d9d9991e2d7a4aba9cad76 100644
--- a/hercules_opt/src/forkify.rs
+++ b/hercules_opt/src/forkify.rs
@@ -106,7 +106,7 @@ pub fn forkify_loop(
     else {
         return false;
     };
-
+    
     // Compute loop variance
     let loop_variance = compute_loop_variance(editor, l);
     let ivs = compute_induction_vars(editor.func(), l, &loop_variance);
@@ -530,10 +530,11 @@ pub fn analyze_phis<'a>(
         let intersection: HashSet<_> = set1.intersection(&set2).cloned().collect();
 
         // If this phi uses any other phis the node is loop dependant,
-        // we use `phis` because this phi can actually contain the loop iv and its fine.
-        if uses_for_dependance.any(|node| phis.contains(&node) && node != *phi) {
-            LoopPHI::LoopDependant(*phi)
-        } else if intersection.clone().iter().next().is_some() {
+        // // we use `phis` because this phi can actually contain the loop iv and its fine.
+        // if uses_for_dependance.any(|node| phis.contains(&node) && node != *phi) {
+        //     LoopPHI::LoopDependant(*phi)
+        // } else 
+        if intersection.clone().iter().next().is_some() {
             // PHIs on the frontier of the uses by the candidate phi, i.e in uses_for_dependance need
             // to have headers that postdominate the loop continue latch. The value of the PHI used needs to be defined
             // by the time the reduce is triggered (at the end of the loop's internal control).
diff --git a/hercules_opt/src/loop_bound_canon.rs b/hercules_opt/src/loop_bound_canon.rs
index f1ce872c9f8b31e8e1c1f3ba72a0df21d5858b26..3e194e2dda3319dd6c2c4ece4aea1af41e7e701b 100644
--- a/hercules_opt/src/loop_bound_canon.rs
+++ b/hercules_opt/src/loop_bound_canon.rs
@@ -275,7 +275,7 @@ pub fn canonicalize_single_loop_bounds(
                     let new_binop_node = edit.add_node(Node::Binary { left, right: blah, op: BinaryOperator::LT });
 
                     edit = edit.replace_all_uses_where(binop_node, new_binop_node, |usee| *usee == if_node)?;
-                    Some((init_id, bound_id, new_binop_node, if_node))
+                    Some((init_id, blah, new_binop_node, if_node))
                 } else {guard_info};
 
                 edit = edit.replace_all_uses_where(dc_bound_node, new_dc_bound_node, |usee| *usee == new_bop)?;
@@ -289,7 +289,6 @@ pub fn canonicalize_single_loop_bounds(
         };
         Ok(edit)
     });
-
     
     let update_expr_users: Vec<_> = editor
         .get_users(*update_expression)
diff --git a/juno_samples/fork_join_tests/src/cpu.sch b/juno_samples/fork_join_tests/src/cpu.sch
index a3bb146b3f8d456083daeb4ba62b8bb5369b7a71..ff30b277499d5d672e45a59a21317807fba64a8a 100644
--- a/juno_samples/fork_join_tests/src/cpu.sch
+++ b/juno_samples/fork_join_tests/src/cpu.sch
@@ -3,7 +3,7 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10);
+let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10, test11);
 cpu(auto.test1);
 cpu(auto.test2);
 cpu(auto.test3);
@@ -13,6 +13,7 @@ cpu(auto.test7);
 cpu(auto.test8);
 cpu(auto.test9);
 cpu(auto.test10);
+cpu(auto.test11);
 
 let test1_cpu = auto.test1;
 rename["test1_cpu"](test1_cpu);
@@ -95,8 +96,6 @@ dce(auto.test8);
 simplify-cfg(auto.test8);
 dce(auto.test8);
 
-no-memset(test9@const);
-
 fork-split(auto.test10);
 let fission_out = fork-fission-reduces[test10@loop3](auto.test10);
 fork-fusion(fission_out.test10_8.fj0, fission_out.test10_8.fj1);
@@ -104,8 +103,13 @@ dce(auto.test10);
 simplify-cfg(auto.test10);
 dce(auto.test10);
 fork-split(auto.test10);
-
 unforkify(auto.test10);
 
+array-slf(auto.test11);
+ccp(auto.test11);
+dce(auto.test11);
+simplify-cfg(auto.test11);
+dce(auto.test11);
+unforkify(auto.test11);
 
 gcm(*);
diff --git a/juno_samples/fork_join_tests/src/fork_join_tests.jn b/juno_samples/fork_join_tests/src/fork_join_tests.jn
index 51284c29b159559a4c27dbe77521bf7bb725eaae..dfc81cd2f3e9b169424e3619118de7426e0782ac 100644
--- a/juno_samples/fork_join_tests/src/fork_join_tests.jn
+++ b/juno_samples/fork_join_tests/src/fork_join_tests.jn
@@ -163,3 +163,16 @@ fn test10(i0 : u64, i1 : u64, input : i32) -> i32 {
   
   return arr0[i0, i1] + arr1[i0, i1] + arr2[i0, i1];
 }
+
+#[entry]
+fn test11(k1 : i32[8], k2 : i32[8], v : i32[8]) -> i32 {
+  @const let s : i32[8];
+  for i = 0 to 8 {
+    s[i] = v[k1[i] as u64];
+  }
+  let sum = 0;
+  for i = 0 to 8 {
+    sum += s[k2[i] as u64];
+  }
+  return sum;
+}
diff --git a/juno_samples/fork_join_tests/src/gpu.sch b/juno_samples/fork_join_tests/src/gpu.sch
index 18caa8e528e2562498252c51bb5ddea888298f67..aa8861d5eda51dc95783d6affbc757c398204e5c 100644
--- a/juno_samples/fork_join_tests/src/gpu.sch
+++ b/juno_samples/fork_join_tests/src/gpu.sch
@@ -8,12 +8,14 @@ no-memset(test6@const);
 no-memset(test8@const1);
 no-memset(test8@const2);
 no-memset(test9@const);
+no-memset(test10@const);
+no-memset(test11@const);
 
 gvn(*);
 phi-elim(*);
 dce(*);
 
-let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10);
+let auto = auto-outline(test1, test2, test3, test4, test5, test7, test8, test9, test10, test11);
 gpu(auto.test1);
 gpu(auto.test2);
 gpu(auto.test3);
@@ -23,6 +25,7 @@ gpu(auto.test7);
 gpu(auto.test8);
 gpu(auto.test9);
 gpu(auto.test10);
+gpu(auto.test11);
 
 ip-sroa(*);
 sroa(*);
diff --git a/juno_samples/fork_join_tests/src/main.rs b/juno_samples/fork_join_tests/src/main.rs
index 277484f6bc094dbee05155fb7fee63fb3dcf3319..29b62beaf24645d68b95271438685073e01d63a6 100644
--- a/juno_samples/fork_join_tests/src/main.rs
+++ b/juno_samples/fork_join_tests/src/main.rs
@@ -79,6 +79,20 @@ fn main() {
         let output = r.run(1, 2, 5).await;
         let correct = 17i32;
         assert_eq!(correct, output);
+
+        let mut r = runner!(test11);
+        let k1 = vec![0, 4, 3, 7, 3, 4, 2, 1];
+        let k2 = vec![6, 4, 3, 2, 4, 1, 0, 5];
+        let v = vec![3, -499, 4, 32, -2, 55, -74, 10];
+        let mut correct = 0;
+        for i in 0..8 {
+            correct += v[k1[k2[i] as usize] as usize];
+        }
+        let k1 = HerculesImmBox::from(&k1 as &[i32]);
+        let k2 = HerculesImmBox::from(&k2 as &[i32]);
+        let v = HerculesImmBox::from(&v as &[i32]);
+        let output = r.run(k1.to(), k2.to(), v.to()).await;
+        assert_eq!(output, correct);
     });
 }
 
diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs
index d2813388e0e7a1d7bd1696ffbb641e629096e2c2..7bc2083cf07c063eccda5855fa4fed3bfca91f87 100644
--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
@@ -1,24 +1,11 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(not(feature = "cuda"))]
-    {
-        JunoCompiler::new()
-            .file_in_src("matmul.jn")
-            .unwrap()
-            .schedule_in_src("cpu.sch")
-            .unwrap()
-            .build()
-            .unwrap();
-    }
-    #[cfg(feature = "cuda")]
-    {
-        JunoCompiler::new()
-            .file_in_src("matmul.jn")
-            .unwrap()
-            .schedule_in_src("gpu.sch")
-            .unwrap()
-            .build()
-            .unwrap();
-    }
+    JunoCompiler::new()
+        .file_in_src("matmul.jn")
+        .unwrap()
+        .schedule_in_src("matmul.sch")
+        .unwrap()
+        .build()
+        .unwrap();
 }
diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch
deleted file mode 100644
index 69f1811d08a5691de6dc10fa86f0720e416fb85a..0000000000000000000000000000000000000000
--- a/juno_samples/matmul/src/cpu.sch
+++ /dev/null
@@ -1,61 +0,0 @@
-macro optimize!(X) {
-  gvn(X);
-  phi-elim(X);
-  dce(X);
-  ip-sroa(X);
-  sroa(X);
-  dce(X);
-  gvn(X);
-  phi-elim(X);
-  dce(X);
-}
-
-macro codegen-prep!(X) {
-  optimize!(X);
-  gcm(X);
-  float-collections(X);
-  dce(X);
-  gcm(X);
-}
-
-macro forkify!(X) {
-  fixpoint {
-    forkify(X);
-    fork-guard-elim(X);
-  }
-}
-
-macro fork-tile![n](X) {
-  fork-tile[n, 0, false, true](X);
-}
-
-macro parallelize!(X) {
-  parallel-fork(X);
-  parallel-reduce(X);
-}
-
-macro unforkify!(X) {
-  fork-split(X);
-  unforkify(X);
-}
-
-optimize!(*);
-forkify!(*);
-associative(matmul@outer);
-
-// Parallelize by computing output array as 16 chunks
-let par = matmul@outer \ matmul@inner;
-fork-tile![4](par);
-let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
-parallelize!(outer \ inner);
-
-let body = outline(inner);
-cpu(body);
-
-// Tile for cache, assuming 64B cache lines
-fork-tile![16](body);
-let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
-
-reduce-slf(inner);
-unforkify!(body);
-codegen-prep!(*);
diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
deleted file mode 100644
index 76808149e7b99dce97e43f0e936536d3d13b7417..0000000000000000000000000000000000000000
--- a/juno_samples/matmul/src/gpu.sch
+++ /dev/null
@@ -1,26 +0,0 @@
-phi-elim(*);
-
-forkify(*);
-fork-guard-elim(*);
-dce(*);
-
-fixpoint {
-  reduce-slf(*);
-  slf(*);
-  infer-schedules(*);
-}
-fork-coalesce(*);
-infer-schedules(*);
-dce(*);
-rewrite(*);
-fixpoint {
-  simplify-cfg(*);
-  dce(*);
-}
-
-ip-sroa(*);
-sroa(*);
-dce(*);
-
-float-collections(*);
-gcm(*);
diff --git a/juno_samples/matmul/src/matmul.sch b/juno_samples/matmul/src/matmul.sch
new file mode 100644
index 0000000000000000000000000000000000000000..306997f58eb217f9ce301dc18e418c412e6df621
--- /dev/null
+++ b/juno_samples/matmul/src/matmul.sch
@@ -0,0 +1,81 @@
+macro optimize!(X) {
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  ip-sroa(X);
+  sroa(X);
+  dce(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+}
+
+macro codegen-prep!(X) {
+  optimize!(X);
+  gcm(X);
+  float-collections(X);
+  dce(X);
+  gcm(X);
+}
+
+macro forkify!(X) {
+  fixpoint {
+    forkify(X);
+    fork-guard-elim(X);
+  }
+}
+
+macro fork-tile![n](X) {
+  fork-tile[n, 0, false, true](X);
+}
+
+macro parallelize!(X) {
+  parallel-fork(X);
+  parallel-reduce(X);
+}
+
+macro unforkify!(X) {
+  fork-split(X);
+  unforkify(X);
+}
+
+optimize!(*);
+forkify!(*);
+
+if feature("cuda") {
+  fixpoint {
+    reduce-slf(*);
+    slf(*);
+    infer-schedules(*);
+  }
+  fork-coalesce(*);
+  infer-schedules(*);
+  dce(*);
+  rewrite(*);
+  fixpoint {
+    simplify-cfg(*);
+    dce(*);
+  }
+
+  optimize!(*);
+  codegen-prep!(*);
+} else {
+  associative(matmul@outer);
+
+  // Parallelize by computing output array as 16 chunks
+  let par = matmul@outer \ matmul@inner;
+  fork-tile![4](par);
+  let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
+  parallelize!(outer \ inner);
+
+  let body = outline(inner);
+  cpu(body);
+
+  // Tile for cache, assuming 64B cache lines
+  fork-tile![16](body);
+  let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);
+
+  reduce-slf(inner);
+  unforkify!(body);
+  codegen-prep!(*);
+}
diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs
index 728702d9bcc18405ef291945f81413f49f5715af..6af13aae5d9093bd59cf6299dfa64ac3e73209a6 100644
--- a/juno_samples/rodinia/srad/benches/srad_bench.rs
+++ b/juno_samples/rodinia/srad/benches/srad_bench.rs
@@ -24,19 +24,6 @@ fn srad_bench(c: &mut Criterion) {
     } = read_graphics(image);
     let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
     let mut image_h = HerculesMutBox::from(image.clone());
-    let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>();
-    let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>();
-    let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>();
-    let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>();
-    // Fix boundary conditions
-    iN[0] = 0;
-    iS[nrows - 1] = (nrows - 1) as i32;
-    jW[0] = 0;
-    jE[ncols - 1] = (ncols - 1) as i32;
-    let iN_h = HerculesImmBox::from(iN.as_slice());
-    let iS_h = HerculesImmBox::from(iS.as_slice());
-    let jW_h = HerculesImmBox::from(jW.as_slice());
-    let jE_h = HerculesImmBox::from(jE.as_slice());
     group.bench_function("srad bench", |b| {
         b.iter(|| {
             async_std::task::block_on(async {
@@ -45,10 +32,6 @@ fn srad_bench(c: &mut Criterion) {
                     ncols as u64,
                     niter as u64,
                     image_h.to(),
-                    iN_h.to(),
-                    iS_h.to(),
-                    jW_h.to(),
-                    jE_h.to(),
                     max,
                     lambda,
                 )
diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch
index a4cd49569aedadf42d3965bee040391f4d56cef9..7b7a6c9e2203b8e280f323ed9e6589ab149537c0 100644
--- a/juno_samples/rodinia/srad/src/cpu.sch
+++ b/juno_samples/rodinia/srad/src/cpu.sch
@@ -8,6 +8,7 @@ macro simpl!(X) {
   infer-schedules(X);
 }
 
+no-memset(srad@scratch);
 phi-elim(*);
 let loop1 = outline(srad@loop1);
 let loop2 = outline(srad@loop2);
@@ -31,8 +32,18 @@ simpl!(*);
 fork-interchange[0, 1](loop1);
 reduce-slf(*);
 simpl!(*);
+slf(*);
+simpl!(*);
+
+fork-tile[32, 0, false, false](loop2);
+let split = fork-split(loop2);
+let loop2_body = outline(split.srad_1.fj1);
+simpl!(loop2, loop2_body);
+
+inline(srad@loop2);
+delete-uncalled(*);
 
-fork-split(*);
-unforkify(*);
+fork-split(extract, compress, loop1, loop2_body, loop3);
+unforkify(extract, compress, loop1, loop2_body, loop3);
 
 gcm(*);
diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch
index f736c0b776759596cfc93a5025fb4f280c3fc932..c5a305272de767b0fb894e57e9f5896f12f0a21f 100644
--- a/juno_samples/rodinia/srad/src/gpu.sch
+++ b/juno_samples/rodinia/srad/src/gpu.sch
@@ -8,6 +8,7 @@ macro simpl!(X) {
   infer-schedules(X);
 }
 
+no-memset(srad@scratch);
 phi-elim(*);
 let sum_loop = outline(srad@loop1);
 let main_loops = outline(srad@loop2 | srad@loop3);
@@ -41,15 +42,26 @@ fork-tile[32, 0, false, true](sum_loop);
 let out = fork-split(sum_loop);
 clean-monoid-reduces(sum_loop);
 simpl!(sum_loop);
-let fission = fork-fission[out.srad_0.fj0](sum_loop);
+
+let fission1 = fork-fission[out.srad_0.fj0](sum_loop);
+simpl!(sum_loop);
+fork-tile[32, 0, false, true](fission1.srad_0.fj_bottom);
+let out = fork-split(fission1.srad_0.fj_bottom);
+clean-monoid-reduces(sum_loop);
+simpl!(sum_loop);
+
+let fission2 = fork-fission[out.srad_0.fj0](sum_loop);
 simpl!(sum_loop);
-fork-tile[32, 0, false, true](fission.srad_0.fj_bottom);
-let out = fork-split(fission.srad_0.fj_bottom);
+fork-tile[32, 0, false, true](fission2.srad_0.fj_bottom);
+let out = fork-split(fission2.srad_0.fj_bottom);
 clean-monoid-reduces(sum_loop);
 simpl!(sum_loop);
-let top = outline(fission.srad_0.fj_top);
-let bottom = outline(out.srad_0.fj0);
-gpu(top, bottom);
+
+let first = outline(fission1.srad_0.fj_top);
+let second = outline(fission2.srad_0.fj_top);
+let third = outline(out.srad_0.fj0);
+gpu(first, second, third);
+const-inline[false](*);
 ip-sroa(*);
 sroa(*);
 simpl!(*);
@@ -60,4 +72,16 @@ dce(main_loops);
 fork-split(main_loops);
 simpl!(main_loops);
 
+fork-dim-merge(extract);
+fork-tile[32, 0, false, true](extract);
+dce(extract);
+fork-split(extract);
+simpl!(extract);
+
+fork-dim-merge(compress);
+fork-tile[32, 0, false, true](compress);
+dce(compress);
+fork-split(compress);
+simpl!(compress);
+
 gcm(*);
diff --git a/juno_samples/rodinia/srad/src/lib.rs b/juno_samples/rodinia/srad/src/lib.rs
index a647b94a5ffc8aad3bab91badc1bd58a305e7e75..cb156d9db712155ad9e5b3f9775e3c6c29dbf22a 100644
--- a/juno_samples/rodinia/srad/src/lib.rs
+++ b/juno_samples/rodinia/srad/src/lib.rs
@@ -48,22 +48,6 @@ pub fn srad_harness(args: SRADInputs) {
         let image = resize(&image_ori, image_ori_rows, image_ori_cols, nrows, ncols);
         let mut image_h = HerculesMutBox::from(image.clone());
 
-        let mut iN = (0..nrows).map(|i| i as i32 - 1).collect::<Vec<_>>();
-        let mut iS = (0..nrows).map(|i| i as i32 + 1).collect::<Vec<_>>();
-        let mut jW = (0..ncols).map(|j| j as i32 - 1).collect::<Vec<_>>();
-        let mut jE = (0..ncols).map(|j| j as i32 + 1).collect::<Vec<_>>();
-
-        // Fix boundary conditions
-        iN[0] = 0;
-        iS[nrows - 1] = (nrows - 1) as i32;
-        jW[0] = 0;
-        jE[ncols - 1] = (ncols - 1) as i32;
-
-        let iN_h = HerculesImmBox::from(iN.as_slice());
-        let iS_h = HerculesImmBox::from(iS.as_slice());
-        let jW_h = HerculesImmBox::from(jW.as_slice());
-        let jE_h = HerculesImmBox::from(jE.as_slice());
-
         let mut runner = runner!(srad);
         let result: Vec<f32> = HerculesMutBox::from(
             runner
@@ -72,10 +56,6 @@ pub fn srad_harness(args: SRADInputs) {
                     ncols as u64,
                     niter as u64,
                     image_h.to(),
-                    iN_h.to(),
-                    iS_h.to(),
-                    jW_h.to(),
-                    jE_h.to(),
                     max,
                     lambda,
                 )
@@ -90,18 +70,7 @@ pub fn srad_harness(args: SRADInputs) {
 
         if verify {
             let mut rust_result = image;
-            rust_srad::srad(
-                nrows,
-                ncols,
-                niter,
-                &mut rust_result,
-                &iN,
-                &iS,
-                &jW,
-                &jE,
-                max,
-                lambda,
-            );
+            rust_srad::srad(nrows, ncols, niter, &mut rust_result, max, lambda);
 
             if let Some(output) = output_verify {
                 write_graphics(output, &rust_result, nrows, ncols, max);
diff --git a/juno_samples/rodinia/srad/src/rust_srad.rs b/juno_samples/rodinia/srad/src/rust_srad.rs
index 3226e35faf2410f94ab887019d54abafe0219454..f25d382aff3c4f42c4478d911532a5850b509a26 100644
--- a/juno_samples/rodinia/srad/src/rust_srad.rs
+++ b/juno_samples/rodinia/srad/src/rust_srad.rs
@@ -1,15 +1,4 @@
-pub fn srad(
-    nrows: usize,
-    ncols: usize,
-    niter: usize,
-    image: &mut Vec<f32>,
-    iN: &[i32],
-    iS: &[i32],
-    jW: &[i32],
-    jE: &[i32],
-    max: f32,
-    lambda: f32,
-) {
+pub fn srad(nrows: usize, ncols: usize, niter: usize, image: &mut Vec<f32>, max: f32, lambda: f32) {
     let nelems = nrows * ncols;
 
     // EXTRACT
@@ -44,11 +33,15 @@ pub fn srad(
             for i in 0..nrows {
                 let k = i + nrows * j;
                 let Jc = image[k];
+                let iN = std::cmp::max(i, 1) - 1;
+                let iS = std::cmp::min(i, nrows - 2) + 1;
+                let jW = std::cmp::max(j, 1) - 1;
+                let jE = std::cmp::min(j, ncols - 2) + 1;
 
-                dN[k] = image[iN[i] as usize + nrows * j] - Jc;
-                dS[k] = image[iS[i] as usize + nrows * j] - Jc;
-                dW[k] = image[i + nrows * jW[j] as usize] - Jc;
-                dE[k] = image[i + nrows * jE[j] as usize] - Jc;
+                dN[k] = image[iN as usize + nrows * j] - Jc;
+                dS[k] = image[iS as usize + nrows * j] - Jc;
+                dW[k] = image[i + nrows * jW as usize] - Jc;
+                dE[k] = image[i + nrows * jE as usize] - Jc;
 
                 let G2 =
                     (dN[k] * dN[k] + dS[k] * dS[k] + dW[k] * dW[k] + dE[k] * dE[k]) / (Jc * Jc);
@@ -72,11 +65,13 @@ pub fn srad(
         for j in 0..ncols {
             for i in 0..nrows {
                 let k = i + nrows * j;
+                let iS = std::cmp::min(i, nrows - 2) + 1;
+                let jE = std::cmp::min(j, ncols - 2) + 1;
 
                 let cN = c[k];
-                let cS = c[iS[i] as usize + nrows * j];
+                let cS = c[iS as usize + nrows * j];
                 let cW = c[k];
-                let cE = c[i + nrows * jE[j] as usize];
+                let cE = c[i + nrows * jE as usize];
 
                 let D = cN * dN[k] + cS * dS[k] + cW * dW[k] + cE * dE[k];
 
diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn
index 6074bf8cb12ccc2ad29c1086d7620b3ef98bcf59..024be598becdabb9ae9c32380a2b584871240cf7 100644
--- a/juno_samples/rodinia/srad/src/srad.jn
+++ b/juno_samples/rodinia/srad/src/srad.jn
@@ -21,10 +21,6 @@ fn compress<nrows, ncols: usize>(inout image: f32[ncols, nrows], max: f32) {
 fn srad<nrows, ncols: usize>(
   niter: usize,
   inout image: f32[ncols, nrows],
-  iN: i32[nrows],
-  iS: i32[nrows],
-  jW: i32[ncols],
-  jE: i32[ncols],
   max: f32,
   lambda: f32,
 ) {
@@ -50,20 +46,25 @@ fn srad<nrows, ncols: usize>(
     let varROI  = (sum2 / nelems as f32) - meanROI * meanROI;
     let q0sqr   = varROI / (meanROI * meanROI);
 
-    @dirs let dN : f32[ncols, nrows];
-    @dirs let dS : f32[ncols, nrows];
-    @dirs let dE : f32[ncols, nrows];
-    @dirs let dW : f32[ncols, nrows];
+    @scratch let dN : f32[ncols, nrows];
+    @scratch let dS : f32[ncols, nrows];
+    @scratch let dE : f32[ncols, nrows];
+    @scratch let dW : f32[ncols, nrows];
 
-    let c : f32[ncols, nrows];
+    @scratch let c : f32[ncols, nrows];
 
     @loop2 for j in 0..ncols {
       for i in 0..nrows {
         let Jc = image[j, i];
-        dN[j, i] = image[j, iN[i] as u64] - Jc;
-        dS[j, i] = image[j, iS[i] as u64] - Jc;
-        dW[j, i] = image[jW[j] as u64, i] - Jc;
-        dE[j, i] = image[jE[j] as u64, i] - Jc;
+	let iN = max!(i, 1) - 1;
+	let iS = min!(i, nrows - 2) + 1;
+	let jW = max!(j, 1) - 1;
+	let jE = min!(j, ncols - 2) + 1;
+
+        dN[j, i] = image[j, iN as u64] - Jc;
+        dS[j, i] = image[j, iS as u64] - Jc;
+        dW[j, i] = image[jW as u64, i] - Jc;
+        dE[j, i] = image[jE as u64, i] - Jc;
 
         let G2 = (dN[j, i] * dN[j, i] + dS[j, i] * dS[j, i]
                 + dW[j, i] * dW[j, i] + dE[j, i] * dE[j, i]) / (Jc * Jc);
@@ -85,10 +86,13 @@ fn srad<nrows, ncols: usize>(
 
     @loop3 for j in 0..ncols {
       for i in 0..nrows {
+	let iS = min!(i, nrows - 2) + 1;
+	let jE = min!(j, ncols - 2) + 1;
+
         let cN = c[j, i];
-        let cS = c[j, iS[i] as u64];
+        let cS = c[j, iS as u64];
         let cW = c[j, i];
-        let cE = c[jE[j] as u64, i];
+        let cE = c[jE as u64, i];
 
         let D = cN * dN[j, i] + cS * dS[j, i] + cW * dW[j, i] + cE * dE[j, i];
         image[j, i] = image[j, i] + 0.25 * lambda * D;
diff --git a/juno_scheduler/src/compile.rs b/juno_scheduler/src/compile.rs
index fbe6d8f084a5dc97e0d15a4063274c025845a23b..81fc82cd8f72d35ebe5a8f3276fe5690b2e0af88 100644
--- a/juno_scheduler/src/compile.rs
+++ b/juno_scheduler/src/compile.rs
@@ -22,6 +22,7 @@ pub enum ScheduleCompilerError {
         actual: usize,
         loc: Location,
     },
+    SemanticError(String, Location),
 }
 
 impl fmt::Display for ScheduleCompilerError {
@@ -46,6 +47,11 @@ impl fmt::Display for ScheduleCompilerError {
                 "({}, {}) -- ({}, {}): Expected {} arguments, found {}",
                 loc.0 .0, loc.0 .1, loc.1 .0, loc.1 .1, expected, actual
             ),
+            ScheduleCompilerError::SemanticError(msg, loc) => write!(
+                f,
+                "({}, {}) -- ({}, {}): {}",
+                loc.0 .0, loc.0 .1, loc.1 .0, loc.1 .1, msg,
+            ),
         }
     }
 }
@@ -76,6 +82,8 @@ enum Appliable {
     // DeleteUncalled requires special handling because it changes FunctionIDs, so it is not
     // treated like a pass
     DeleteUncalled,
+    // Test whether a feature is enabled
+    Feature,
     Schedule(Schedule),
     Device(Device),
 }
@@ -85,6 +93,8 @@ impl Appliable {
     fn is_valid_num_args(&self, num: usize) -> bool {
         match self {
             Appliable::Pass(pass) => pass.is_valid_num_args(num),
+            // Testing whether a feature is enabled takes the feature instead of a selection, so it
+            // has 0 arguments
             // Delete uncalled, Schedules, and devices do not take arguments
             _ => num == 0,
         }
@@ -159,6 +169,8 @@ impl FromStr for Appliable {
             "serialize" => Ok(Appliable::Pass(ir::Pass::Serialize)),
             "write-predication" => Ok(Appliable::Pass(ir::Pass::WritePredication)),
 
+            "feature" => Ok(Appliable::Feature),
+
             "print" => Ok(Appliable::Pass(ir::Pass::Print)),
 
             "cpu" | "llvm" => Ok(Appliable::Device(Device::LLVM)),
@@ -276,6 +288,35 @@ fn compile_stmt(
                 limit,
             }])
         }
+        parser::Stmt::IfThenElse {
+            span: _,
+            cond,
+            thn,
+            els,
+        } => {
+            let cond = compile_exp_as_expr(cond, lexer, macrostab, macros)?;
+
+            macros.open_scope();
+            let thn = ir::ScheduleStmt::Block {
+                body: compile_ops_as_block(*thn, lexer, macrostab, macros)?,
+            };
+            macros.close_scope();
+
+            macros.open_scope();
+            let els = match els {
+                Some(els) => ir::ScheduleStmt::Block {
+                    body: compile_ops_as_block(*els, lexer, macrostab, macros)?,
+                },
+                None => ir::ScheduleStmt::Block { body: vec![] },
+            };
+            macros.close_scope();
+
+            Ok(vec![ir::ScheduleStmt::IfThenElse {
+                cond,
+                thn: Box::new(thn),
+                els: Box::new(els),
+            }])
+        }
         parser::Stmt::MacroDecl { span: _, def } => {
             let parser::MacroDecl {
                 name,
@@ -381,6 +422,17 @@ fn compile_expr(
                         on: selection,
                     }))
                 }
+                Appliable::Feature => match selection {
+                    ir::Selector::Selection(mut args) if args.len() == 1 => {
+                        Ok(ExprResult::Expr(ir::ScheduleExp::Feature {
+                            feature: Box::new(args.pop().unwrap()),
+                        }))
+                    }
+                    _ => Err(ScheduleCompilerError::SemanticError(
+                        "feature requires exactly one argument as its selection".to_string(),
+                        lexer.line_col(span),
+                    )),
+                },
                 Appliable::Schedule(sched) => Ok(ExprResult::Stmt(ir::ScheduleStmt::AddSchedule {
                     sched,
                     on: selection,
diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs
index dd7a76a7da6b58bb5bc723116e84318b4b6346dc..287cd21a1ff64838942f8f2117937fd52020e75c 100644
--- a/juno_scheduler/src/ir.rs
+++ b/juno_scheduler/src/ir.rs
@@ -124,6 +124,9 @@ pub enum ScheduleExp {
     DeleteUncalled {
         on: Selector,
     },
+    Feature {
+        feature: Box<ScheduleExp>,
+    },
     Record {
         fields: Vec<(String, ScheduleExp)>,
     },
@@ -183,4 +186,9 @@ pub enum ScheduleStmt {
         device: Device,
         on: Selector,
     },
+    IfThenElse {
+        cond: ScheduleExp,
+        thn: Box<ScheduleStmt>,
+        els: Box<ScheduleStmt>,
+    },
 }
diff --git a/juno_scheduler/src/lang.l b/juno_scheduler/src/lang.l
index af154fce3d489b7b078607e52888d76d08f8e5fe..1f4f8723de4d950920d503a11c9145ad1d15d24a 100644
--- a/juno_scheduler/src/lang.l
+++ b/juno_scheduler/src/lang.l
@@ -20,12 +20,15 @@
 \.              "."
 
 apply           "apply"
+else            "else"
 fixpoint        "fixpoint"
+if              "if"
 let             "let"
 macro           "macro_keyword"
 on              "on"
 set             "set"
 target          "target"
+then            "then"
 
 true            "true"
 false           "false"
diff --git a/juno_scheduler/src/lang.y b/juno_scheduler/src/lang.y
index 451f035b8122ac1a694567327a44776c9326fff2..55c82b9dc368e7cade9500fb852c9d48320be7d2 100644
--- a/juno_scheduler/src/lang.y
+++ b/juno_scheduler/src/lang.y
@@ -27,6 +27,10 @@ Stmt -> Stmt
       { Stmt::ExprStmt { span: $span, exp: $1 } }
   | 'fixpoint' FixpointLimit '{' Schedule '}'
       { Stmt::Fixpoint { span: $span, limit: $2, body: Box::new($4) } }
+  | 'if' Expr '{' Schedule '}'
+      { Stmt::IfThenElse { span: $span, cond: $2, thn: Box::new($4), els: None } }
+  | 'if' Expr '{' Schedule '}' 'else' '{' Schedule '}'
+      { Stmt::IfThenElse { span: $span, cond: $2, thn: Box::new($4), els: Some(Box::new($8)) } }
   | MacroDecl
       { Stmt::MacroDecl { span: $span, def: $1 } }
   ;
@@ -163,6 +167,7 @@ pub enum Stmt {
   AssignStmt { span: Span, var: Span, rhs: Expr },
   ExprStmt   { span: Span, exp: Expr },
   Fixpoint   { span: Span, limit: FixpointLimit, body: Box<OperationList> },
+  IfThenElse { span: Span, cond: Expr, thn: Box<OperationList>, els: Option<Box<OperationList>> },
   MacroDecl  { span: Span, def: MacroDecl },
 }
 
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 2e31192a5e971d5eb2c32e07e966eb4cfe687da0..97bec5445eeeb9e9f849ef129b8ce9cf8c242798 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -16,6 +16,7 @@ use juno_utils::stringtab::StringTable;
 
 use std::cell::RefCell;
 use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
+use std::env;
 use std::fmt;
 use std::fs::File;
 use std::io::Write;
@@ -1199,6 +1200,23 @@ fn schedule_interpret(
             // were made
             Ok(i > 1)
         }
+        ScheduleStmt::IfThenElse { cond, thn, els } => {
+            let (cond, modified) = interp_expr(pm, cond, stringtab, env, functions)?;
+            let Value::Boolean { val: cond } = cond else {
+                return Err(SchedulerError::SemanticError(
+                    "Condition must be a boolean value".to_string(),
+                ));
+            };
+            let changed = schedule_interpret(
+                pm,
+                if cond { &*thn } else { &*els },
+                stringtab,
+                env,
+                functions,
+            )?;
+
+            Ok(modified || changed)
+        }
         ScheduleStmt::Block { body } => {
             let mut modified = false;
             env.open_scope();
@@ -1339,7 +1357,10 @@ fn interp_expr(
                     }
                 }
                 Value::Record { fields } => match fields.get(field) {
-                    None => Err(SchedulerError::UndefinedField(field.clone())),
+                    None => Err(SchedulerError::UndefinedField(format!(
+                        "{} not in {:?}",
+                        field, fields
+                    ))),
                     Some(v) => Ok((v.clone(), changed)),
                 },
             }
@@ -1443,6 +1464,23 @@ fn interp_expr(
                 changed,
             ))
         }
+        ScheduleExp::Feature { feature } => {
+            let (feature, modified) = interp_expr(pm, &*feature, stringtab, env, functions)?;
+            let Value::String { val } = feature else {
+                return Err(SchedulerError::SemanticError(
+                    "Feature expects a single string argument (instead of a selection)".to_string(),
+                ));
+            };
+            // To test for features, the scheduler needs to be invoked from a build script so that
+            // Cargo provides the enabled features via environment variables
+            let key = "CARGO_FEATURE_".to_string() + &val.to_uppercase().replace("-", "_");
+            Ok((
+                Value::Boolean {
+                    val: env::var(key).is_ok(),
+                },
+                modified,
+            ))
+        }
         ScheduleExp::Record { fields } => {
             let mut result = HashMap::new();
             let mut changed = false;