diff --git a/juno_samples/rodinia/backprop/src/backprop.jn b/juno_samples/rodinia/backprop/src/backprop.jn
index 94c4334c1cae17a396384ad6135432e3e80f70e3..2ca57c9f3d98624f04621f42fe28d33272fcd4e6 100644
--- a/juno_samples/rodinia/backprop/src/backprop.jn
+++ b/juno_samples/rodinia/backprop/src/backprop.jn
@@ -6,10 +6,9 @@ fn squash(x: f32) -> f32 {
 fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f32[m + 1] {
   @res let result : f32[m + 1];
   result[0] = 1.0;
-
   @outer_loop for j in 1..=m {
-    let sum = 0.0;
-    @inner_loop for k in 0..=n {
+    let sum = weights[0, j] * vals[0];
+    @inner_loop for k in 1..=n {
       sum += weights[k, j] * vals[k];
     }
     result[j] = squash(sum);
@@ -19,13 +18,16 @@ fn layer_forward<n, m: usize>(vals: f32[n + 1], weights: f32[n + 1, m + 1]) -> f
 }
 
 fn output_error<n: usize>(target: f32[n + 1], actual: f32[n + 1]) -> f32, f32[n + 1] {
-  let errsum = 0.0;
-  let delta : f32[n + 1];
-
-  for j in 1..=n {
+  @loop1 @res let delta : f32[n + 1];
+  @loop1 delta[0] = 0.0;
+  @loop1 for j in 1..=n {
     let a = actual[j];
     let t = target[j];
     delta[j] = a * (1.0 - a) * (t - a);
+  }
+
+  let errsum = 0.0;
+  @loop2 for j in 1..=n {
     errsum += abs!(delta[j]);
   }
 
@@ -37,10 +39,9 @@ fn hidden_error<hidden_n, output_n: usize>(
   hidden_weights: f32[hidden_n + 1, output_n + 1],
   hidden_vals: f32[hidden_n + 1],
 ) -> f32, f32[hidden_n + 1] {
-  let errsum = 0.0;
-  let delta : f32[hidden_n + 1];
-
-  for j in 1..=hidden_n {
+  @loop1 @res let delta : f32[hidden_n + 1];
+  @loop1 delta[0] = 0.0;
+  @loop1 for j in 1..=hidden_n {
     let h = hidden_vals[j];
 
     let sum = 0.0;
@@ -49,6 +50,10 @@ fn hidden_error<hidden_n, output_n: usize>(
     }
 
     delta[j] = h * (1.0 - h) * sum;
+  }
+
+  let errsum = 0.0;
+  @loop2 for j in 1..=hidden_n {
     errsum += abs!(delta[j]);
   }
 
@@ -89,8 +94,8 @@ fn backprop<input_n, hidden_n, output_n: usize>(
   let hidden_vals = layer_forward::<input_n, hidden_n>(input_vals, input_weights);
   let output_vals = layer_forward::<hidden_n, output_n>(hidden_vals, hidden_weights);
 
-  let out_err, out_delta = output_error::<output_n>(target, output_vals);
-  let hid_err, hid_delta = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals);
+  @output_error let out_err, out_delta = output_error::<output_n>(target, output_vals);
+  @hidden_error let hid_err, hid_delta = hidden_error::<hidden_n, output_n>(out_delta, hidden_weights, hidden_vals);
 
   let hidden_weights, hidden_prev_weights
     = adjust_weights::<hidden_n, output_n>(out_delta, hidden_vals, hidden_weights, hidden_prev_weights);
diff --git a/juno_samples/rodinia/backprop/src/cpu.sch b/juno_samples/rodinia/backprop/src/cpu.sch
index de34d660bcc5e3d95d58aa63524bffdbc0b8f67e..661ec531b6b56d4965e20d149cc45598c0c51956 100644
--- a/juno_samples/rodinia/backprop/src/cpu.sch
+++ b/juno_samples/rodinia/backprop/src/cpu.sch
@@ -12,7 +12,7 @@ simpl!(*);
 inline(layer_forward);
 delete-uncalled(*);
 
-no-memset(layer_forward@res);
+no-memset(layer_forward@res, output_error@res, hidden_error@res);
 lift-dc-math(*);
 loop-bound-canon(*);
 simpl!(*);
@@ -25,6 +25,8 @@ fixpoint {
 }
 reduce-slf(*);
 simpl!(*);
+fork-interchange[0, 1](adjust_weights);
+simpl!(*);
 
 fork-split(*);
 unforkify(*);
diff --git a/juno_samples/rodinia/backprop/src/gpu.sch b/juno_samples/rodinia/backprop/src/gpu.sch
index 2011860dab535e174a22d91bf340bbd178080a3c..d0be79db50addb8c9f8b553dff7b68a71f3f4fe8 100644
--- a/juno_samples/rodinia/backprop/src/gpu.sch
+++ b/juno_samples/rodinia/backprop/src/gpu.sch
@@ -1,24 +1,51 @@
-gvn(*);
-dce(*);
+macro simpl!(X) {
+  ccp(X);
+  simplify-cfg(X);
+  lift-dc-math(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  infer-schedules(X);
+}
+
+no-memset(layer_forward@res, output_error@res, hidden_error@res);
 phi-elim(*);
-dce(*);
-crc(*);
-dce(*);
-slf(*);
-dce(*);
+let output_loop1 = outline(output_error@loop1);
+let output_loop2 = outline(output_error@loop2);
+let hidden_loop1 = outline(hidden_error@loop1);
+let hidden_loop2 = outline(hidden_error@loop2);
+simpl!(*);
+inline(layer_forward, backprop@output_error, backprop@hidden_error);
+delete-uncalled(*);
+gpu(layer_forward, output_loop1, output_loop2, hidden_loop1, hidden_loop2, adjust_weights);
+const-inline(*);
 
-let auto = auto-outline(backprop);
-gpu(auto.backprop);
+lift-dc-math(*);
+loop-bound-canon(*);
+simpl!(*);
+lift-dc-math(*);
+slf(*);
+fixpoint {
+  forkify(*);
+  fork-guard-elim(*);
+  fork-coalesce(*);
+}
+reduce-slf(*);
+simpl!(*);
 
-inline(auto.backprop);
-inline(auto.backprop);
-delete-uncalled(*);
+fork-tile[16, 0, false, true](layer_forward@inner_loop);
+let out = fork-split(layer_forward@inner_loop);
+clean-monoid-reduces(layer_forward);
+simpl!(layer_forward);
+let fission = fork-fission[out._1_layer_forward.fj0](layer_forward);
+simpl!(layer_forward);
 
-sroa[true](*);
-dce(*);
-float-collections(*);
-reuse-products(*);
-dce(*);
+fork-dim-merge(adjust_weights);
+simpl!(adjust_weights);
+fork-extend[32](adjust_weights);
+fork-tile[32, 0, false, true](adjust_weights);
+fork-split(adjust_weights);
+simpl!(adjust_weights);
 
+xdot[true](*);
 gcm(*);
-