diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
index 76808149e7b99dce97e43f0e936536d3d13b7417..effdc6b2830680f35f3aa1f2e443ed1b63934c3a 100644
--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
@@ -1,26 +1,51 @@
-phi-elim(*);
+macro optimize!(X) {
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  ip-sroa(X);
+  sroa(X);
+  dce(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+}
+
+macro codegen!(X) {
+  gcm(*);
+  float-collections(*);
+  dce(*);
+  gcm(*);
+}
 
-forkify(*);
-fork-guard-elim(*);
-dce(*);
+optimize!(*);
 
-fixpoint {
-  reduce-slf(*);
-  slf(*);
-  infer-schedules(*);
+fixpoint panic after 20 {
+  forkify(matmul);
+  fork-guard-elim(matmul);
 }
-fork-coalesce(*);
-infer-schedules(*);
-dce(*);
-rewrite(*);
-fixpoint {
-  simplify-cfg(*);
-  dce(*);
+
+optimize!(*);
+
+fixpoint panic after 20 {
+  reduce-slf(matmul);
+  slf(matmul);
+  infer-schedules(matmul);
 }
+dce(matmul);
 
-ip-sroa(*);
-sroa(*);
-dce(*);
+// Tile outer and middle loops into 32x32 sized blocks
+fork-tile[32, 0, false, true](matmul@outer \ matmul@inner);
+// Merge outer and middle loops and interchange so blocks are first
+fork-coalesce(matmul@outer \ matmul@inner);
+fork-interchange[1, 2](matmul@outer \ matmul@inner);
+// Split forks
+let split = fork-split(matmul);
+// Join the threads and then blocks into a single fork each
+fork-coalesce(split.matmul.fj2 \ matmul@inner);
+fork-coalesce(split.matmul.fj0 \ split.matmul.fj2);
 
+let auto = auto-outline(*);
 float-collections(*);
-gcm(*);
+gpu(auto.matmul);
+
+codegen!(*);