diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
index effdc6b2830680f35f3aa1f2e443ed1b63934c3a..76808149e7b99dce97e43f0e936536d3d13b7417 100644
--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
@@ -1,51 +1,26 @@
-macro optimize!(X) {
-  gvn(X);
-  phi-elim(X);
-  dce(X);
-  ip-sroa(X);
-  sroa(X);
-  dce(X);
-  gvn(X);
-  phi-elim(X);
-  dce(X);
-}
-
-macro codegen!(X) {
-  gcm(*);
-  float-collections(*);
-  dce(*);
-  gcm(*);
-}
+phi-elim(*);
 
-optimize!(*);
+forkify(*);
+fork-guard-elim(*);
+dce(*);
 
-fixpoint panic after 20 {
-  forkify(matmul);
-  fork-guard-elim(matmul);
+fixpoint {
+  reduce-slf(*);
+  slf(*);
+  infer-schedules(*);
 }
-
-optimize!(*);
-
-fixpoint panic after 20 {
-  reduce-slf(matmul);
-  slf(matmul);
-  infer-schedules(matmul);
+fork-coalesce(*);
+infer-schedules(*);
+dce(*);
+rewrite(*);
+fixpoint {
+  simplify-cfg(*);
+  dce(*);
 }
-dce(matmul);
 
-// Tile outer and middle loops into 32x32 sized blocks
-fork-tile[32, 0, false, true](matmul@outer \ matmul@inner);
-// Merge outer and middle loops and interchange so blocks are first
-fork-coalesce(matmul@outer \ matmul@inner);
-fork-interchange[1, 2](matmul@outer \ matmul@inner);
-// Split forks
-let split = fork-split(matmul);
-// Join the threads and then blocks into a single fork each
-fork-coalesce(split.matmul.fj2 \ matmul@inner);
-fork-coalesce(split.matmul.fj0 \ split.matmul.fj2);
+ip-sroa(*);
+sroa(*);
+dce(*);
 
-let auto = auto-outline(*);
 float-collections(*);
-gpu(auto.matmul);
-
-codegen!(*);
+gcm(*);