From 199a8a80e72e8377faedea785a879847edf11ff7 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Sat, 22 Feb 2025 18:44:44 -0600
Subject: [PATCH] Manual gpu schedule

---
 juno_samples/matmul/src/gpu.sch | 63 +++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
index 76808149..effdc6b2 100644
--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
@@ -1,26 +1,51 @@
-phi-elim(*);
+macro optimize!(X) {
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  ip-sroa(X);
+  sroa(X);
+  dce(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+}
+
+macro codegen!(X) {
+  gcm(*);
+  float-collections(*);
+  dce(*);
+  gcm(*);
+}
 
-forkify(*);
-fork-guard-elim(*);
-dce(*);
+optimize!(*);
 
-fixpoint {
-  reduce-slf(*);
-  slf(*);
-  infer-schedules(*);
+fixpoint panic after 20 {
+  forkify(matmul);
+  fork-guard-elim(matmul);
 }
-fork-coalesce(*);
-infer-schedules(*);
-dce(*);
-rewrite(*);
-fixpoint {
-  simplify-cfg(*);
-  dce(*);
+
+optimize!(*);
+
+fixpoint panic after 20 {
+  reduce-slf(matmul);
+  slf(matmul);
+  infer-schedules(matmul);
 }
+dce(matmul);
 
-ip-sroa(*);
-sroa(*);
-dce(*);
+// Tile outer and middle loops into 32x32 sized blocks
+fork-tile[32, 0, false, true](matmul@outer \ matmul@inner);
+// Merge outer and middle loops and interchange so blocks are first
+fork-coalesce(matmul@outer \ matmul@inner);
+fork-interchange[1, 2](matmul@outer \ matmul@inner);
+// Split forks
+let split = fork-split(matmul);
+// Join the threads and then blocks into a single fork each
+fork-coalesce(split.matmul.fj2 \ matmul@inner);
+fork-coalesce(split.matmul.fj0 \ split.matmul.fj2);
 
+let auto = auto-outline(*);
 float-collections(*);
-gcm(*);
+gpu(auto.matmul);
+
+codegen!(*);
-- 
GitLab