From d179193de2081aec72dfe06baa96dd58612b98a4 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Mon, 24 Feb 2025 15:28:02 -0600
Subject: [PATCH] Parallel tiled cpu schedule

---
 juno_samples/matmul/src/cpu.sch | 63 +++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 juno_samples/matmul/src/cpu.sch

diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch
new file mode 100644
index 00000000..bef45ca2
--- /dev/null
+++ b/juno_samples/matmul/src/cpu.sch
@@ -0,0 +1,63 @@
+macro optimize!(X) {
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  ip-sroa(X);
+  sroa(X);
+  dce(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+}
+
+macro codegen!(X) {
+  gcm(*);
+  float-collections(*);
+  dce(*);
+  gcm(*);
+}
+
+optimize!(*);
+
+fixpoint panic after 20 {
+  forkify(matmul);
+  fork-guard-elim(matmul);
+}
+
+// Mark the whole loop nest as associative, any order of iterations is equivalent
+associative(matmul@outer);
+
+// Tile the outer 2 loops to create 16 parallel threads (each responsible for
+// computing one block of the output
+let par = matmul@outer \ matmul@inner;
+fork-tile[4, 0, false, true](par);
+fork-coalesce(par);
+fork-interchange[0, 1](par);
+fork-interchange[2, 3](par);
+fork-interchange[1, 2](par);
+
+let split = fork-split(*);
+fork-coalesce(split.matmul.fj0 \ split.matmul.fj2);
+parallel-fork(split.matmul.fj0 \ split.matmul.fj2);
+
+// Pull the body of the parallel loop out into its own device function
+let body = outline(split.matmul.fj2);
+cpu(body);
+
+// Tile the loop nest for cache performance; 16x16x16 tile
+fork-tile[16, 0, false, true](body);
+fixpoint { fork-coalesce(body); }
+
+fork-interchange[1, 2](body);
+fork-interchange[3, 4](body);
+fork-interchange[2, 3](body);
+
+optimize!(*);
+
+fork-split(body);
+reduce-slf(*);
+unforkify(body);
+
+optimize!(*);
+
+codegen!(*);
-- 
GitLab