diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..bef45ca2aa75f07491998cb67cedcee8b3f2049b
--- /dev/null
+++ b/juno_samples/matmul/src/cpu.sch
@@ -0,0 +1,63 @@
+macro optimize!(X) {
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+  ip-sroa(X);
+  sroa(X);
+  dce(X);
+  gvn(X);
+  phi-elim(X);
+  dce(X);
+}
+
+macro codegen!(X) {
+  gcm(*);
+  float-collections(*);
+  dce(*);
+  gcm(*);
+}
+
+optimize!(*);
+
+fixpoint panic after 20 {
+  forkify(matmul);
+  fork-guard-elim(matmul);
+}
+
+// Mark the whole loop nest as associative, any order of iterations is equivalent
+associative(matmul@outer);
+
+// Tile the outer 2 loops to create 16 parallel threads (each responsible for
+// computing one block of the output
+let par = matmul@outer \ matmul@inner;
+fork-tile[4, 0, false, true](par);
+fork-coalesce(par);
+fork-interchange[0, 1](par);
+fork-interchange[2, 3](par);
+fork-interchange[1, 2](par);
+
+let split = fork-split(*);
+fork-coalesce(split.matmul.fj0 \ split.matmul.fj2);
+parallel-fork(split.matmul.fj0 \ split.matmul.fj2);
+
+// Pull the body of the parallel loop out into its own device function
+let body = outline(split.matmul.fj2);
+cpu(body);
+
+// Tile the loop nest for cache performance; 16x16x16 tile
+fork-tile[16, 0, false, true](body);
+fixpoint { fork-coalesce(body); }
+
+fork-interchange[1, 2](body);
+fork-interchange[3, 4](body);
+fork-interchange[2, 3](body);
+
+optimize!(*);
+
+fork-split(body);
+reduce-slf(*);
+unforkify(body);
+
+optimize!(*);
+
+codegen!(*);