Skip to content
Snippets Groups Projects

Fork reshape

Merged Aaron Councilman requested to merge fork-reshape into main
1 file
+ 34
20
Compare changes
  • Side-by-side
  • Inline
@@ -10,27 +10,45 @@ macro optimize!(X) {
dce(X);
}
macro codegen!(X) {
gcm(*);
float-collections(*);
dce(*);
gcm(*);
macro codegen-prep!(X) {
optimize!(X);
gcm(X);
float-collections(X);
dce(X);
gcm(X);
}
optimize!(*);
macro forkify!(X) {
fixpoint {
forkify(X);
fork-guard-elim(X);
}
}
macro fork-tile![n](X) {
fork-tile[n, 0, false, true](X);
}
macro parallelize!(X) {
parallel-fork(X);
parallel-reduce(X);
}
fixpoint panic after 20 {
forkify(matmul);
fork-guard-elim(matmul);
macro unforkify!(X) {
fork-split(X);
unforkify(X);
}
optimize!(*);
forkify!(*);
// Mark the whole loop nest as associative, any order of iterations is equivalent
associative(matmul@outer);
// Tile the outer 2 loops to create 16 parallel threads (each responsible for
// computing one block of the output
let par = matmul@outer \ matmul@inner;
fork-tile[4, 0, false, true](par);
fork-tile![4](par);
fork-coalesce(par);
fork-interchange[0, 1](par);
fork-interchange[2, 3](par);
@@ -38,29 +56,25 @@ fork-interchange[1, 2](par);
let split = fork-split(*);
fork-coalesce(split.matmul.fj0 \ split.matmul.fj2);
parallel-fork(split.matmul.fj0 \ split.matmul.fj2);
parallelize!(split.matmul.fj0 \ split.matmul.fj2);
// Pull the body of the parallel loop out into its own device function
let body = outline(split.matmul.fj2);
cpu(body);
// Tile the loop nest for cache performance; 16x16x16 tile
fork-tile[16, 0, false, true](body);
fork-tile![16](body);
fixpoint { fork-coalesce(body); }
fork-interchange[1, 2](body);
fork-interchange[3, 4](body);
fork-interchange[2, 3](body);
optimize!(*);
fork-split(body);
reduce-slf(*);
unforkify(body);
optimize!(*);
parallel-reduce(split.matmul.fj0);
xdot[true](*);
codegen!(*);
unforkify!(body);
codegen-prep!(*);
Loading