From d179193de2081aec72dfe06baa96dd58612b98a4 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Mon, 24 Feb 2025 15:28:02 -0600 Subject: [PATCH] Parallel tiled cpu schedule --- juno_samples/matmul/src/cpu.sch | 63 +++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 juno_samples/matmul/src/cpu.sch diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch new file mode 100644 index 00000000..bef45ca2 --- /dev/null +++ b/juno_samples/matmul/src/cpu.sch @@ -0,0 +1,63 @@ +macro optimize!(X) { + gvn(X); + phi-elim(X); + dce(X); + ip-sroa(X); + sroa(X); + dce(X); + gvn(X); + phi-elim(X); + dce(X); +} + +macro codegen!(X) { + gcm(*); + float-collections(*); + dce(*); + gcm(*); +} + +optimize!(*); + +fixpoint panic after 20 { + forkify(matmul); + fork-guard-elim(matmul); +} + +// Mark the whole loop nest as associative, any order of iterations is equivalent +associative(matmul@outer); + +// Tile the outer 2 loops to create 16 parallel threads (each responsible for +// computing one block of the output +let par = matmul@outer \ matmul@inner; +fork-tile[4, 0, false, true](par); +fork-coalesce(par); +fork-interchange[0, 1](par); +fork-interchange[2, 3](par); +fork-interchange[1, 2](par); + +let split = fork-split(*); +fork-coalesce(split.matmul.fj0 \ split.matmul.fj2); +parallel-fork(split.matmul.fj0 \ split.matmul.fj2); + +// Pull the body of the parallel loop out into its own device function +let body = outline(split.matmul.fj2); +cpu(body); + +// Tile the loop nest for cache performance; 16x16x16 tile +fork-tile[16, 0, false, true](body); +fixpoint { fork-coalesce(body); } + +fork-interchange[1, 2](body); +fork-interchange[3, 4](body); +fork-interchange[2, 3](body); + +optimize!(*); + +fork-split(body); +reduce-slf(*); +unforkify(body); + +optimize!(*); + +codegen!(*); -- GitLab