From 10f9373682232536ad0c400e4365d88e2f7050f0 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Wed, 26 Feb 2025 14:49:03 -0600 Subject: [PATCH] Clean-up cpu schedule --- juno_samples/matmul/src/cpu.sch | 54 +++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch index 2fcf3108..fb08c254 100644 --- a/juno_samples/matmul/src/cpu.sch +++ b/juno_samples/matmul/src/cpu.sch @@ -10,27 +10,45 @@ macro optimize!(X) { dce(X); } -macro codegen!(X) { - gcm(*); - float-collections(*); - dce(*); - gcm(*); +macro codegen-prep!(X) { + optimize!(X); + gcm(X); + float-collections(X); + dce(X); + gcm(X); } -optimize!(*); +macro forkify!(X) { + fixpoint { + forkify(X); + fork-guard-elim(X); + } +} + +macro fork-tile { + fork-tile[n, 0, false, true](X); +} + +macro parallelize!(X) { + parallel-fork(X); + parallel-reduce(X); +} -fixpoint panic after 20 { - forkify(matmul); - fork-guard-elim(matmul); +macro unforkify!(X) { + fork-split(X); + unforkify(X); } +optimize!(*); +forkify!(*); + // Mark the whole loop nest as associative, any order of iterations is equivalent associative(matmul@outer); // Tile the outer 2 loops to create 16 parallel threads (each responsible for // computing one block of the output let par = matmul@outer \ matmul@inner; -fork-tile[4, 0, false, true](par); +fork-tile; fork-coalesce(par); fork-interchange[0, 1](par); fork-interchange[2, 3](par); @@ -38,29 +56,25 @@ fork-interchange[1, 2](par); let split = fork-split(*); fork-coalesce(split.matmul.fj0 \ split.matmul.fj2); -parallel-fork(split.matmul.fj0 \ split.matmul.fj2); + +parallelize!(split.matmul.fj0 \ split.matmul.fj2); // Pull the body of the parallel loop out into its own device function let body = outline(split.matmul.fj2); cpu(body); // Tile the loop nest for cache performance; 16x16x16 tile -fork-tile[16, 0, false, true](body); +fork-tile; fixpoint { fork-coalesce(body); } - fork-interchange[1, 2](body); fork-interchange[3, 4](body); fork-interchange[2, 3](body); -optimize!(*); - fork-split(body); reduce-slf(*); -unforkify(body); - -optimize!(*); -parallel-reduce(split.matmul.fj0); xdot[true](*); -codegen!(*); +unforkify!(body); + +codegen-prep!(*); -- GitLab