diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch index 2fcf3108d12690d2173ed52ce95aa8edc341b7f5..fb08c2546449f43bf94653eea12478c54828aded 100644 --- a/juno_samples/matmul/src/cpu.sch +++ b/juno_samples/matmul/src/cpu.sch @@ -10,27 +10,45 @@ macro optimize!(X) { dce(X); } -macro codegen!(X) { - gcm(*); - float-collections(*); - dce(*); - gcm(*); +macro codegen-prep!(X) { + optimize!(X); + gcm(X); + float-collections(X); + dce(X); + gcm(X); } -optimize!(*); +macro forkify!(X) { + fixpoint { + forkify(X); + fork-guard-elim(X); + } +} + +macro fork-tile { + fork-tile[n, 0, false, true](X); +} + +macro parallelize!(X) { + parallel-fork(X); + parallel-reduce(X); +} -fixpoint panic after 20 { - forkify(matmul); - fork-guard-elim(matmul); +macro unforkify!(X) { + fork-split(X); + unforkify(X); } +optimize!(*); +forkify!(*); + // Mark the whole loop nest as associative, any order of iterations is equivalent associative(matmul@outer); // Tile the outer 2 loops to create 16 parallel threads (each responsible for // computing one block of the output let par = matmul@outer \ matmul@inner; -fork-tile[4, 0, false, true](par); +fork-tile; fork-coalesce(par); fork-interchange[0, 1](par); fork-interchange[2, 3](par); @@ -38,29 +56,25 @@ fork-interchange[1, 2](par); let split = fork-split(*); fork-coalesce(split.matmul.fj0 \ split.matmul.fj2); -parallel-fork(split.matmul.fj0 \ split.matmul.fj2); + +parallelize!(split.matmul.fj0 \ split.matmul.fj2); // Pull the body of the parallel loop out into its own device function let body = outline(split.matmul.fj2); cpu(body); // Tile the loop nest for cache performance; 16x16x16 tile -fork-tile[16, 0, false, true](body); +fork-tile; fixpoint { fork-coalesce(body); } - fork-interchange[1, 2](body); fork-interchange[3, 4](body); fork-interchange[2, 3](body); -optimize!(*); - fork-split(body); reduce-slf(*); -unforkify(body); - -optimize!(*); -parallel-reduce(split.matmul.fj0); xdot[true](*); -codegen!(*); +unforkify!(body); + +codegen-prep!(*);