macro optimize!(X) { gvn(X); phi-elim(X); dce(X); ip-sroa(X); sroa(X); dce(X); gvn(X); phi-elim(X); dce(X); } macro codegen-prep!(X) { optimize!(X); gcm(X); float-collections(X); dce(X); gcm(X); } macro forkify!(X) { fixpoint { forkify(X); fork-guard-elim(X); } } macro fork-tile { fork-tile[n, 0, false, true](X); } macro parallelize!(X) { parallel-fork(X); parallel-reduce(X); } macro unforkify!(X) { fork-split(X); unforkify(X); } optimize!(*); forkify!(*); associative(matmul@outer); // Parallelize by computing output array as 16 chunks let par = matmul@outer \ matmul@inner; fork-tile; let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par); parallelize!(outer \ inner); let body = outline(inner); cpu(body); // Tile for cache, assuming 64B cache lines fork-tile; let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body); reduce-slf(inner); unforkify!(body); codegen-prep!(*);