macro optimize!(X) {
  gvn(X);
  phi-elim(X);
  dce(X);
  ip-sroa(X);
  sroa(X);
  dce(X);
  gvn(X);
  phi-elim(X);
  dce(X);
}

macro codegen-prep!(X) {
  optimize!(X);
  gcm(X);
  float-collections(X);
  dce(X);
  gcm(X);
}

macro forkify!(X) {
  fixpoint {
    forkify(X);
    fork-guard-elim(X);
  }
}

macro fork-tile![n](X) {
  fork-tile[n, 0, false, true](X);
}

macro parallelize!(X) {
  parallel-fork(X);
  parallel-reduce(X);
}

macro unforkify!(X) {
  fork-split(X);
  unforkify(X);
}

optimize!(*);
forkify!(*);
associative(matmul@outer);

// Parallelize by computing output array as 16 chunks
let par = matmul@outer \ matmul@inner;
fork-tile![4](par);
let (outer, inner, _) = fork-reshape[[1, 3], [0], [2]](par);
parallelize!(outer \ inner);

let body = outline(inner);
cpu(body);

// Tile for cache, assuming 64B cache lines
fork-tile![16](body);
let (outer, inner) = fork-reshape[[0, 2, 4, 1, 3], [5]](body);

reduce-slf(inner);
unforkify!(body);
codegen-prep!(*);