macro simpl!(X) {
  ccp(X);
  simplify-cfg(X);
  lift-dc-math(X);
  gvn(X);
  phi-elim(X);
  dce(X);
  infer-schedules(X);
}

phi-elim(bfs);
no-memset(bfs@cost);
let init = outline(bfs@cost_init);
let loop1 = outline(bfs@loop1);
let loop2 = outline(bfs@loop2);
let loop3 = outline(bfs@loop3);
parallel-reduce(loop1);

simpl!(*);
predication(*);
const-inline(*);
loop-bound-canon(*);
simpl!(*);
fixpoint {
  forkify(*);
  fork-guard-elim(*);
}
simpl!(*);
predication(*);
simpl!(*);
reduce-slf(*);
simpl!(*);

fork-tile[32, 0, false, true](loop1);
fork-split(loop1);
gpu(loop1);

fixpoint {
  forkify(loop2, loop3);
  fork-guard-elim(loop2, loop3);
}

simpl!(loop2, loop3);
fork-tile[32, 0, false, true](loop2, loop3);
let out = fork-split(loop2, loop3);
clean-monoid-reduces(loop2, loop3);
simpl!(loop2, loop3);
gpu(loop3);

let fission1 = fork-fission[out.bfs_2.fj0](loop2);
simpl!(loop2);
fork-tile[32, 0, false, true](fission1.bfs_2.fj_bottom);
let out = fork-split(fission1.bfs_2.fj_bottom);
clean-monoid-reduces(loop2);
simpl!(loop2);
let fission2 = fork-fission[out.bfs_2.fj0](loop2);
simpl!(loop2);
fork-tile[32, 0, false, true](fission2.bfs_2.fj_bottom);
let out = fork-split(fission2.bfs_2.fj_bottom);
clean-monoid-reduces(loop2);
simpl!(loop2);
let top = outline(fission1.bfs_2.fj_top);
let middle = outline(fission2.bfs_2.fj_top);
let bottom = outline(out.bfs_2.fj0);
const-inline(loop2, top, middle, bottom);
no-memset(top, middle);
gpu(top, middle, bottom);
simpl!(loop2, top, middle, bottom);

unforkify(init);
gcm(*);