macro simpl!(X) {
  ccp(X);
  simplify-cfg(X);
  lift-dc-math(X);
  gvn(X);
  phi-elim(X);
  dce(X);
  infer-schedules(X);
}

phi-elim(bfs);
no-memset(bfs@cost);
let init = outline(bfs@cost_init);
let traverse = outline(bfs@loop1);
let collect = outline(bfs@loop2);
parallel-reduce(traverse, collect);
no-memset(make_stop_prod);
gpu(init, traverse, make_stop_prod, collect);

simpl!(*);
predication(*);
const-inline(*);
loop-bound-canon(*);
simpl!(*);
fixpoint {
  forkify(*);
  fork-guard-elim(*);
}
simpl!(*);
predication(*);
simpl!(*);
reduce-slf(*);
simpl!(*);

fixpoint {
  forkify(collect);
  fork-guard-elim(collect);
}
simpl!(collect);

fork-tile[1024, 0, false, true](init, traverse, collect);
fork-split(init, traverse, collect);

gcm(*);