macro simpl!(X) {
  ccp(X);
  simplify-cfg(X);
  lift-dc-math(X);
  gvn(X);
  phi-elim(X);
  dce(X);
  infer-schedules(X);
}

no-memset(srad@scratch);
phi-elim(*);
let sum_loop = outline(srad@loop1);
let main_loops = outline(srad@loop2 | srad@loop3);
gpu(main_loops, extract, compress);
simpl!(*);
const-inline[true](*);
crc(*);
slf(*);
write-predication(*);
simpl!(*);
predication(*);
simpl!(*);
predication(*);
simpl!(*);
fixpoint {
  forkify(*);
  fork-guard-elim(*);
  fork-coalesce(*);
}
simpl!(*);
reduce-slf(*);
simpl!(*);
array-slf(*);
simpl!(*);
slf(*);
simpl!(*);

fork-dim-merge(sum_loop);
simpl!(sum_loop);
fork-tile[32, 0, false, true](sum_loop);
let out = fork-split(sum_loop);
clean-monoid-reduces(sum_loop);
simpl!(sum_loop);

let fission1 = fork-fission[out.srad_0.fj0](sum_loop);
simpl!(sum_loop);
fork-tile[32, 0, false, true](fission1.srad_0.fj_bottom);
let out = fork-split(fission1.srad_0.fj_bottom);
clean-monoid-reduces(sum_loop);
simpl!(sum_loop);

let fission2 = fork-fission[out.srad_0.fj0](sum_loop);
simpl!(sum_loop);
fork-tile[32, 0, false, true](fission2.srad_0.fj_bottom);
let out = fork-split(fission2.srad_0.fj_bottom);
clean-monoid-reduces(sum_loop);
simpl!(sum_loop);

let first = outline(fission1.srad_0.fj_top);
let second = outline(fission2.srad_0.fj_top);
let third = outline(out.srad_0.fj0);
gpu(first, second, third);
const-inline[false](*);
ip-sroa(*);
sroa(*);
simpl!(*);

fork-interchange[0, 1](main_loops);
fork-dim-merge(main_loops);
fork-tile[32, 0, false, true](main_loops);
dce(main_loops);
fork-split(main_loops);
simpl!(main_loops);

fork-interchange[0, 1](extract);
fork-dim-merge(extract);
fork-tile[32, 0, false, true](extract);
dce(extract);
fork-split(extract);
simpl!(extract);

fork-interchange[0, 1](compress);
fork-dim-merge(compress);
fork-tile[32, 0, false, true](compress);
dce(compress);
fork-split(compress);
simpl!(compress);

gcm(*);