macro simpl!(X) { ccp(X); simplify-cfg(X); lift-dc-math(X); gvn(X); phi-elim(X); dce(X); infer-schedules(X); } no-memset(srad@scratch); phi-elim(*); let sum_loop = outline(srad@loop1); let main_loops = outline(srad@loop2 | srad@loop3); gpu(main_loops, extract, compress); simpl!(*); const-inline[true](*); crc(*); slf(*); write-predication(*); simpl!(*); predication(*); simpl!(*); predication(*); simpl!(*); fixpoint { forkify(*); fork-guard-elim(*); fork-coalesce(*); } simpl!(*); reduce-slf(*); simpl!(*); array-slf(*); simpl!(*); slf(*); simpl!(*); fork-dim-merge(sum_loop); simpl!(sum_loop); fork-tile[32, 0, false, true](sum_loop); let out = fork-split(sum_loop); clean-monoid-reduces(sum_loop); simpl!(sum_loop); let fission1 = fork-fission[out.srad_0.fj0](sum_loop); simpl!(sum_loop); fork-tile[32, 0, false, true](fission1.srad_0.fj_bottom); let out = fork-split(fission1.srad_0.fj_bottom); clean-monoid-reduces(sum_loop); simpl!(sum_loop); let fission2 = fork-fission[out.srad_0.fj0](sum_loop); simpl!(sum_loop); fork-tile[32, 0, false, true](fission2.srad_0.fj_bottom); let out = fork-split(fission2.srad_0.fj_bottom); clean-monoid-reduces(sum_loop); simpl!(sum_loop); let first = outline(fission1.srad_0.fj_top); let second = outline(fission2.srad_0.fj_top); let third = outline(out.srad_0.fj0); gpu(first, second, third); const-inline[false](*); ip-sroa(*); sroa(*); simpl!(*); fork-interchange[0, 1](main_loops); fork-dim-merge(main_loops); fork-tile[32, 0, false, true](main_loops); dce(main_loops); fork-split(main_loops); simpl!(main_loops); fork-interchange[0, 1](extract); fork-dim-merge(extract); fork-tile[32, 0, false, true](extract); dce(extract); fork-split(extract); simpl!(extract); fork-interchange[0, 1](compress); fork-dim-merge(compress); fork-tile[32, 0, false, true](compress); dce(compress); fork-split(compress); simpl!(compress); gcm(*);