diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index effdc6b2830680f35f3aa1f2e443ed1b63934c3a..76808149e7b99dce97e43f0e936536d3d13b7417 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -1,51 +1,26 @@ -macro optimize!(X) { - gvn(X); - phi-elim(X); - dce(X); - ip-sroa(X); - sroa(X); - dce(X); - gvn(X); - phi-elim(X); - dce(X); -} - -macro codegen!(X) { - gcm(*); - float-collections(*); - dce(*); - gcm(*); -} +phi-elim(*); -optimize!(*); +forkify(*); +fork-guard-elim(*); +dce(*); -fixpoint panic after 20 { - forkify(matmul); - fork-guard-elim(matmul); +fixpoint { + reduce-slf(*); + slf(*); + infer-schedules(*); } - -optimize!(*); - -fixpoint panic after 20 { - reduce-slf(matmul); - slf(matmul); - infer-schedules(matmul); +fork-coalesce(*); +infer-schedules(*); +dce(*); +rewrite(*); +fixpoint { + simplify-cfg(*); + dce(*); } -dce(matmul); -// Tile outer and middle loops into 32x32 sized blocks -fork-tile[32, 0, false, true](matmul@outer \ matmul@inner); -// Merge outer and middle loops and interchange so blocks are first -fork-coalesce(matmul@outer \ matmul@inner); -fork-interchange[1, 2](matmul@outer \ matmul@inner); -// Split forks -let split = fork-split(matmul); -// Join the threads and then blocks into a single fork each -fork-coalesce(split.matmul.fj2 \ matmul@inner); -fork-coalesce(split.matmul.fj0 \ split.matmul.fj2); +ip-sroa(*); +sroa(*); +dce(*); -let auto = auto-outline(*); float-collections(*); -gpu(auto.matmul); - -codegen!(*); +gcm(*);