From d173f53bfae51bf9b27e795104eaa4e18b537acc Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Thu, 27 Feb 2025 16:55:11 -0600
Subject: [PATCH] fused sum reduction is very fast

---
 juno_samples/rodinia/srad/src/gpu.sch | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch
index f7885f9b..289548f9 100644
--- a/juno_samples/rodinia/srad/src/gpu.sch
+++ b/juno_samples/rodinia/srad/src/gpu.sch
@@ -9,9 +9,9 @@ macro simpl!(X) {
 }
 
 phi-elim(*);
-let init_loop = outline(srad@loop1);
+let sum_loop = outline(srad@loop1);
 let main_loops = outline(srad@loop2 | srad@loop3);
-gpu(init_loop, main_loops, extract, compress);
+gpu(main_loops, extract, compress);
 simpl!(*);
 const-inline[true](*);
 crc(*);
@@ -35,4 +35,23 @@ simpl!(*);
 slf(*);
 simpl!(*);
 
+fork-dim-merge(sum_loop);
+simpl!(sum_loop);
+fork-tile[32, 0, false, true](sum_loop);
+let out = fork-split(sum_loop);
+clean-monoid-reduces(sum_loop);
+simpl!(sum_loop);
+let fission = fork-fission[out.srad_0.fj0](sum_loop);
+simpl!(sum_loop);
+fork-tile[32, 0, false, true](fission.srad_0.fj_bottom);
+let out = fork-split(fission.srad_0.fj_bottom);
+clean-monoid-reduces(sum_loop);
+simpl!(sum_loop);
+let top = outline(fission.srad_0.fj_top);
+let bottom = outline(out.srad_0.fj0);
+gpu(top, bottom);
+ip-sroa(*);
+sroa(*);
+simpl!(*);
+
 gcm(*);
-- 
GitLab