From 2b2b2ea58ce976029b911e0b2d06346f6e889857 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 14:23:33 -0600
Subject: [PATCH 1/6] Use same align in GPU

---
 hercules_cg/src/gpu.rs | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 5f2feedd..14341756 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -2101,19 +2101,7 @@ namespace cg = cooperative_groups;
     }
 
     fn get_alignment(&self, type_id: TypeID) -> usize {
-        match &self.types[type_id.idx()] {
-            Type::Array(element_type, _) => self.get_alignment(*element_type),
-            Type::Product(fields) | Type::Summation(fields) => fields
-                .iter()
-                .map(|field| self.get_alignment(*field))
-                .max()
-                .unwrap_or(0),
-            Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 | Type::Float8 => 1,
-            Type::Integer16 | Type::UnsignedInteger16 | Type::BFloat16 => 2,
-            Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4,
-            Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8,
-            _ => panic!("Unsupported type for alignment"),
-        }
+        get_type_alignment(&self.types, type_id)
     }
 
     fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String {
-- 
GitLab


From 790b50d7cf518ea5698f7b00e173210a1d4fb700 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 14:37:30 -0600
Subject: [PATCH 2/6] Properly align array elements

---
 hercules_cg/src/cpu.rs |  6 +++++-
 hercules_cg/src/gpu.rs | 16 ++++++++++++++--
 hercules_cg/src/rt.rs  | 16 +++++++++++++++-
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index 552dc3a3..b15cf301 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -839,6 +839,8 @@ impl<'a> CPUContext<'a> {
                     //
                     //     ((0 * s1 + p1) * s2 + p2) * s3 + p3 ...
                     let elem_size = self.codegen_type_size(elem, body)?;
+                    let elem_align = get_type_alignment(&self.types, elem);
+                    let aligned_elem_size = Self::round_up_to(&elem_size, elem_align, body)?;
                     let mut acc_offset = "0".to_string();
                     for (p, s) in zip(pos, dims) {
                         let p = self.get_value(*p, false);
@@ -848,7 +850,7 @@ impl<'a> CPUContext<'a> {
                     }
 
                     // Convert offset in # elements -> # bytes.
-                    acc_offset = Self::multiply(&acc_offset, &elem_size, body)?;
+                    acc_offset = Self::multiply(&acc_offset, &aligned_elem_size, body)?;
                     acc_ptr = Self::gep(&acc_ptr, &acc_offset, body)?;
                     collect_ty = elem;
                 }
@@ -910,6 +912,8 @@ impl<'a> CPUContext<'a> {
                 // The size of an array is the size of the element multipled by
                 // the dynamic constant bounds.
                 let mut acc_size = self.codegen_type_size(elem, body)?;
+                acc_size =
+                    Self::round_up_to(&acc_size, get_type_alignment(&self.types, elem), body)?;
                 for dc in bounds {
                     acc_size = Self::multiply(&acc_size, &format!("%dc{}", dc.idx()), body)?;
                 }
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 14341756..25bbf1be 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1877,7 +1877,11 @@ namespace cg = cooperative_groups;
                         ")".repeat(array_indices.len())
                     ));
                     let element_size = self.get_size(*element_type, None);
-                    index_ptr.push_str(&format!(" * ({})", element_size));
+                    let element_align = self.get_alignment(*element_type);
+                    index_ptr.push_str(&format!(
+                        " * (({} + {} - 1 / {} * {}))",
+                        element_size, element_align, element_align, element_align
+                    ));
                     type_id = *element_type;
                 }
             }
@@ -2049,7 +2053,15 @@ namespace cg = cooperative_groups;
             Type::Array(element_type, extents) => {
                 assert!(num_fields.is_none());
                 let array_size = multiply_dcs(extents);
-                format!("{} * {}", self.get_size(*element_type, None), array_size)
+                let elem_align = self.get_alignment(type_id);
+                format!(
+                    "(({} + {} - 1) / {} * {}) * {}",
+                    self.get_size(*element_type, None),
+                    elem_align,
+                    elem_align,
+                    elem_align,
+                    array_size
+                )
             }
             Type::Product(fields) => {
                 let num_fields = num_fields.unwrap_or(fields.len());
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 3db0f16f..8fa0c09e 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -1111,6 +1111,13 @@ impl<'a> RTContext<'a> {
                     //
                     //     ((0 * s1 + p1) * s2 + p2) * s3 + p3 ...
                     let elem_size = self.codegen_type_size(elem);
+                    let elem_align = get_type_alignment(&self.module.types, elem);
+                    let aligned_elem_size = format!(
+                        "(({} + {}) & !{})",
+                        elem_size,
+                        elem_align - 1,
+                        elem_align - 1
+                    );
                     for (p, s) in zip(pos, dims) {
                         let p = self.get_value(*p, bb, false);
                         acc_offset = format!("{} * ", acc_offset);
@@ -1119,7 +1126,7 @@ impl<'a> RTContext<'a> {
                     }
 
                     // Convert offset in # elements -> # bytes.
-                    acc_offset = format!("({} * {})", acc_offset, elem_size);
+                    acc_offset = format!("({} * {})", acc_offset, aligned_elem_size);
                     collect_ty = elem;
                 }
             }
@@ -1192,6 +1199,13 @@ impl<'a> RTContext<'a> {
                 // The size of an array is the size of the element multipled by
                 // the dynamic constant bounds.
                 let mut acc_size = self.codegen_type_size(elem);
+                let elem_align = get_type_alignment(&self.module.types, elem);
+                acc_size = format!(
+                    "(({} + {}) & !{})",
+                    acc_size,
+                    elem_align - 1,
+                    elem_align - 1
+                );
                 for dc in bounds {
                     acc_size = format!("{} * ", acc_size);
                     self.codegen_dynamic_constant(*dc, &mut acc_size).unwrap();
-- 
GitLab


From 9a58de9e055004d05d0df66d062987c94819b790 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 14:46:40 -0600
Subject: [PATCH 3/6] whoops

---
 hercules_cg/src/gpu.rs | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 25bbf1be..d857fdda 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1879,8 +1879,10 @@ namespace cg = cooperative_groups;
                     let element_size = self.get_size(*element_type, None);
                     let element_align = self.get_alignment(*element_type);
                     index_ptr.push_str(&format!(
-                        " * (({} + {} - 1 / {} * {}))",
-                        element_size, element_align, element_align, element_align
+                        " * (({} + {}) & ~{})",
+                        element_size,
+                        element_align - 1,
+                        element_align - 1
                     ));
                     type_id = *element_type;
                 }
@@ -1932,8 +1934,10 @@ namespace cg = cooperative_groups;
                     let alignment = self.get_alignment(*type_id);
                     let size = self.get_size(*type_id, None);
                     *dynamic_shared_offset = format!(
-                        "(({} + {} - 1) / {}) * {}",
-                        dynamic_shared_offset, alignment, alignment, alignment
+                        "(({} + {}) & ~{})",
+                        dynamic_shared_offset,
+                        alignment - 1,
+                        alignment - 1,
                     );
                     write!(
                         w,
@@ -1981,8 +1985,10 @@ namespace cg = cooperative_groups;
                     let alignment = self.get_alignment(*type_id);
                     let size = self.get_size(*type_id, None);
                     *dynamic_shared_offset = format!(
-                        "(({} + {} - 1) / {}) * {}",
-                        dynamic_shared_offset, alignment, alignment, alignment
+                        "(({} + {}) & ~{})",
+                        dynamic_shared_offset,
+                        alignment - 1,
+                        alignment - 1
                     );
                     write!(
                         w,
@@ -2023,8 +2029,10 @@ namespace cg = cooperative_groups;
                 let alignment = self.get_alignment(*type_id);
                 let size = self.get_size(*type_id, None);
                 *dynamic_shared_offset = format!(
-                    "(({} + {} - 1) / {}) * {}",
-                    dynamic_shared_offset, alignment, alignment, alignment
+                    "(({} + {}) & ~{})",
+                    dynamic_shared_offset,
+                    alignment - 1,
+                    alignment - 1,
                 );
                 write!(
                     w,
@@ -2055,11 +2063,10 @@ namespace cg = cooperative_groups;
                 let array_size = multiply_dcs(extents);
                 let elem_align = self.get_alignment(type_id);
                 format!(
-                    "(({} + {} - 1) / {} * {}) * {}",
+                    "(({} + {}) & ~{}) * {}",
                     self.get_size(*element_type, None),
-                    elem_align,
-                    elem_align,
-                    elem_align,
+                    elem_align - 1,
+                    elem_align - 1,
                     array_size
                 )
             }
@@ -2073,10 +2080,7 @@ namespace cg = cooperative_groups;
                         if acc == "0" {
                             size
                         } else {
-                            format!(
-                                "({} + {} - 1) / {} * {} + {}",
-                                acc, align, align, align, size
-                            )
+                            format!("(({} + {}) & ~{}) + {}", acc, align - 1, align - 1, size)
                         }
                     })
             }
@@ -2101,8 +2105,10 @@ namespace cg = cooperative_groups;
                     .max()
                     .unwrap_or(0);
                 format!(
-                    "({} + {} - 1) / {} * {}",
-                    max_size, max_alignment, max_alignment, max_alignment
+                    "(({} + {}) & ~{})",
+                    max_size,
+                    max_alignment - 1,
+                    max_alignment - 1,
                 )
             }
             _ => {
-- 
GitLab


From 5cee37adcca4c3de33c025f4975d381cf113f2bf Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 14:52:16 -0600
Subject: [PATCH 4/6] Revert "whoops"

This reverts commit 9a58de9e055004d05d0df66d062987c94819b790.
---
 hercules_cg/src/gpu.rs | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index d857fdda..e3dcc0c3 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1879,10 +1879,8 @@ namespace cg = cooperative_groups;
                     let element_size = self.get_size(*element_type, None);
                     let element_align = self.get_alignment(*element_type);
                     index_ptr.push_str(&format!(
-                        " * (({} + {}) & ~{})",
-                        element_size,
-                        element_align - 1,
-                        element_align - 1
+                        " * (({} + {} - 1) / {} * {})",
+                        element_size, element_align, element_align, element_align
                     ));
                     type_id = *element_type;
                 }
@@ -1934,10 +1932,8 @@ namespace cg = cooperative_groups;
                     let alignment = self.get_alignment(*type_id);
                     let size = self.get_size(*type_id, None);
                     *dynamic_shared_offset = format!(
-                        "(({} + {}) & ~{})",
-                        dynamic_shared_offset,
-                        alignment - 1,
-                        alignment - 1,
+                        "(({} + {} - 1) / {}) * {}",
+                        dynamic_shared_offset, alignment, alignment, alignment
                     );
                     write!(
                         w,
@@ -1985,10 +1981,8 @@ namespace cg = cooperative_groups;
                     let alignment = self.get_alignment(*type_id);
                     let size = self.get_size(*type_id, None);
                     *dynamic_shared_offset = format!(
-                        "(({} + {}) & ~{})",
-                        dynamic_shared_offset,
-                        alignment - 1,
-                        alignment - 1
+                        "(({} + {} - 1) / {}) * {}",
+                        dynamic_shared_offset, alignment, alignment, alignment
                     );
                     write!(
                         w,
@@ -2029,10 +2023,8 @@ namespace cg = cooperative_groups;
                 let alignment = self.get_alignment(*type_id);
                 let size = self.get_size(*type_id, None);
                 *dynamic_shared_offset = format!(
-                    "(({} + {}) & ~{})",
-                    dynamic_shared_offset,
-                    alignment - 1,
-                    alignment - 1,
+                    "(({} + {} - 1) / {}) * {}",
+                    dynamic_shared_offset, alignment, alignment, alignment
                 );
                 write!(
                     w,
@@ -2063,10 +2055,11 @@ namespace cg = cooperative_groups;
                 let array_size = multiply_dcs(extents);
                 let elem_align = self.get_alignment(type_id);
                 format!(
-                    "(({} + {}) & ~{}) * {}",
+                    "(({} + {} - 1) / {} * {}) * {}",
                     self.get_size(*element_type, None),
-                    elem_align - 1,
-                    elem_align - 1,
+                    elem_align,
+                    elem_align,
+                    elem_align,
                     array_size
                 )
             }
@@ -2080,7 +2073,10 @@ namespace cg = cooperative_groups;
                         if acc == "0" {
                             size
                         } else {
-                            format!("(({} + {}) & ~{}) + {}", acc, align - 1, align - 1, size)
+                            format!(
+                                "({} + {} - 1) / {} * {} + {}",
+                                acc, align, align, align, size
+                            )
                         }
                     })
             }
@@ -2105,10 +2101,8 @@ namespace cg = cooperative_groups;
                     .max()
                     .unwrap_or(0);
                 format!(
-                    "(({} + {}) & ~{})",
-                    max_size,
-                    max_alignment - 1,
-                    max_alignment - 1,
+                    "({} + {} - 1) / {} * {}",
+                    max_size, max_alignment, max_alignment, max_alignment
                 )
             }
             _ => {
-- 
GitLab


From d71ec87a12a16b0d9929900af51f6469f8c65d02 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 15:02:26 -0600
Subject: [PATCH 5/6] increase alignment of AlignedSlice

---
 juno_samples/rodinia/cfd/src/setup.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/juno_samples/rodinia/cfd/src/setup.rs b/juno_samples/rodinia/cfd/src/setup.rs
index ad2ee961..b996f057 100644
--- a/juno_samples/rodinia/cfd/src/setup.rs
+++ b/juno_samples/rodinia/cfd/src/setup.rs
@@ -20,11 +20,11 @@ pub const VAR_DENSITY_ENERGY: usize = VAR_MOMENTUM + NDIM;
 pub const NVAR: usize = VAR_DENSITY_ENERGY + 1;
 pub const deg_angle_of_attack: f32 = 0.0;
 
-#[repr(align(32))]
-struct Alignment([u8; 32]);
+#[repr(align(64))]
+struct Alignment([u8; 64]);
 
 // An aligned slice is stored as a boxed slice and an offset number of elements
-// that we skip over to get the desired alignment (of 32 bytes)
+// that we skip over to get the desired alignment (of 64 bytes)
 pub struct AlignedSlice<T> {
     slice: Box<[T]>,
     offset: usize,
@@ -37,8 +37,8 @@ where
 {
     pub fn of_len(len: usize) -> Self {
         // The maximum number of elements that may be waisted in getting the alignment we need is
-        // (32 - alignment of T) / size of T
-        let extra_elements = (32 - std::mem::align_of::<T>()) / std::mem::size_of::<T>();
+        // (64 - alignment of T) / size of T
+        let extra_elements = (64 - std::mem::align_of::<T>()) / std::mem::size_of::<T>();
         let slice: Box<[T]> = (0..len + extra_elements)
             .map(|_| Default::default())
             .collect();
-- 
GitLab


From eeee9868779303fc8137463a08d8e7f6d45c56eb Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 25 Feb 2025 15:05:48 -0600
Subject: [PATCH 6/6] whoops

---
 hercules_cg/src/gpu.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index e3dcc0c3..a3eea274 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -2053,7 +2053,7 @@ namespace cg = cooperative_groups;
             Type::Array(element_type, extents) => {
                 assert!(num_fields.is_none());
                 let array_size = multiply_dcs(extents);
-                let elem_align = self.get_alignment(type_id);
+                let elem_align = self.get_alignment(*element_type);
                 format!(
                     "(({} + {} - 1) / {} * {}) * {}",
                     self.get_size(*element_type, None),
-- 
GitLab