From 2b2b2ea58ce976029b911e0b2d06346f6e889857 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 25 Feb 2025 14:23:33 -0600 Subject: [PATCH 1/6] Use same align in GPU --- hercules_cg/src/gpu.rs | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 5f2feedd..14341756 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -2101,19 +2101,7 @@ namespace cg = cooperative_groups; } fn get_alignment(&self, type_id: TypeID) -> usize { - match &self.types[type_id.idx()] { - Type::Array(element_type, _) => self.get_alignment(*element_type), - Type::Product(fields) | Type::Summation(fields) => fields - .iter() - .map(|field| self.get_alignment(*field)) - .max() - .unwrap_or(0), - Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 | Type::Float8 => 1, - Type::Integer16 | Type::UnsignedInteger16 | Type::BFloat16 => 2, - Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => 4, - Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => 8, - _ => panic!("Unsupported type for alignment"), - } + get_type_alignment(&self.types, type_id) } fn codegen_intrinsic(&self, intrinsic: &Intrinsic, ty: &Type) -> String { -- GitLab From 790b50d7cf518ea5698f7b00e173210a1d4fb700 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 25 Feb 2025 14:37:30 -0600 Subject: [PATCH 2/6] Properly align array elements --- hercules_cg/src/cpu.rs | 6 +++++- hercules_cg/src/gpu.rs | 16 ++++++++++++++-- hercules_cg/src/rt.rs | 16 +++++++++++++++- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs index 552dc3a3..b15cf301 100644 --- a/hercules_cg/src/cpu.rs +++ b/hercules_cg/src/cpu.rs @@ -839,6 +839,8 @@ impl<'a> CPUContext<'a> { // // ((0 * s1 + p1) * s2 + p2) * s3 + p3 ... let elem_size = self.codegen_type_size(elem, body)?; + let elem_align = get_type_alignment(&self.types, elem); + let aligned_elem_size = Self::round_up_to(&elem_size, elem_align, body)?; let mut acc_offset = "0".to_string(); for (p, s) in zip(pos, dims) { let p = self.get_value(*p, false); @@ -848,7 +850,7 @@ impl<'a> CPUContext<'a> { } // Convert offset in # elements -> # bytes. - acc_offset = Self::multiply(&acc_offset, &elem_size, body)?; + acc_offset = Self::multiply(&acc_offset, &aligned_elem_size, body)?; acc_ptr = Self::gep(&acc_ptr, &acc_offset, body)?; collect_ty = elem; } @@ -910,6 +912,8 @@ impl<'a> CPUContext<'a> { // The size of an array is the size of the element multipled by // the dynamic constant bounds. let mut acc_size = self.codegen_type_size(elem, body)?; + acc_size = + Self::round_up_to(&acc_size, get_type_alignment(&self.types, elem), body)?; for dc in bounds { acc_size = Self::multiply(&acc_size, &format!("%dc{}", dc.idx()), body)?; } diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 14341756..25bbf1be 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1877,7 +1877,11 @@ namespace cg = cooperative_groups; ")".repeat(array_indices.len()) )); let element_size = self.get_size(*element_type, None); - index_ptr.push_str(&format!(" * ({})", element_size)); + let element_align = self.get_alignment(*element_type); + index_ptr.push_str(&format!( + " * (({} + {} - 1 / {} * {}))", + element_size, element_align, element_align, element_align + )); type_id = *element_type; } } @@ -2049,7 +2053,15 @@ namespace cg = cooperative_groups; Type::Array(element_type, extents) => { assert!(num_fields.is_none()); let array_size = multiply_dcs(extents); - format!("{} * {}", self.get_size(*element_type, None), array_size) + let elem_align = self.get_alignment(type_id); + format!( + "(({} + {} - 1) / {} * {}) * {}", + self.get_size(*element_type, None), + elem_align, + elem_align, + elem_align, + array_size + ) } Type::Product(fields) => { let num_fields = num_fields.unwrap_or(fields.len()); diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 3db0f16f..8fa0c09e 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -1111,6 +1111,13 @@ impl<'a> RTContext<'a> { // // ((0 * s1 + p1) * s2 + p2) * s3 + p3 ... let elem_size = self.codegen_type_size(elem); + let elem_align = get_type_alignment(&self.module.types, elem); + let aligned_elem_size = format!( + "(({} + {}) & !{})", + elem_size, + elem_align - 1, + elem_align - 1 + ); for (p, s) in zip(pos, dims) { let p = self.get_value(*p, bb, false); acc_offset = format!("{} * ", acc_offset); @@ -1119,7 +1126,7 @@ impl<'a> RTContext<'a> { } // Convert offset in # elements -> # bytes. - acc_offset = format!("({} * {})", acc_offset, elem_size); + acc_offset = format!("({} * {})", acc_offset, aligned_elem_size); collect_ty = elem; } } @@ -1192,6 +1199,13 @@ impl<'a> RTContext<'a> { // The size of an array is the size of the element multipled by // the dynamic constant bounds. let mut acc_size = self.codegen_type_size(elem); + let elem_align = get_type_alignment(&self.module.types, elem); + acc_size = format!( + "(({} + {}) & !{})", + acc_size, + elem_align - 1, + elem_align - 1 + ); for dc in bounds { acc_size = format!("{} * ", acc_size); self.codegen_dynamic_constant(*dc, &mut acc_size).unwrap(); -- GitLab From 9a58de9e055004d05d0df66d062987c94819b790 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 25 Feb 2025 14:46:40 -0600 Subject: [PATCH 3/6] whoops --- hercules_cg/src/gpu.rs | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index 25bbf1be..d857fdda 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1879,8 +1879,10 @@ namespace cg = cooperative_groups; let element_size = self.get_size(*element_type, None); let element_align = self.get_alignment(*element_type); index_ptr.push_str(&format!( - " * (({} + {} - 1 / {} * {}))", - element_size, element_align, element_align, element_align + " * (({} + {}) & ~{})", + element_size, + element_align - 1, + element_align - 1 )); type_id = *element_type; } @@ -1932,8 +1934,10 @@ namespace cg = cooperative_groups; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( - "(({} + {} - 1) / {}) * {}", - dynamic_shared_offset, alignment, alignment, alignment + "(({} + {}) & ~{})", + dynamic_shared_offset, + alignment - 1, + alignment - 1, ); write!( w, @@ -1981,8 +1985,10 @@ namespace cg = cooperative_groups; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( - "(({} + {} - 1) / {}) * {}", - dynamic_shared_offset, alignment, alignment, alignment + "(({} + {}) & ~{})", + dynamic_shared_offset, + alignment - 1, + alignment - 1 ); write!( w, @@ -2023,8 +2029,10 @@ namespace cg = cooperative_groups; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( - "(({} + {} - 1) / {}) * {}", - dynamic_shared_offset, alignment, alignment, alignment + "(({} + {}) & ~{})", + dynamic_shared_offset, + alignment - 1, + alignment - 1, ); write!( w, @@ -2055,11 +2063,10 @@ namespace cg = cooperative_groups; let array_size = multiply_dcs(extents); let elem_align = self.get_alignment(type_id); format!( - "(({} + {} - 1) / {} * {}) * {}", + "(({} + {}) & ~{}) * {}", self.get_size(*element_type, None), - elem_align, - elem_align, - elem_align, + elem_align - 1, + elem_align - 1, array_size ) } @@ -2073,10 +2080,7 @@ namespace cg = cooperative_groups; if acc == "0" { size } else { - format!( - "({} + {} - 1) / {} * {} + {}", - acc, align, align, align, size - ) + format!("(({} + {}) & ~{}) + {}", acc, align - 1, align - 1, size) } }) } @@ -2101,8 +2105,10 @@ namespace cg = cooperative_groups; .max() .unwrap_or(0); format!( - "({} + {} - 1) / {} * {}", - max_size, max_alignment, max_alignment, max_alignment + "(({} + {}) & ~{})", + max_size, + max_alignment - 1, + max_alignment - 1, ) } _ => { -- GitLab From 5cee37adcca4c3de33c025f4975d381cf113f2bf Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 25 Feb 2025 14:52:16 -0600 Subject: [PATCH 4/6] Revert "whoops" This reverts commit 9a58de9e055004d05d0df66d062987c94819b790. --- hercules_cg/src/gpu.rs | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index d857fdda..e3dcc0c3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1879,10 +1879,8 @@ namespace cg = cooperative_groups; let element_size = self.get_size(*element_type, None); let element_align = self.get_alignment(*element_type); index_ptr.push_str(&format!( - " * (({} + {}) & ~{})", - element_size, - element_align - 1, - element_align - 1 + " * (({} + {} - 1) / {} * {})", + element_size, element_align, element_align, element_align )); type_id = *element_type; } @@ -1934,10 +1932,8 @@ namespace cg = cooperative_groups; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( - "(({} + {}) & ~{})", - dynamic_shared_offset, - alignment - 1, - alignment - 1, + "(({} + {} - 1) / {}) * {}", + dynamic_shared_offset, alignment, alignment, alignment ); write!( w, @@ -1985,10 +1981,8 @@ namespace cg = cooperative_groups; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( - "(({} + {}) & ~{})", - dynamic_shared_offset, - alignment - 1, - alignment - 1 + "(({} + {} - 1) / {}) * {}", + dynamic_shared_offset, alignment, alignment, alignment ); write!( w, @@ -2029,10 +2023,8 @@ namespace cg = cooperative_groups; let alignment = self.get_alignment(*type_id); let size = self.get_size(*type_id, None); *dynamic_shared_offset = format!( - "(({} + {}) & ~{})", - dynamic_shared_offset, - alignment - 1, - alignment - 1, + "(({} + {} - 1) / {}) * {}", + dynamic_shared_offset, alignment, alignment, alignment ); write!( w, @@ -2063,10 +2055,11 @@ namespace cg = cooperative_groups; let array_size = multiply_dcs(extents); let elem_align = self.get_alignment(type_id); format!( - "(({} + {}) & ~{}) * {}", + "(({} + {} - 1) / {} * {}) * {}", self.get_size(*element_type, None), - elem_align - 1, - elem_align - 1, + elem_align, + elem_align, + elem_align, array_size ) } @@ -2080,7 +2073,10 @@ namespace cg = cooperative_groups; if acc == "0" { size } else { - format!("(({} + {}) & ~{}) + {}", acc, align - 1, align - 1, size) + format!( + "({} + {} - 1) / {} * {} + {}", + acc, align, align, align, size + ) } }) } @@ -2105,10 +2101,8 @@ namespace cg = cooperative_groups; .max() .unwrap_or(0); format!( - "(({} + {}) & ~{})", - max_size, - max_alignment - 1, - max_alignment - 1, + "({} + {} - 1) / {} * {}", + max_size, max_alignment, max_alignment, max_alignment ) } _ => { -- GitLab From d71ec87a12a16b0d9929900af51f6469f8c65d02 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 25 Feb 2025 15:02:26 -0600 Subject: [PATCH 5/6] increase alignment of AlignedSlice --- juno_samples/rodinia/cfd/src/setup.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/juno_samples/rodinia/cfd/src/setup.rs b/juno_samples/rodinia/cfd/src/setup.rs index ad2ee961..b996f057 100644 --- a/juno_samples/rodinia/cfd/src/setup.rs +++ b/juno_samples/rodinia/cfd/src/setup.rs @@ -20,11 +20,11 @@ pub const VAR_DENSITY_ENERGY: usize = VAR_MOMENTUM + NDIM; pub const NVAR: usize = VAR_DENSITY_ENERGY + 1; pub const deg_angle_of_attack: f32 = 0.0; -#[repr(align(32))] -struct Alignment([u8; 32]); +#[repr(align(64))] +struct Alignment([u8; 64]); // An aligned slice is stored as a boxed slice and an offset number of elements -// that we skip over to get the desired alignment (of 32 bytes) +// that we skip over to get the desired alignment (of 64 bytes) pub struct AlignedSlice<T> { slice: Box<[T]>, offset: usize, @@ -37,8 +37,8 @@ where { pub fn of_len(len: usize) -> Self { // The maximum number of elements that may be waisted in getting the alignment we need is - // (32 - alignment of T) / size of T - let extra_elements = (32 - std::mem::align_of::<T>()) / std::mem::size_of::<T>(); + // (64 - alignment of T) / size of T + let extra_elements = (64 - std::mem::align_of::<T>()) / std::mem::size_of::<T>(); let slice: Box<[T]> = (0..len + extra_elements) .map(|_| Default::default()) .collect(); -- GitLab From eeee9868779303fc8137463a08d8e7f6d45c56eb Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 25 Feb 2025 15:05:48 -0600 Subject: [PATCH 6/6] whoops --- hercules_cg/src/gpu.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index e3dcc0c3..a3eea274 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -2053,7 +2053,7 @@ namespace cg = cooperative_groups; Type::Array(element_type, extents) => { assert!(num_fields.is_none()); let array_size = multiply_dcs(extents); - let elem_align = self.get_alignment(type_id); + let elem_align = self.get_alignment(*element_type); format!( "(({} + {} - 1) / {} * {}) * {}", self.get_size(*element_type, None), -- GitLab