From e73e2a976eccadc249d7e9560dec63c41e123426 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 16 Feb 2025 18:16:37 -0600
Subject: [PATCH 1/4] emit aligns

---
 hercules_cg/src/cpu.rs   | 13 ++++++++-----
 juno_scheduler/src/pm.rs |  3 +++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index 7d87170b..a19218c7 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -103,8 +103,9 @@ impl<'a> CPUContext<'a> {
             } else {
                 write!(
                     w,
-                    "{} noalias nofree nonnull noundef %p{}",
+                    "{} noalias nofree nonnull noundef align({}) %p{}",
                     self.get_type(*ty),
+                    get_type_alignment(&self.types, *ty),
                     idx
                 )?;
             }
@@ -506,10 +507,11 @@ impl<'a> CPUContext<'a> {
                     // load.
                     write!(
                         body,
-                        "  {} = load {}, ptr {}\n",
+                        "  {} = load {}, ptr {}, align {}\n",
                         self.get_value(id, false),
                         self.get_type(self_ty),
-                        index_ptr_name
+                        index_ptr_name,
+                        get_type_alignment(self.types, collect_ty),
                     )?;
                 } else {
                     // If this read doesn't reach a primitive type, just return
@@ -538,9 +540,10 @@ impl<'a> CPUContext<'a> {
                     // perform a single store of the data value.
                     write!(
                         body,
-                        "  store {}, ptr {}\n",
+                        "  store {}, ptr {}, align {}\n",
                         self.get_value(data, true),
-                        index_ptr_name
+                        index_ptr_name,
+                        get_type_alignment(self.types, collect_ty),
                     )?;
                 } else {
                     // If the data item being written is not a primitive type,
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 410e614c..1c00e3d0 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -795,6 +795,7 @@ impl PassManager {
             .arg(&llvm_path)
             .arg("-c")
             .arg("-O3")
+            .arg("-ffast-math")
             .arg("-march=native")
             .arg("-o")
             .arg(&llvm_object)
@@ -830,6 +831,8 @@ impl PassManager {
             let mut nvcc_process = Command::new("nvcc")
                 .arg("-c")
                 .arg("-O3")
+                .arg("-ffast-math")
+                .arg("-march=native")
                 .arg("-diag-suppress")
                 .arg("177")
                 .arg("-o")
-- 
GitLab


From 7764ca63040186f02411516448f8e7f81236fc0c Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 16 Feb 2025 18:19:50 -0600
Subject: [PATCH 2/4] Use larger alignment for arrays

---
 hercules_cg/src/cpu.rs | 6 ++----
 hercules_cg/src/lib.rs | 5 +++--
 hercules_rt/src/lib.rs | 6 ++++--
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index a19218c7..f90657a4 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -507,11 +507,10 @@ impl<'a> CPUContext<'a> {
                     // load.
                     write!(
                         body,
-                        "  {} = load {}, ptr {}, align {}\n",
+                        "  {} = load {}, ptr {}\n",
                         self.get_value(id, false),
                         self.get_type(self_ty),
                         index_ptr_name,
-                        get_type_alignment(self.types, collect_ty),
                     )?;
                 } else {
                     // If this read doesn't reach a primitive type, just return
@@ -540,10 +539,9 @@ impl<'a> CPUContext<'a> {
                     // perform a single store of the data value.
                     write!(
                         body,
-                        "  store {}, ptr {}, align {}\n",
+                        "  store {}, ptr {}\n",
                         self.get_value(data, true),
                         index_ptr_name,
-                        get_type_alignment(self.types, collect_ty),
                     )?;
                 } else {
                     // If the data item being written is not a primitive type,
diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs
index 15946f72..af2420d8 100644
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
@@ -16,7 +16,7 @@ use std::collections::BTreeMap;
 
 use hercules_ir::*;
 
-pub const LARGEST_ALIGNMENT: usize = 8;
+pub const LARGEST_ALIGNMENT: usize = 32;
 
 /*
  * The alignment of a type does not depend on dynamic constants.
@@ -33,7 +33,8 @@ pub fn get_type_alignment(types: &Vec<Type>, ty: TypeID) -> usize {
             .map(|id| get_type_alignment(types, *id))
             .max()
             .unwrap_or(1),
-        Type::Array(elem, _) => get_type_alignment(types, elem),
+        // Use a large alignment for arrays to generate better vector code.
+        Type::Array(_, _) => LARGEST_ALIGNMENT,
     }
 }
 
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index f8fdf2ef..0c3ffd80 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -13,8 +13,10 @@ use std::sync::OnceLock;
  * src/rt.rs (the RT backend).
  */
 
+pub const LARGEST_ALIGNMENT: usize = 32;
+
 pub unsafe fn __cpu_alloc(size: usize) -> *mut u8 {
-    let ptr = alloc(Layout::from_size_align(size, 16).unwrap());
+    let ptr = alloc(Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap());
     if cfg!(feature = "debug") {
         eprintln!("__cpu_alloc: {:?}, {}", ptr, size);
         assert!(!ptr.is_null() || size == 0);
@@ -27,7 +29,7 @@ pub unsafe fn __cpu_dealloc(ptr: *mut u8, size: usize) {
         eprintln!("__cpu_dealloc: {:?}, {}", ptr, size);
         assert!(!ptr.is_null() || size == 0);
     }
-    dealloc(ptr, Layout::from_size_align(size, 16).unwrap())
+    dealloc(ptr, Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap())
 }
 
 pub unsafe fn __cpu_zero_mem(ptr: *mut u8, size: usize) {
-- 
GitLab


From 6ba77d30bb843202e19bdc746d80356fc8a652c2 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 16 Feb 2025 18:23:53 -0600
Subject: [PATCH 3/4] .

---
 juno_scheduler/src/pm.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 1c00e3d0..e1ff113e 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -831,7 +831,6 @@ impl PassManager {
             let mut nvcc_process = Command::new("nvcc")
                 .arg("-c")
                 .arg("-O3")
-                .arg("-ffast-math")
                 .arg("-march=native")
                 .arg("-diag-suppress")
                 .arg("177")
-- 
GitLab


From c4f5220dca24514978ab8adfa41f096b12bce54e Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 16 Feb 2025 18:27:54 -0600
Subject: [PATCH 4/4] .

---
 juno_scheduler/src/pm.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index e1ff113e..4555e024 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -831,7 +831,6 @@ impl PassManager {
             let mut nvcc_process = Command::new("nvcc")
                 .arg("-c")
                 .arg("-O3")
-                .arg("-march=native")
                 .arg("-diag-suppress")
                 .arg("177")
                 .arg("-o")
-- 
GitLab