diff --git a/hercules_cg/src/cpu.rs b/hercules_cg/src/cpu.rs
index 7d87170baa08322ba44bec42f34b6a132ecd8a35..f90657a4b81313a2dcadd6405bc86635c51606a0 100644
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -103,8 +103,9 @@ impl<'a> CPUContext<'a> {
             } else {
                 write!(
                     w,
-                    "{} noalias nofree nonnull noundef %p{}",
+                    "{} noalias nofree nonnull noundef align({}) %p{}",
                     self.get_type(*ty),
+                    get_type_alignment(&self.types, *ty),
                     idx
                 )?;
             }
@@ -509,7 +510,7 @@ impl<'a> CPUContext<'a> {
                         "  {} = load {}, ptr {}\n",
                         self.get_value(id, false),
                         self.get_type(self_ty),
-                        index_ptr_name
+                        index_ptr_name,
                     )?;
                 } else {
                     // If this read doesn't reach a primitive type, just return
@@ -540,7 +541,7 @@ impl<'a> CPUContext<'a> {
                         body,
                         "  store {}, ptr {}\n",
                         self.get_value(data, true),
-                        index_ptr_name
+                        index_ptr_name,
                     )?;
                 } else {
                     // If the data item being written is not a primitive type,
diff --git a/hercules_cg/src/lib.rs b/hercules_cg/src/lib.rs
index 15946f72f86c44eaa2bf537450c1d22a57ba6992..af2420d83a550e7a50ff22962302612f21627995 100644
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
@@ -16,7 +16,7 @@ use std::collections::BTreeMap;
 
 use hercules_ir::*;
 
-pub const LARGEST_ALIGNMENT: usize = 8;
+pub const LARGEST_ALIGNMENT: usize = 32;
 
 /*
  * The alignment of a type does not depend on dynamic constants.
@@ -33,7 +33,8 @@ pub fn get_type_alignment(types: &Vec<Type>, ty: TypeID) -> usize {
             .map(|id| get_type_alignment(types, *id))
             .max()
             .unwrap_or(1),
-        Type::Array(elem, _) => get_type_alignment(types, elem),
+        // Use a large alignment for arrays to generate better vector code.
+        Type::Array(_, _) => LARGEST_ALIGNMENT,
     }
 }
 
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index e9b8f11f00edac8763ff0e376991fb1220af293c..4cf9b51a3e1ef18213a18ab039e00b1399aac21c 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -13,8 +13,10 @@ use std::sync::OnceLock;
  * src/rt.rs (the RT backend).
  */
 
+pub const LARGEST_ALIGNMENT: usize = 32;
+
 pub unsafe fn __cpu_alloc(size: usize) -> *mut u8 {
-    let ptr = alloc(Layout::from_size_align(size, 16).unwrap());
+    let ptr = alloc(Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap());
     if cfg!(feature = "debug") {
         eprintln!("__cpu_alloc: {:?}, {}", ptr, size);
         assert!(!ptr.is_null() || size == 0);
@@ -27,7 +29,7 @@ pub unsafe fn __cpu_dealloc(ptr: *mut u8, size: usize) {
         eprintln!("__cpu_dealloc: {:?}, {}", ptr, size);
         assert!(!ptr.is_null() || size == 0);
     }
-    dealloc(ptr, Layout::from_size_align(size, 16).unwrap())
+    dealloc(ptr, Layout::from_size_align(size, LARGEST_ALIGNMENT).unwrap())
 }
 
 pub unsafe fn __cpu_zero_mem(ptr: *mut u8, size: usize) {
diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs
index 9b77e51c4fa8b2a78a8fb7dc1b3b25b638eb1aac..d83ff0bb5df2a2814d550faeb800a4f0392e15c4 100644
--- a/juno_scheduler/src/pm.rs
+++ b/juno_scheduler/src/pm.rs
@@ -795,6 +795,7 @@ impl PassManager {
             .arg(&llvm_path)
             .arg("-c")
             .arg("-O3")
+            .arg("-ffast-math")
             .arg("-march=native")
             .arg("-o")
             .arg(&llvm_object)