From 004eb2c67b7436a3cdf570c3704034d21b458d39 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Tue, 18 Feb 2025 16:27:21 -0600
Subject: [PATCH] Auto align in hercules box

---
 Cargo.lock             |  3 +++
 hercules_rt/Cargo.toml |  2 +-
 hercules_rt/src/lib.rs | 39 ++++++++++++++++++++++++++++-----------
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1973fbbe..41ca98b7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1065,6 +1065,9 @@ dependencies = [
 [[package]]
 name = "hercules_rt"
 version = "0.1.0"
+dependencies = [
+ "aligned-vec",
+]
 
 [[package]]
 name = "hercules_tests"
diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml
index 46886b12..61a6188a 100644
--- a/hercules_rt/Cargo.toml
+++ b/hercules_rt/Cargo.toml
@@ -9,4 +9,4 @@ cuda = []
 debug = []
 
 [dependencies]
-
+aligned-vec = "*"
\ No newline at end of file
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index 3b79dc48..5e8c031a 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -4,9 +4,10 @@ use std::alloc::{alloc, dealloc, Layout};
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
 use std::slice::{from_raw_parts, from_raw_parts_mut};
-
 use std::sync::OnceLock;
 
+use aligned_vec::AVec;
+
 /*
  * Define supporting types, functions, and macros for Hercules RT functions. For
  * a more in-depth discussion of the design of these utilities, see hercules_cg/
@@ -463,7 +464,7 @@ unsafe impl Sync for __RawPtrSendSync {}
  */
 pub struct HerculesImmBox<'a, T> {
     #[allow(dead_code)]
-    cpu_alloc: OnceLock<Vec<T>>,
+    cpu_alloc: OnceLock<AVec<T>>,
     #[cfg(feature = "cuda")]
     cuda_alloc: OnceLock<CUDABox>,
 
@@ -472,16 +473,32 @@ pub struct HerculesImmBox<'a, T> {
     cuda_ref: OnceLock<HerculesCUDARef<'a>>,
 }
 
-impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> {
+impl<'a, T: Clone> From<&'a [T]> for HerculesImmBox<'a, T> {
     fn from(value: &'a [T]) -> Self {
-        HerculesImmBox {
-            cpu_alloc: OnceLock::new(),
-            #[cfg(feature = "cuda")]
-            cuda_alloc: OnceLock::new(),
-
-            cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)),
-            #[cfg(feature = "cuda")]
-            cuda_ref: OnceLock::new(),
+        if value.as_ptr().is_aligned_to(32) {
+            HerculesImmBox {
+                cpu_alloc: OnceLock::new(),
+                #[cfg(feature = "cuda")]
+                cuda_alloc: OnceLock::new(),
+
+                cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)),
+                #[cfg(feature = "cuda")]
+                cuda_ref: OnceLock::new(),
+            }
+        } else {
+            let cpu_alloc = AVec::from_slice(32, value);
+            let size = value.len() * size_of::<T>();
+            let cpu_ref =
+                unsafe { HerculesCPURef::__from_parts(cpu_alloc.as_ptr() as *mut u8, size) };
+            HerculesImmBox {
+                cpu_alloc: OnceLock::from(cpu_alloc),
+                #[cfg(feature = "cuda")]
+                cuda_alloc: OnceLock::new(),
+
+                cpu_ref: OnceLock::from(cpu_ref),
+                #[cfg(feature = "cuda")]
+                cuda_ref: OnceLock::new(),
+            }
         }
     }
 }
-- 
GitLab