From 004eb2c67b7436a3cdf570c3704034d21b458d39 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Tue, 18 Feb 2025 16:27:21 -0600 Subject: [PATCH] Auto align in hercules box --- Cargo.lock | 3 +++ hercules_rt/Cargo.toml | 2 +- hercules_rt/src/lib.rs | 39 ++++++++++++++++++++++++++++----------- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1973fbbe..41ca98b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1065,6 +1065,9 @@ dependencies = [ [[package]] name = "hercules_rt" version = "0.1.0" +dependencies = [ + "aligned-vec", +] [[package]] name = "hercules_tests" diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml index 46886b12..61a6188a 100644 --- a/hercules_rt/Cargo.toml +++ b/hercules_rt/Cargo.toml @@ -9,4 +9,4 @@ cuda = [] debug = [] [dependencies] - +aligned-vec = "*" \ No newline at end of file diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index 3b79dc48..5e8c031a 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -4,9 +4,10 @@ use std::alloc::{alloc, dealloc, Layout}; use std::marker::PhantomData; use std::ptr::{copy_nonoverlapping, write_bytes, NonNull}; use std::slice::{from_raw_parts, from_raw_parts_mut}; - use std::sync::OnceLock; +use aligned_vec::AVec; + /* * Define supporting types, functions, and macros for Hercules RT functions. For * a more in-depth discussion of the design of these utilities, see hercules_cg/ @@ -463,7 +464,7 @@ unsafe impl Sync for __RawPtrSendSync {} */ pub struct HerculesImmBox<'a, T> { #[allow(dead_code)] - cpu_alloc: OnceLock<Vec<T>>, + cpu_alloc: OnceLock<AVec<T>>, #[cfg(feature = "cuda")] cuda_alloc: OnceLock<CUDABox>, @@ -472,16 +473,32 @@ pub struct HerculesImmBox<'a, T> { cuda_ref: OnceLock<HerculesCUDARef<'a>>, } -impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> { +impl<'a, T: Clone> From<&'a [T]> for HerculesImmBox<'a, T> { fn from(value: &'a [T]) -> Self { - HerculesImmBox { - cpu_alloc: OnceLock::new(), - #[cfg(feature = "cuda")] - cuda_alloc: OnceLock::new(), - - cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)), - #[cfg(feature = "cuda")] - cuda_ref: OnceLock::new(), + if value.as_ptr().is_aligned_to(32) { + HerculesImmBox { + cpu_alloc: OnceLock::new(), + #[cfg(feature = "cuda")] + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)), + #[cfg(feature = "cuda")] + cuda_ref: OnceLock::new(), + } + } else { + let cpu_alloc = AVec::from_slice(32, value); + let size = value.len() * size_of::<T>(); + let cpu_ref = + unsafe { HerculesCPURef::__from_parts(cpu_alloc.as_ptr() as *mut u8, size) }; + HerculesImmBox { + cpu_alloc: OnceLock::from(cpu_alloc), + #[cfg(feature = "cuda")] + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::from(cpu_ref), + #[cfg(feature = "cuda")] + cuda_ref: OnceLock::new(), + } } } } -- GitLab