Aaron Councilman · d78c63c3 · 3b1f9e24 · 4cb853f6 · 27ce61d0 · 255898ca
--- a/hercules_rt/src/lib.rs

+ 429

− 0
+++ b/hercules_rt/src/lib.rs

+ 429

− 0
+#![feature(once_cell_try)]
+
 use std::alloc::{alloc, dealloc, Layout};
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
 use std::slice::{from_raw_parts, from_raw_parts_mut};

+use std::sync::OnceLock;
+
 /*
 * Define supporting types, functions, and macros for Hercules RT functions. For
 * a more in-depth discussion of the design of these utilities, see hercules_cg/
 @@ -278,6 +282,14 @@ impl<'a> HerculesCUDARefMut<'a> {
        }
    }

+    pub fn dup(&'a mut self) -> Self {
+        HerculesCUDARefMut {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
+        }
+    }
+
    pub unsafe fn __ptr(&self) -> *mut u8 {
        self.ptr.as_ptr()
    }
 @@ -330,6 +342,10 @@ impl CUDABox {
            _phantom: PhantomData,
        }
    }
+
+    pub fn get_bytes(&self) -> usize {
+        self.size
+    }
 }

 #[cfg(feature = "cuda")]
 @@ -354,3 +370,416 @@ macro_rules! runner {
        <concat_idents!(HerculesRunner_, $x)>::new()
    };
 }
+
+/*
+ * A HerculesBox holds memory that can be on any device and provides a common interface to moving
+ * data where it is needed.
+ *
+ * It can hold CPU and device allocations to basically point at the memory it represents. It can
+ * also hold Hercules references either to those allocations it holds or to other allocations not
+ * held by this Box (in which case the appropriate allocation field should be None).
+ *
+ * The data held at all of its non-None allocations and references is maintained so that it is the
+ * same, and so methods will attempt to use the reference or allocation that is most convenient.
+ * 
+ * HerculesImmBox hold references to immutable memory only. All operations on these is through
+ * immutable references, though internally it uses OnceLocks to protect its resources since the Box
+ * may be used in multiple parallel threads if it is used in parallel Hercules code invocation.
+ * We use OnceLocks since the data is immutable and so once it has been placed on a device movement
+ * is not necessary.
+ *
+ * We maintain the invariant that at least one of the device references is always set, their
+ * associated allocations may or may not be set, as those may not be needed if the allocation is
+ * help elsewhere.
+ *
+ * HerculesMutBox holds memory on some device and can produce mutable references to that data on
+ * on any device. All these operations are through mutable references since this ensures exclusive
+ * access to the Box and therefore to the underlying device memory. Because of the exclusive access
+ * locks are not needed.
+ *
+ * Note that a HerculesMutBox may hold multiple allocations at once, but it tracks the "definitive"
+ * copy to be the one borrowed mutably most recently (since it may have been updated). The extra
+ * allocations are kept around to avoid reallocation if memory is moved back to the device.
+ */
+pub struct HerculesImmBox<'a, T> {
+    #[allow(dead_code)]
+    cpu_alloc: OnceLock<Vec<T>>,
+    #[cfg(feature = "cuda")]
+    cuda_alloc: OnceLock<CUDABox>,
+
+    cpu_ref: OnceLock<HerculesCPURef<'a>>,
+    #[cfg(feature = "cuda")]
+    cuda_ref: OnceLock<HerculesCUDARef<'a>>,
+}
+
+impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> {
+    fn from(value: &'a [T]) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+}
+
+impl<'a, T> From<HerculesCPURef<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCPURef<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(value),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+}
+
+// If we are building from a mutable reference, we demote that to a non-mutable reference since we
+// don't hold mutable references.
+impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCPURefMut<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(value.as_ref()),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCUDARef<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::from(value),
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCUDARefMut<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::from(value.as_ref()),
+        }
+    }
+}
+
+impl<'a, T> HerculesImmBox<'a, T> 
+where 
+    T: Default + Clone
+{
+    pub fn as_slice<'b>(&'b self) -> &'a [T] {
+        self.as_cpu_ref().as_slice()
+    }
+
+    pub fn to_vec<'b>(&'b self) -> Vec<T> {
+        Vec::from(self.as_cpu_ref().as_slice())
+    }
+
+    pub fn as_cpu_ref<'b>(&'b self) -> HerculesCPURef<'a> {
+        if let Some(cpu_ref) = self.cpu_ref.get() {
+            cpu_ref.clone()
+        } else {
+            #[cfg(feature = "cuda")]
+            if let Some(cuda_ref) = self.cuda_ref.get() {
+                return 
+                    self.cpu_ref.get_or_init(|| {
+                        let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
+
+                        let mut alloc = Vec::new();
+                        alloc.resize_with(elements, Default::default);
+                        let _ = cuda_ref.clone().to_cpu_ref(&mut alloc);
+
+                        self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly");
+                        let alloc = self.cpu_alloc.get().unwrap();
+                        HerculesCPURef::from_slice(alloc)
+                    }).clone();
+            }
+
+            panic!("HerculesImmBox has no reference to data")
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn as_cuda_ref<'b>(&'b self) -> HerculesCUDARef<'a> {
+        if let Some(cuda_ref) = self.cuda_ref.get() {
+            cuda_ref.clone()
+        } else {
+            if let Some(cpu_ref) = self.cpu_ref.get() {
+                return self.cuda_ref.get_or_init(|| {
+                    // Copy data to CUDA device
+                    let alloc = CUDABox::from_cpu_ref(cpu_ref.clone());
+                    self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly");
+
+                    self.cuda_alloc.get().unwrap().get_ref()
+                }).clone();
+            }
+
+            panic!("HerculesImmBox has no reference to data")
+        }
+    }
+}
+
+enum HerculesMutBoxLocation {
+    CPU,
+    #[cfg(feature = "cuda")]
+    CUDA,
+}
+
+enum Allocation<R, A> {
+    None,
+    Reference(R),
+    Allocation(A),
+}
+
+impl<R, A> Allocation<R, A> {
+    fn take(&mut self) -> Allocation<R, A> {
+        std::mem::replace(self, Allocation::None)
+    }
+}
+
+pub struct HerculesMutBox<'a, T> {
+    loc: HerculesMutBoxLocation,
+
+    cpu_alloc: Allocation<&'a mut [T], Vec<T>>,
+    #[cfg(feature = "cuda")]
+    cuda_alloc: Allocation<HerculesCUDARefMut<'a>, CUDABox>,
+}
+
+impl<'a, T> From<&'a mut [T]> for HerculesMutBox<'a, T> {
+    fn from(value: &'a mut [T]) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Reference(value),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+}
+
+impl<'a, T> From<Vec<T>> for HerculesMutBox<'a, T> {
+    fn from(value: Vec<T>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Allocation(value),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+}
+
+impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<'a, T> {
+    fn from(value: HerculesCPURef<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Allocation(value.as_slice().to_vec()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+}
+
+impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> {
+    fn from(value: HerculesCPURefMut<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Reference(value.as_slice()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<'a, T> {
+    fn from(value: HerculesCUDARef<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CUDA,
+            cpu_alloc: Allocation::None,
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::Allocation(CUDABox::from_cuda_ref(value)),
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
+    fn from(value: HerculesCUDARefMut<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CUDA,
+            cpu_alloc: Allocation::None,
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::Reference(value),
+        }
+    }
+}
+
+impl<'a, T> HerculesMutBox<'a, T>
+where
+    T: Default + Clone
+{
+    pub fn as_slice(&'a mut self) -> &'a mut [T] {
+        self.as_cpu_ref().as_slice()
+    }
+
+    pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> {
+        match self.loc {
+            HerculesMutBoxLocation::CPU => {
+                match self.cpu_alloc {
+                    Allocation::None => panic!("No CPU reference"),
+                    Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val),
+                    Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val),
+                }
+            }
+            #[cfg(feature = "cuda")]
+            HerculesMutBoxLocation::CUDA => {
+                let cuda_ref : HerculesCUDARef<'a> =
+                    match self.cuda_alloc {
+                        Allocation::None => panic!("No GPU reference"),
+                        Allocation::Reference(ref mut val) => val.dup().as_ref(),
+                        Allocation::Allocation(ref val) => val.get_ref(),
+                    };
+
+                let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
+
+                // Allocate host memory (if needed)
+                let cpu_alloc : Allocation<&'a mut [T], Vec<T>> =
+                    match self.cpu_alloc.take() {
+                        Allocation::Reference(val)  if val.len() == elements => Allocation::Reference(val),
+                        Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val),
+                        _ => {
+                            let mut alloc = Vec::new();
+                            alloc.resize_with(elements, Default::default);
+                            Allocation::Allocation(alloc)
+                        }
+                    };
+                self.cpu_alloc = cpu_alloc;
+                let cpu_ref : &'a mut [T] =
+                    match &mut self.cpu_alloc {
+                        Allocation::None => panic!(),
+                        Allocation::Reference(val)  => val,
+                        Allocation::Allocation(val) => val,
+                    };
+
+                // Transfer data from CUDA device
+                let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref);
+
+                self.loc = HerculesMutBoxLocation::CPU;
+                cpu_ref
+            }
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> {
+        match self.loc {
+            HerculesMutBoxLocation::CPU => {
+                let cpu_ref : &'a [T] =
+                    match self.cpu_alloc {
+                        Allocation::None => panic!("No CPU reference"),
+                        Allocation::Reference(ref val) => val,
+                        Allocation::Allocation(ref val) => val,
+                    };
+
+                let size = cpu_ref.len() * size_of::<T>();
+                let (cuda_alloc, copied) =
+                    match self.cuda_alloc.take() {
+                        Allocation::Reference(val)  if unsafe { val.__size() == size } => (Allocation::Reference(val), false),
+                        Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false),
+                        _ => {
+                            let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref));
+                            (Allocation::Allocation(alloc), true)
+                        }
+                    };
+                self.cuda_alloc = cuda_alloc;
+
+                let cuda_ref =
+                    match self.cuda_alloc {
+                        Allocation::None => panic!(),
+                        Allocation::Reference(ref mut val) => val.dup(),
+                        Allocation::Allocation(ref mut val) => val.get_ref_mut(),
+                    };
+
+                if !copied {
+                    unsafe {
+                        __copy_cpu_to_cuda(cuda_ref.__ptr(), cpu_ref.as_ptr() as *mut u8, size);
+                    }
+                }
+
+                self.loc = HerculesMutBoxLocation::CUDA;
+                cuda_ref
+            }
+            HerculesMutBoxLocation::CUDA => {
+                match self.cuda_alloc {
+                    Allocation::None => panic!("No GPU reference"),
+                    Allocation::Reference(ref mut val) => val.dup(),
+                    Allocation::Allocation(ref mut val) => val.get_ref_mut(),
+                }
+            }
+        }
+    }
+}
+
+pub trait HerculesImmBoxTo<'a, T> {
+    fn to(&self) -> T;
+}
+
+impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
+where T: Default + Clone
+{
+    fn to(&self) -> HerculesCPURef<'a> {
+        self.as_cpu_ref()
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
+where T: Default + Clone
+{
+    fn to(&self) -> HerculesCUDARef<'a> {
+        self.as_cuda_ref()
+    }
+}
+
+pub trait HerculesMutBoxTo<'a, T> {
+    fn to(&'a mut self) -> T;
+}
+
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
+where T: Default + Clone
+{
+    fn to(&'a mut self) -> HerculesCPURefMut<'a> {
+        self.as_cpu_ref()
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
+where T: Default + Clone
+{
+    fn to(&'a mut self) -> HerculesCUDARefMut<'a> {
+        self.as_cuda_ref()
+    }
+}