Aaron Councilman · Aaron Councilman · d78c63c3 · 3b1f9e24 · 4cb853f6 · 27ce61d0
--- a/hercules_rt/src/lib.rs

+ 353

− 0
+++ b/hercules_rt/src/lib.rs

+ 353

− 0
+#![feature(once_cell_try)]
+
 use std::alloc::{alloc, dealloc, Layout};
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
 use std::slice::{from_raw_parts, from_raw_parts_mut};

+use std::sync::OnceLock;
+
 /*
 * Define supporting types, functions, and macros for Hercules RT functions. For
 * a more in-depth discussion of the design of these utilities, see hercules_cg/
 @@ -330,6 +334,10 @@ impl CUDABox {
            _phantom: PhantomData,
        }
    }
+
+    pub fn get_bytes(&self) -> usize {
+        self.size
+    }
 }

 #[cfg(feature = "cuda")]
 @@ -354,3 +362,348 @@ macro_rules! runner {
        <concat_idents!(HerculesRunner_, $x)>::new()
    };
 }
+
+/*
+ * A HerculesBox holds memory that can be on any device and provides a common interface to moving
+ * data where it is needed.
+ *
+ * It can hold CPU and device allocations to basically point at the memory it represents. It can
+ * also hold Hercules references either to those allocations it holds or to other allocations not
+ * held by this Box (in which case the appropriate allocation field should be None).
+ *
+ * The data held at all of its non-None allocations and references is maintained so that it is the
+ * same, and so methods will attempt to use the reference or allocation that is most convenient.
+ * 
+ * HerculesImmBox hold references to immutable memory only. All operations on these is through
+ * immutable references, though internally it uses OnceLocks to protect its resources since the Box
+ * may be used in multiple parallel threads if it is used in parallel Hercules code invocation.
+ * We use OnceLocks since the data is immutable and so once it has been placed on a device movement
+ * is not necessary.
+ *
+ * We maintain the invariant that at least one of the device references is always set, their
+ * associated allocations may or may not be set, as those may not be needed if the allocation is
+ * help elsewhere.
+ *
+ * HerculesMutBox holds memory on some device and can produce mutable references to that data on
+ * on any device. All these operations are through mutable references since this ensures exclusive
+ * access to the Box and therefore to the underlying device memory. Because of the exclusive access
+ * locks are not needed.
+ *
+ * Note that a HerculesMutBox may hold multiple allocations at once, but it tracks the "definitive"
+ * copy to be the one borrowed mutably most recently (since it may have been updated). The extra
+ * allocations are kept around to avoid reallocation if memory is moved back to the device.
+ */
+pub struct HerculesImmBox<'a, T> {
+    #[allow(dead_code)]
+    cpu_alloc: OnceLock<Vec<T>>,
+    #[cfg(feature = "cuda")]
+    cuda_alloc: OnceLock<CUDABox>,
+
+    cpu_ref: OnceLock<HerculesCPURef<'a>>,
+    #[cfg(feature = "cuda")]
+    cuda_ref: OnceLock<HerculesCUDARef<'a>>,
+}
+
+impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> {
+    fn from(value: &'a [T]) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+}
+
+impl<'a, T> From<HerculesCPURef<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCPURef<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(value),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+}
+
+// If we are building from a mutable reference, we demote that to a non-mutable reference since we
+// don't hold mutable references.
+impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCPURefMut<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(value.as_ref()),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCUDARef<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::from(value),
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCUDARefMut<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::from(value.as_ref()),
+        }
+    }
+}
+
+impl<'a, T> HerculesImmBox<'a, T> 
+where 
+    T: Default + Clone
+{
+    pub fn as_slice(&'a self) -> &'a [T] {
+        self.as_cpu_ref().as_slice()
+    }
+
+    pub fn to_vec(&'a self) -> Vec<T> {
+        Vec::from(self.as_cpu_ref().as_slice())
+    }
+
+    pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> {
+        if let Some(cpu_ref) = self.cpu_ref.get() {
+            cpu_ref.clone()
+        } else {
+            #[cfg(feature = "cuda")]
+            if let Some(cuda_ref) = self.cuda_ref.get() {
+                return 
+                    self.cpu_ref.get_or_init(|| {
+                        let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
+
+                        let mut alloc = Vec::new();
+                        alloc.resize_with(elements, Default::default);
+                        let _ = cuda_ref.clone().to_cpu_ref(&mut alloc);
+
+                        self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly");
+                        let alloc = self.cpu_alloc.get().unwrap();
+                        HerculesCPURef::from_slice(alloc)
+                    }).clone();
+            }
+
+            panic!("HerculesImmBox has no reference to data")
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> {
+        if let Some(cuda_ref) = self.cuda_ref.get() {
+            cuda_ref.clone()
+        } else {
+            if let Some(cpu_ref) = self.cpu_ref.get() {
+                return self.cuda_ref.get_or_init(|| {
+                    // Copy data to CUDA device
+                    let alloc = CUDABox::from_cpu_ref(cpu_ref.clone());
+                    self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly");
+
+                    self.cuda_alloc.get().unwrap().get_ref()
+                }).clone();
+            }
+
+            panic!("HerculesImmBox has no reference to data")
+        }
+    }
+}
+
+enum HerculesMutBoxLocation {
+    CPU,
+    #[cfg(feature = "cuda")]
+    CUDA,
+}
+
+pub struct HerculesMutBox<T> {
+    loc: HerculesMutBoxLocation,
+
+    cpu_alloc: Option<Vec<T>>,
+    #[cfg(feature = "cuda")]
+    cuda_alloc: Option<CUDABox>,
+}
+
+impl<T: Clone> From<&mut [T]> for HerculesMutBox<T> {
+    fn from(value: &mut [T]) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Some(value.to_vec()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+        }
+    }
+}
+
+impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<T> {
+    fn from(value: HerculesCPURef<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Some(value.as_slice().to_vec()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+        }
+    }
+}
+
+impl<'a, T: Clone> From<HerculesCPURefMut<'a>> for HerculesMutBox<T> {
+    fn from(value: HerculesCPURefMut<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Some(value.as_slice().to_vec()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<T> {
+    fn from(value: HerculesCUDARef<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CUDA,
+            cpu_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Some(CUDABox::from_cuda_ref(value)),
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<T> {
+    fn from(value: HerculesCUDARefMut<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CUDA,
+            cpu_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Some(CUDABox::from_cuda_ref(value.as_ref())),
+        }
+    }
+}
+
+impl<T> HerculesMutBox<T>
+where
+    T: Default
+{
+    pub fn as_slice(&mut self) -> &mut [T] {
+        self.as_cpu_ref().as_slice()
+    }
+
+    pub fn to_vec(mut self) -> Vec<T> {
+        // Bring to CPU (if needed)
+        let _ = self.as_cpu_ref();
+        self.cpu_alloc.unwrap()
+    }
+
+    pub fn as_cpu_ref<'a>(&'a mut self) -> HerculesCPURefMut<'a> {
+        match self.loc {
+            HerculesMutBoxLocation::CPU => {
+                HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap())
+            }
+            #[cfg(feature = "cuda")]
+            HerculesMutBoxLocation::CUDA => {
+                let cuda_alloc = self.cuda_alloc.as_ref().unwrap();
+                let elements = cuda_alloc.get_bytes() / size_of::<T>();
+
+                // Allocate host memory (if needed)
+                if self.cpu_alloc.is_none() || self.cpu_alloc.as_ref().unwrap().len() != elements {
+                    let mut alloc = Vec::new();
+                    alloc.resize_with(elements, Default::default);
+                    self.cpu_alloc = Some(alloc);
+                }
+
+                // Transfer data from CUDA device
+                let cpu_alloc = self.cpu_alloc.as_mut().unwrap();
+                let _ = cuda_alloc.get_ref().to_cpu_ref(cpu_alloc);
+
+                self.loc = HerculesMutBoxLocation::CPU;
+                HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap())
+            }
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn as_cuda_ref<'a>(&'a mut self) -> HerculesCUDARefMut<'a> {
+        match self.loc {
+            HerculesMutBoxLocation::CPU => {
+                // TODO: CUDABox does not provide an interface for copying data to it, so currently
+                // we just reallocate it
+                let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap());
+                let cuda_alloc = CUDABox::from_cpu_ref(cpu_ref);
+
+                self.cuda_alloc = Some(cuda_alloc);
+                self.loc = HerculesMutBoxLocation::CUDA;
+                self.cuda_alloc.as_mut().unwrap().get_ref_mut()
+            }
+            HerculesMutBoxLocation::CUDA => {
+                self.cuda_alloc.as_mut().unwrap().get_ref_mut()
+            }
+        }
+    }
+}
+
+pub trait HerculesImmBoxTo<'a, T> {
+    fn to(&'a self) -> T;
+}
+
+impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
+where T: Default + Clone
+{
+    fn to(&'a self) -> HerculesCPURef<'a> {
+        self.as_cpu_ref()
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
+where T: Default + Clone
+{
+    fn to(&'a self) -> HerculesCUDARef<'a> {
+        self.as_cuda_ref()
+    }
+}
+
+pub trait HerculesMutBoxTo<'a, T> {
+    fn to(&'a mut self) -> T;
+}
+
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<T>
+where T: Default + Clone
+{
+    fn to(&'a mut self) -> HerculesCPURefMut<'a> {
+        self.as_cpu_ref()
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<T>
+where T: Default + Clone
+{
+    fn to(&'a mut self) -> HerculesCUDARefMut<'a> {
+        self.as_cuda_ref()
+    }
+}