Skip to content
Snippets Groups Projects

Safe rust interface

Merged Aaron Councilman requested to merge safe-rust-interface into main
All threads resolved!
Files
3
+ 429
0
#![feature(once_cell_try)]
use std::alloc::{alloc, dealloc, Layout};
use std::marker::PhantomData;
use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
use std::slice::{from_raw_parts, from_raw_parts_mut};
use std::sync::OnceLock;
/*
* Define supporting types, functions, and macros for Hercules RT functions. For
* a more in-depth discussion of the design of these utilities, see hercules_cg/
@@ -278,6 +282,14 @@ impl<'a> HerculesCUDARefMut<'a> {
}
}
pub fn dup(&'a mut self) -> Self {
HerculesCUDARefMut {
ptr: self.ptr,
size: self.size,
_phantom: PhantomData,
}
}
pub unsafe fn __ptr(&self) -> *mut u8 {
self.ptr.as_ptr()
}
@@ -330,6 +342,10 @@ impl CUDABox {
_phantom: PhantomData,
}
}
pub fn get_bytes(&self) -> usize {
self.size
}
}
#[cfg(feature = "cuda")]
@@ -354,3 +370,416 @@ macro_rules! runner {
<concat_idents!(HerculesRunner_, $x)>::new()
};
}
/*
* A HerculesBox holds memory that can be on any device and provides a common interface to moving
* data where it is needed.
*
* It can hold CPU and device allocations to basically point at the memory it represents. It can
* also hold Hercules references either to those allocations it holds or to other allocations not
* held by this Box (in which case the appropriate allocation field should be None).
*
* The data held at all of its non-None allocations and references is maintained so that it is the
* same, and so methods will attempt to use the reference or allocation that is most convenient.
*
* HerculesImmBox hold references to immutable memory only. All operations on these is through
* immutable references, though internally it uses OnceLocks to protect its resources since the Box
* may be used in multiple parallel threads if it is used in parallel Hercules code invocation.
* We use OnceLocks since the data is immutable and so once it has been placed on a device movement
* is not necessary.
*
* We maintain the invariant that at least one of the device references is always set, their
* associated allocations may or may not be set, as those may not be needed if the allocation is
* help elsewhere.
*
* HerculesMutBox holds memory on some device and can produce mutable references to that data on
* on any device. All these operations are through mutable references since this ensures exclusive
* access to the Box and therefore to the underlying device memory. Because of the exclusive access
* locks are not needed.
*
* Note that a HerculesMutBox may hold multiple allocations at once, but it tracks the "definitive"
* copy to be the one borrowed mutably most recently (since it may have been updated). The extra
* allocations are kept around to avoid reallocation if memory is moved back to the device.
*/
pub struct HerculesImmBox<'a, T> {
#[allow(dead_code)]
cpu_alloc: OnceLock<Vec<T>>,
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock<CUDABox>,
cpu_ref: OnceLock<HerculesCPURef<'a>>,
#[cfg(feature = "cuda")]
cuda_ref: OnceLock<HerculesCUDARef<'a>>,
}
impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> {
fn from(value: &'a [T]) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::new(),
}
}
}
impl<'a, T> From<HerculesCPURef<'a>> for HerculesImmBox<'a, T> {
fn from(value: HerculesCPURef<'a>) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::from(value),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::new(),
}
}
}
// If we are building from a mutable reference, we demote that to a non-mutable reference since we
// don't hold mutable references.
impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesImmBox<'a, T> {
fn from(value: HerculesCPURefMut<'a>) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::from(value.as_ref()),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::new(),
}
}
}
#[cfg(feature = "cuda")]
impl<'a, T> From<HerculesCUDARef<'a>> for HerculesImmBox<'a, T> {
fn from(value: HerculesCUDARef<'a>) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::from(value),
}
}
}
#[cfg(feature = "cuda")]
impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> {
fn from(value: HerculesCUDARefMut<'a>) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::from(value.as_ref()),
}
}
}
impl<'a, T> HerculesImmBox<'a, T>
where
T: Default + Clone
{
pub fn as_slice<'b>(&'b self) -> &'a [T] {
self.as_cpu_ref().as_slice()
}
pub fn to_vec<'b>(&'b self) -> Vec<T> {
Vec::from(self.as_cpu_ref().as_slice())
}
pub fn as_cpu_ref<'b>(&'b self) -> HerculesCPURef<'a> {
if let Some(cpu_ref) = self.cpu_ref.get() {
cpu_ref.clone()
} else {
#[cfg(feature = "cuda")]
if let Some(cuda_ref) = self.cuda_ref.get() {
return
self.cpu_ref.get_or_init(|| {
let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
let mut alloc = Vec::new();
alloc.resize_with(elements, Default::default);
let _ = cuda_ref.clone().to_cpu_ref(&mut alloc);
self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly");
let alloc = self.cpu_alloc.get().unwrap();
HerculesCPURef::from_slice(alloc)
}).clone();
}
panic!("HerculesImmBox has no reference to data")
}
}
#[cfg(feature = "cuda")]
pub fn as_cuda_ref<'b>(&'b self) -> HerculesCUDARef<'a> {
if let Some(cuda_ref) = self.cuda_ref.get() {
cuda_ref.clone()
} else {
if let Some(cpu_ref) = self.cpu_ref.get() {
return self.cuda_ref.get_or_init(|| {
// Copy data to CUDA device
let alloc = CUDABox::from_cpu_ref(cpu_ref.clone());
self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly");
self.cuda_alloc.get().unwrap().get_ref()
}).clone();
}
panic!("HerculesImmBox has no reference to data")
}
}
}
enum HerculesMutBoxLocation {
CPU,
#[cfg(feature = "cuda")]
CUDA,
}
enum Allocation<R, A> {
None,
Reference(R),
Allocation(A),
}
impl<R, A> Allocation<R, A> {
fn take(&mut self) -> Allocation<R, A> {
std::mem::replace(self, Allocation::None)
}
}
pub struct HerculesMutBox<'a, T> {
loc: HerculesMutBoxLocation,
cpu_alloc: Allocation<&'a mut [T], Vec<T>>,
#[cfg(feature = "cuda")]
cuda_alloc: Allocation<HerculesCUDARefMut<'a>, CUDABox>,
}
impl<'a, T> From<&'a mut [T]> for HerculesMutBox<'a, T> {
fn from(value: &'a mut [T]) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CPU,
cpu_alloc: Allocation::Reference(value),
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::None,
}
}
}
impl<'a, T> From<Vec<T>> for HerculesMutBox<'a, T> {
fn from(value: Vec<T>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CPU,
cpu_alloc: Allocation::Allocation(value),
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::None,
}
}
}
impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<'a, T> {
fn from(value: HerculesCPURef<'a>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CPU,
cpu_alloc: Allocation::Allocation(value.as_slice().to_vec()),
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::None,
}
}
}
impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> {
fn from(value: HerculesCPURefMut<'a>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CPU,
cpu_alloc: Allocation::Reference(value.as_slice()),
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::None,
}
}
}
#[cfg(feature = "cuda")]
impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<'a, T> {
fn from(value: HerculesCUDARef<'a>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CUDA,
cpu_alloc: Allocation::None,
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::Allocation(CUDABox::from_cuda_ref(value)),
}
}
}
#[cfg(feature = "cuda")]
impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
fn from(value: HerculesCUDARefMut<'a>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CUDA,
cpu_alloc: Allocation::None,
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::Reference(value),
}
}
}
impl<'a, T> HerculesMutBox<'a, T>
where
T: Default + Clone
{
pub fn as_slice(&'a mut self) -> &'a mut [T] {
self.as_cpu_ref().as_slice()
}
pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> {
match self.loc {
HerculesMutBoxLocation::CPU => {
match self.cpu_alloc {
Allocation::None => panic!("No CPU reference"),
Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val),
Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val),
}
}
#[cfg(feature = "cuda")]
HerculesMutBoxLocation::CUDA => {
let cuda_ref : HerculesCUDARef<'a> =
match self.cuda_alloc {
Allocation::None => panic!("No GPU reference"),
Allocation::Reference(ref mut val) => val.dup().as_ref(),
Allocation::Allocation(ref val) => val.get_ref(),
};
let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
// Allocate host memory (if needed)
let cpu_alloc : Allocation<&'a mut [T], Vec<T>> =
match self.cpu_alloc.take() {
Allocation::Reference(val) if val.len() == elements => Allocation::Reference(val),
Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val),
_ => {
let mut alloc = Vec::new();
alloc.resize_with(elements, Default::default);
Allocation::Allocation(alloc)
}
};
self.cpu_alloc = cpu_alloc;
let cpu_ref : &'a mut [T] =
match &mut self.cpu_alloc {
Allocation::None => panic!(),
Allocation::Reference(val) => val,
Allocation::Allocation(val) => val,
};
// Transfer data from CUDA device
let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref);
self.loc = HerculesMutBoxLocation::CPU;
cpu_ref
}
}
}
#[cfg(feature = "cuda")]
pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> {
match self.loc {
HerculesMutBoxLocation::CPU => {
let cpu_ref : &'a [T] =
match self.cpu_alloc {
Allocation::None => panic!("No CPU reference"),
Allocation::Reference(ref val) => val,
Allocation::Allocation(ref val) => val,
};
let size = cpu_ref.len() * size_of::<T>();
let (cuda_alloc, copied) =
match self.cuda_alloc.take() {
Allocation::Reference(val) if unsafe { val.__size() == size } => (Allocation::Reference(val), false),
Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false),
_ => {
let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref));
(Allocation::Allocation(alloc), true)
}
};
self.cuda_alloc = cuda_alloc;
let cuda_ref =
match self.cuda_alloc {
Allocation::None => panic!(),
Allocation::Reference(ref mut val) => val.dup(),
Allocation::Allocation(ref mut val) => val.get_ref_mut(),
};
if !copied {
unsafe {
__copy_cpu_to_cuda(cuda_ref.__ptr(), cpu_ref.as_ptr() as *mut u8, size);
}
}
self.loc = HerculesMutBoxLocation::CUDA;
cuda_ref
}
HerculesMutBoxLocation::CUDA => {
match self.cuda_alloc {
Allocation::None => panic!("No GPU reference"),
Allocation::Reference(ref mut val) => val.dup(),
Allocation::Allocation(ref mut val) => val.get_ref_mut(),
}
}
}
}
}
pub trait HerculesImmBoxTo<'a, T> {
fn to(&self) -> T;
}
impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
where T: Default + Clone
{
fn to(&self) -> HerculesCPURef<'a> {
self.as_cpu_ref()
}
}
#[cfg(feature = "cuda")]
impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
where T: Default + Clone
{
fn to(&self) -> HerculesCUDARef<'a> {
self.as_cuda_ref()
}
}
pub trait HerculesMutBoxTo<'a, T> {
fn to(&'a mut self) -> T;
}
impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
where T: Default + Clone
{
fn to(&'a mut self) -> HerculesCPURefMut<'a> {
self.as_cpu_ref()
}
}
#[cfg(feature = "cuda")]
impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
where T: Default + Clone
{
fn to(&'a mut self) -> HerculesCUDARefMut<'a> {
self.as_cuda_ref()
}
}
Loading