From 70b71cfed7283dc00c4d2ace97feaa7b1de98eac Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Wed, 5 Feb 2025 09:42:45 -0600 Subject: [PATCH 01/11] First attempt at new HerculesBox --- hercules_rt/src/lib.rs | 306 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index 12b64fa3..e99f12e3 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -330,6 +330,10 @@ impl CUDABox { _phantom: PhantomData, } } + + pub fn get_bytes(&self) -> usize { + self.size + } } #[cfg(feature = "cuda")] @@ -367,3 +371,305 @@ impl __RawPtrSendSync { unsafe impl Send for __RawPtrSendSync {} unsafe impl Sync for __RawPtrSendSync {} + +/* + * A HerculesBox holds memory that can be on any device and provides a common interface to moving + * data where it is needed. + * + * It can hold CPU and device allocations to basically point at the memory it represents. It can + * also hold Hercules references either to those allocations it holds or to other allocations not + * held by this Box (in which case the appropriate allocation field should be None). + * + * The data held at all of its non-None allocations and references is maintained so that it is the + * same, and so methods will attempt to use the reference or allocation that is most convenient. + * + * When mutable references are required of this Box, it will allocate space on the required device + * and eliminate any references or allocations it had on other devices since it must assume its + * data is modified through the mutable reference. + */ +pub struct HerculesBox<'a, T> { + cpu_alloc: Option<Vec<T>>, + cpu_ref: Option<HerculesCPURef<'a>>, + + #[cfg(feature = "cuda")] + cuda_alloc: Option<CudaBox>, + #[cfg(feature = "cuda")] + cuda_ref: Option<HerculesCUDARef<'a>>, +} + +impl<'a, T> From<&'a [T]> for HerculesBox<'a, T> { + fn from(value: &'a [T]) -> Self { + HerculesBox { + cpu_alloc: None, + cpu_ref: Some(HerculesCPURef::from_slice(value)), + + #[cfg(feature = "cuda")] + cuda_alloc: None, + #[cfg(feature = "cuda")] + cuda_ref: None, + } + } +} + +impl<'a, T> From<HerculesCPURef<'a>> for HerculesBox<'a, T> { + fn from(value: HerculesCPURef<'a>) -> Self { + HerculesBox { + cpu_alloc: None, + cpu_ref: Some(value), + + #[cfg(feature = "cuda")] + cuda_alloc: None, + #[cfg(feature = "cuda")] + cuda_ref: None, + } + } +} + +// If we are building from a mutable reference, we demote that to a non-mutable reference since we +// don't hold mutable references. This means if we construct a box and then request a mutable +// reference from it that we will copy the data into a new allocation and then return a reference +// to that. +impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesBox<'a, T> { + fn from(value: HerculesCPURefMut<'a>) -> Self { + HerculesBox { + cpu_alloc: None, + cpu_ref: Some(value.as_ref()), + + #[cfg(feature = "cuda")] + cuda_alloc: None, + #[cfg(feature = "cuda")] + cuda_ref: None, + } + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> From<HerculesCUDARef<'a>> for HerculesBox<'a, T> { + fn from(value: HerculesCUDARef<'a>) -> Self { + HerculesBox { + cpu_alloc: None, + cpu_ref: None, + cpu_ref_mut: None, + + #[cfg(feature = "cuda")] + cuda_alloc: None, + #[cfg(feature = "cuda")] + cuda_ref: Some(value), + #[cfg(feature = "cuda")] + cuda_ref_mut: None, + } + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesBox<'a, T> { + fn from(value: HerculesCUDARefMut<'a>) -> Self { + HerculesBox { + cpu_alloc: None, + cpu_ref: None, + cpu_ref_mut: None, + + #[cfg(feature = "cuda")] + cuda_alloc: None, + #[cfg(feature = "cuda")] + cuda_ref: None, + #[cfg(feature = "cuda")] + cuda_ref_mut: Some(value), + } + } +} + +impl<'a, T> HerculesBox<'a, T> +where + T: Default + Clone +{ + fn as_slice(&'a mut self) -> &'a [T] { + self.as_cpu_ref().as_slice() + } + + fn as_slice_mut(&'a mut self) -> &'a mut [T] { + self.as_cpu_ref_mut().as_slice() + } + + fn as_cpu_ref(&'a mut self) -> HerculesCPURef<'a> { + if self.cpu_ref.is_some() { + self.cpu_ref.clone().unwrap() + } else if self.cpu_alloc.is_some() { + // This could occur if a mutable reference had been created from this box + let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice()); + self.cpu_ref = Some(cpu_ref.clone()); + cpu_ref + } else { + #[cfg(feature = "cuda")] + { + let cuda_ref = + if self.cuda_ref.is_some() { + Some(self.cuda_ref.clone().unwrap()) + } else if self.cuda_alloc.is_some() { + Some(self.cuda_alloc.as_ref().unwrap().get_ref()) + } else { + None + }; + + if let Some(cuda_ref) = cuda_ref { + let elements = cuda_ref.__size() / size_of::<T>(); + + // Transfer memory back to CPU using our cpu_alloc + let mut alloc = Vec::new(); + alloc.resize_with(elements, Default::default); + self.cpu_alloc = Some(alloc); + + let alloc = self.cpu_alloc.as_mut().unwrap(); + assert!(alloc.len() == elements); + + let cpu_ref = cuda_ref.to_cpu_ref(alloc); + self.cpu_ref = Some(cpu_ref.clone()); + return cpu_ref; + } + } + + panic!("HerculesBox has no reference to data") + } + } + + fn as_cpu_ref_mut(&'a mut self) -> HerculesCPURefMut<'a> { + // If we have a CPU allocation already, we'll use that and we don't need to make any copies + if self.cpu_alloc.is_some() { + // Eliminate any other references/allocations since the data may be modified + self.cpu_ref = None; + #[cfg(feature = "cuda")] + { + // TODO: We don't actually need to de-allocate our other allocations, just mark + // them so we know they don't hold the correct data + self.cuda_alloc = None; + self.cuda_ref = None; + } + + HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap().as_mut()) + } else if self.cpu_ref.is_some() { + // The data is in CPU memory, but we don't have exclusive access to it, so we need to + // copy it + let slice = self.cpu_ref.take().unwrap().as_slice(); + + #[cfg(feature = "cuda")] + { + self.cuda_alloc = None; // TODO + self.cuda_ref = None; + } + + self.cpu_alloc = Some(slice.to_vec()); + + HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap().as_mut()) + } else { + #[cfg(feature = "cuda")] + { + let cuda_ref = + if self.cuda_ref.is_some() { + Some(self.cuda_ref.clone().unwrap()) + } else if self.cuda_alloc.is_some() { + Some(self.cuda_alloc.as_ref().unwrap().get_ref()) + } else { + None + }; + + if let Some(cuda_ref) = cuda_ref { + let elements = cuda_ref.__size() / size_of::<T>(); + + // Transfer memory back to CPU using our cpu_alloc + let mut alloc = Vec::new(); + alloc.resize_with(elements, Default::default); + self.cpu_alloc = Some(alloc); + + let alloc = self.cpu_alloc.as_mut().unwrap(); + assert!(alloc.len() == elements); + + let cpu_ref = cuda_ref.to_cpu_ref(alloc); + + // Eliminate other references + self.cpu_ref = None; + self.cuda_alloc = None; // TODO + self.cuda_ref = None; + + return cpu_ref; + } + } + + panic!("HerculesBox has no reference to data") + } + } + + #[cfg(feature = "cuda")] + fn as_cuda_ref(&'a mut self) -> HerculesCUDARef<'a> { + if self.cuda_ref.is_some() { + self.cuda_ref.clone().unwrap() + } else if self.cuda_alloc.is_some() { + // This could occur if a mutable reference had been created from this box + let cuda_ref = self.cuda_alloc.as_ref().unwrap().get_ref(); + self.cuda_ref = Some(cuda_ref.clone()); + cuda_ref + } else { + let cpu_ref = + if self.cpu_ref.is_some() { + Some(cpu_ref.clone().unwrap()) + } else if self.cpu_alloc.is_some() { + Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice())) + } else { + None + }; + + if let Some(cpu_ref) = cpu_ref { + // Copy data to CUDA device + self.cuda_alloc = Some(CUDABox::from_cpu_ref(cpu_ref)); + let alloc = self.cuda_alloc.as_ref().unwrap(); + + let cuda_ref = alloc.get_ref(); + self.cuda_ref = Some(cuda_ref.clone()); + return cuda_ref; + } + + panic!("HerculesBox has no reference to data") + } + } + + #[cfg(feature = "cuda")] + fn as_cuda_ref_mut(&'a mut self) -> HerculesCUDARefMut<'a> { + if self.cuda_alloc.is_some() { + self.cpu_alloc = None; // TODO + self.cpu_ref = None; + self.cuda_ref = None; + + self.cuda_alloc.as_mut().unwrap().get_ref_mut() + } else if self.cuda_ref.is_some() { + // The data is in CUDA memory, but we don't have exclusive access to it, so we need to + // copy it + let cuda_alloc = CUDABox::from_cuda_ref(self.cuda_ref.take().unwrap()); + self.cuda_alloc = Some(cuda_alloc); + + self.cpu_alloc = None; // TODO + self.cpu_ref = None; + + self.cuda_alloc.as_mut().unwrap().get_ref_mut() + } else { + let cpu_ref = + if self.cpu_ref.is_some() { + Some(cpu_ref.clone().unwrap()) + } else if self.cpu_alloc.is_some() { + Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice())) + } else { + None + }; + + if let Some(cpu_ref) = cpu_ref { + // Copy data to CUDA device + self.cuda_alloc = Some(CUDABox::from_cpu_ref(cpu_ref)); + + self.cpu_alloc = None; // TODO + self.cpu_ref = None; + + return self.cuda_alloc.as_mut().unwrap().get_ref_mut(); + } + + panic!("HerculesBox has no reference to data") + } + } +} -- GitLab From c076928b463f57acff0efd783d996d1424a08e91 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Wed, 5 Feb 2025 09:48:23 -0600 Subject: [PATCH 02/11] Fix HerculesBox --- hercules_rt/src/lib.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index e99f12e3..10985cc7 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -392,7 +392,7 @@ pub struct HerculesBox<'a, T> { cpu_ref: Option<HerculesCPURef<'a>>, #[cfg(feature = "cuda")] - cuda_alloc: Option<CudaBox>, + cuda_alloc: Option<CUDABox>, #[cfg(feature = "cuda")] cuda_ref: Option<HerculesCUDARef<'a>>, } @@ -449,14 +449,11 @@ impl<'a, T> From<HerculesCUDARef<'a>> for HerculesBox<'a, T> { HerculesBox { cpu_alloc: None, cpu_ref: None, - cpu_ref_mut: None, #[cfg(feature = "cuda")] cuda_alloc: None, #[cfg(feature = "cuda")] cuda_ref: Some(value), - #[cfg(feature = "cuda")] - cuda_ref_mut: None, } } } @@ -467,14 +464,11 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesBox<'a, T> { HerculesBox { cpu_alloc: None, cpu_ref: None, - cpu_ref_mut: None, #[cfg(feature = "cuda")] cuda_alloc: None, #[cfg(feature = "cuda")] - cuda_ref: None, - #[cfg(feature = "cuda")] - cuda_ref_mut: Some(value), + cuda_ref: Some(value.as_ref()), } } } @@ -610,7 +604,7 @@ where } else { let cpu_ref = if self.cpu_ref.is_some() { - Some(cpu_ref.clone().unwrap()) + Some(self.cpu_ref.clone().unwrap()) } else if self.cpu_alloc.is_some() { Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice())) } else { @@ -652,7 +646,7 @@ where } else { let cpu_ref = if self.cpu_ref.is_some() { - Some(cpu_ref.clone().unwrap()) + Some(self.cpu_ref.clone().unwrap()) } else if self.cpu_alloc.is_some() { Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice())) } else { -- GitLab From 4694aae1fdea13f503133e5ebd168b3abb8d3f3a Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Wed, 5 Feb 2025 14:34:05 -0600 Subject: [PATCH 03/11] Fix immutable boxes and make them thread safe --- hercules_rt/src/lib.rs | 325 ++++++++---------------- juno_samples/cava/src/main.rs | 74 ++---- juno_samples/edge_detection/src/main.rs | 90 ++----- 3 files changed, 158 insertions(+), 331 deletions(-) diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index 10985cc7..6d03bd25 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -1,8 +1,12 @@ +#![feature(once_cell_try)] + use std::alloc::{alloc, dealloc, Layout}; use std::marker::PhantomData; use std::ptr::{copy_nonoverlapping, write_bytes, NonNull}; use std::slice::{from_raw_parts, from_raw_parts_mut}; +use std::sync::OnceLock; + /* * Define supporting types, functions, and macros for Hercules RT functions. For * a more in-depth discussion of the design of these utilities, see hercules_cg/ @@ -383,287 +387,178 @@ unsafe impl Sync for __RawPtrSendSync {} * The data held at all of its non-None allocations and references is maintained so that it is the * same, and so methods will attempt to use the reference or allocation that is most convenient. * - * When mutable references are required of this Box, it will allocate space on the required device - * and eliminate any references or allocations it had on other devices since it must assume its - * data is modified through the mutable reference. + * HerculesImmBox hold references to immutable memory only. All operations on these is through + * immutable references, though internally it uses OnceLocks to protect its resources since the Box + * may be used in multiple parallel threads if it is used in parallel Hercules code invocation. + * We use OnceLocks since the data is immutable and so once it has been placed on a device movement + * is not necessary. + * + * We maintain the invariant that at least one of the device references is always set, their + * associated allocations may or may not be set, as those may not be needed if the allocation is + * help elsewhere. + * + * HerculesMutBox is TODO. */ -pub struct HerculesBox<'a, T> { - cpu_alloc: Option<Vec<T>>, - cpu_ref: Option<HerculesCPURef<'a>>, - +pub struct HerculesImmBox<'a, T> { + // NOTE: We only need OnceLock if we're allowed to launch multiple Hercules program in + // parallel, if that's not necessary we can probably get away with using OnceCell + #[allow(dead_code)] + cpu_alloc: OnceLock<Vec<T>>, #[cfg(feature = "cuda")] - cuda_alloc: Option<CUDABox>, + cuda_alloc: OnceLock<CUDABox>, + + cpu_ref: OnceLock<HerculesCPURef<'a>>, #[cfg(feature = "cuda")] - cuda_ref: Option<HerculesCUDARef<'a>>, + cuda_ref: OnceLock<HerculesCUDARef<'a>>, } -impl<'a, T> From<&'a [T]> for HerculesBox<'a, T> { +impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> { fn from(value: &'a [T]) -> Self { - HerculesBox { - cpu_alloc: None, - cpu_ref: Some(HerculesCPURef::from_slice(value)), - + HerculesImmBox { + cpu_alloc: OnceLock::new(), #[cfg(feature = "cuda")] - cuda_alloc: None, + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)), #[cfg(feature = "cuda")] - cuda_ref: None, + cuda_ref: OnceLock::new(), } } } -impl<'a, T> From<HerculesCPURef<'a>> for HerculesBox<'a, T> { +impl<'a, T> From<HerculesCPURef<'a>> for HerculesImmBox<'a, T> { fn from(value: HerculesCPURef<'a>) -> Self { - HerculesBox { - cpu_alloc: None, - cpu_ref: Some(value), - + HerculesImmBox { + cpu_alloc: OnceLock::new(), #[cfg(feature = "cuda")] - cuda_alloc: None, + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::from(value), #[cfg(feature = "cuda")] - cuda_ref: None, + cuda_ref: OnceLock::new(), } } } // If we are building from a mutable reference, we demote that to a non-mutable reference since we -// don't hold mutable references. This means if we construct a box and then request a mutable -// reference from it that we will copy the data into a new allocation and then return a reference -// to that. -impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesBox<'a, T> { +// don't hold mutable references. +impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesImmBox<'a, T> { fn from(value: HerculesCPURefMut<'a>) -> Self { - HerculesBox { - cpu_alloc: None, - cpu_ref: Some(value.as_ref()), - + HerculesImmBox { + cpu_alloc: OnceLock::new(), #[cfg(feature = "cuda")] - cuda_alloc: None, + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::from(value.as_ref()), #[cfg(feature = "cuda")] - cuda_ref: None, + cuda_ref: OnceLock::new(), } } } #[cfg(feature = "cuda")] -impl<'a, T> From<HerculesCUDARef<'a>> for HerculesBox<'a, T> { +impl<'a, T> From<HerculesCUDARef<'a>> for HerculesImmBox<'a, T> { fn from(value: HerculesCUDARef<'a>) -> Self { - HerculesBox { - cpu_alloc: None, - cpu_ref: None, - + HerculesImmBox { + cpu_alloc: OnceLock::new(), #[cfg(feature = "cuda")] - cuda_alloc: None, + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::new(), #[cfg(feature = "cuda")] - cuda_ref: Some(value), + cuda_ref: OnceLock::from(value), } } } #[cfg(feature = "cuda")] -impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesBox<'a, T> { +impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> { fn from(value: HerculesCUDARefMut<'a>) -> Self { - HerculesBox { - cpu_alloc: None, - cpu_ref: None, - + HerculesImmBox { + cpu_alloc: OnceLock::new(), #[cfg(feature = "cuda")] - cuda_alloc: None, + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::new(), #[cfg(feature = "cuda")] - cuda_ref: Some(value.as_ref()), + cuda_ref: OnceLock::from(value.as_ref()), } } } -impl<'a, T> HerculesBox<'a, T> +impl<'a, T> HerculesImmBox<'a, T> where T: Default + Clone { - fn as_slice(&'a mut self) -> &'a [T] { + pub fn as_slice(&'a self) -> &'a [T] { self.as_cpu_ref().as_slice() } - fn as_slice_mut(&'a mut self) -> &'a mut [T] { - self.as_cpu_ref_mut().as_slice() + pub fn to_vec(&'a self) -> Vec<T> { + Vec::from(self.as_cpu_ref().as_slice()) } - fn as_cpu_ref(&'a mut self) -> HerculesCPURef<'a> { - if self.cpu_ref.is_some() { - self.cpu_ref.clone().unwrap() - } else if self.cpu_alloc.is_some() { - // This could occur if a mutable reference had been created from this box - let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice()); - self.cpu_ref = Some(cpu_ref.clone()); - cpu_ref + pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> { + if let Some(cpu_ref) = self.cpu_ref.get() { + cpu_ref.clone() } else { #[cfg(feature = "cuda")] - { - let cuda_ref = - if self.cuda_ref.is_some() { - Some(self.cuda_ref.clone().unwrap()) - } else if self.cuda_alloc.is_some() { - Some(self.cuda_alloc.as_ref().unwrap().get_ref()) - } else { - None - }; - - if let Some(cuda_ref) = cuda_ref { - let elements = cuda_ref.__size() / size_of::<T>(); - - // Transfer memory back to CPU using our cpu_alloc - let mut alloc = Vec::new(); - alloc.resize_with(elements, Default::default); - self.cpu_alloc = Some(alloc); - - let alloc = self.cpu_alloc.as_mut().unwrap(); - assert!(alloc.len() == elements); - - let cpu_ref = cuda_ref.to_cpu_ref(alloc); - self.cpu_ref = Some(cpu_ref.clone()); - return cpu_ref; - } + if let Some(cuda_ref) = self.cuda_ref.get() { + return + self.cpu_ref.get_or_init(|| { + let elements = unsafe { cuda_ref.__size() / size_of::<T>() }; + + let mut alloc = Vec::new(); + alloc.resize_with(elements, Default::default); + let _ = cuda_ref.clone().to_cpu_ref(&mut alloc); + + self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly"); + let alloc = self.cpu_alloc.get().unwrap(); + HerculesCPURef::from_slice(alloc) + }).clone(); } - panic!("HerculesBox has no reference to data") + panic!("HerculesImmBox has no reference to data") } } - fn as_cpu_ref_mut(&'a mut self) -> HerculesCPURefMut<'a> { - // If we have a CPU allocation already, we'll use that and we don't need to make any copies - if self.cpu_alloc.is_some() { - // Eliminate any other references/allocations since the data may be modified - self.cpu_ref = None; - #[cfg(feature = "cuda")] - { - // TODO: We don't actually need to de-allocate our other allocations, just mark - // them so we know they don't hold the correct data - self.cuda_alloc = None; - self.cuda_ref = None; - } - - HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap().as_mut()) - } else if self.cpu_ref.is_some() { - // The data is in CPU memory, but we don't have exclusive access to it, so we need to - // copy it - let slice = self.cpu_ref.take().unwrap().as_slice(); - - #[cfg(feature = "cuda")] - { - self.cuda_alloc = None; // TODO - self.cuda_ref = None; - } - - self.cpu_alloc = Some(slice.to_vec()); - - HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap().as_mut()) + #[cfg(feature = "cuda")] + pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> { + if let Some(cuda_ref) = self.cuda_ref.get() { + cuda_ref.clone() } else { - #[cfg(feature = "cuda")] - { - let cuda_ref = - if self.cuda_ref.is_some() { - Some(self.cuda_ref.clone().unwrap()) - } else if self.cuda_alloc.is_some() { - Some(self.cuda_alloc.as_ref().unwrap().get_ref()) - } else { - None - }; - - if let Some(cuda_ref) = cuda_ref { - let elements = cuda_ref.__size() / size_of::<T>(); - - // Transfer memory back to CPU using our cpu_alloc - let mut alloc = Vec::new(); - alloc.resize_with(elements, Default::default); - self.cpu_alloc = Some(alloc); - - let alloc = self.cpu_alloc.as_mut().unwrap(); - assert!(alloc.len() == elements); - - let cpu_ref = cuda_ref.to_cpu_ref(alloc); - - // Eliminate other references - self.cpu_ref = None; - self.cuda_alloc = None; // TODO - self.cuda_ref = None; - - return cpu_ref; - } + if let Some(cpu_ref) = self.cpu_ref.get() { + return self.cuda_ref.get_or_init(|| { + // Copy data to CUDA device + let alloc = CUDABox::from_cpu_ref(cpu_ref.clone()); + self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly"); + + self.cuda_alloc.get().unwrap().get_ref() + }).clone(); } - panic!("HerculesBox has no reference to data") + panic!("HerculesImmBox has no reference to data") } } +} - #[cfg(feature = "cuda")] - fn as_cuda_ref(&'a mut self) -> HerculesCUDARef<'a> { - if self.cuda_ref.is_some() { - self.cuda_ref.clone().unwrap() - } else if self.cuda_alloc.is_some() { - // This could occur if a mutable reference had been created from this box - let cuda_ref = self.cuda_alloc.as_ref().unwrap().get_ref(); - self.cuda_ref = Some(cuda_ref.clone()); - cuda_ref - } else { - let cpu_ref = - if self.cpu_ref.is_some() { - Some(self.cpu_ref.clone().unwrap()) - } else if self.cpu_alloc.is_some() { - Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice())) - } else { - None - }; - - if let Some(cpu_ref) = cpu_ref { - // Copy data to CUDA device - self.cuda_alloc = Some(CUDABox::from_cpu_ref(cpu_ref)); - let alloc = self.cuda_alloc.as_ref().unwrap(); - - let cuda_ref = alloc.get_ref(); - self.cuda_ref = Some(cuda_ref.clone()); - return cuda_ref; - } +pub trait HerculesBoxTo<'a, T> { + fn to(&'a self) -> T; +} - panic!("HerculesBox has no reference to data") - } +impl<'a, T> HerculesBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T> +where T: Default + Clone +{ + fn to(&'a self) -> HerculesCPURef<'a> { + self.as_cpu_ref() } +} - #[cfg(feature = "cuda")] - fn as_cuda_ref_mut(&'a mut self) -> HerculesCUDARefMut<'a> { - if self.cuda_alloc.is_some() { - self.cpu_alloc = None; // TODO - self.cpu_ref = None; - self.cuda_ref = None; - - self.cuda_alloc.as_mut().unwrap().get_ref_mut() - } else if self.cuda_ref.is_some() { - // The data is in CUDA memory, but we don't have exclusive access to it, so we need to - // copy it - let cuda_alloc = CUDABox::from_cuda_ref(self.cuda_ref.take().unwrap()); - self.cuda_alloc = Some(cuda_alloc); - - self.cpu_alloc = None; // TODO - self.cpu_ref = None; - - self.cuda_alloc.as_mut().unwrap().get_ref_mut() - } else { - let cpu_ref = - if self.cpu_ref.is_some() { - Some(self.cpu_ref.clone().unwrap()) - } else if self.cpu_alloc.is_some() { - Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice())) - } else { - None - }; - - if let Some(cpu_ref) = cpu_ref { - // Copy data to CUDA device - self.cuda_alloc = Some(CUDABox::from_cpu_ref(cpu_ref)); - - self.cpu_alloc = None; // TODO - self.cpu_ref = None; - - return self.cuda_alloc.as_mut().unwrap().get_ref_mut(); - } - - panic!("HerculesBox has no reference to data") - } +#[cfg(feature = "cuda")] +impl<'a, T> HerculesBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T> +where T: Default + Clone +{ + fn to(&'a self) -> HerculesCUDARef<'a> { + self.as_cuda_ref() } } diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index e8a7e4e9..9d0f4702 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -8,16 +8,12 @@ use self::camera_model::*; use self::cava_rust::CHAN; use self::image_proc::*; -#[cfg(feature = "cuda")] -use hercules_rt::CUDABox; -use hercules_rt::{runner, HerculesCPURef}; +use hercules_rt::{runner, HerculesImmBox, HerculesBoxTo}; use image::ImageError; use clap::Parser; -use std::mem; - juno_build::juno!("cava"); fn run_cava( @@ -38,62 +34,34 @@ fn run_cava( assert_eq!(coefs.len(), 4 * CHAN); assert_eq!(tonemap.len(), 256 * CHAN); - #[cfg(not(feature = "cuda"))] - { - let image = HerculesCPURef::from_slice(image); - let tstw = HerculesCPURef::from_slice(tstw); - let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); - let weights = HerculesCPURef::from_slice(weights); - let coefs = HerculesCPURef::from_slice(coefs); - let tonemap = HerculesCPURef::from_slice(tonemap); - let mut r = runner!(cava); + let image = HerculesImmBox::from(image); + let tstw = HerculesImmBox::from(tstw); + let ctrl_pts = HerculesImmBox::from(ctrl_pts); + let weights = HerculesImmBox::from(weights); + let coefs = HerculesImmBox::from(coefs); + let tonemap = HerculesImmBox::from(tonemap); + + let mut r = runner!(cava); + + HerculesImmBox::from( async_std::task::block_on(async { r.run( rows as u64, cols as u64, num_ctrl_pts as u64, - image, - tstw, - ctrl_pts, - weights, - coefs, - tonemap, + image.to(), + tstw.to(), + ctrl_pts.to(), + weights.to(), + coefs.to(), + tonemap.to(), ) .await }) - .as_slice::<u8>() - .to_vec() - .into_boxed_slice() - } - - #[cfg(feature = "cuda")] - { - let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image)); - let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw)); - let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts)); - let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights)); - let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs)); - let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap)); - let mut r = runner!(cava); - let res = async_std::task::block_on(async { - r.run( - rows as u64, - cols as u64, - num_ctrl_pts as u64, - image.get_ref(), - tstw.get_ref(), - ctrl_pts.get_ref(), - weights.get_ref(), - coefs.get_ref(), - tonemap.get_ref(), - ) - .await - }); - let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() }; - let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice(); - res.to_cpu_ref(&mut res_cpu); - res_cpu - } + ) + .as_slice() + .to_vec() + .into_boxed_slice() } enum Error { diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs index 3b067ebd..80a334b7 100644 --- a/juno_samples/edge_detection/src/main.rs +++ b/juno_samples/edge_detection/src/main.rs @@ -2,9 +2,7 @@ mod edge_detection_rust; -#[cfg(feature = "cuda")] -use hercules_rt::CUDABox; -use hercules_rt::{runner, HerculesCPURef}; +use hercules_rt::{runner, HerculesImmBox, HerculesBoxTo}; use std::slice::from_raw_parts; @@ -106,38 +104,18 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036, ]; - #[cfg(not(feature = "cuda"))] - let gaussian_filter_h = HerculesCPURef::from_slice(&gaussian_filter); - #[cfg(feature = "cuda")] - let gaussian_filter_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&gaussian_filter)); - #[cfg(feature = "cuda")] - let gaussian_filter_h = gaussian_filter_cuda.get_ref(); + let gaussian_filter_h = HerculesImmBox::from(gaussian_filter.as_slice()); let sz: usize = 3; let structure: Vec<f32> = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; - #[cfg(not(feature = "cuda"))] - let structure_h = HerculesCPURef::from_slice(&structure); - #[cfg(feature = "cuda")] - let structure_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&structure)); - #[cfg(feature = "cuda")] - let structure_h = structure_cuda.get_ref(); + let structure_h = HerculesImmBox::from(structure.as_slice()); let sb: usize = 3; let sx: Vec<f32> = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0]; - #[cfg(not(feature = "cuda"))] - let sx_h = HerculesCPURef::from_slice(&sx); - #[cfg(feature = "cuda")] - let sx_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sx)); - #[cfg(feature = "cuda")] - let sx_h = sx_cuda.get_ref(); + let sx_h = HerculesImmBox::from(sx.as_slice()); let sy: Vec<f32> = vec![-1.0, -2.0, -1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0]; - #[cfg(not(feature = "cuda"))] - let sy_h = HerculesCPURef::from_slice(&sy); - #[cfg(feature = "cuda")] - let sy_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sy)); - #[cfg(feature = "cuda")] - let sy_h = sy_cuda.get_ref(); + let sy_h = HerculesImmBox::from(sy.as_slice()); let theta: f32 = 0.1; @@ -203,39 +181,28 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { let input = unsafe { from_raw_parts(ptr, height * width) }; - #[cfg(not(feature = "cuda"))] - let input_h = HerculesCPURef::from_slice(input); - #[cfg(feature = "cuda")] - let input_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(input)); - #[cfg(feature = "cuda")] - let input_h = input_cuda.get_ref(); - - let result = async_std::task::block_on(async { - r.run( - height as u64, - width as u64, - gs as u64, - sz as u64, - sb as u64, - input_h, - gaussian_filter_h.clone(), - structure_h.clone(), - sx_h.clone(), - sy_h.clone(), - theta, + let input_h = HerculesImmBox::from(input); + + let result = + HerculesImmBox::from( + async_std::task::block_on(async { + r.run( + height as u64, + width as u64, + gs as u64, + sz as u64, + sb as u64, + input_h.to(), + gaussian_filter_h.to(), + structure_h.to(), + sx_h.to(), + sy_h.to(), + theta, + ) + .await + }) ) - .await - }); - - #[cfg(not(feature = "cuda"))] - let result: Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice(); - #[cfg(feature = "cuda")] - let result: Box<[f32]> = { - let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() }; - let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice(); - result.to_cpu_ref(&mut res_cpu); - res_cpu - }; + .to_vec(); if display { let result = frame_from_slice(&result, height, width); @@ -261,10 +228,7 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { theta, ); - assert_eq!( - result.as_ref(), - <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result) - ); + assert_eq!(result, rust_result); println!("Frames {} match", i); if display_verify { -- GitLab From 71adbd48c355bd48deda33c34da0c74b72e4f8ea Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Wed, 5 Feb 2025 17:07:58 -0600 Subject: [PATCH 04/11] Add HerculesMutBox --- hercules_rt/src/lib.rs | 170 +++++++++++++++++++++++- juno_samples/cava/src/main.rs | 2 +- juno_samples/edge_detection/src/main.rs | 2 +- 3 files changed, 166 insertions(+), 8 deletions(-) diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index 6d03bd25..c287e093 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -397,11 +397,16 @@ unsafe impl Sync for __RawPtrSendSync {} * associated allocations may or may not be set, as those may not be needed if the allocation is * help elsewhere. * - * HerculesMutBox is TODO. + * HerculesMutBox holds memory on some device and can produce mutable references to that data on + * on any device. All these operations are through mutable references since this ensures exclusive + * access to the Box and therefore to the underlying device memory. Because of the exclusive access + * locks are not needed. + * + * Note that a HerculesMutBox may hold multiple allocations at once, but it tracks the "definitive" + * copy to be the one borrowed mutably most recently (since it may have been updated). The extra + * allocations are kept around to avoid reallocation if memory is moved back to the device. */ pub struct HerculesImmBox<'a, T> { - // NOTE: We only need OnceLock if we're allowed to launch multiple Hercules program in - // parallel, if that's not necessary we can probably get away with using OnceCell #[allow(dead_code)] cpu_alloc: OnceLock<Vec<T>>, #[cfg(feature = "cuda")] @@ -542,11 +547,143 @@ where } } -pub trait HerculesBoxTo<'a, T> { +enum HerculesMutBoxLocation { + CPU, + #[cfg(feature = "cuda")] + CUDA, +} + +pub struct HerculesMutBox<T> { + loc: HerculesMutBoxLocation, + + cpu_alloc: Option<Vec<T>>, + #[cfg(feature = "cuda")] + cuda_alloc: Option<CUDABox>, +} + +impl<T: Clone> From<&mut [T]> for HerculesMutBox<T> { + fn from(value: &mut [T]) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CPU, + cpu_alloc: Some(value.to_vec()), + #[cfg(feature = "cuda")] + cuda_alloc: None, + } + } +} + +impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<T> { + fn from(value: HerculesCPURef<'a>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CPU, + cpu_alloc: Some(value.as_slice().to_vec()), + #[cfg(feature = "cuda")] + cuda_alloc: None, + } + } +} + +impl<'a, T: Clone> From<HerculesCPURefMut<'a>> for HerculesMutBox<T> { + fn from(value: HerculesCPURefMut<'a>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CPU, + cpu_alloc: Some(value.as_slice().to_vec()), + #[cfg(feature = "cuda")] + cuda_alloc: None, + } + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<T> { + fn from(value: HerculesCUDARef<'a>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CUDA, + cpu_alloc: None, + #[cfg(feature = "cuda")] + cuda_alloc: Some(CUDABox::from_cuda_ref(value)), + } + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<T> { + fn from(value: HerculesCUDARefMut<'a>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CUDA, + cpu_alloc: None, + #[cfg(feature = "cuda")] + cuda_alloc: Some(CUDABox::from_cuda_ref(value.as_ref())), + } + } +} + +impl<T> HerculesMutBox<T> +where + T: Default +{ + pub fn as_slice(&mut self) -> &mut [T] { + self.as_cpu_ref().as_slice() + } + + pub fn to_vec(mut self) -> Vec<T> { + // Bring to CPU (if needed) + let _ = self.as_cpu_ref(); + self.cpu_alloc.unwrap() + } + + pub fn as_cpu_ref<'a>(&'a mut self) -> HerculesCPURefMut<'a> { + match self.loc { + HerculesMutBoxLocation::CPU => { + HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap()) + } + #[cfg(feature = "cuda")] + HerculesMutBoxLocation::CUDA => { + let cuda_alloc = self.cuda_alloc.as_ref().unwrap(); + let elements = cuda_alloc.get_bytes() / size_of::<T>(); + + // Allocate host memory (if needed) + if self.cpu_alloc.is_none() || self.cpu_alloc.as_ref().unwrap().len() != elements { + let mut alloc = Vec::new(); + alloc.resize_with(elements, Default::default); + self.cpu_alloc = Some(alloc); + } + + // Transfer data from CUDA device + let cpu_alloc = self.cpu_alloc.as_mut().unwrap(); + let _ = cuda_alloc.get_ref().to_cpu_ref(cpu_alloc); + + self.loc = HerculesMutBoxLocation::CPU; + HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap()) + } + } + } + + #[cfg(feature = "cuda")] + pub fn as_cuda_ref<'a>(&'a mut self) -> HerculesCUDARefMut<'a> { + match self.loc { + HerculesMutBoxLocation::CPU => { + // TODO: CUDABox does not provide an interface for copying data to it, so currently + // we just reallocate it + let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap()); + let cuda_alloc = CUDABox::from_cpu_ref(cpu_ref); + + self.cuda_alloc = Some(cuda_alloc); + self.loc = HerculesMutBoxLocation::CUDA; + self.cuda_alloc.as_mut().unwrap().get_ref_mut() + } + HerculesMutBoxLocation::CUDA => { + self.cuda_alloc.as_mut().unwrap().get_ref_mut() + } + } + } +} + +pub trait HerculesImmBoxTo<'a, T> { fn to(&'a self) -> T; } -impl<'a, T> HerculesBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T> +impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T> where T: Default + Clone { fn to(&'a self) -> HerculesCPURef<'a> { @@ -555,10 +692,31 @@ where T: Default + Clone } #[cfg(feature = "cuda")] -impl<'a, T> HerculesBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T> +impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T> where T: Default + Clone { fn to(&'a self) -> HerculesCUDARef<'a> { self.as_cuda_ref() } } + +pub trait HerculesMutBoxTo<'a, T> { + fn to(&'a mut self) -> T; +} + +impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<T> +where T: Default + Clone +{ + fn to(&'a mut self) -> HerculesCPURefMut<'a> { + self.as_cpu_ref() + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<T> +where T: Default + Clone +{ + fn to(&'a mut self) -> HerculesCUDARefMut<'a> { + self.as_cuda_ref() + } +} diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index 9d0f4702..18024a0f 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -8,7 +8,7 @@ use self::camera_model::*; use self::cava_rust::CHAN; use self::image_proc::*; -use hercules_rt::{runner, HerculesImmBox, HerculesBoxTo}; +use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo}; use image::ImageError; diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs index 80a334b7..c74e2098 100644 --- a/juno_samples/edge_detection/src/main.rs +++ b/juno_samples/edge_detection/src/main.rs @@ -2,7 +2,7 @@ mod edge_detection_rust; -use hercules_rt::{runner, HerculesImmBox, HerculesBoxTo}; +use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo}; use std::slice::from_raw_parts; -- GitLab From 573d79f910f7bcdfbf82420d486e1677e08b7741 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Thu, 6 Feb 2025 08:56:55 -0600 Subject: [PATCH 05/11] Add example of safe runner interface --- juno_samples/cava/src/main.rs | 52 ++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index 18024a0f..2c5ddc13 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -16,6 +16,28 @@ use clap::Parser; juno_build::juno!("cava"); +async fn safe_run<'a>( + runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64, + input: &'a HerculesImmBox<'a, u8>, tstw: &'a HerculesImmBox<'a, f32>, + ctrl_pts: &'a HerculesImmBox<'a, f32>, weights: &'a HerculesImmBox<'a, f32>, + coefs: &'a HerculesImmBox<'a, f32>, tonemap: &'a HerculesImmBox<'a, f32>, +) -> HerculesImmBox<'a, u8> { + HerculesImmBox::from( + runner.run( + r, + c, + num_ctrl_pts, + input.to(), + tstw.to(), + ctrl_pts.to(), + weights.to(), + coefs.to(), + tonemap.to() + ) + .await + ) +} + fn run_cava( rows: usize, cols: usize, @@ -43,22 +65,20 @@ fn run_cava( let mut r = runner!(cava); - HerculesImmBox::from( - async_std::task::block_on(async { - r.run( - rows as u64, - cols as u64, - num_ctrl_pts as u64, - image.to(), - tstw.to(), - ctrl_pts.to(), - weights.to(), - coefs.to(), - tonemap.to(), - ) - .await - }) - ) + async_std::task::block_on(async { + safe_run(&mut r, + rows as u64, + cols as u64, + num_ctrl_pts as u64, + &image, + &tstw, + &ctrl_pts, + &weights, + &coefs, + &tonemap, + ) + .await + }) .as_slice() .to_vec() .into_boxed_slice() -- GitLab From 3693e9c9a90fce83c156906b01bbbea007f42fd9 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Thu, 6 Feb 2025 12:14:03 -0600 Subject: [PATCH 06/11] Improve HerculesMutBox. Edge detection safe interface issues --- hercules_rt/src/lib.rs | 176 +++++++++++++++++------- juno_samples/cava/src/main.rs | 6 +- juno_samples/edge_detection/src/main.rs | 64 ++++++--- 3 files changed, 174 insertions(+), 72 deletions(-) diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index c287e093..f8fdf2ef 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -282,6 +282,14 @@ impl<'a> HerculesCUDARefMut<'a> { } } + pub fn dup(&'a mut self) -> Self { + HerculesCUDARefMut { + ptr: self.ptr, + size: self.size, + _phantom: PhantomData, + } + } + pub unsafe fn __ptr(&self) -> *mut u8 { self.ptr.as_ptr() } @@ -553,127 +561,195 @@ enum HerculesMutBoxLocation { CUDA, } -pub struct HerculesMutBox<T> { +enum Allocation<R, A> { + None, + Reference(R), + Allocation(A), +} + +impl<R, A> Allocation<R, A> { + fn take(&mut self) -> Allocation<R, A> { + std::mem::replace(self, Allocation::None) + } +} + +pub struct HerculesMutBox<'a, T> { loc: HerculesMutBoxLocation, - cpu_alloc: Option<Vec<T>>, + cpu_alloc: Allocation<&'a mut [T], Vec<T>>, #[cfg(feature = "cuda")] - cuda_alloc: Option<CUDABox>, + cuda_alloc: Allocation<HerculesCUDARefMut<'a>, CUDABox>, } -impl<T: Clone> From<&mut [T]> for HerculesMutBox<T> { - fn from(value: &mut [T]) -> Self { +impl<'a, T> From<&'a mut [T]> for HerculesMutBox<'a, T> { + fn from(value: &'a mut [T]) -> Self { HerculesMutBox { loc: HerculesMutBoxLocation::CPU, - cpu_alloc: Some(value.to_vec()), + cpu_alloc: Allocation::Reference(value), #[cfg(feature = "cuda")] - cuda_alloc: None, + cuda_alloc: Allocation::None, } } } -impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<T> { +impl<'a, T> From<Vec<T>> for HerculesMutBox<'a, T> { + fn from(value: Vec<T>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CPU, + cpu_alloc: Allocation::Allocation(value), + #[cfg(feature = "cuda")] + cuda_alloc: Allocation::None, + } + } +} + +impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<'a, T> { fn from(value: HerculesCPURef<'a>) -> Self { HerculesMutBox { loc: HerculesMutBoxLocation::CPU, - cpu_alloc: Some(value.as_slice().to_vec()), + cpu_alloc: Allocation::Allocation(value.as_slice().to_vec()), #[cfg(feature = "cuda")] - cuda_alloc: None, + cuda_alloc: Allocation::None, } } } -impl<'a, T: Clone> From<HerculesCPURefMut<'a>> for HerculesMutBox<T> { +impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> { fn from(value: HerculesCPURefMut<'a>) -> Self { HerculesMutBox { loc: HerculesMutBoxLocation::CPU, - cpu_alloc: Some(value.as_slice().to_vec()), + cpu_alloc: Allocation::Reference(value.as_slice()), #[cfg(feature = "cuda")] - cuda_alloc: None, + cuda_alloc: Allocation::None, } } } #[cfg(feature = "cuda")] -impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<T> { +impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<'a, T> { fn from(value: HerculesCUDARef<'a>) -> Self { HerculesMutBox { loc: HerculesMutBoxLocation::CUDA, - cpu_alloc: None, + cpu_alloc: Allocation::None, #[cfg(feature = "cuda")] - cuda_alloc: Some(CUDABox::from_cuda_ref(value)), + cuda_alloc: Allocation::Allocation(CUDABox::from_cuda_ref(value)), } } } #[cfg(feature = "cuda")] -impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<T> { +impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> { fn from(value: HerculesCUDARefMut<'a>) -> Self { HerculesMutBox { loc: HerculesMutBoxLocation::CUDA, - cpu_alloc: None, + cpu_alloc: Allocation::None, #[cfg(feature = "cuda")] - cuda_alloc: Some(CUDABox::from_cuda_ref(value.as_ref())), + cuda_alloc: Allocation::Reference(value), } } } -impl<T> HerculesMutBox<T> +impl<'a, T> HerculesMutBox<'a, T> where - T: Default + T: Default + Clone { - pub fn as_slice(&mut self) -> &mut [T] { + pub fn as_slice(&'a mut self) -> &'a mut [T] { self.as_cpu_ref().as_slice() } - pub fn to_vec(mut self) -> Vec<T> { - // Bring to CPU (if needed) - let _ = self.as_cpu_ref(); - self.cpu_alloc.unwrap() - } - - pub fn as_cpu_ref<'a>(&'a mut self) -> HerculesCPURefMut<'a> { + pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> { match self.loc { HerculesMutBoxLocation::CPU => { - HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap()) + match self.cpu_alloc { + Allocation::None => panic!("No CPU reference"), + Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val), + Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val), + } } #[cfg(feature = "cuda")] HerculesMutBoxLocation::CUDA => { - let cuda_alloc = self.cuda_alloc.as_ref().unwrap(); - let elements = cuda_alloc.get_bytes() / size_of::<T>(); + let cuda_ref : HerculesCUDARef<'a> = + match self.cuda_alloc { + Allocation::None => panic!("No GPU reference"), + Allocation::Reference(ref mut val) => val.dup().as_ref(), + Allocation::Allocation(ref val) => val.get_ref(), + }; + + let elements = unsafe { cuda_ref.__size() / size_of::<T>() }; // Allocate host memory (if needed) - if self.cpu_alloc.is_none() || self.cpu_alloc.as_ref().unwrap().len() != elements { - let mut alloc = Vec::new(); - alloc.resize_with(elements, Default::default); - self.cpu_alloc = Some(alloc); - } + let cpu_alloc : Allocation<&'a mut [T], Vec<T>> = + match self.cpu_alloc.take() { + Allocation::Reference(val) if val.len() == elements => Allocation::Reference(val), + Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val), + _ => { + let mut alloc = Vec::new(); + alloc.resize_with(elements, Default::default); + Allocation::Allocation(alloc) + } + }; + self.cpu_alloc = cpu_alloc; + let cpu_ref : &'a mut [T] = + match &mut self.cpu_alloc { + Allocation::None => panic!(), + Allocation::Reference(val) => val, + Allocation::Allocation(val) => val, + }; // Transfer data from CUDA device - let cpu_alloc = self.cpu_alloc.as_mut().unwrap(); - let _ = cuda_alloc.get_ref().to_cpu_ref(cpu_alloc); + let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref); self.loc = HerculesMutBoxLocation::CPU; - HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap()) + cpu_ref } } } #[cfg(feature = "cuda")] - pub fn as_cuda_ref<'a>(&'a mut self) -> HerculesCUDARefMut<'a> { + pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> { match self.loc { HerculesMutBoxLocation::CPU => { - // TODO: CUDABox does not provide an interface for copying data to it, so currently - // we just reallocate it - let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap()); - let cuda_alloc = CUDABox::from_cpu_ref(cpu_ref); + let cpu_ref : &'a [T] = + match self.cpu_alloc { + Allocation::None => panic!("No CPU reference"), + Allocation::Reference(ref val) => val, + Allocation::Allocation(ref val) => val, + }; + + let size = cpu_ref.len() * size_of::<T>(); + let (cuda_alloc, copied) = + match self.cuda_alloc.take() { + Allocation::Reference(val) if unsafe { val.__size() == size } => (Allocation::Reference(val), false), + Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false), + _ => { + let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref)); + (Allocation::Allocation(alloc), true) + } + }; + self.cuda_alloc = cuda_alloc; + + let cuda_ref = + match self.cuda_alloc { + Allocation::None => panic!(), + Allocation::Reference(ref mut val) => val.dup(), + Allocation::Allocation(ref mut val) => val.get_ref_mut(), + }; + + if !copied { + unsafe { + __copy_cpu_to_cuda(cuda_ref.__ptr(), cpu_ref.as_ptr() as *mut u8, size); + } + } - self.cuda_alloc = Some(cuda_alloc); self.loc = HerculesMutBoxLocation::CUDA; - self.cuda_alloc.as_mut().unwrap().get_ref_mut() + cuda_ref } HerculesMutBoxLocation::CUDA => { - self.cuda_alloc.as_mut().unwrap().get_ref_mut() + match self.cuda_alloc { + Allocation::None => panic!("No GPU reference"), + Allocation::Reference(ref mut val) => val.dup(), + Allocation::Allocation(ref mut val) => val.get_ref_mut(), + } } } } @@ -704,7 +780,7 @@ pub trait HerculesMutBoxTo<'a, T> { fn to(&'a mut self) -> T; } -impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<T> +impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> where T: Default + Clone { fn to(&'a mut self) -> HerculesCPURefMut<'a> { @@ -713,7 +789,7 @@ where T: Default + Clone } #[cfg(feature = "cuda")] -impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<T> +impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> where T: Default + Clone { fn to(&'a mut self) -> HerculesCUDARefMut<'a> { diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index 2c5ddc13..c1acbe3f 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -8,7 +8,7 @@ use self::camera_model::*; use self::cava_rust::CHAN; use self::image_proc::*; -use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo}; +use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox}; use image::ImageError; @@ -21,8 +21,8 @@ async fn safe_run<'a>( input: &'a HerculesImmBox<'a, u8>, tstw: &'a HerculesImmBox<'a, f32>, ctrl_pts: &'a HerculesImmBox<'a, f32>, weights: &'a HerculesImmBox<'a, f32>, coefs: &'a HerculesImmBox<'a, f32>, tonemap: &'a HerculesImmBox<'a, f32>, -) -> HerculesImmBox<'a, u8> { - HerculesImmBox::from( +) -> HerculesMutBox<'a, u8> { + HerculesMutBox::from( runner.run( r, c, diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs index c74e2098..32dfaba9 100644 --- a/juno_samples/edge_detection/src/main.rs +++ b/juno_samples/edge_detection/src/main.rs @@ -2,7 +2,7 @@ mod edge_detection_rust; -use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo}; +use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox}; use std::slice::from_raw_parts; @@ -84,6 +84,33 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat { converted } +async fn safe_run<'a>(runner: &'a mut HerculesRunner_edge_detection, + n: u64, m: u64, gs: u64, sz: u64, sb: u64, + input: &'a HerculesImmBox<'a, f32>, + gaussian_filter: &'a HerculesImmBox<'a, f32>, + structure: &'a HerculesImmBox<'a, f32>, + sx: &'a HerculesImmBox<'a, f32>, + sy: &'a HerculesImmBox<'a, f32>, + theta: f32 +) -> HerculesMutBox<'a, f32> { + HerculesMutBox::from( + runner.run( + n, + m, + gs, + sz, + sb, + input.to(), + gaussian_filter.to(), + structure.to(), + sx.to(), + sy.to(), + theta, + ) + .await + ) +} + fn edge_detection_harness(args: EdgeDetectionInputs) { let EdgeDetectionInputs { input, @@ -184,24 +211,23 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { let input_h = HerculesImmBox::from(input); let result = - HerculesImmBox::from( - async_std::task::block_on(async { - r.run( - height as u64, - width as u64, - gs as u64, - sz as u64, - sb as u64, - input_h.to(), - gaussian_filter_h.to(), - structure_h.to(), - sx_h.to(), - sy_h.to(), - theta, - ) - .await - }) - ) + async_std::task::block_on(async { + safe_run(&mut r, + height as u64, + width as u64, + gs as u64, + sz as u64, + sb as u64, + &input_h, + &gaussian_filter_h, + &structure_h, + &sx_h, + &sy_h, + theta, + ) + .await + }) + .as_slice() .to_vec(); if display { -- GitLab From 81525f538a4012f7c4a590a96929fe7ad025de63 Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Thu, 6 Feb 2025 14:28:58 -0600 Subject: [PATCH 07/11] Fixing lifetime issues --- hercules_rt/src/lib.rs | 14 +++++++------- juno_samples/cava/src/main.rs | 8 ++++---- juno_samples/edge_detection/src/main.rs | 12 ++++++------ 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index f8fdf2ef..848309e3 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -503,15 +503,15 @@ impl<'a, T> HerculesImmBox<'a, T> where T: Default + Clone { - pub fn as_slice(&'a self) -> &'a [T] { + pub fn as_slice<'b>(&'b self) -> &'a [T] { self.as_cpu_ref().as_slice() } - pub fn to_vec(&'a self) -> Vec<T> { + pub fn to_vec<'b>(&'b self) -> Vec<T> { Vec::from(self.as_cpu_ref().as_slice()) } - pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> { + pub fn as_cpu_ref<'b>(&'b self) -> HerculesCPURef<'a> { if let Some(cpu_ref) = self.cpu_ref.get() { cpu_ref.clone() } else { @@ -536,7 +536,7 @@ where } #[cfg(feature = "cuda")] - pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> { + pub fn as_cuda_ref<'b>(&'b self) -> HerculesCUDARef<'a> { if let Some(cuda_ref) = self.cuda_ref.get() { cuda_ref.clone() } else { @@ -756,13 +756,13 @@ where } pub trait HerculesImmBoxTo<'a, T> { - fn to(&'a self) -> T; + fn to(&self) -> T; } impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T> where T: Default + Clone { - fn to(&'a self) -> HerculesCPURef<'a> { + fn to(&self) -> HerculesCPURef<'a> { self.as_cpu_ref() } } @@ -771,7 +771,7 @@ where T: Default + Clone impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T> where T: Default + Clone { - fn to(&'a self) -> HerculesCUDARef<'a> { + fn to(&self) -> HerculesCUDARef<'a> { self.as_cuda_ref() } } diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index c1acbe3f..72a9d823 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -16,11 +16,11 @@ use clap::Parser; juno_build::juno!("cava"); -async fn safe_run<'a>( +async fn safe_run<'a, 'b: 'a>( runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64, - input: &'a HerculesImmBox<'a, u8>, tstw: &'a HerculesImmBox<'a, f32>, - ctrl_pts: &'a HerculesImmBox<'a, f32>, weights: &'a HerculesImmBox<'a, f32>, - coefs: &'a HerculesImmBox<'a, f32>, tonemap: &'a HerculesImmBox<'a, f32>, + input: &HerculesImmBox<'b, u8>, tstw: &HerculesImmBox<'b, f32>, + ctrl_pts: &HerculesImmBox<'b, f32>, weights: &HerculesImmBox<'b, f32>, + coefs: &HerculesImmBox<'b, f32>, tonemap: &HerculesImmBox<'b, f32>, ) -> HerculesMutBox<'a, u8> { HerculesMutBox::from( runner.run( diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs index 32dfaba9..7452a0ec 100644 --- a/juno_samples/edge_detection/src/main.rs +++ b/juno_samples/edge_detection/src/main.rs @@ -84,13 +84,13 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat { converted } -async fn safe_run<'a>(runner: &'a mut HerculesRunner_edge_detection, +async fn safe_run<'a, 'b: 'a>(runner: &'a mut HerculesRunner_edge_detection, n: u64, m: u64, gs: u64, sz: u64, sb: u64, - input: &'a HerculesImmBox<'a, f32>, - gaussian_filter: &'a HerculesImmBox<'a, f32>, - structure: &'a HerculesImmBox<'a, f32>, - sx: &'a HerculesImmBox<'a, f32>, - sy: &'a HerculesImmBox<'a, f32>, + input: &HerculesImmBox<'b, f32>, + gaussian_filter: &HerculesImmBox<'b, f32>, + structure: &HerculesImmBox<'b, f32>, + sx: &HerculesImmBox<'b, f32>, + sy: &HerculesImmBox<'b, f32>, theta: f32 ) -> HerculesMutBox<'a, f32> { HerculesMutBox::from( -- GitLab From 255898ca0b050139e2e8c65887bcbfbe1d86fd8f Mon Sep 17 00:00:00 2001 From: Aaron Councilman <aaronjc4@illinois.edu> Date: Thu, 6 Feb 2025 14:47:15 -0600 Subject: [PATCH 08/11] Actually fixed this time --- hercules_rt/src/lib.rs | 14 +++++++------- juno_samples/cava/src/main.rs | 10 ++++++---- juno_samples/edge_detection/src/main.rs | 20 ++++++++++++-------- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index 848309e3..f8fdf2ef 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -503,15 +503,15 @@ impl<'a, T> HerculesImmBox<'a, T> where T: Default + Clone { - pub fn as_slice<'b>(&'b self) -> &'a [T] { + pub fn as_slice(&'a self) -> &'a [T] { self.as_cpu_ref().as_slice() } - pub fn to_vec<'b>(&'b self) -> Vec<T> { + pub fn to_vec(&'a self) -> Vec<T> { Vec::from(self.as_cpu_ref().as_slice()) } - pub fn as_cpu_ref<'b>(&'b self) -> HerculesCPURef<'a> { + pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> { if let Some(cpu_ref) = self.cpu_ref.get() { cpu_ref.clone() } else { @@ -536,7 +536,7 @@ where } #[cfg(feature = "cuda")] - pub fn as_cuda_ref<'b>(&'b self) -> HerculesCUDARef<'a> { + pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> { if let Some(cuda_ref) = self.cuda_ref.get() { cuda_ref.clone() } else { @@ -756,13 +756,13 @@ where } pub trait HerculesImmBoxTo<'a, T> { - fn to(&self) -> T; + fn to(&'a self) -> T; } impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T> where T: Default + Clone { - fn to(&self) -> HerculesCPURef<'a> { + fn to(&'a self) -> HerculesCPURef<'a> { self.as_cpu_ref() } } @@ -771,7 +771,7 @@ where T: Default + Clone impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T> where T: Default + Clone { - fn to(&self) -> HerculesCUDARef<'a> { + fn to(&'a self) -> HerculesCUDARef<'a> { self.as_cuda_ref() } } diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index 72a9d823..b4a0f6fd 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -16,11 +16,13 @@ use clap::Parser; juno_build::juno!("cava"); -async fn safe_run<'a, 'b: 'a>( +// Individual lifetimes are not needed in this example but should probably be generated for +// flexibility +async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a, 'g: 'a>( runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64, - input: &HerculesImmBox<'b, u8>, tstw: &HerculesImmBox<'b, f32>, - ctrl_pts: &HerculesImmBox<'b, f32>, weights: &HerculesImmBox<'b, f32>, - coefs: &HerculesImmBox<'b, f32>, tonemap: &HerculesImmBox<'b, f32>, + input: &'b HerculesImmBox<'b, u8>, tstw: &'c HerculesImmBox<'c, f32>, + ctrl_pts: &'d HerculesImmBox<'d, f32>, weights: &'e HerculesImmBox<'e, f32>, + coefs: &'f HerculesImmBox<'f, f32>, tonemap: &'g HerculesImmBox<'g, f32>, ) -> HerculesMutBox<'a, u8> { HerculesMutBox::from( runner.run( diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs index 7452a0ec..9605e69d 100644 --- a/juno_samples/edge_detection/src/main.rs +++ b/juno_samples/edge_detection/src/main.rs @@ -84,14 +84,18 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat { converted } -async fn safe_run<'a, 'b: 'a>(runner: &'a mut HerculesRunner_edge_detection, - n: u64, m: u64, gs: u64, sz: u64, sb: u64, - input: &HerculesImmBox<'b, f32>, - gaussian_filter: &HerculesImmBox<'b, f32>, - structure: &HerculesImmBox<'b, f32>, - sx: &HerculesImmBox<'b, f32>, - sy: &HerculesImmBox<'b, f32>, - theta: f32 +// If all of the HerculesImmBox are given lifetimes 'b: &'b HerculesImmBox<'b, f32> +// this fails to borrow-check (I think because the input is declared inside the loop while +// everything else is declared outside it) +async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a>( + runner: &'a mut HerculesRunner_edge_detection, + n: u64, m: u64, gs: u64, sz: u64, sb: u64, + input: &'b HerculesImmBox<'b, f32>, + gaussian_filter: &'c HerculesImmBox<'c, f32>, + structure: &'d HerculesImmBox<'d, f32>, + sx: &'e HerculesImmBox<'e, f32>, + sy: &'f HerculesImmBox<'f, f32>, + theta: f32 ) -> HerculesMutBox<'a, f32> { HerculesMutBox::from( runner.run( -- GitLab From 27ce61d09d94e20a1584876a5615fc6bfadd98bb Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Fri, 7 Feb 2025 10:15:26 -0600 Subject: [PATCH 09/11] Clarifying comment regarding run() lifetimes --- hercules_cg/src/rt.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 2c5f7c35..6ea9b45e 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -1001,6 +1001,9 @@ impl<'a> RTContext<'a> { )?; } write!(w, "}}}}")?; + // Every reference, including the runner and the Hercules Refs, have the + // same lifetime, since a returned object may come from backing memory + // or from one of the parameters. write!(w, "async fn run<'a>(&'a mut self")?; for idx in 0..func.num_dynamic_constants { write!(w, ", dc_p{}: u64", idx)?; -- GitLab From 3b1f9e24af05094d2de630ccf299728f4e139372 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 9 Feb 2025 09:25:32 -0600 Subject: [PATCH 10/11] Be more precise about lifetimes in RT backend --- hercules_cg/src/rt.rs | 50 +++++++++++++++++++++++++++++----- hercules_ir/src/collections.rs | 6 +++- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 30b1d22c..b79e4953 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -1013,10 +1013,28 @@ impl<'a> RTContext<'a> { )?; } write!(w, "}}}}")?; - // Every reference, including the runner and the Hercules Refs, have the - // same lifetime, since a returned object may come from backing memory - // or from one of the parameters. - write!(w, "async fn run<'a>(&'a mut self")?; + + // Every reference that may be returned has the same lifetime. Every + // other reference gets its own unique lifetime. + let returned_origins: HashSet<_> = self.collection_objects[&self.func_id] + .returned_objects() + .into_iter() + .map(|obj| self.collection_objects[&self.func_id].origin(*obj)) + .collect(); + + write!(w, "async fn run<'runner, 'returned")?; + for idx in 0..func.param_types.len() { + write!(w, ", 'p{}", idx)?; + } + write!( + w, + ">(&'{} mut self", + if returned_origins.iter().any(|origin| !origin.is_parameter()) { + "returned" + } else { + "runner" + } + )?; for idx in 0..func.num_dynamic_constants { write!(w, ", dc_p{}: u64", idx)?; } @@ -1032,8 +1050,19 @@ impl<'a> RTContext<'a> { let mutability = if param_muts[idx] { "Mut" } else { "" }; write!( w, - ", p{}: ::hercules_rt::Hercules{}Ref{}<'a>", - idx, device, mutability + ", p{}: ::hercules_rt::Hercules{}Ref{}<'{}>", + idx, + device, + mutability, + if returned_origins.iter().any(|origin| origin + .try_parameter() + .map(|oidx| idx == oidx) + .unwrap_or(false)) + { + "returned".to_string() + } else { + format!("p{}", idx) + } )?; } } @@ -1048,10 +1077,13 @@ impl<'a> RTContext<'a> { let mutability = if return_mut { "Mut" } else { "" }; write!( w, - ") -> ::hercules_rt::Hercules{}Ref{}<'a> {{", + ") -> ::hercules_rt::Hercules{}Ref{}<'returned> {{", device, mutability )?; } + + // Start with possibly re-allocating the backing memory if it's not + // large enough. write!(w, "unsafe {{")?; for (device, (total, _)) in self.backing_allocations[&self.func_id].iter() { write!(w, "let size = ")?; @@ -1087,6 +1119,8 @@ impl<'a> RTContext<'a> { )?; } } + + // Call the wrapped function. write!(w, "let ret = {}(", func.name)?; for (device, _) in self.backing_allocations[&self.func_id].iter() { write!( @@ -1120,6 +1154,8 @@ impl<'a> RTContext<'a> { )?; } write!(w, "}}}}")?; + + // De-allocate the backing memory on drop. write!( w, "}}impl Drop for HerculesRunner_{} {{#[allow(unused_unsafe)]fn drop(&mut self) {{unsafe {{", diff --git a/hercules_ir/src/collections.rs b/hercules_ir/src/collections.rs index 1bc650e9..d236d5b5 100644 --- a/hercules_ir/src/collections.rs +++ b/hercules_ir/src/collections.rs @@ -36,7 +36,7 @@ use crate::*; * - For each function, which collection objects may be returned? * - For each collection object, how was it originated? */ -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum CollectionObjectOrigin { Parameter(usize), Constant(NodeID), @@ -57,6 +57,10 @@ pub struct FunctionCollectionObjects { pub type CollectionObjects = BTreeMap<FunctionID, FunctionCollectionObjects>; impl CollectionObjectOrigin { + pub fn is_parameter(&self) -> bool { + self.try_parameter().is_some() + } + pub fn try_parameter(&self) -> Option<usize> { match self { CollectionObjectOrigin::Parameter(index) => Some(*index), -- GitLab From 3d6432d7fa7a44ad5811a81e35041c7be3770a97 Mon Sep 17 00:00:00 2001 From: Russel Arbore <russel.jma@gmail.com> Date: Sun, 9 Feb 2025 09:30:44 -0600 Subject: [PATCH 11/11] edge detection no longer needs longer than lifetime annotations --- juno_samples/edge_detection/src/main.rs | 80 +++++++++++++------------ 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs index 9605e69d..60ccb515 100644 --- a/juno_samples/edge_detection/src/main.rs +++ b/juno_samples/edge_detection/src/main.rs @@ -84,34 +84,36 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat { converted } -// If all of the HerculesImmBox are given lifetimes 'b: &'b HerculesImmBox<'b, f32> -// this fails to borrow-check (I think because the input is declared inside the loop while -// everything else is declared outside it) -async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a>( +async fn safe_run<'a, 'b, 'c, 'd, 'e, 'f>( runner: &'a mut HerculesRunner_edge_detection, - n: u64, m: u64, gs: u64, sz: u64, sb: u64, + n: u64, + m: u64, + gs: u64, + sz: u64, + sb: u64, input: &'b HerculesImmBox<'b, f32>, gaussian_filter: &'c HerculesImmBox<'c, f32>, structure: &'d HerculesImmBox<'d, f32>, sx: &'e HerculesImmBox<'e, f32>, sy: &'f HerculesImmBox<'f, f32>, - theta: f32 + theta: f32, ) -> HerculesMutBox<'a, f32> { HerculesMutBox::from( - runner.run( - n, - m, - gs, - sz, - sb, - input.to(), - gaussian_filter.to(), - structure.to(), - sx.to(), - sy.to(), - theta, - ) - .await + runner + .run( + n, + m, + gs, + sz, + sb, + input.to(), + gaussian_filter.to(), + structure.to(), + sx.to(), + sy.to(), + theta, + ) + .await, ) } @@ -214,25 +216,25 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { let input_h = HerculesImmBox::from(input); - let result = - async_std::task::block_on(async { - safe_run(&mut r, - height as u64, - width as u64, - gs as u64, - sz as u64, - sb as u64, - &input_h, - &gaussian_filter_h, - &structure_h, - &sx_h, - &sy_h, - theta, - ) - .await - }) - .as_slice() - .to_vec(); + let result = async_std::task::block_on(async { + safe_run( + &mut r, + height as u64, + width as u64, + gs as u64, + sz as u64, + sb as u64, + &input_h, + &gaussian_filter_h, + &structure_h, + &sx_h, + &sy_h, + theta, + ) + .await + }) + .as_slice() + .to_vec(); if display { let result = frame_from_slice(&result, height, width); -- GitLab