diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 232274824184a52f037cd783d4729d7a389b7c04..b79e4953346f2f47a81bf928eadb647ed7217645 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -1013,7 +1013,28 @@ impl<'a> RTContext<'a> { )?; } write!(w, "}}}}")?; - write!(w, "async fn run<'a>(&'a mut self")?; + + // Every reference that may be returned has the same lifetime. Every + // other reference gets its own unique lifetime. + let returned_origins: HashSet<_> = self.collection_objects[&self.func_id] + .returned_objects() + .into_iter() + .map(|obj| self.collection_objects[&self.func_id].origin(*obj)) + .collect(); + + write!(w, "async fn run<'runner, 'returned")?; + for idx in 0..func.param_types.len() { + write!(w, ", 'p{}", idx)?; + } + write!( + w, + ">(&'{} mut self", + if returned_origins.iter().any(|origin| !origin.is_parameter()) { + "returned" + } else { + "runner" + } + )?; for idx in 0..func.num_dynamic_constants { write!(w, ", dc_p{}: u64", idx)?; } @@ -1029,8 +1050,19 @@ impl<'a> RTContext<'a> { let mutability = if param_muts[idx] { "Mut" } else { "" }; write!( w, - ", p{}: ::hercules_rt::Hercules{}Ref{}<'a>", - idx, device, mutability + ", p{}: ::hercules_rt::Hercules{}Ref{}<'{}>", + idx, + device, + mutability, + if returned_origins.iter().any(|origin| origin + .try_parameter() + .map(|oidx| idx == oidx) + .unwrap_or(false)) + { + "returned".to_string() + } else { + format!("p{}", idx) + } )?; } } @@ -1045,10 +1077,13 @@ impl<'a> RTContext<'a> { let mutability = if return_mut { "Mut" } else { "" }; write!( w, - ") -> ::hercules_rt::Hercules{}Ref{}<'a> {{", + ") -> ::hercules_rt::Hercules{}Ref{}<'returned> {{", device, mutability )?; } + + // Start with possibly re-allocating the backing memory if it's not + // large enough. write!(w, "unsafe {{")?; for (device, (total, _)) in self.backing_allocations[&self.func_id].iter() { write!(w, "let size = ")?; @@ -1084,6 +1119,8 @@ impl<'a> RTContext<'a> { )?; } } + + // Call the wrapped function. write!(w, "let ret = {}(", func.name)?; for (device, _) in self.backing_allocations[&self.func_id].iter() { write!( @@ -1117,6 +1154,8 @@ impl<'a> RTContext<'a> { )?; } write!(w, "}}}}")?; + + // De-allocate the backing memory on drop. write!( w, "}}impl Drop for HerculesRunner_{} {{#[allow(unused_unsafe)]fn drop(&mut self) {{unsafe {{", diff --git a/hercules_ir/src/collections.rs b/hercules_ir/src/collections.rs index 1bc650e947e02b4d99bb5fde4173d2744b25a2cb..d236d5b55cd61f5f0cbbd721660779f2d3ba5e61 100644 --- a/hercules_ir/src/collections.rs +++ b/hercules_ir/src/collections.rs @@ -36,7 +36,7 @@ use crate::*; * - For each function, which collection objects may be returned? * - For each collection object, how was it originated? */ -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum CollectionObjectOrigin { Parameter(usize), Constant(NodeID), @@ -57,6 +57,10 @@ pub struct FunctionCollectionObjects { pub type CollectionObjects = BTreeMap<FunctionID, FunctionCollectionObjects>; impl CollectionObjectOrigin { + pub fn is_parameter(&self) -> bool { + self.try_parameter().is_some() + } + pub fn try_parameter(&self) -> Option<usize> { match self { CollectionObjectOrigin::Parameter(index) => Some(*index), diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs index 12b64fa3ce7c5f73c2c7470dc546d9913203aa3f..f8fdf2effa9bfc9bbf900b79915b8716706bde6c 100644 --- a/hercules_rt/src/lib.rs +++ b/hercules_rt/src/lib.rs @@ -1,8 +1,12 @@ +#![feature(once_cell_try)] + use std::alloc::{alloc, dealloc, Layout}; use std::marker::PhantomData; use std::ptr::{copy_nonoverlapping, write_bytes, NonNull}; use std::slice::{from_raw_parts, from_raw_parts_mut}; +use std::sync::OnceLock; + /* * Define supporting types, functions, and macros for Hercules RT functions. For * a more in-depth discussion of the design of these utilities, see hercules_cg/ @@ -278,6 +282,14 @@ impl<'a> HerculesCUDARefMut<'a> { } } + pub fn dup(&'a mut self) -> Self { + HerculesCUDARefMut { + ptr: self.ptr, + size: self.size, + _phantom: PhantomData, + } + } + pub unsafe fn __ptr(&self) -> *mut u8 { self.ptr.as_ptr() } @@ -330,6 +342,10 @@ impl CUDABox { _phantom: PhantomData, } } + + pub fn get_bytes(&self) -> usize { + self.size + } } #[cfg(feature = "cuda")] @@ -367,3 +383,416 @@ impl __RawPtrSendSync { unsafe impl Send for __RawPtrSendSync {} unsafe impl Sync for __RawPtrSendSync {} + +/* + * A HerculesBox holds memory that can be on any device and provides a common interface to moving + * data where it is needed. + * + * It can hold CPU and device allocations to basically point at the memory it represents. It can + * also hold Hercules references either to those allocations it holds or to other allocations not + * held by this Box (in which case the appropriate allocation field should be None). + * + * The data held at all of its non-None allocations and references is maintained so that it is the + * same, and so methods will attempt to use the reference or allocation that is most convenient. + * + * HerculesImmBox hold references to immutable memory only. All operations on these is through + * immutable references, though internally it uses OnceLocks to protect its resources since the Box + * may be used in multiple parallel threads if it is used in parallel Hercules code invocation. + * We use OnceLocks since the data is immutable and so once it has been placed on a device movement + * is not necessary. + * + * We maintain the invariant that at least one of the device references is always set, their + * associated allocations may or may not be set, as those may not be needed if the allocation is + * help elsewhere. + * + * HerculesMutBox holds memory on some device and can produce mutable references to that data on + * on any device. All these operations are through mutable references since this ensures exclusive + * access to the Box and therefore to the underlying device memory. Because of the exclusive access + * locks are not needed. + * + * Note that a HerculesMutBox may hold multiple allocations at once, but it tracks the "definitive" + * copy to be the one borrowed mutably most recently (since it may have been updated). The extra + * allocations are kept around to avoid reallocation if memory is moved back to the device. + */ +pub struct HerculesImmBox<'a, T> { + #[allow(dead_code)] + cpu_alloc: OnceLock<Vec<T>>, + #[cfg(feature = "cuda")] + cuda_alloc: OnceLock<CUDABox>, + + cpu_ref: OnceLock<HerculesCPURef<'a>>, + #[cfg(feature = "cuda")] + cuda_ref: OnceLock<HerculesCUDARef<'a>>, +} + +impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> { + fn from(value: &'a [T]) -> Self { + HerculesImmBox { + cpu_alloc: OnceLock::new(), + #[cfg(feature = "cuda")] + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)), + #[cfg(feature = "cuda")] + cuda_ref: OnceLock::new(), + } + } +} + +impl<'a, T> From<HerculesCPURef<'a>> for HerculesImmBox<'a, T> { + fn from(value: HerculesCPURef<'a>) -> Self { + HerculesImmBox { + cpu_alloc: OnceLock::new(), + #[cfg(feature = "cuda")] + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::from(value), + #[cfg(feature = "cuda")] + cuda_ref: OnceLock::new(), + } + } +} + +// If we are building from a mutable reference, we demote that to a non-mutable reference since we +// don't hold mutable references. +impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesImmBox<'a, T> { + fn from(value: HerculesCPURefMut<'a>) -> Self { + HerculesImmBox { + cpu_alloc: OnceLock::new(), + #[cfg(feature = "cuda")] + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::from(value.as_ref()), + #[cfg(feature = "cuda")] + cuda_ref: OnceLock::new(), + } + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> From<HerculesCUDARef<'a>> for HerculesImmBox<'a, T> { + fn from(value: HerculesCUDARef<'a>) -> Self { + HerculesImmBox { + cpu_alloc: OnceLock::new(), + #[cfg(feature = "cuda")] + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::new(), + #[cfg(feature = "cuda")] + cuda_ref: OnceLock::from(value), + } + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> { + fn from(value: HerculesCUDARefMut<'a>) -> Self { + HerculesImmBox { + cpu_alloc: OnceLock::new(), + #[cfg(feature = "cuda")] + cuda_alloc: OnceLock::new(), + + cpu_ref: OnceLock::new(), + #[cfg(feature = "cuda")] + cuda_ref: OnceLock::from(value.as_ref()), + } + } +} + +impl<'a, T> HerculesImmBox<'a, T> +where + T: Default + Clone +{ + pub fn as_slice(&'a self) -> &'a [T] { + self.as_cpu_ref().as_slice() + } + + pub fn to_vec(&'a self) -> Vec<T> { + Vec::from(self.as_cpu_ref().as_slice()) + } + + pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> { + if let Some(cpu_ref) = self.cpu_ref.get() { + cpu_ref.clone() + } else { + #[cfg(feature = "cuda")] + if let Some(cuda_ref) = self.cuda_ref.get() { + return + self.cpu_ref.get_or_init(|| { + let elements = unsafe { cuda_ref.__size() / size_of::<T>() }; + + let mut alloc = Vec::new(); + alloc.resize_with(elements, Default::default); + let _ = cuda_ref.clone().to_cpu_ref(&mut alloc); + + self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly"); + let alloc = self.cpu_alloc.get().unwrap(); + HerculesCPURef::from_slice(alloc) + }).clone(); + } + + panic!("HerculesImmBox has no reference to data") + } + } + + #[cfg(feature = "cuda")] + pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> { + if let Some(cuda_ref) = self.cuda_ref.get() { + cuda_ref.clone() + } else { + if let Some(cpu_ref) = self.cpu_ref.get() { + return self.cuda_ref.get_or_init(|| { + // Copy data to CUDA device + let alloc = CUDABox::from_cpu_ref(cpu_ref.clone()); + self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly"); + + self.cuda_alloc.get().unwrap().get_ref() + }).clone(); + } + + panic!("HerculesImmBox has no reference to data") + } + } +} + +enum HerculesMutBoxLocation { + CPU, + #[cfg(feature = "cuda")] + CUDA, +} + +enum Allocation<R, A> { + None, + Reference(R), + Allocation(A), +} + +impl<R, A> Allocation<R, A> { + fn take(&mut self) -> Allocation<R, A> { + std::mem::replace(self, Allocation::None) + } +} + +pub struct HerculesMutBox<'a, T> { + loc: HerculesMutBoxLocation, + + cpu_alloc: Allocation<&'a mut [T], Vec<T>>, + #[cfg(feature = "cuda")] + cuda_alloc: Allocation<HerculesCUDARefMut<'a>, CUDABox>, +} + +impl<'a, T> From<&'a mut [T]> for HerculesMutBox<'a, T> { + fn from(value: &'a mut [T]) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CPU, + cpu_alloc: Allocation::Reference(value), + #[cfg(feature = "cuda")] + cuda_alloc: Allocation::None, + } + } +} + +impl<'a, T> From<Vec<T>> for HerculesMutBox<'a, T> { + fn from(value: Vec<T>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CPU, + cpu_alloc: Allocation::Allocation(value), + #[cfg(feature = "cuda")] + cuda_alloc: Allocation::None, + } + } +} + +impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<'a, T> { + fn from(value: HerculesCPURef<'a>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CPU, + cpu_alloc: Allocation::Allocation(value.as_slice().to_vec()), + #[cfg(feature = "cuda")] + cuda_alloc: Allocation::None, + } + } +} + +impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> { + fn from(value: HerculesCPURefMut<'a>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CPU, + cpu_alloc: Allocation::Reference(value.as_slice()), + #[cfg(feature = "cuda")] + cuda_alloc: Allocation::None, + } + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<'a, T> { + fn from(value: HerculesCUDARef<'a>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CUDA, + cpu_alloc: Allocation::None, + #[cfg(feature = "cuda")] + cuda_alloc: Allocation::Allocation(CUDABox::from_cuda_ref(value)), + } + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> { + fn from(value: HerculesCUDARefMut<'a>) -> Self { + HerculesMutBox { + loc: HerculesMutBoxLocation::CUDA, + cpu_alloc: Allocation::None, + #[cfg(feature = "cuda")] + cuda_alloc: Allocation::Reference(value), + } + } +} + +impl<'a, T> HerculesMutBox<'a, T> +where + T: Default + Clone +{ + pub fn as_slice(&'a mut self) -> &'a mut [T] { + self.as_cpu_ref().as_slice() + } + + pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> { + match self.loc { + HerculesMutBoxLocation::CPU => { + match self.cpu_alloc { + Allocation::None => panic!("No CPU reference"), + Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val), + Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val), + } + } + #[cfg(feature = "cuda")] + HerculesMutBoxLocation::CUDA => { + let cuda_ref : HerculesCUDARef<'a> = + match self.cuda_alloc { + Allocation::None => panic!("No GPU reference"), + Allocation::Reference(ref mut val) => val.dup().as_ref(), + Allocation::Allocation(ref val) => val.get_ref(), + }; + + let elements = unsafe { cuda_ref.__size() / size_of::<T>() }; + + // Allocate host memory (if needed) + let cpu_alloc : Allocation<&'a mut [T], Vec<T>> = + match self.cpu_alloc.take() { + Allocation::Reference(val) if val.len() == elements => Allocation::Reference(val), + Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val), + _ => { + let mut alloc = Vec::new(); + alloc.resize_with(elements, Default::default); + Allocation::Allocation(alloc) + } + }; + self.cpu_alloc = cpu_alloc; + let cpu_ref : &'a mut [T] = + match &mut self.cpu_alloc { + Allocation::None => panic!(), + Allocation::Reference(val) => val, + Allocation::Allocation(val) => val, + }; + + // Transfer data from CUDA device + let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref); + + self.loc = HerculesMutBoxLocation::CPU; + cpu_ref + } + } + } + + #[cfg(feature = "cuda")] + pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> { + match self.loc { + HerculesMutBoxLocation::CPU => { + let cpu_ref : &'a [T] = + match self.cpu_alloc { + Allocation::None => panic!("No CPU reference"), + Allocation::Reference(ref val) => val, + Allocation::Allocation(ref val) => val, + }; + + let size = cpu_ref.len() * size_of::<T>(); + let (cuda_alloc, copied) = + match self.cuda_alloc.take() { + Allocation::Reference(val) if unsafe { val.__size() == size } => (Allocation::Reference(val), false), + Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false), + _ => { + let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref)); + (Allocation::Allocation(alloc), true) + } + }; + self.cuda_alloc = cuda_alloc; + + let cuda_ref = + match self.cuda_alloc { + Allocation::None => panic!(), + Allocation::Reference(ref mut val) => val.dup(), + Allocation::Allocation(ref mut val) => val.get_ref_mut(), + }; + + if !copied { + unsafe { + __copy_cpu_to_cuda(cuda_ref.__ptr(), cpu_ref.as_ptr() as *mut u8, size); + } + } + + self.loc = HerculesMutBoxLocation::CUDA; + cuda_ref + } + HerculesMutBoxLocation::CUDA => { + match self.cuda_alloc { + Allocation::None => panic!("No GPU reference"), + Allocation::Reference(ref mut val) => val.dup(), + Allocation::Allocation(ref mut val) => val.get_ref_mut(), + } + } + } + } +} + +pub trait HerculesImmBoxTo<'a, T> { + fn to(&'a self) -> T; +} + +impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T> +where T: Default + Clone +{ + fn to(&'a self) -> HerculesCPURef<'a> { + self.as_cpu_ref() + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T> +where T: Default + Clone +{ + fn to(&'a self) -> HerculesCUDARef<'a> { + self.as_cuda_ref() + } +} + +pub trait HerculesMutBoxTo<'a, T> { + fn to(&'a mut self) -> T; +} + +impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> +where T: Default + Clone +{ + fn to(&'a mut self) -> HerculesCPURefMut<'a> { + self.as_cpu_ref() + } +} + +#[cfg(feature = "cuda")] +impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> +where T: Default + Clone +{ + fn to(&'a mut self) -> HerculesCUDARefMut<'a> { + self.as_cuda_ref() + } +} diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index 8368a74f42d2a795d9febfb42ec5cf1707958772..b4a0f6fd7652fe35e0a3177c07cf2782a26e17a4 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -8,9 +8,7 @@ use self::camera_model::*; use self::cava_rust::CHAN; use self::image_proc::*; -#[cfg(feature = "cuda")] -use hercules_rt::CUDABox; -use hercules_rt::{runner, HerculesCPURef}; +use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox}; use image::ImageError; @@ -18,6 +16,30 @@ use clap::Parser; juno_build::juno!("cava"); +// Individual lifetimes are not needed in this example but should probably be generated for +// flexibility +async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a, 'g: 'a>( + runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64, + input: &'b HerculesImmBox<'b, u8>, tstw: &'c HerculesImmBox<'c, f32>, + ctrl_pts: &'d HerculesImmBox<'d, f32>, weights: &'e HerculesImmBox<'e, f32>, + coefs: &'f HerculesImmBox<'f, f32>, tonemap: &'g HerculesImmBox<'g, f32>, +) -> HerculesMutBox<'a, u8> { + HerculesMutBox::from( + runner.run( + r, + c, + num_ctrl_pts, + input.to(), + tstw.to(), + ctrl_pts.to(), + weights.to(), + coefs.to(), + tonemap.to() + ) + .await + ) +} + fn run_cava( rows: usize, cols: usize, @@ -36,62 +58,32 @@ fn run_cava( assert_eq!(coefs.len(), 4 * CHAN); assert_eq!(tonemap.len(), 256 * CHAN); - #[cfg(not(feature = "cuda"))] - { - let image = HerculesCPURef::from_slice(image); - let tstw = HerculesCPURef::from_slice(tstw); - let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); - let weights = HerculesCPURef::from_slice(weights); - let coefs = HerculesCPURef::from_slice(coefs); - let tonemap = HerculesCPURef::from_slice(tonemap); - let mut r = runner!(cava); - async_std::task::block_on(async { - r.run( - rows as u64, - cols as u64, - num_ctrl_pts as u64, - image, - tstw, - ctrl_pts, - weights, - coefs, - tonemap, - ) - .await - }) - .as_slice::<u8>() - .to_vec() - .into_boxed_slice() - } - - #[cfg(feature = "cuda")] - { - let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image)); - let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw)); - let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts)); - let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights)); - let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs)); - let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap)); - let mut r = runner!(cava); - let res = async_std::task::block_on(async { - r.run( - rows as u64, - cols as u64, - num_ctrl_pts as u64, - image.get_ref(), - tstw.get_ref(), - ctrl_pts.get_ref(), - weights.get_ref(), - coefs.get_ref(), - tonemap.get_ref(), - ) - .await - }); - let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() }; - let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice(); - res.to_cpu_ref(&mut res_cpu); - res_cpu - } + let image = HerculesImmBox::from(image); + let tstw = HerculesImmBox::from(tstw); + let ctrl_pts = HerculesImmBox::from(ctrl_pts); + let weights = HerculesImmBox::from(weights); + let coefs = HerculesImmBox::from(coefs); + let tonemap = HerculesImmBox::from(tonemap); + + let mut r = runner!(cava); + + async_std::task::block_on(async { + safe_run(&mut r, + rows as u64, + cols as u64, + num_ctrl_pts as u64, + &image, + &tstw, + &ctrl_pts, + &weights, + &coefs, + &tonemap, + ) + .await + }) + .as_slice() + .to_vec() + .into_boxed_slice() } enum Error { diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs index 3b067ebd0c74ba4fe4b1cd4a39cf4f0b0c8b46cd..60ccb51565bdaa6d0f1837385a9de7ac52dc0128 100644 --- a/juno_samples/edge_detection/src/main.rs +++ b/juno_samples/edge_detection/src/main.rs @@ -2,9 +2,7 @@ mod edge_detection_rust; -#[cfg(feature = "cuda")] -use hercules_rt::CUDABox; -use hercules_rt::{runner, HerculesCPURef}; +use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox}; use std::slice::from_raw_parts; @@ -86,6 +84,39 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat { converted } +async fn safe_run<'a, 'b, 'c, 'd, 'e, 'f>( + runner: &'a mut HerculesRunner_edge_detection, + n: u64, + m: u64, + gs: u64, + sz: u64, + sb: u64, + input: &'b HerculesImmBox<'b, f32>, + gaussian_filter: &'c HerculesImmBox<'c, f32>, + structure: &'d HerculesImmBox<'d, f32>, + sx: &'e HerculesImmBox<'e, f32>, + sy: &'f HerculesImmBox<'f, f32>, + theta: f32, +) -> HerculesMutBox<'a, f32> { + HerculesMutBox::from( + runner + .run( + n, + m, + gs, + sz, + sb, + input.to(), + gaussian_filter.to(), + structure.to(), + sx.to(), + sy.to(), + theta, + ) + .await, + ) +} + fn edge_detection_harness(args: EdgeDetectionInputs) { let EdgeDetectionInputs { input, @@ -106,38 +137,18 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, 0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036, ]; - #[cfg(not(feature = "cuda"))] - let gaussian_filter_h = HerculesCPURef::from_slice(&gaussian_filter); - #[cfg(feature = "cuda")] - let gaussian_filter_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&gaussian_filter)); - #[cfg(feature = "cuda")] - let gaussian_filter_h = gaussian_filter_cuda.get_ref(); + let gaussian_filter_h = HerculesImmBox::from(gaussian_filter.as_slice()); let sz: usize = 3; let structure: Vec<f32> = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; - #[cfg(not(feature = "cuda"))] - let structure_h = HerculesCPURef::from_slice(&structure); - #[cfg(feature = "cuda")] - let structure_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&structure)); - #[cfg(feature = "cuda")] - let structure_h = structure_cuda.get_ref(); + let structure_h = HerculesImmBox::from(structure.as_slice()); let sb: usize = 3; let sx: Vec<f32> = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0]; - #[cfg(not(feature = "cuda"))] - let sx_h = HerculesCPURef::from_slice(&sx); - #[cfg(feature = "cuda")] - let sx_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sx)); - #[cfg(feature = "cuda")] - let sx_h = sx_cuda.get_ref(); + let sx_h = HerculesImmBox::from(sx.as_slice()); let sy: Vec<f32> = vec![-1.0, -2.0, -1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0]; - #[cfg(not(feature = "cuda"))] - let sy_h = HerculesCPURef::from_slice(&sy); - #[cfg(feature = "cuda")] - let sy_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sy)); - #[cfg(feature = "cuda")] - let sy_h = sy_cuda.get_ref(); + let sy_h = HerculesImmBox::from(sy.as_slice()); let theta: f32 = 0.1; @@ -203,39 +214,27 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { let input = unsafe { from_raw_parts(ptr, height * width) }; - #[cfg(not(feature = "cuda"))] - let input_h = HerculesCPURef::from_slice(input); - #[cfg(feature = "cuda")] - let input_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(input)); - #[cfg(feature = "cuda")] - let input_h = input_cuda.get_ref(); + let input_h = HerculesImmBox::from(input); let result = async_std::task::block_on(async { - r.run( + safe_run( + &mut r, height as u64, width as u64, gs as u64, sz as u64, sb as u64, - input_h, - gaussian_filter_h.clone(), - structure_h.clone(), - sx_h.clone(), - sy_h.clone(), + &input_h, + &gaussian_filter_h, + &structure_h, + &sx_h, + &sy_h, theta, ) .await - }); - - #[cfg(not(feature = "cuda"))] - let result: Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice(); - #[cfg(feature = "cuda")] - let result: Box<[f32]> = { - let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() }; - let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice(); - result.to_cpu_ref(&mut res_cpu); - res_cpu - }; + }) + .as_slice() + .to_vec(); if display { let result = frame_from_slice(&result, height, width); @@ -261,10 +260,7 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { theta, ); - assert_eq!( - result.as_ref(), - <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result) - ); + assert_eq!(result, rust_result); println!("Frames {} match", i); if display_verify {