Skip to content
Snippets Groups Projects
Commit 67e74f86 authored by Aaron Councilman's avatar Aaron Councilman Committed by rarbore2
Browse files

Safe rust interface

parent 5ae86850
No related branches found
No related tags found
1 merge request!149Safe rust interface
...@@ -1013,7 +1013,28 @@ impl<'a> RTContext<'a> { ...@@ -1013,7 +1013,28 @@ impl<'a> RTContext<'a> {
)?; )?;
} }
write!(w, "}}}}")?; write!(w, "}}}}")?;
write!(w, "async fn run<'a>(&'a mut self")?;
// Every reference that may be returned has the same lifetime. Every
// other reference gets its own unique lifetime.
let returned_origins: HashSet<_> = self.collection_objects[&self.func_id]
.returned_objects()
.into_iter()
.map(|obj| self.collection_objects[&self.func_id].origin(*obj))
.collect();
write!(w, "async fn run<'runner, 'returned")?;
for idx in 0..func.param_types.len() {
write!(w, ", 'p{}", idx)?;
}
write!(
w,
">(&'{} mut self",
if returned_origins.iter().any(|origin| !origin.is_parameter()) {
"returned"
} else {
"runner"
}
)?;
for idx in 0..func.num_dynamic_constants { for idx in 0..func.num_dynamic_constants {
write!(w, ", dc_p{}: u64", idx)?; write!(w, ", dc_p{}: u64", idx)?;
} }
...@@ -1029,8 +1050,19 @@ impl<'a> RTContext<'a> { ...@@ -1029,8 +1050,19 @@ impl<'a> RTContext<'a> {
let mutability = if param_muts[idx] { "Mut" } else { "" }; let mutability = if param_muts[idx] { "Mut" } else { "" };
write!( write!(
w, w,
", p{}: ::hercules_rt::Hercules{}Ref{}<'a>", ", p{}: ::hercules_rt::Hercules{}Ref{}<'{}>",
idx, device, mutability idx,
device,
mutability,
if returned_origins.iter().any(|origin| origin
.try_parameter()
.map(|oidx| idx == oidx)
.unwrap_or(false))
{
"returned".to_string()
} else {
format!("p{}", idx)
}
)?; )?;
} }
} }
...@@ -1045,10 +1077,13 @@ impl<'a> RTContext<'a> { ...@@ -1045,10 +1077,13 @@ impl<'a> RTContext<'a> {
let mutability = if return_mut { "Mut" } else { "" }; let mutability = if return_mut { "Mut" } else { "" };
write!( write!(
w, w,
") -> ::hercules_rt::Hercules{}Ref{}<'a> {{", ") -> ::hercules_rt::Hercules{}Ref{}<'returned> {{",
device, mutability device, mutability
)?; )?;
} }
// Start with possibly re-allocating the backing memory if it's not
// large enough.
write!(w, "unsafe {{")?; write!(w, "unsafe {{")?;
for (device, (total, _)) in self.backing_allocations[&self.func_id].iter() { for (device, (total, _)) in self.backing_allocations[&self.func_id].iter() {
write!(w, "let size = ")?; write!(w, "let size = ")?;
...@@ -1084,6 +1119,8 @@ impl<'a> RTContext<'a> { ...@@ -1084,6 +1119,8 @@ impl<'a> RTContext<'a> {
)?; )?;
} }
} }
// Call the wrapped function.
write!(w, "let ret = {}(", func.name)?; write!(w, "let ret = {}(", func.name)?;
for (device, _) in self.backing_allocations[&self.func_id].iter() { for (device, _) in self.backing_allocations[&self.func_id].iter() {
write!( write!(
...@@ -1117,6 +1154,8 @@ impl<'a> RTContext<'a> { ...@@ -1117,6 +1154,8 @@ impl<'a> RTContext<'a> {
)?; )?;
} }
write!(w, "}}}}")?; write!(w, "}}}}")?;
// De-allocate the backing memory on drop.
write!( write!(
w, w,
"}}impl Drop for HerculesRunner_{} {{#[allow(unused_unsafe)]fn drop(&mut self) {{unsafe {{", "}}impl Drop for HerculesRunner_{} {{#[allow(unused_unsafe)]fn drop(&mut self) {{unsafe {{",
......
...@@ -36,7 +36,7 @@ use crate::*; ...@@ -36,7 +36,7 @@ use crate::*;
* - For each function, which collection objects may be returned? * - For each function, which collection objects may be returned?
* - For each collection object, how was it originated? * - For each collection object, how was it originated?
*/ */
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum CollectionObjectOrigin { pub enum CollectionObjectOrigin {
Parameter(usize), Parameter(usize),
Constant(NodeID), Constant(NodeID),
...@@ -57,6 +57,10 @@ pub struct FunctionCollectionObjects { ...@@ -57,6 +57,10 @@ pub struct FunctionCollectionObjects {
pub type CollectionObjects = BTreeMap<FunctionID, FunctionCollectionObjects>; pub type CollectionObjects = BTreeMap<FunctionID, FunctionCollectionObjects>;
impl CollectionObjectOrigin { impl CollectionObjectOrigin {
pub fn is_parameter(&self) -> bool {
self.try_parameter().is_some()
}
pub fn try_parameter(&self) -> Option<usize> { pub fn try_parameter(&self) -> Option<usize> {
match self { match self {
CollectionObjectOrigin::Parameter(index) => Some(*index), CollectionObjectOrigin::Parameter(index) => Some(*index),
......
#![feature(once_cell_try)]
use std::alloc::{alloc, dealloc, Layout}; use std::alloc::{alloc, dealloc, Layout};
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ptr::{copy_nonoverlapping, write_bytes, NonNull}; use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
use std::slice::{from_raw_parts, from_raw_parts_mut}; use std::slice::{from_raw_parts, from_raw_parts_mut};
use std::sync::OnceLock;
/* /*
* Define supporting types, functions, and macros for Hercules RT functions. For * Define supporting types, functions, and macros for Hercules RT functions. For
* a more in-depth discussion of the design of these utilities, see hercules_cg/ * a more in-depth discussion of the design of these utilities, see hercules_cg/
...@@ -278,6 +282,14 @@ impl<'a> HerculesCUDARefMut<'a> { ...@@ -278,6 +282,14 @@ impl<'a> HerculesCUDARefMut<'a> {
} }
} }
pub fn dup(&'a mut self) -> Self {
HerculesCUDARefMut {
ptr: self.ptr,
size: self.size,
_phantom: PhantomData,
}
}
pub unsafe fn __ptr(&self) -> *mut u8 { pub unsafe fn __ptr(&self) -> *mut u8 {
self.ptr.as_ptr() self.ptr.as_ptr()
} }
...@@ -330,6 +342,10 @@ impl CUDABox { ...@@ -330,6 +342,10 @@ impl CUDABox {
_phantom: PhantomData, _phantom: PhantomData,
} }
} }
pub fn get_bytes(&self) -> usize {
self.size
}
} }
#[cfg(feature = "cuda")] #[cfg(feature = "cuda")]
...@@ -367,3 +383,416 @@ impl __RawPtrSendSync { ...@@ -367,3 +383,416 @@ impl __RawPtrSendSync {
unsafe impl Send for __RawPtrSendSync {} unsafe impl Send for __RawPtrSendSync {}
unsafe impl Sync for __RawPtrSendSync {} unsafe impl Sync for __RawPtrSendSync {}
/*
* A HerculesBox holds memory that can be on any device and provides a common interface to moving
* data where it is needed.
*
* It can hold CPU and device allocations to basically point at the memory it represents. It can
* also hold Hercules references either to those allocations it holds or to other allocations not
* held by this Box (in which case the appropriate allocation field should be None).
*
* The data held at all of its non-None allocations and references is maintained so that it is the
* same, and so methods will attempt to use the reference or allocation that is most convenient.
*
* HerculesImmBox hold references to immutable memory only. All operations on these is through
* immutable references, though internally it uses OnceLocks to protect its resources since the Box
* may be used in multiple parallel threads if it is used in parallel Hercules code invocation.
* We use OnceLocks since the data is immutable and so once it has been placed on a device movement
* is not necessary.
*
* We maintain the invariant that at least one of the device references is always set, their
* associated allocations may or may not be set, as those may not be needed if the allocation is
* help elsewhere.
*
* HerculesMutBox holds memory on some device and can produce mutable references to that data on
* on any device. All these operations are through mutable references since this ensures exclusive
* access to the Box and therefore to the underlying device memory. Because of the exclusive access
* locks are not needed.
*
* Note that a HerculesMutBox may hold multiple allocations at once, but it tracks the "definitive"
* copy to be the one borrowed mutably most recently (since it may have been updated). The extra
* allocations are kept around to avoid reallocation if memory is moved back to the device.
*/
pub struct HerculesImmBox<'a, T> {
#[allow(dead_code)]
cpu_alloc: OnceLock<Vec<T>>,
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock<CUDABox>,
cpu_ref: OnceLock<HerculesCPURef<'a>>,
#[cfg(feature = "cuda")]
cuda_ref: OnceLock<HerculesCUDARef<'a>>,
}
impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> {
fn from(value: &'a [T]) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::new(),
}
}
}
impl<'a, T> From<HerculesCPURef<'a>> for HerculesImmBox<'a, T> {
fn from(value: HerculesCPURef<'a>) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::from(value),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::new(),
}
}
}
// If we are building from a mutable reference, we demote that to a non-mutable reference since we
// don't hold mutable references.
impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesImmBox<'a, T> {
fn from(value: HerculesCPURefMut<'a>) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::from(value.as_ref()),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::new(),
}
}
}
#[cfg(feature = "cuda")]
impl<'a, T> From<HerculesCUDARef<'a>> for HerculesImmBox<'a, T> {
fn from(value: HerculesCUDARef<'a>) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::from(value),
}
}
}
#[cfg(feature = "cuda")]
impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> {
fn from(value: HerculesCUDARefMut<'a>) -> Self {
HerculesImmBox {
cpu_alloc: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_alloc: OnceLock::new(),
cpu_ref: OnceLock::new(),
#[cfg(feature = "cuda")]
cuda_ref: OnceLock::from(value.as_ref()),
}
}
}
impl<'a, T> HerculesImmBox<'a, T>
where
T: Default + Clone
{
pub fn as_slice(&'a self) -> &'a [T] {
self.as_cpu_ref().as_slice()
}
pub fn to_vec(&'a self) -> Vec<T> {
Vec::from(self.as_cpu_ref().as_slice())
}
pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> {
if let Some(cpu_ref) = self.cpu_ref.get() {
cpu_ref.clone()
} else {
#[cfg(feature = "cuda")]
if let Some(cuda_ref) = self.cuda_ref.get() {
return
self.cpu_ref.get_or_init(|| {
let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
let mut alloc = Vec::new();
alloc.resize_with(elements, Default::default);
let _ = cuda_ref.clone().to_cpu_ref(&mut alloc);
self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly");
let alloc = self.cpu_alloc.get().unwrap();
HerculesCPURef::from_slice(alloc)
}).clone();
}
panic!("HerculesImmBox has no reference to data")
}
}
#[cfg(feature = "cuda")]
pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> {
if let Some(cuda_ref) = self.cuda_ref.get() {
cuda_ref.clone()
} else {
if let Some(cpu_ref) = self.cpu_ref.get() {
return self.cuda_ref.get_or_init(|| {
// Copy data to CUDA device
let alloc = CUDABox::from_cpu_ref(cpu_ref.clone());
self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly");
self.cuda_alloc.get().unwrap().get_ref()
}).clone();
}
panic!("HerculesImmBox has no reference to data")
}
}
}
enum HerculesMutBoxLocation {
CPU,
#[cfg(feature = "cuda")]
CUDA,
}
enum Allocation<R, A> {
None,
Reference(R),
Allocation(A),
}
impl<R, A> Allocation<R, A> {
fn take(&mut self) -> Allocation<R, A> {
std::mem::replace(self, Allocation::None)
}
}
pub struct HerculesMutBox<'a, T> {
loc: HerculesMutBoxLocation,
cpu_alloc: Allocation<&'a mut [T], Vec<T>>,
#[cfg(feature = "cuda")]
cuda_alloc: Allocation<HerculesCUDARefMut<'a>, CUDABox>,
}
impl<'a, T> From<&'a mut [T]> for HerculesMutBox<'a, T> {
fn from(value: &'a mut [T]) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CPU,
cpu_alloc: Allocation::Reference(value),
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::None,
}
}
}
impl<'a, T> From<Vec<T>> for HerculesMutBox<'a, T> {
fn from(value: Vec<T>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CPU,
cpu_alloc: Allocation::Allocation(value),
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::None,
}
}
}
impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<'a, T> {
fn from(value: HerculesCPURef<'a>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CPU,
cpu_alloc: Allocation::Allocation(value.as_slice().to_vec()),
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::None,
}
}
}
impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> {
fn from(value: HerculesCPURefMut<'a>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CPU,
cpu_alloc: Allocation::Reference(value.as_slice()),
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::None,
}
}
}
#[cfg(feature = "cuda")]
impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<'a, T> {
fn from(value: HerculesCUDARef<'a>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CUDA,
cpu_alloc: Allocation::None,
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::Allocation(CUDABox::from_cuda_ref(value)),
}
}
}
#[cfg(feature = "cuda")]
impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
fn from(value: HerculesCUDARefMut<'a>) -> Self {
HerculesMutBox {
loc: HerculesMutBoxLocation::CUDA,
cpu_alloc: Allocation::None,
#[cfg(feature = "cuda")]
cuda_alloc: Allocation::Reference(value),
}
}
}
impl<'a, T> HerculesMutBox<'a, T>
where
T: Default + Clone
{
pub fn as_slice(&'a mut self) -> &'a mut [T] {
self.as_cpu_ref().as_slice()
}
pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> {
match self.loc {
HerculesMutBoxLocation::CPU => {
match self.cpu_alloc {
Allocation::None => panic!("No CPU reference"),
Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val),
Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val),
}
}
#[cfg(feature = "cuda")]
HerculesMutBoxLocation::CUDA => {
let cuda_ref : HerculesCUDARef<'a> =
match self.cuda_alloc {
Allocation::None => panic!("No GPU reference"),
Allocation::Reference(ref mut val) => val.dup().as_ref(),
Allocation::Allocation(ref val) => val.get_ref(),
};
let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
// Allocate host memory (if needed)
let cpu_alloc : Allocation<&'a mut [T], Vec<T>> =
match self.cpu_alloc.take() {
Allocation::Reference(val) if val.len() == elements => Allocation::Reference(val),
Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val),
_ => {
let mut alloc = Vec::new();
alloc.resize_with(elements, Default::default);
Allocation::Allocation(alloc)
}
};
self.cpu_alloc = cpu_alloc;
let cpu_ref : &'a mut [T] =
match &mut self.cpu_alloc {
Allocation::None => panic!(),
Allocation::Reference(val) => val,
Allocation::Allocation(val) => val,
};
// Transfer data from CUDA device
let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref);
self.loc = HerculesMutBoxLocation::CPU;
cpu_ref
}
}
}
#[cfg(feature = "cuda")]
pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> {
match self.loc {
HerculesMutBoxLocation::CPU => {
let cpu_ref : &'a [T] =
match self.cpu_alloc {
Allocation::None => panic!("No CPU reference"),
Allocation::Reference(ref val) => val,
Allocation::Allocation(ref val) => val,
};
let size = cpu_ref.len() * size_of::<T>();
let (cuda_alloc, copied) =
match self.cuda_alloc.take() {
Allocation::Reference(val) if unsafe { val.__size() == size } => (Allocation::Reference(val), false),
Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false),
_ => {
let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref));
(Allocation::Allocation(alloc), true)
}
};
self.cuda_alloc = cuda_alloc;
let cuda_ref =
match self.cuda_alloc {
Allocation::None => panic!(),
Allocation::Reference(ref mut val) => val.dup(),
Allocation::Allocation(ref mut val) => val.get_ref_mut(),
};
if !copied {
unsafe {
__copy_cpu_to_cuda(cuda_ref.__ptr(), cpu_ref.as_ptr() as *mut u8, size);
}
}
self.loc = HerculesMutBoxLocation::CUDA;
cuda_ref
}
HerculesMutBoxLocation::CUDA => {
match self.cuda_alloc {
Allocation::None => panic!("No GPU reference"),
Allocation::Reference(ref mut val) => val.dup(),
Allocation::Allocation(ref mut val) => val.get_ref_mut(),
}
}
}
}
}
pub trait HerculesImmBoxTo<'a, T> {
fn to(&'a self) -> T;
}
impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
where T: Default + Clone
{
fn to(&'a self) -> HerculesCPURef<'a> {
self.as_cpu_ref()
}
}
#[cfg(feature = "cuda")]
impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
where T: Default + Clone
{
fn to(&'a self) -> HerculesCUDARef<'a> {
self.as_cuda_ref()
}
}
pub trait HerculesMutBoxTo<'a, T> {
fn to(&'a mut self) -> T;
}
impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
where T: Default + Clone
{
fn to(&'a mut self) -> HerculesCPURefMut<'a> {
self.as_cpu_ref()
}
}
#[cfg(feature = "cuda")]
impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
where T: Default + Clone
{
fn to(&'a mut self) -> HerculesCUDARefMut<'a> {
self.as_cuda_ref()
}
}
...@@ -8,9 +8,7 @@ use self::camera_model::*; ...@@ -8,9 +8,7 @@ use self::camera_model::*;
use self::cava_rust::CHAN; use self::cava_rust::CHAN;
use self::image_proc::*; use self::image_proc::*;
#[cfg(feature = "cuda")] use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesCPURef};
use image::ImageError; use image::ImageError;
...@@ -18,6 +16,30 @@ use clap::Parser; ...@@ -18,6 +16,30 @@ use clap::Parser;
juno_build::juno!("cava"); juno_build::juno!("cava");
// Individual lifetimes are not needed in this example but should probably be generated for
// flexibility
async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a, 'g: 'a>(
runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64,
input: &'b HerculesImmBox<'b, u8>, tstw: &'c HerculesImmBox<'c, f32>,
ctrl_pts: &'d HerculesImmBox<'d, f32>, weights: &'e HerculesImmBox<'e, f32>,
coefs: &'f HerculesImmBox<'f, f32>, tonemap: &'g HerculesImmBox<'g, f32>,
) -> HerculesMutBox<'a, u8> {
HerculesMutBox::from(
runner.run(
r,
c,
num_ctrl_pts,
input.to(),
tstw.to(),
ctrl_pts.to(),
weights.to(),
coefs.to(),
tonemap.to()
)
.await
)
}
fn run_cava( fn run_cava(
rows: usize, rows: usize,
cols: usize, cols: usize,
...@@ -36,62 +58,32 @@ fn run_cava( ...@@ -36,62 +58,32 @@ fn run_cava(
assert_eq!(coefs.len(), 4 * CHAN); assert_eq!(coefs.len(), 4 * CHAN);
assert_eq!(tonemap.len(), 256 * CHAN); assert_eq!(tonemap.len(), 256 * CHAN);
#[cfg(not(feature = "cuda"))] let image = HerculesImmBox::from(image);
{ let tstw = HerculesImmBox::from(tstw);
let image = HerculesCPURef::from_slice(image); let ctrl_pts = HerculesImmBox::from(ctrl_pts);
let tstw = HerculesCPURef::from_slice(tstw); let weights = HerculesImmBox::from(weights);
let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); let coefs = HerculesImmBox::from(coefs);
let weights = HerculesCPURef::from_slice(weights); let tonemap = HerculesImmBox::from(tonemap);
let coefs = HerculesCPURef::from_slice(coefs);
let tonemap = HerculesCPURef::from_slice(tonemap); let mut r = runner!(cava);
let mut r = runner!(cava);
async_std::task::block_on(async { async_std::task::block_on(async {
r.run( safe_run(&mut r,
rows as u64, rows as u64,
cols as u64, cols as u64,
num_ctrl_pts as u64, num_ctrl_pts as u64,
image, &image,
tstw, &tstw,
ctrl_pts, &ctrl_pts,
weights, &weights,
coefs, &coefs,
tonemap, &tonemap,
) )
.await .await
}) })
.as_slice::<u8>() .as_slice()
.to_vec() .to_vec()
.into_boxed_slice() .into_boxed_slice()
}
#[cfg(feature = "cuda")]
{
let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image));
let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw));
let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts));
let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
let mut r = runner!(cava);
let res = async_std::task::block_on(async {
r.run(
rows as u64,
cols as u64,
num_ctrl_pts as u64,
image.get_ref(),
tstw.get_ref(),
ctrl_pts.get_ref(),
weights.get_ref(),
coefs.get_ref(),
tonemap.get_ref(),
)
.await
});
let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
res.to_cpu_ref(&mut res_cpu);
res_cpu
}
} }
enum Error { enum Error {
......
...@@ -2,9 +2,7 @@ ...@@ -2,9 +2,7 @@
mod edge_detection_rust; mod edge_detection_rust;
#[cfg(feature = "cuda")] use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
use hercules_rt::CUDABox;
use hercules_rt::{runner, HerculesCPURef};
use std::slice::from_raw_parts; use std::slice::from_raw_parts;
...@@ -86,6 +84,39 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat { ...@@ -86,6 +84,39 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat {
converted converted
} }
async fn safe_run<'a, 'b, 'c, 'd, 'e, 'f>(
runner: &'a mut HerculesRunner_edge_detection,
n: u64,
m: u64,
gs: u64,
sz: u64,
sb: u64,
input: &'b HerculesImmBox<'b, f32>,
gaussian_filter: &'c HerculesImmBox<'c, f32>,
structure: &'d HerculesImmBox<'d, f32>,
sx: &'e HerculesImmBox<'e, f32>,
sy: &'f HerculesImmBox<'f, f32>,
theta: f32,
) -> HerculesMutBox<'a, f32> {
HerculesMutBox::from(
runner
.run(
n,
m,
gs,
sz,
sb,
input.to(),
gaussian_filter.to(),
structure.to(),
sx.to(),
sy.to(),
theta,
)
.await,
)
}
fn edge_detection_harness(args: EdgeDetectionInputs) { fn edge_detection_harness(args: EdgeDetectionInputs) {
let EdgeDetectionInputs { let EdgeDetectionInputs {
input, input,
...@@ -106,38 +137,18 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { ...@@ -106,38 +137,18 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, 0.000036, 0.000363, 0.001446, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, 0.000036, 0.000363, 0.001446,
0.002291, 0.001446, 0.000363, 0.000036, 0.002291, 0.001446, 0.000363, 0.000036,
]; ];
#[cfg(not(feature = "cuda"))] let gaussian_filter_h = HerculesImmBox::from(gaussian_filter.as_slice());
let gaussian_filter_h = HerculesCPURef::from_slice(&gaussian_filter);
#[cfg(feature = "cuda")]
let gaussian_filter_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&gaussian_filter));
#[cfg(feature = "cuda")]
let gaussian_filter_h = gaussian_filter_cuda.get_ref();
let sz: usize = 3; let sz: usize = 3;
let structure: Vec<f32> = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; let structure: Vec<f32> = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
#[cfg(not(feature = "cuda"))] let structure_h = HerculesImmBox::from(structure.as_slice());
let structure_h = HerculesCPURef::from_slice(&structure);
#[cfg(feature = "cuda")]
let structure_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&structure));
#[cfg(feature = "cuda")]
let structure_h = structure_cuda.get_ref();
let sb: usize = 3; let sb: usize = 3;
let sx: Vec<f32> = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0]; let sx: Vec<f32> = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0];
#[cfg(not(feature = "cuda"))] let sx_h = HerculesImmBox::from(sx.as_slice());
let sx_h = HerculesCPURef::from_slice(&sx);
#[cfg(feature = "cuda")]
let sx_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sx));
#[cfg(feature = "cuda")]
let sx_h = sx_cuda.get_ref();
let sy: Vec<f32> = vec![-1.0, -2.0, -1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0]; let sy: Vec<f32> = vec![-1.0, -2.0, -1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0];
#[cfg(not(feature = "cuda"))] let sy_h = HerculesImmBox::from(sy.as_slice());
let sy_h = HerculesCPURef::from_slice(&sy);
#[cfg(feature = "cuda")]
let sy_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sy));
#[cfg(feature = "cuda")]
let sy_h = sy_cuda.get_ref();
let theta: f32 = 0.1; let theta: f32 = 0.1;
...@@ -203,39 +214,27 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { ...@@ -203,39 +214,27 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
let input = unsafe { from_raw_parts(ptr, height * width) }; let input = unsafe { from_raw_parts(ptr, height * width) };
#[cfg(not(feature = "cuda"))] let input_h = HerculesImmBox::from(input);
let input_h = HerculesCPURef::from_slice(input);
#[cfg(feature = "cuda")]
let input_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(input));
#[cfg(feature = "cuda")]
let input_h = input_cuda.get_ref();
let result = async_std::task::block_on(async { let result = async_std::task::block_on(async {
r.run( safe_run(
&mut r,
height as u64, height as u64,
width as u64, width as u64,
gs as u64, gs as u64,
sz as u64, sz as u64,
sb as u64, sb as u64,
input_h, &input_h,
gaussian_filter_h.clone(), &gaussian_filter_h,
structure_h.clone(), &structure_h,
sx_h.clone(), &sx_h,
sy_h.clone(), &sy_h,
theta, theta,
) )
.await .await
}); })
.as_slice()
#[cfg(not(feature = "cuda"))] .to_vec();
let result: Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
#[cfg(feature = "cuda")]
let result: Box<[f32]> = {
let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() };
let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice();
result.to_cpu_ref(&mut res_cpu);
res_cpu
};
if display { if display {
let result = frame_from_slice(&result, height, width); let result = frame_from_slice(&result, height, width);
...@@ -261,10 +260,7 @@ fn edge_detection_harness(args: EdgeDetectionInputs) { ...@@ -261,10 +260,7 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
theta, theta,
); );
assert_eq!( assert_eq!(result, rust_result);
result.as_ref(),
<Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result)
);
println!("Frames {} match", i); println!("Frames {} match", i);
if display_verify { if display_verify {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment