diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 232274824184a52f037cd783d4729d7a389b7c04..b79e4953346f2f47a81bf928eadb647ed7217645 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -1013,7 +1013,28 @@ impl<'a> RTContext<'a> {
         write!(w, "}}}}")?;
-        write!(w, "async fn run<'a>(&'a mut self")?;
+        // Every reference that may be returned has the same lifetime. Every
+        // other reference gets its own unique lifetime.
+        let returned_origins: HashSet<_> = self.collection_objects[&self.func_id]
+            .returned_objects()
+            .into_iter()
+            .map(|obj| self.collection_objects[&self.func_id].origin(*obj))
+            .collect();
+        write!(w, "async fn run<'runner, 'returned")?;
+        for idx in 0..func.param_types.len() {
+            write!(w, ", 'p{}", idx)?;
+        }
+        write!(
+            w,
+            ">(&'{} mut self",
+            if returned_origins.iter().any(|origin| !origin.is_parameter()) {
+                "returned"
+            } else {
+                "runner"
+            }
+        )?;
         for idx in 0..func.num_dynamic_constants {
             write!(w, ", dc_p{}: u64", idx)?;
@@ -1029,8 +1050,19 @@ impl<'a> RTContext<'a> {
                 let mutability = if param_muts[idx] { "Mut" } else { "" };
-                    ", p{}: ::hercules_rt::Hercules{}Ref{}<'a>",
-                    idx, device, mutability
+                    ", p{}: ::hercules_rt::Hercules{}Ref{}<'{}>",
+                    idx,
+                    device,
+                    mutability,
+                    if returned_origins.iter().any(|origin| origin
+                        .try_parameter()
+                        .map(|oidx| idx == oidx)
+                        .unwrap_or(false))
+                    {
+                        "returned".to_string()
+                    } else {
+                        format!("p{}", idx)
+                    }
@@ -1045,10 +1077,13 @@ impl<'a> RTContext<'a> {
             let mutability = if return_mut { "Mut" } else { "" };
-                ") -> ::hercules_rt::Hercules{}Ref{}<'a> {{",
+                ") -> ::hercules_rt::Hercules{}Ref{}<'returned> {{",
                 device, mutability
+        // Start with possibly re-allocating the backing memory if it's not
+        // large enough.
         write!(w, "unsafe {{")?;
         for (device, (total, _)) in self.backing_allocations[&self.func_id].iter() {
             write!(w, "let size = ")?;
@@ -1084,6 +1119,8 @@ impl<'a> RTContext<'a> {
+        // Call the wrapped function.
         write!(w, "let ret = {}(", func.name)?;
         for (device, _) in self.backing_allocations[&self.func_id].iter() {
@@ -1117,6 +1154,8 @@ impl<'a> RTContext<'a> {
         write!(w, "}}}}")?;
+        // De-allocate the backing memory on drop.
             "}}impl Drop for HerculesRunner_{} {{#[allow(unused_unsafe)]fn drop(&mut self) {{unsafe {{",
diff --git a/hercules_ir/src/collections.rs b/hercules_ir/src/collections.rs
index 1bc650e947e02b4d99bb5fde4173d2744b25a2cb..d236d5b55cd61f5f0cbbd721660779f2d3ba5e61 100644
--- a/hercules_ir/src/collections.rs
+++ b/hercules_ir/src/collections.rs
@@ -36,7 +36,7 @@ use crate::*;
  * - For each function, which collection objects may be returned?
  * - For each collection object, how was it originated?
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum CollectionObjectOrigin {
@@ -57,6 +57,10 @@ pub struct FunctionCollectionObjects {
 pub type CollectionObjects = BTreeMap<FunctionID, FunctionCollectionObjects>;
 impl CollectionObjectOrigin {
+    pub fn is_parameter(&self) -> bool {
+        self.try_parameter().is_some()
+    }
     pub fn try_parameter(&self) -> Option<usize> {
         match self {
             CollectionObjectOrigin::Parameter(index) => Some(*index),
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index 12b64fa3ce7c5f73c2c7470dc546d9913203aa3f..f8fdf2effa9bfc9bbf900b79915b8716706bde6c 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -1,8 +1,12 @@
 use std::alloc::{alloc, dealloc, Layout};
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
 use std::slice::{from_raw_parts, from_raw_parts_mut};
+use std::sync::OnceLock;
  * Define supporting types, functions, and macros for Hercules RT functions. For
  * a more in-depth discussion of the design of these utilities, see hercules_cg/
@@ -278,6 +282,14 @@ impl<'a> HerculesCUDARefMut<'a> {
+    pub fn dup(&'a mut self) -> Self {
+        HerculesCUDARefMut {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
+        }
+    }
     pub unsafe fn __ptr(&self) -> *mut u8 {
@@ -330,6 +342,10 @@ impl CUDABox {
             _phantom: PhantomData,
+    pub fn get_bytes(&self) -> usize {
+        self.size
+    }
 #[cfg(feature = "cuda")]
@@ -367,3 +383,416 @@ impl __RawPtrSendSync {
 unsafe impl Send for __RawPtrSendSync {}
 unsafe impl Sync for __RawPtrSendSync {}
+ * A HerculesBox holds memory that can be on any device and provides a common interface to moving
+ * data where it is needed.
+ *
+ * It can hold CPU and device allocations to basically point at the memory it represents. It can
+ * also hold Hercules references either to those allocations it holds or to other allocations not
+ * held by this Box (in which case the appropriate allocation field should be None).
+ *
+ * The data held at all of its non-None allocations and references is maintained so that it is the
+ * same, and so methods will attempt to use the reference or allocation that is most convenient.
+ * 
+ * HerculesImmBox hold references to immutable memory only. All operations on these is through
+ * immutable references, though internally it uses OnceLocks to protect its resources since the Box
+ * may be used in multiple parallel threads if it is used in parallel Hercules code invocation.
+ * We use OnceLocks since the data is immutable and so once it has been placed on a device movement
+ * is not necessary.
+ *
+ * We maintain the invariant that at least one of the device references is always set, their
+ * associated allocations may or may not be set, as those may not be needed if the allocation is
+ * help elsewhere.
+ *
+ * HerculesMutBox holds memory on some device and can produce mutable references to that data on
+ * on any device. All these operations are through mutable references since this ensures exclusive
+ * access to the Box and therefore to the underlying device memory. Because of the exclusive access
+ * locks are not needed.
+ *
+ * Note that a HerculesMutBox may hold multiple allocations at once, but it tracks the "definitive"
+ * copy to be the one borrowed mutably most recently (since it may have been updated). The extra
+ * allocations are kept around to avoid reallocation if memory is moved back to the device.
+ */
+pub struct HerculesImmBox<'a, T> {
+    #[allow(dead_code)]
+    cpu_alloc: OnceLock<Vec<T>>,
+    #[cfg(feature = "cuda")]
+    cuda_alloc: OnceLock<CUDABox>,
+    cpu_ref: OnceLock<HerculesCPURef<'a>>,
+    #[cfg(feature = "cuda")]
+    cuda_ref: OnceLock<HerculesCUDARef<'a>>,
+impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> {
+    fn from(value: &'a [T]) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+            cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+impl<'a, T> From<HerculesCPURef<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCPURef<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+            cpu_ref: OnceLock::from(value),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+// If we are building from a mutable reference, we demote that to a non-mutable reference since we
+// don't hold mutable references.
+impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCPURefMut<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+            cpu_ref: OnceLock::from(value.as_ref()),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::new(),
+        }
+    }
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCUDARef<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+            cpu_ref: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::from(value),
+        }
+    }
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> {
+    fn from(value: HerculesCUDARefMut<'a>) -> Self {
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: OnceLock::new(),
+            cpu_ref: OnceLock::new(),
+            #[cfg(feature = "cuda")]
+            cuda_ref: OnceLock::from(value.as_ref()),
+        }
+    }
+impl<'a, T> HerculesImmBox<'a, T> 
+    T: Default + Clone
+    pub fn as_slice(&'a self) -> &'a [T] {
+        self.as_cpu_ref().as_slice()
+    }
+    pub fn to_vec(&'a self) -> Vec<T> {
+        Vec::from(self.as_cpu_ref().as_slice())
+    }
+    pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> {
+        if let Some(cpu_ref) = self.cpu_ref.get() {
+            cpu_ref.clone()
+        } else {
+            #[cfg(feature = "cuda")]
+            if let Some(cuda_ref) = self.cuda_ref.get() {
+                return 
+                    self.cpu_ref.get_or_init(|| {
+                        let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
+                        let mut alloc = Vec::new();
+                        alloc.resize_with(elements, Default::default);
+                        let _ = cuda_ref.clone().to_cpu_ref(&mut alloc);
+                        self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly");
+                        let alloc = self.cpu_alloc.get().unwrap();
+                        HerculesCPURef::from_slice(alloc)
+                    }).clone();
+            }
+            panic!("HerculesImmBox has no reference to data")
+        }
+    }
+    #[cfg(feature = "cuda")]
+    pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> {
+        if let Some(cuda_ref) = self.cuda_ref.get() {
+            cuda_ref.clone()
+        } else {
+            if let Some(cpu_ref) = self.cpu_ref.get() {
+                return self.cuda_ref.get_or_init(|| {
+                    // Copy data to CUDA device
+                    let alloc = CUDABox::from_cpu_ref(cpu_ref.clone());
+                    self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly");
+                    self.cuda_alloc.get().unwrap().get_ref()
+                }).clone();
+            }
+            panic!("HerculesImmBox has no reference to data")
+        }
+    }
+enum HerculesMutBoxLocation {
+    CPU,
+    #[cfg(feature = "cuda")]
+    CUDA,
+enum Allocation<R, A> {
+    None,
+    Reference(R),
+    Allocation(A),
+impl<R, A> Allocation<R, A> {
+    fn take(&mut self) -> Allocation<R, A> {
+        std::mem::replace(self, Allocation::None)
+    }
+pub struct HerculesMutBox<'a, T> {
+    loc: HerculesMutBoxLocation,
+    cpu_alloc: Allocation<&'a mut [T], Vec<T>>,
+    #[cfg(feature = "cuda")]
+    cuda_alloc: Allocation<HerculesCUDARefMut<'a>, CUDABox>,
+impl<'a, T> From<&'a mut [T]> for HerculesMutBox<'a, T> {
+    fn from(value: &'a mut [T]) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Reference(value),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+impl<'a, T> From<Vec<T>> for HerculesMutBox<'a, T> {
+    fn from(value: Vec<T>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Allocation(value),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<'a, T> {
+    fn from(value: HerculesCPURef<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Allocation(value.as_slice().to_vec()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> {
+    fn from(value: HerculesCPURefMut<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Reference(value.as_slice()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<'a, T> {
+    fn from(value: HerculesCUDARef<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CUDA,
+            cpu_alloc: Allocation::None,
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::Allocation(CUDABox::from_cuda_ref(value)),
+        }
+    }
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
+    fn from(value: HerculesCUDARefMut<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CUDA,
+            cpu_alloc: Allocation::None,
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::Reference(value),
+        }
+    }
+impl<'a, T> HerculesMutBox<'a, T>
+    T: Default + Clone
+    pub fn as_slice(&'a mut self) -> &'a mut [T] {
+        self.as_cpu_ref().as_slice()
+    }
+    pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> {
+        match self.loc {
+            HerculesMutBoxLocation::CPU => {
+                match self.cpu_alloc {
+                    Allocation::None => panic!("No CPU reference"),
+                    Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val),
+                    Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val),
+                }
+            }
+            #[cfg(feature = "cuda")]
+            HerculesMutBoxLocation::CUDA => {
+                let cuda_ref : HerculesCUDARef<'a> =
+                    match self.cuda_alloc {
+                        Allocation::None => panic!("No GPU reference"),
+                        Allocation::Reference(ref mut val) => val.dup().as_ref(),
+                        Allocation::Allocation(ref val) => val.get_ref(),
+                    };
+                let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
+                // Allocate host memory (if needed)
+                let cpu_alloc : Allocation<&'a mut [T], Vec<T>> =
+                    match self.cpu_alloc.take() {
+                        Allocation::Reference(val)  if val.len() == elements => Allocation::Reference(val),
+                        Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val),
+                        _ => {
+                            let mut alloc = Vec::new();
+                            alloc.resize_with(elements, Default::default);
+                            Allocation::Allocation(alloc)
+                        }
+                    };
+                self.cpu_alloc = cpu_alloc;
+                let cpu_ref : &'a mut [T] =
+                    match &mut self.cpu_alloc {
+                        Allocation::None => panic!(),
+                        Allocation::Reference(val)  => val,
+                        Allocation::Allocation(val) => val,
+                    };
+                // Transfer data from CUDA device
+                let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref);
+                self.loc = HerculesMutBoxLocation::CPU;
+                cpu_ref
+            }
+        }
+    }
+    #[cfg(feature = "cuda")]
+    pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> {
+        match self.loc {
+            HerculesMutBoxLocation::CPU => {
+                let cpu_ref : &'a [T] =
+                    match self.cpu_alloc {
+                        Allocation::None => panic!("No CPU reference"),
+                        Allocation::Reference(ref val) => val,
+                        Allocation::Allocation(ref val) => val,
+                    };
+                let size = cpu_ref.len() * size_of::<T>();
+                let (cuda_alloc, copied) =
+                    match self.cuda_alloc.take() {
+                        Allocation::Reference(val)  if unsafe { val.__size() == size } => (Allocation::Reference(val), false),
+                        Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false),
+                        _ => {
+                            let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref));
+                            (Allocation::Allocation(alloc), true)
+                        }
+                    };
+                self.cuda_alloc = cuda_alloc;
+                let cuda_ref =
+                    match self.cuda_alloc {
+                        Allocation::None => panic!(),
+                        Allocation::Reference(ref mut val) => val.dup(),
+                        Allocation::Allocation(ref mut val) => val.get_ref_mut(),
+                    };
+                if !copied {
+                    unsafe {
+                        __copy_cpu_to_cuda(cuda_ref.__ptr(), cpu_ref.as_ptr() as *mut u8, size);
+                    }
+                }
+                self.loc = HerculesMutBoxLocation::CUDA;
+                cuda_ref
+            }
+            HerculesMutBoxLocation::CUDA => {
+                match self.cuda_alloc {
+                    Allocation::None => panic!("No GPU reference"),
+                    Allocation::Reference(ref mut val) => val.dup(),
+                    Allocation::Allocation(ref mut val) => val.get_ref_mut(),
+                }
+            }
+        }
+    }
+pub trait HerculesImmBoxTo<'a, T> {
+    fn to(&'a self) -> T;
+impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
+where T: Default + Clone
+    fn to(&'a self) -> HerculesCPURef<'a> {
+        self.as_cpu_ref()
+    }
+#[cfg(feature = "cuda")]
+impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
+where T: Default + Clone
+    fn to(&'a self) -> HerculesCUDARef<'a> {
+        self.as_cuda_ref()
+    }
+pub trait HerculesMutBoxTo<'a, T> {
+    fn to(&'a mut self) -> T;
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
+where T: Default + Clone
+    fn to(&'a mut self) -> HerculesCPURefMut<'a> {
+        self.as_cpu_ref()
+    }
+#[cfg(feature = "cuda")]
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
+where T: Default + Clone
+    fn to(&'a mut self) -> HerculesCUDARefMut<'a> {
+        self.as_cuda_ref()
+    }
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index 8368a74f42d2a795d9febfb42ec5cf1707958772..b4a0f6fd7652fe35e0a3177c07cf2782a26e17a4 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -8,9 +8,7 @@ use self::camera_model::*;
 use self::cava_rust::CHAN;
 use self::image_proc::*;
-#[cfg(feature = "cuda")]
-use hercules_rt::CUDABox;
-use hercules_rt::{runner, HerculesCPURef};
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
 use image::ImageError;
@@ -18,6 +16,30 @@ use clap::Parser;
+// Individual lifetimes are not needed in this example but should probably be generated for
+// flexibility
+async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a, 'g: 'a>(
+    runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64,
+    input: &'b HerculesImmBox<'b, u8>, tstw: &'c HerculesImmBox<'c, f32>,
+    ctrl_pts: &'d HerculesImmBox<'d, f32>, weights: &'e HerculesImmBox<'e, f32>,
+    coefs: &'f HerculesImmBox<'f, f32>, tonemap: &'g HerculesImmBox<'g, f32>,
+) -> HerculesMutBox<'a, u8> {
+    HerculesMutBox::from(
+        runner.run(
+            r,
+            c,
+            num_ctrl_pts,
+            input.to(),
+            tstw.to(),
+            ctrl_pts.to(),
+            weights.to(),
+            coefs.to(),
+            tonemap.to()
+        )
+        .await
+    )
 fn run_cava(
     rows: usize,
     cols: usize,
@@ -36,62 +58,32 @@ fn run_cava(
     assert_eq!(coefs.len(), 4 * CHAN);
     assert_eq!(tonemap.len(), 256 * CHAN);
-    #[cfg(not(feature = "cuda"))]
-    {
-        let image = HerculesCPURef::from_slice(image);
-        let tstw = HerculesCPURef::from_slice(tstw);
-        let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts);
-        let weights = HerculesCPURef::from_slice(weights);
-        let coefs = HerculesCPURef::from_slice(coefs);
-        let tonemap = HerculesCPURef::from_slice(tonemap);
-        let mut r = runner!(cava);
-        async_std::task::block_on(async {
-            r.run(
-                rows as u64,
-                cols as u64,
-                num_ctrl_pts as u64,
-                image,
-                tstw,
-                ctrl_pts,
-                weights,
-                coefs,
-                tonemap,
-            )
-            .await
-        })
-        .as_slice::<u8>()
-        .to_vec()
-        .into_boxed_slice()
-    }
-    #[cfg(feature = "cuda")]
-    {
-        let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image));
-        let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw));
-        let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts));
-        let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
-        let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
-        let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
-        let mut r = runner!(cava);
-        let res = async_std::task::block_on(async {
-            r.run(
-                rows as u64,
-                cols as u64,
-                num_ctrl_pts as u64,
-                image.get_ref(),
-                tstw.get_ref(),
-                ctrl_pts.get_ref(),
-                weights.get_ref(),
-                coefs.get_ref(),
-                tonemap.get_ref(),
-            )
-            .await
-        });
-        let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
-        let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
-        res.to_cpu_ref(&mut res_cpu);
-        res_cpu
-    }
+    let image = HerculesImmBox::from(image);
+    let tstw = HerculesImmBox::from(tstw);
+    let ctrl_pts = HerculesImmBox::from(ctrl_pts);
+    let weights = HerculesImmBox::from(weights);
+    let coefs = HerculesImmBox::from(coefs);
+    let tonemap = HerculesImmBox::from(tonemap);
+    let mut r = runner!(cava);
+    async_std::task::block_on(async {
+        safe_run(&mut r,
+                 rows as u64,
+                 cols as u64,
+                 num_ctrl_pts as u64,
+                 &image,
+                 &tstw,
+                 &ctrl_pts,
+                 &weights,
+                 &coefs,
+                 &tonemap,
+        )
+        .await
+    })
+    .as_slice()
+    .to_vec()
+    .into_boxed_slice()
 enum Error {
diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
index 3b067ebd0c74ba4fe4b1cd4a39cf4f0b0c8b46cd..60ccb51565bdaa6d0f1837385a9de7ac52dc0128 100644
--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -2,9 +2,7 @@
 mod edge_detection_rust;
-#[cfg(feature = "cuda")]
-use hercules_rt::CUDABox;
-use hercules_rt::{runner, HerculesCPURef};
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
 use std::slice::from_raw_parts;
@@ -86,6 +84,39 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat {
+async fn safe_run<'a, 'b, 'c, 'd, 'e, 'f>(
+    runner: &'a mut HerculesRunner_edge_detection,
+    n: u64,
+    m: u64,
+    gs: u64,
+    sz: u64,
+    sb: u64,
+    input: &'b HerculesImmBox<'b, f32>,
+    gaussian_filter: &'c HerculesImmBox<'c, f32>,
+    structure: &'d HerculesImmBox<'d, f32>,
+    sx: &'e HerculesImmBox<'e, f32>,
+    sy: &'f HerculesImmBox<'f, f32>,
+    theta: f32,
+) -> HerculesMutBox<'a, f32> {
+    HerculesMutBox::from(
+        runner
+            .run(
+                n,
+                m,
+                gs,
+                sz,
+                sb,
+                input.to(),
+                gaussian_filter.to(),
+                structure.to(),
+                sx.to(),
+                sy.to(),
+                theta,
+            )
+            .await,
+    )
 fn edge_detection_harness(args: EdgeDetectionInputs) {
     let EdgeDetectionInputs {
@@ -106,38 +137,18 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
         0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, 0.000036, 0.000363, 0.001446,
         0.002291, 0.001446, 0.000363, 0.000036,
-    #[cfg(not(feature = "cuda"))]
-    let gaussian_filter_h = HerculesCPURef::from_slice(&gaussian_filter);
-    #[cfg(feature = "cuda")]
-    let gaussian_filter_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&gaussian_filter));
-    #[cfg(feature = "cuda")]
-    let gaussian_filter_h = gaussian_filter_cuda.get_ref();
+    let gaussian_filter_h = HerculesImmBox::from(gaussian_filter.as_slice());
     let sz: usize = 3;
     let structure: Vec<f32> = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
-    #[cfg(not(feature = "cuda"))]
-    let structure_h = HerculesCPURef::from_slice(&structure);
-    #[cfg(feature = "cuda")]
-    let structure_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&structure));
-    #[cfg(feature = "cuda")]
-    let structure_h = structure_cuda.get_ref();
+    let structure_h = HerculesImmBox::from(structure.as_slice());
     let sb: usize = 3;
     let sx: Vec<f32> = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0];
-    #[cfg(not(feature = "cuda"))]
-    let sx_h = HerculesCPURef::from_slice(&sx);
-    #[cfg(feature = "cuda")]
-    let sx_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sx));
-    #[cfg(feature = "cuda")]
-    let sx_h = sx_cuda.get_ref();
+    let sx_h = HerculesImmBox::from(sx.as_slice());
     let sy: Vec<f32> = vec![-1.0, -2.0, -1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0];
-    #[cfg(not(feature = "cuda"))]
-    let sy_h = HerculesCPURef::from_slice(&sy);
-    #[cfg(feature = "cuda")]
-    let sy_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sy));
-    #[cfg(feature = "cuda")]
-    let sy_h = sy_cuda.get_ref();
+    let sy_h = HerculesImmBox::from(sy.as_slice());
     let theta: f32 = 0.1;
@@ -203,39 +214,27 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
         let input = unsafe { from_raw_parts(ptr, height * width) };
-        #[cfg(not(feature = "cuda"))]
-        let input_h = HerculesCPURef::from_slice(input);
-        #[cfg(feature = "cuda")]
-        let input_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(input));
-        #[cfg(feature = "cuda")]
-        let input_h = input_cuda.get_ref();
+        let input_h = HerculesImmBox::from(input);
         let result = async_std::task::block_on(async {
-            r.run(
+            safe_run(
+                &mut r,
                 height as u64,
                 width as u64,
                 gs as u64,
                 sz as u64,
                 sb as u64,
-                input_h,
-                gaussian_filter_h.clone(),
-                structure_h.clone(),
-                sx_h.clone(),
-                sy_h.clone(),
+                &input_h,
+                &gaussian_filter_h,
+                &structure_h,
+                &sx_h,
+                &sy_h,
-        });
-        #[cfg(not(feature = "cuda"))]
-        let result: Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
-        #[cfg(feature = "cuda")]
-        let result: Box<[f32]> = {
-            let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() };
-            let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice();
-            result.to_cpu_ref(&mut res_cpu);
-            res_cpu
-        };
+        })
+        .as_slice()
+        .to_vec();
         if display {
             let result = frame_from_slice(&result, height, width);
@@ -261,10 +260,7 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
-            assert_eq!(
-                result.as_ref(),
-                <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result)
-            );
+            assert_eq!(result, rust_result);
             println!("Frames {} match", i);
             if display_verify {