From 70b71cfed7283dc00c4d2ace97feaa7b1de98eac Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Wed, 5 Feb 2025 09:42:45 -0600
Subject: [PATCH 01/11] First attempt at new HerculesBox

---
 hercules_rt/src/lib.rs | 306 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)

diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index 12b64fa3..e99f12e3 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -330,6 +330,10 @@ impl CUDABox {
             _phantom: PhantomData,
         }
     }
+
+    pub fn get_bytes(&self) -> usize {
+        self.size
+    }
 }
 
 #[cfg(feature = "cuda")]
@@ -367,3 +371,305 @@ impl __RawPtrSendSync {
 
 unsafe impl Send for __RawPtrSendSync {}
 unsafe impl Sync for __RawPtrSendSync {}
+
+/*
+ * A HerculesBox holds memory that can be on any device and provides a common interface to moving
+ * data where it is needed.
+ *
+ * It can hold CPU and device allocations to basically point at the memory it represents. It can
+ * also hold Hercules references either to those allocations it holds or to other allocations not
+ * held by this Box (in which case the appropriate allocation field should be None).
+ *
+ * The data held at all of its non-None allocations and references is maintained so that it is the
+ * same, and so methods will attempt to use the reference or allocation that is most convenient.
+ * 
+ * When mutable references are required of this Box, it will allocate space on the required device
+ * and eliminate any references or allocations it had on other devices since it must assume its
+ * data is modified through the mutable reference.
+ */
+pub struct HerculesBox<'a, T> {
+    cpu_alloc: Option<Vec<T>>,
+    cpu_ref: Option<HerculesCPURef<'a>>,
+
+    #[cfg(feature = "cuda")]
+    cuda_alloc: Option<CudaBox>,
+    #[cfg(feature = "cuda")]
+    cuda_ref: Option<HerculesCUDARef<'a>>,
+}
+
+impl<'a, T> From<&'a [T]> for HerculesBox<'a, T> {
+    fn from(value: &'a [T]) -> Self {
+        HerculesBox {
+            cpu_alloc: None,
+            cpu_ref: Some(HerculesCPURef::from_slice(value)),
+
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_ref: None,
+        }
+    }
+}
+
+impl<'a, T> From<HerculesCPURef<'a>> for HerculesBox<'a, T> {
+    fn from(value: HerculesCPURef<'a>) -> Self {
+        HerculesBox {
+            cpu_alloc: None,
+            cpu_ref: Some(value),
+
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_ref: None,
+        }
+    }
+}
+
+// If we are building from a mutable reference, we demote that to a non-mutable reference since we
+// don't hold mutable references. This means if we construct a box and then request a mutable
+// reference from it that we will copy the data into a new allocation and then return a reference
+// to that.
+impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesBox<'a, T> {
+    fn from(value: HerculesCPURefMut<'a>) -> Self {
+        HerculesBox {
+            cpu_alloc: None,
+            cpu_ref: Some(value.as_ref()),
+
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_ref: None,
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesBox<'a, T> {
+    fn from(value: HerculesCUDARef<'a>) -> Self {
+        HerculesBox {
+            cpu_alloc: None,
+            cpu_ref: None,
+            cpu_ref_mut: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_ref: Some(value),
+            #[cfg(feature = "cuda")]
+            cuda_ref_mut: None,
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesBox<'a, T> {
+    fn from(value: HerculesCUDARefMut<'a>) -> Self {
+        HerculesBox {
+            cpu_alloc: None,
+            cpu_ref: None,
+            cpu_ref_mut: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_ref: None,
+            #[cfg(feature = "cuda")]
+            cuda_ref_mut: Some(value),
+        }
+    }
+}
+
+impl<'a, T> HerculesBox<'a, T> 
+where 
+    T: Default + Clone
+{
+    fn as_slice(&'a mut self) -> &'a [T] {
+        self.as_cpu_ref().as_slice()
+    }
+
+    fn as_slice_mut(&'a mut self) -> &'a mut [T] {
+        self.as_cpu_ref_mut().as_slice()
+    }
+
+    fn as_cpu_ref(&'a mut self) -> HerculesCPURef<'a> {
+        if self.cpu_ref.is_some() {
+            self.cpu_ref.clone().unwrap()
+        } else if self.cpu_alloc.is_some() {
+            // This could occur if a mutable reference had been created from this box
+            let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice());
+            self.cpu_ref = Some(cpu_ref.clone());
+            cpu_ref
+        } else {
+            #[cfg(feature = "cuda")]
+            {
+                let cuda_ref =
+                    if self.cuda_ref.is_some() {
+                        Some(self.cuda_ref.clone().unwrap())
+                    } else if self.cuda_alloc.is_some() {
+                        Some(self.cuda_alloc.as_ref().unwrap().get_ref())
+                    } else {
+                        None
+                    };
+
+                if let Some(cuda_ref) = cuda_ref {
+                    let elements = cuda_ref.__size() / size_of::<T>();
+
+                    // Transfer memory back to CPU using our cpu_alloc
+                    let mut alloc = Vec::new();
+                    alloc.resize_with(elements, Default::default);
+                    self.cpu_alloc = Some(alloc);
+                    
+                    let alloc = self.cpu_alloc.as_mut().unwrap();
+                    assert!(alloc.len() == elements);
+
+                    let cpu_ref = cuda_ref.to_cpu_ref(alloc);
+                    self.cpu_ref = Some(cpu_ref.clone());
+                    return cpu_ref;
+                }
+            }
+
+            panic!("HerculesBox has no reference to data")
+        }
+    }
+
+    fn as_cpu_ref_mut(&'a mut self) -> HerculesCPURefMut<'a> {
+        // If we have a CPU allocation already, we'll use that and we don't need to make any copies
+        if self.cpu_alloc.is_some() {
+            // Eliminate any other references/allocations since the data may be modified
+            self.cpu_ref = None;
+            #[cfg(feature = "cuda")]
+            {
+                // TODO: We don't actually need to de-allocate our other allocations, just mark
+                // them so we know they don't hold the correct data
+                self.cuda_alloc = None;
+                self.cuda_ref = None;
+            }
+
+            HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap().as_mut())
+        } else if self.cpu_ref.is_some() {
+            // The data is in CPU memory, but we don't have exclusive access to it, so we need to
+            // copy it
+            let slice = self.cpu_ref.take().unwrap().as_slice();
+
+            #[cfg(feature = "cuda")]
+            {
+                self.cuda_alloc = None; // TODO
+                self.cuda_ref = None;
+            }
+
+            self.cpu_alloc = Some(slice.to_vec());
+
+            HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap().as_mut())
+        } else {
+            #[cfg(feature = "cuda")]
+            {
+                let cuda_ref =
+                    if self.cuda_ref.is_some() {
+                        Some(self.cuda_ref.clone().unwrap())
+                    } else if self.cuda_alloc.is_some() {
+                        Some(self.cuda_alloc.as_ref().unwrap().get_ref())
+                    } else {
+                        None
+                    };
+
+                if let Some(cuda_ref) = cuda_ref {
+                    let elements = cuda_ref.__size() / size_of::<T>();
+
+                    // Transfer memory back to CPU using our cpu_alloc
+                    let mut alloc = Vec::new();
+                    alloc.resize_with(elements, Default::default);
+                    self.cpu_alloc = Some(alloc);
+
+                    let alloc = self.cpu_alloc.as_mut().unwrap();
+                    assert!(alloc.len() == elements);
+
+                    let cpu_ref = cuda_ref.to_cpu_ref(alloc);
+
+                    // Eliminate other references
+                    self.cpu_ref = None;
+                    self.cuda_alloc = None; // TODO
+                    self.cuda_ref = None;
+
+                    return cpu_ref;
+                }
+            }
+
+            panic!("HerculesBox has no reference to data")
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    fn as_cuda_ref(&'a mut self) -> HerculesCUDARef<'a> {
+        if self.cuda_ref.is_some() {
+            self.cuda_ref.clone().unwrap()
+        } else if self.cuda_alloc.is_some() {
+            // This could occur if a mutable reference had been created from this box
+            let cuda_ref = self.cuda_alloc.as_ref().unwrap().get_ref();
+            self.cuda_ref = Some(cuda_ref.clone());
+            cuda_ref
+        } else {
+            let cpu_ref =
+                if self.cpu_ref.is_some() {
+                    Some(cpu_ref.clone().unwrap())
+                } else if self.cpu_alloc.is_some() {
+                    Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice()))
+                } else {
+                    None
+                };
+
+            if let Some(cpu_ref) = cpu_ref {
+                // Copy data to CUDA device
+                self.cuda_alloc = Some(CUDABox::from_cpu_ref(cpu_ref));
+                let alloc = self.cuda_alloc.as_ref().unwrap();
+
+                let cuda_ref = alloc.get_ref();
+                self.cuda_ref = Some(cuda_ref.clone());
+                return cuda_ref;
+            }
+
+            panic!("HerculesBox has no reference to data")
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    fn as_cuda_ref_mut(&'a mut self) -> HerculesCUDARefMut<'a> {
+        if self.cuda_alloc.is_some() {
+            self.cpu_alloc = None; // TODO
+            self.cpu_ref = None;
+            self.cuda_ref = None;
+
+            self.cuda_alloc.as_mut().unwrap().get_ref_mut()
+        } else if self.cuda_ref.is_some() {
+            // The data is in CUDA memory, but we don't have exclusive access to it, so we need to
+            // copy it
+            let cuda_alloc = CUDABox::from_cuda_ref(self.cuda_ref.take().unwrap());
+            self.cuda_alloc = Some(cuda_alloc);
+
+            self.cpu_alloc = None; // TODO
+            self.cpu_ref = None;
+
+            self.cuda_alloc.as_mut().unwrap().get_ref_mut()
+        } else {
+            let cpu_ref =
+                if self.cpu_ref.is_some() {
+                    Some(cpu_ref.clone().unwrap())
+                } else if self.cpu_alloc.is_some() {
+                    Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice()))
+                } else {
+                    None
+                };
+
+            if let Some(cpu_ref) = cpu_ref {
+                // Copy data to CUDA device
+                self.cuda_alloc = Some(CUDABox::from_cpu_ref(cpu_ref));
+
+                self.cpu_alloc = None; // TODO
+                self.cpu_ref = None;
+
+                return self.cuda_alloc.as_mut().unwrap().get_ref_mut();
+            }
+
+            panic!("HerculesBox has no reference to data")
+        }
+    }
+}
-- 
GitLab


From c076928b463f57acff0efd783d996d1424a08e91 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Wed, 5 Feb 2025 09:48:23 -0600
Subject: [PATCH 02/11] Fix HerculesBox

---
 hercules_rt/src/lib.rs | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index e99f12e3..10985cc7 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -392,7 +392,7 @@ pub struct HerculesBox<'a, T> {
     cpu_ref: Option<HerculesCPURef<'a>>,
 
     #[cfg(feature = "cuda")]
-    cuda_alloc: Option<CudaBox>,
+    cuda_alloc: Option<CUDABox>,
     #[cfg(feature = "cuda")]
     cuda_ref: Option<HerculesCUDARef<'a>>,
 }
@@ -449,14 +449,11 @@ impl<'a, T> From<HerculesCUDARef<'a>> for HerculesBox<'a, T> {
         HerculesBox {
             cpu_alloc: None,
             cpu_ref: None,
-            cpu_ref_mut: None,
 
             #[cfg(feature = "cuda")]
             cuda_alloc: None,
             #[cfg(feature = "cuda")]
             cuda_ref: Some(value),
-            #[cfg(feature = "cuda")]
-            cuda_ref_mut: None,
         }
     }
 }
@@ -467,14 +464,11 @@ impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesBox<'a, T> {
         HerculesBox {
             cpu_alloc: None,
             cpu_ref: None,
-            cpu_ref_mut: None,
 
             #[cfg(feature = "cuda")]
             cuda_alloc: None,
             #[cfg(feature = "cuda")]
-            cuda_ref: None,
-            #[cfg(feature = "cuda")]
-            cuda_ref_mut: Some(value),
+            cuda_ref: Some(value.as_ref()),
         }
     }
 }
@@ -610,7 +604,7 @@ where
         } else {
             let cpu_ref =
                 if self.cpu_ref.is_some() {
-                    Some(cpu_ref.clone().unwrap())
+                    Some(self.cpu_ref.clone().unwrap())
                 } else if self.cpu_alloc.is_some() {
                     Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice()))
                 } else {
@@ -652,7 +646,7 @@ where
         } else {
             let cpu_ref =
                 if self.cpu_ref.is_some() {
-                    Some(cpu_ref.clone().unwrap())
+                    Some(self.cpu_ref.clone().unwrap())
                 } else if self.cpu_alloc.is_some() {
                     Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice()))
                 } else {
-- 
GitLab


From 4694aae1fdea13f503133e5ebd168b3abb8d3f3a Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Wed, 5 Feb 2025 14:34:05 -0600
Subject: [PATCH 03/11] Fix immutable boxes and make them thread safe

---
 hercules_rt/src/lib.rs                  | 325 ++++++++----------------
 juno_samples/cava/src/main.rs           |  74 ++----
 juno_samples/edge_detection/src/main.rs |  90 ++-----
 3 files changed, 158 insertions(+), 331 deletions(-)

diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index 10985cc7..6d03bd25 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -1,8 +1,12 @@
+#![feature(once_cell_try)]
+
 use std::alloc::{alloc, dealloc, Layout};
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, write_bytes, NonNull};
 use std::slice::{from_raw_parts, from_raw_parts_mut};
 
+use std::sync::OnceLock;
+
 /*
  * Define supporting types, functions, and macros for Hercules RT functions. For
  * a more in-depth discussion of the design of these utilities, see hercules_cg/
@@ -383,287 +387,178 @@ unsafe impl Sync for __RawPtrSendSync {}
  * The data held at all of its non-None allocations and references is maintained so that it is the
  * same, and so methods will attempt to use the reference or allocation that is most convenient.
  * 
- * When mutable references are required of this Box, it will allocate space on the required device
- * and eliminate any references or allocations it had on other devices since it must assume its
- * data is modified through the mutable reference.
+ * HerculesImmBox hold references to immutable memory only. All operations on these is through
+ * immutable references, though internally it uses OnceLocks to protect its resources since the Box
+ * may be used in multiple parallel threads if it is used in parallel Hercules code invocation.
+ * We use OnceLocks since the data is immutable and so once it has been placed on a device movement
+ * is not necessary.
+ *
+ * We maintain the invariant that at least one of the device references is always set, their
+ * associated allocations may or may not be set, as those may not be needed if the allocation is
+ * help elsewhere.
+ *
+ * HerculesMutBox is TODO.
  */
-pub struct HerculesBox<'a, T> {
-    cpu_alloc: Option<Vec<T>>,
-    cpu_ref: Option<HerculesCPURef<'a>>,
-
+pub struct HerculesImmBox<'a, T> {
+    // NOTE: We only need OnceLock if we're allowed to launch multiple Hercules program in
+    // parallel, if that's not necessary we can probably get away with using OnceCell
+    #[allow(dead_code)]
+    cpu_alloc: OnceLock<Vec<T>>,
     #[cfg(feature = "cuda")]
-    cuda_alloc: Option<CUDABox>,
+    cuda_alloc: OnceLock<CUDABox>,
+
+    cpu_ref: OnceLock<HerculesCPURef<'a>>,
     #[cfg(feature = "cuda")]
-    cuda_ref: Option<HerculesCUDARef<'a>>,
+    cuda_ref: OnceLock<HerculesCUDARef<'a>>,
 }
 
-impl<'a, T> From<&'a [T]> for HerculesBox<'a, T> {
+impl<'a, T> From<&'a [T]> for HerculesImmBox<'a, T> {
     fn from(value: &'a [T]) -> Self {
-        HerculesBox {
-            cpu_alloc: None,
-            cpu_ref: Some(HerculesCPURef::from_slice(value)),
-
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
             #[cfg(feature = "cuda")]
-            cuda_alloc: None,
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(HerculesCPURef::from_slice(value)),
             #[cfg(feature = "cuda")]
-            cuda_ref: None,
+            cuda_ref: OnceLock::new(),
         }
     }
 }
 
-impl<'a, T> From<HerculesCPURef<'a>> for HerculesBox<'a, T> {
+impl<'a, T> From<HerculesCPURef<'a>> for HerculesImmBox<'a, T> {
     fn from(value: HerculesCPURef<'a>) -> Self {
-        HerculesBox {
-            cpu_alloc: None,
-            cpu_ref: Some(value),
-
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
             #[cfg(feature = "cuda")]
-            cuda_alloc: None,
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(value),
             #[cfg(feature = "cuda")]
-            cuda_ref: None,
+            cuda_ref: OnceLock::new(),
         }
     }
 }
 
 // If we are building from a mutable reference, we demote that to a non-mutable reference since we
-// don't hold mutable references. This means if we construct a box and then request a mutable
-// reference from it that we will copy the data into a new allocation and then return a reference
-// to that.
-impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesBox<'a, T> {
+// don't hold mutable references.
+impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesImmBox<'a, T> {
     fn from(value: HerculesCPURefMut<'a>) -> Self {
-        HerculesBox {
-            cpu_alloc: None,
-            cpu_ref: Some(value.as_ref()),
-
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
             #[cfg(feature = "cuda")]
-            cuda_alloc: None,
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::from(value.as_ref()),
             #[cfg(feature = "cuda")]
-            cuda_ref: None,
+            cuda_ref: OnceLock::new(),
         }
     }
 }
 
 #[cfg(feature = "cuda")]
-impl<'a, T> From<HerculesCUDARef<'a>> for HerculesBox<'a, T> {
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesImmBox<'a, T> {
     fn from(value: HerculesCUDARef<'a>) -> Self {
-        HerculesBox {
-            cpu_alloc: None,
-            cpu_ref: None,
-
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
             #[cfg(feature = "cuda")]
-            cuda_alloc: None,
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::new(),
             #[cfg(feature = "cuda")]
-            cuda_ref: Some(value),
+            cuda_ref: OnceLock::from(value),
         }
     }
 }
 
 #[cfg(feature = "cuda")]
-impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesBox<'a, T> {
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesImmBox<'a, T> {
     fn from(value: HerculesCUDARefMut<'a>) -> Self {
-        HerculesBox {
-            cpu_alloc: None,
-            cpu_ref: None,
-
+        HerculesImmBox {
+            cpu_alloc: OnceLock::new(),
             #[cfg(feature = "cuda")]
-            cuda_alloc: None,
+            cuda_alloc: OnceLock::new(),
+
+            cpu_ref: OnceLock::new(),
             #[cfg(feature = "cuda")]
-            cuda_ref: Some(value.as_ref()),
+            cuda_ref: OnceLock::from(value.as_ref()),
         }
     }
 }
 
-impl<'a, T> HerculesBox<'a, T> 
+impl<'a, T> HerculesImmBox<'a, T> 
 where 
     T: Default + Clone
 {
-    fn as_slice(&'a mut self) -> &'a [T] {
+    pub fn as_slice(&'a self) -> &'a [T] {
         self.as_cpu_ref().as_slice()
     }
 
-    fn as_slice_mut(&'a mut self) -> &'a mut [T] {
-        self.as_cpu_ref_mut().as_slice()
+    pub fn to_vec(&'a self) -> Vec<T> {
+        Vec::from(self.as_cpu_ref().as_slice())
     }
 
-    fn as_cpu_ref(&'a mut self) -> HerculesCPURef<'a> {
-        if self.cpu_ref.is_some() {
-            self.cpu_ref.clone().unwrap()
-        } else if self.cpu_alloc.is_some() {
-            // This could occur if a mutable reference had been created from this box
-            let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice());
-            self.cpu_ref = Some(cpu_ref.clone());
-            cpu_ref
+    pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> {
+        if let Some(cpu_ref) = self.cpu_ref.get() {
+            cpu_ref.clone()
         } else {
             #[cfg(feature = "cuda")]
-            {
-                let cuda_ref =
-                    if self.cuda_ref.is_some() {
-                        Some(self.cuda_ref.clone().unwrap())
-                    } else if self.cuda_alloc.is_some() {
-                        Some(self.cuda_alloc.as_ref().unwrap().get_ref())
-                    } else {
-                        None
-                    };
-
-                if let Some(cuda_ref) = cuda_ref {
-                    let elements = cuda_ref.__size() / size_of::<T>();
-
-                    // Transfer memory back to CPU using our cpu_alloc
-                    let mut alloc = Vec::new();
-                    alloc.resize_with(elements, Default::default);
-                    self.cpu_alloc = Some(alloc);
-                    
-                    let alloc = self.cpu_alloc.as_mut().unwrap();
-                    assert!(alloc.len() == elements);
-
-                    let cpu_ref = cuda_ref.to_cpu_ref(alloc);
-                    self.cpu_ref = Some(cpu_ref.clone());
-                    return cpu_ref;
-                }
+            if let Some(cuda_ref) = self.cuda_ref.get() {
+                return 
+                    self.cpu_ref.get_or_init(|| {
+                        let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
+
+                        let mut alloc = Vec::new();
+                        alloc.resize_with(elements, Default::default);
+                        let _ = cuda_ref.clone().to_cpu_ref(&mut alloc);
+
+                        self.cpu_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cpu_alloc was set unexpectedly");
+                        let alloc = self.cpu_alloc.get().unwrap();
+                        HerculesCPURef::from_slice(alloc)
+                    }).clone();
             }
 
-            panic!("HerculesBox has no reference to data")
+            panic!("HerculesImmBox has no reference to data")
         }
     }
 
-    fn as_cpu_ref_mut(&'a mut self) -> HerculesCPURefMut<'a> {
-        // If we have a CPU allocation already, we'll use that and we don't need to make any copies
-        if self.cpu_alloc.is_some() {
-            // Eliminate any other references/allocations since the data may be modified
-            self.cpu_ref = None;
-            #[cfg(feature = "cuda")]
-            {
-                // TODO: We don't actually need to de-allocate our other allocations, just mark
-                // them so we know they don't hold the correct data
-                self.cuda_alloc = None;
-                self.cuda_ref = None;
-            }
-
-            HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap().as_mut())
-        } else if self.cpu_ref.is_some() {
-            // The data is in CPU memory, but we don't have exclusive access to it, so we need to
-            // copy it
-            let slice = self.cpu_ref.take().unwrap().as_slice();
-
-            #[cfg(feature = "cuda")]
-            {
-                self.cuda_alloc = None; // TODO
-                self.cuda_ref = None;
-            }
-
-            self.cpu_alloc = Some(slice.to_vec());
-
-            HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap().as_mut())
+    #[cfg(feature = "cuda")]
+    pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> {
+        if let Some(cuda_ref) = self.cuda_ref.get() {
+            cuda_ref.clone()
         } else {
-            #[cfg(feature = "cuda")]
-            {
-                let cuda_ref =
-                    if self.cuda_ref.is_some() {
-                        Some(self.cuda_ref.clone().unwrap())
-                    } else if self.cuda_alloc.is_some() {
-                        Some(self.cuda_alloc.as_ref().unwrap().get_ref())
-                    } else {
-                        None
-                    };
-
-                if let Some(cuda_ref) = cuda_ref {
-                    let elements = cuda_ref.__size() / size_of::<T>();
-
-                    // Transfer memory back to CPU using our cpu_alloc
-                    let mut alloc = Vec::new();
-                    alloc.resize_with(elements, Default::default);
-                    self.cpu_alloc = Some(alloc);
-
-                    let alloc = self.cpu_alloc.as_mut().unwrap();
-                    assert!(alloc.len() == elements);
-
-                    let cpu_ref = cuda_ref.to_cpu_ref(alloc);
-
-                    // Eliminate other references
-                    self.cpu_ref = None;
-                    self.cuda_alloc = None; // TODO
-                    self.cuda_ref = None;
-
-                    return cpu_ref;
-                }
+            if let Some(cpu_ref) = self.cpu_ref.get() {
+                return self.cuda_ref.get_or_init(|| {
+                    // Copy data to CUDA device
+                    let alloc = CUDABox::from_cpu_ref(cpu_ref.clone());
+                    self.cuda_alloc.set(alloc).map_err(|_| ()).expect("HerculesImmBox cuda_alloc was set unexpectedly");
+
+                    self.cuda_alloc.get().unwrap().get_ref()
+                }).clone();
             }
 
-            panic!("HerculesBox has no reference to data")
+            panic!("HerculesImmBox has no reference to data")
         }
     }
+}
 
-    #[cfg(feature = "cuda")]
-    fn as_cuda_ref(&'a mut self) -> HerculesCUDARef<'a> {
-        if self.cuda_ref.is_some() {
-            self.cuda_ref.clone().unwrap()
-        } else if self.cuda_alloc.is_some() {
-            // This could occur if a mutable reference had been created from this box
-            let cuda_ref = self.cuda_alloc.as_ref().unwrap().get_ref();
-            self.cuda_ref = Some(cuda_ref.clone());
-            cuda_ref
-        } else {
-            let cpu_ref =
-                if self.cpu_ref.is_some() {
-                    Some(self.cpu_ref.clone().unwrap())
-                } else if self.cpu_alloc.is_some() {
-                    Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice()))
-                } else {
-                    None
-                };
-
-            if let Some(cpu_ref) = cpu_ref {
-                // Copy data to CUDA device
-                self.cuda_alloc = Some(CUDABox::from_cpu_ref(cpu_ref));
-                let alloc = self.cuda_alloc.as_ref().unwrap();
-
-                let cuda_ref = alloc.get_ref();
-                self.cuda_ref = Some(cuda_ref.clone());
-                return cuda_ref;
-            }
+pub trait HerculesBoxTo<'a, T> {
+    fn to(&'a self) -> T;
+}
 
-            panic!("HerculesBox has no reference to data")
-        }
+impl<'a, T> HerculesBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
+where T: Default + Clone
+{
+    fn to(&'a self) -> HerculesCPURef<'a> {
+        self.as_cpu_ref()
     }
+}
 
-    #[cfg(feature = "cuda")]
-    fn as_cuda_ref_mut(&'a mut self) -> HerculesCUDARefMut<'a> {
-        if self.cuda_alloc.is_some() {
-            self.cpu_alloc = None; // TODO
-            self.cpu_ref = None;
-            self.cuda_ref = None;
-
-            self.cuda_alloc.as_mut().unwrap().get_ref_mut()
-        } else if self.cuda_ref.is_some() {
-            // The data is in CUDA memory, but we don't have exclusive access to it, so we need to
-            // copy it
-            let cuda_alloc = CUDABox::from_cuda_ref(self.cuda_ref.take().unwrap());
-            self.cuda_alloc = Some(cuda_alloc);
-
-            self.cpu_alloc = None; // TODO
-            self.cpu_ref = None;
-
-            self.cuda_alloc.as_mut().unwrap().get_ref_mut()
-        } else {
-            let cpu_ref =
-                if self.cpu_ref.is_some() {
-                    Some(self.cpu_ref.clone().unwrap())
-                } else if self.cpu_alloc.is_some() {
-                    Some(HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap().as_slice()))
-                } else {
-                    None
-                };
-
-            if let Some(cpu_ref) = cpu_ref {
-                // Copy data to CUDA device
-                self.cuda_alloc = Some(CUDABox::from_cpu_ref(cpu_ref));
-
-                self.cpu_alloc = None; // TODO
-                self.cpu_ref = None;
-
-                return self.cuda_alloc.as_mut().unwrap().get_ref_mut();
-            }
-
-            panic!("HerculesBox has no reference to data")
-        }
+#[cfg(feature = "cuda")]
+impl<'a, T> HerculesBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
+where T: Default + Clone
+{
+    fn to(&'a self) -> HerculesCUDARef<'a> {
+        self.as_cuda_ref()
     }
 }
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index e8a7e4e9..9d0f4702 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -8,16 +8,12 @@ use self::camera_model::*;
 use self::cava_rust::CHAN;
 use self::image_proc::*;
 
-#[cfg(feature = "cuda")]
-use hercules_rt::CUDABox;
-use hercules_rt::{runner, HerculesCPURef};
+use hercules_rt::{runner, HerculesImmBox, HerculesBoxTo};
 
 use image::ImageError;
 
 use clap::Parser;
 
-use std::mem;
-
 juno_build::juno!("cava");
 
 fn run_cava(
@@ -38,62 +34,34 @@ fn run_cava(
     assert_eq!(coefs.len(), 4 * CHAN);
     assert_eq!(tonemap.len(), 256 * CHAN);
 
-    #[cfg(not(feature = "cuda"))]
-    {
-        let image = HerculesCPURef::from_slice(image);
-        let tstw = HerculesCPURef::from_slice(tstw);
-        let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts);
-        let weights = HerculesCPURef::from_slice(weights);
-        let coefs = HerculesCPURef::from_slice(coefs);
-        let tonemap = HerculesCPURef::from_slice(tonemap);
-        let mut r = runner!(cava);
+    let image = HerculesImmBox::from(image);
+    let tstw = HerculesImmBox::from(tstw);
+    let ctrl_pts = HerculesImmBox::from(ctrl_pts);
+    let weights = HerculesImmBox::from(weights);
+    let coefs = HerculesImmBox::from(coefs);
+    let tonemap = HerculesImmBox::from(tonemap);
+
+    let mut r = runner!(cava);
+
+    HerculesImmBox::from(
         async_std::task::block_on(async {
             r.run(
                 rows as u64,
                 cols as u64,
                 num_ctrl_pts as u64,
-                image,
-                tstw,
-                ctrl_pts,
-                weights,
-                coefs,
-                tonemap,
+                image.to(),
+                tstw.to(),
+                ctrl_pts.to(),
+                weights.to(),
+                coefs.to(),
+                tonemap.to(),
             )
             .await
         })
-        .as_slice::<u8>()
-        .to_vec()
-        .into_boxed_slice()
-    }
-
-    #[cfg(feature = "cuda")]
-    {
-        let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image));
-        let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw));
-        let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts));
-        let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
-        let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
-        let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
-        let mut r = runner!(cava);
-        let res = async_std::task::block_on(async {
-            r.run(
-                rows as u64,
-                cols as u64,
-                num_ctrl_pts as u64,
-                image.get_ref(),
-                tstw.get_ref(),
-                ctrl_pts.get_ref(),
-                weights.get_ref(),
-                coefs.get_ref(),
-                tonemap.get_ref(),
-            )
-            .await
-        });
-        let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
-        let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
-        res.to_cpu_ref(&mut res_cpu);
-        res_cpu
-    }
+    )
+    .as_slice()
+    .to_vec()
+    .into_boxed_slice()
 }
 
 enum Error {
diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
index 3b067ebd..80a334b7 100644
--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -2,9 +2,7 @@
 
 mod edge_detection_rust;
 
-#[cfg(feature = "cuda")]
-use hercules_rt::CUDABox;
-use hercules_rt::{runner, HerculesCPURef};
+use hercules_rt::{runner, HerculesImmBox, HerculesBoxTo};
 
 use std::slice::from_raw_parts;
 
@@ -106,38 +104,18 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
         0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, 0.000036, 0.000363, 0.001446,
         0.002291, 0.001446, 0.000363, 0.000036,
     ];
-    #[cfg(not(feature = "cuda"))]
-    let gaussian_filter_h = HerculesCPURef::from_slice(&gaussian_filter);
-    #[cfg(feature = "cuda")]
-    let gaussian_filter_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&gaussian_filter));
-    #[cfg(feature = "cuda")]
-    let gaussian_filter_h = gaussian_filter_cuda.get_ref();
+    let gaussian_filter_h = HerculesImmBox::from(gaussian_filter.as_slice());
 
     let sz: usize = 3;
     let structure: Vec<f32> = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
-    #[cfg(not(feature = "cuda"))]
-    let structure_h = HerculesCPURef::from_slice(&structure);
-    #[cfg(feature = "cuda")]
-    let structure_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&structure));
-    #[cfg(feature = "cuda")]
-    let structure_h = structure_cuda.get_ref();
+    let structure_h = HerculesImmBox::from(structure.as_slice());
 
     let sb: usize = 3;
     let sx: Vec<f32> = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0];
-    #[cfg(not(feature = "cuda"))]
-    let sx_h = HerculesCPURef::from_slice(&sx);
-    #[cfg(feature = "cuda")]
-    let sx_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sx));
-    #[cfg(feature = "cuda")]
-    let sx_h = sx_cuda.get_ref();
+    let sx_h = HerculesImmBox::from(sx.as_slice());
 
     let sy: Vec<f32> = vec![-1.0, -2.0, -1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0];
-    #[cfg(not(feature = "cuda"))]
-    let sy_h = HerculesCPURef::from_slice(&sy);
-    #[cfg(feature = "cuda")]
-    let sy_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sy));
-    #[cfg(feature = "cuda")]
-    let sy_h = sy_cuda.get_ref();
+    let sy_h = HerculesImmBox::from(sy.as_slice());
 
     let theta: f32 = 0.1;
 
@@ -203,39 +181,28 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
 
         let input = unsafe { from_raw_parts(ptr, height * width) };
 
-        #[cfg(not(feature = "cuda"))]
-        let input_h = HerculesCPURef::from_slice(input);
-        #[cfg(feature = "cuda")]
-        let input_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(input));
-        #[cfg(feature = "cuda")]
-        let input_h = input_cuda.get_ref();
-
-        let result = async_std::task::block_on(async {
-            r.run(
-                height as u64,
-                width as u64,
-                gs as u64,
-                sz as u64,
-                sb as u64,
-                input_h,
-                gaussian_filter_h.clone(),
-                structure_h.clone(),
-                sx_h.clone(),
-                sy_h.clone(),
-                theta,
+        let input_h = HerculesImmBox::from(input);
+
+        let result =
+            HerculesImmBox::from(
+                async_std::task::block_on(async {
+                    r.run(
+                        height as u64,
+                        width as u64,
+                        gs as u64,
+                        sz as u64,
+                        sb as u64,
+                        input_h.to(),
+                        gaussian_filter_h.to(),
+                        structure_h.to(),
+                        sx_h.to(),
+                        sy_h.to(),
+                        theta,
+                    )
+                    .await
+                })
             )
-            .await
-        });
-
-        #[cfg(not(feature = "cuda"))]
-        let result: Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
-        #[cfg(feature = "cuda")]
-        let result: Box<[f32]> = {
-            let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() };
-            let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice();
-            result.to_cpu_ref(&mut res_cpu);
-            res_cpu
-        };
+            .to_vec();
 
         if display {
             let result = frame_from_slice(&result, height, width);
@@ -261,10 +228,7 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
                 theta,
             );
 
-            assert_eq!(
-                result.as_ref(),
-                <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result)
-            );
+            assert_eq!(result, rust_result);
             println!("Frames {} match", i);
 
             if display_verify {
-- 
GitLab


From 71adbd48c355bd48deda33c34da0c74b72e4f8ea Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Wed, 5 Feb 2025 17:07:58 -0600
Subject: [PATCH 04/11] Add HerculesMutBox

---
 hercules_rt/src/lib.rs                  | 170 +++++++++++++++++++++++-
 juno_samples/cava/src/main.rs           |   2 +-
 juno_samples/edge_detection/src/main.rs |   2 +-
 3 files changed, 166 insertions(+), 8 deletions(-)

diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index 6d03bd25..c287e093 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -397,11 +397,16 @@ unsafe impl Sync for __RawPtrSendSync {}
  * associated allocations may or may not be set, as those may not be needed if the allocation is
  * help elsewhere.
  *
- * HerculesMutBox is TODO.
+ * HerculesMutBox holds memory on some device and can produce mutable references to that data on
+ * on any device. All these operations are through mutable references since this ensures exclusive
+ * access to the Box and therefore to the underlying device memory. Because of the exclusive access
+ * locks are not needed.
+ *
+ * Note that a HerculesMutBox may hold multiple allocations at once, but it tracks the "definitive"
+ * copy to be the one borrowed mutably most recently (since it may have been updated). The extra
+ * allocations are kept around to avoid reallocation if memory is moved back to the device.
  */
 pub struct HerculesImmBox<'a, T> {
-    // NOTE: We only need OnceLock if we're allowed to launch multiple Hercules program in
-    // parallel, if that's not necessary we can probably get away with using OnceCell
     #[allow(dead_code)]
     cpu_alloc: OnceLock<Vec<T>>,
     #[cfg(feature = "cuda")]
@@ -542,11 +547,143 @@ where
     }
 }
 
-pub trait HerculesBoxTo<'a, T> {
+enum HerculesMutBoxLocation {
+    CPU,
+    #[cfg(feature = "cuda")]
+    CUDA,
+}
+
+pub struct HerculesMutBox<T> {
+    loc: HerculesMutBoxLocation,
+
+    cpu_alloc: Option<Vec<T>>,
+    #[cfg(feature = "cuda")]
+    cuda_alloc: Option<CUDABox>,
+}
+
+impl<T: Clone> From<&mut [T]> for HerculesMutBox<T> {
+    fn from(value: &mut [T]) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Some(value.to_vec()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+        }
+    }
+}
+
+impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<T> {
+    fn from(value: HerculesCPURef<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Some(value.as_slice().to_vec()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+        }
+    }
+}
+
+impl<'a, T: Clone> From<HerculesCPURefMut<'a>> for HerculesMutBox<T> {
+    fn from(value: HerculesCPURefMut<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Some(value.as_slice().to_vec()),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: None,
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<T> {
+    fn from(value: HerculesCUDARef<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CUDA,
+            cpu_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Some(CUDABox::from_cuda_ref(value)),
+        }
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<T> {
+    fn from(value: HerculesCUDARefMut<'a>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CUDA,
+            cpu_alloc: None,
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Some(CUDABox::from_cuda_ref(value.as_ref())),
+        }
+    }
+}
+
+impl<T> HerculesMutBox<T>
+where
+    T: Default
+{
+    pub fn as_slice(&mut self) -> &mut [T] {
+        self.as_cpu_ref().as_slice()
+    }
+
+    pub fn to_vec(mut self) -> Vec<T> {
+        // Bring to CPU (if needed)
+        let _ = self.as_cpu_ref();
+        self.cpu_alloc.unwrap()
+    }
+
+    pub fn as_cpu_ref<'a>(&'a mut self) -> HerculesCPURefMut<'a> {
+        match self.loc {
+            HerculesMutBoxLocation::CPU => {
+                HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap())
+            }
+            #[cfg(feature = "cuda")]
+            HerculesMutBoxLocation::CUDA => {
+                let cuda_alloc = self.cuda_alloc.as_ref().unwrap();
+                let elements = cuda_alloc.get_bytes() / size_of::<T>();
+
+                // Allocate host memory (if needed)
+                if self.cpu_alloc.is_none() || self.cpu_alloc.as_ref().unwrap().len() != elements {
+                    let mut alloc = Vec::new();
+                    alloc.resize_with(elements, Default::default);
+                    self.cpu_alloc = Some(alloc);
+                }
+
+                // Transfer data from CUDA device
+                let cpu_alloc = self.cpu_alloc.as_mut().unwrap();
+                let _ = cuda_alloc.get_ref().to_cpu_ref(cpu_alloc);
+
+                self.loc = HerculesMutBoxLocation::CPU;
+                HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap())
+            }
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn as_cuda_ref<'a>(&'a mut self) -> HerculesCUDARefMut<'a> {
+        match self.loc {
+            HerculesMutBoxLocation::CPU => {
+                // TODO: CUDABox does not provide an interface for copying data to it, so currently
+                // we just reallocate it
+                let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap());
+                let cuda_alloc = CUDABox::from_cpu_ref(cpu_ref);
+
+                self.cuda_alloc = Some(cuda_alloc);
+                self.loc = HerculesMutBoxLocation::CUDA;
+                self.cuda_alloc.as_mut().unwrap().get_ref_mut()
+            }
+            HerculesMutBoxLocation::CUDA => {
+                self.cuda_alloc.as_mut().unwrap().get_ref_mut()
+            }
+        }
+    }
+}
+
+pub trait HerculesImmBoxTo<'a, T> {
     fn to(&'a self) -> T;
 }
 
-impl<'a, T> HerculesBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
+impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
 where T: Default + Clone
 {
     fn to(&'a self) -> HerculesCPURef<'a> {
@@ -555,10 +692,31 @@ where T: Default + Clone
 }
 
 #[cfg(feature = "cuda")]
-impl<'a, T> HerculesBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
+impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
 where T: Default + Clone
 {
     fn to(&'a self) -> HerculesCUDARef<'a> {
         self.as_cuda_ref()
     }
 }
+
+pub trait HerculesMutBoxTo<'a, T> {
+    fn to(&'a mut self) -> T;
+}
+
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<T>
+where T: Default + Clone
+{
+    fn to(&'a mut self) -> HerculesCPURefMut<'a> {
+        self.as_cpu_ref()
+    }
+}
+
+#[cfg(feature = "cuda")]
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<T>
+where T: Default + Clone
+{
+    fn to(&'a mut self) -> HerculesCUDARefMut<'a> {
+        self.as_cuda_ref()
+    }
+}
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index 9d0f4702..18024a0f 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -8,7 +8,7 @@ use self::camera_model::*;
 use self::cava_rust::CHAN;
 use self::image_proc::*;
 
-use hercules_rt::{runner, HerculesImmBox, HerculesBoxTo};
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
 
 use image::ImageError;
 
diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
index 80a334b7..c74e2098 100644
--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -2,7 +2,7 @@
 
 mod edge_detection_rust;
 
-use hercules_rt::{runner, HerculesImmBox, HerculesBoxTo};
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
 
 use std::slice::from_raw_parts;
 
-- 
GitLab


From 573d79f910f7bcdfbf82420d486e1677e08b7741 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Thu, 6 Feb 2025 08:56:55 -0600
Subject: [PATCH 05/11] Add example of safe runner interface

---
 juno_samples/cava/src/main.rs | 52 ++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index 18024a0f..2c5ddc13 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -16,6 +16,28 @@ use clap::Parser;
 
 juno_build::juno!("cava");
 
+async fn safe_run<'a>(
+    runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64,
+    input: &'a HerculesImmBox<'a, u8>, tstw: &'a HerculesImmBox<'a, f32>,
+    ctrl_pts: &'a HerculesImmBox<'a, f32>, weights: &'a HerculesImmBox<'a, f32>,
+    coefs: &'a HerculesImmBox<'a, f32>, tonemap: &'a HerculesImmBox<'a, f32>,
+) -> HerculesImmBox<'a, u8> {
+    HerculesImmBox::from(
+        runner.run(
+            r,
+            c,
+            num_ctrl_pts,
+            input.to(),
+            tstw.to(),
+            ctrl_pts.to(),
+            weights.to(),
+            coefs.to(),
+            tonemap.to()
+        )
+        .await
+    )
+}
+
 fn run_cava(
     rows: usize,
     cols: usize,
@@ -43,22 +65,20 @@ fn run_cava(
 
     let mut r = runner!(cava);
 
-    HerculesImmBox::from(
-        async_std::task::block_on(async {
-            r.run(
-                rows as u64,
-                cols as u64,
-                num_ctrl_pts as u64,
-                image.to(),
-                tstw.to(),
-                ctrl_pts.to(),
-                weights.to(),
-                coefs.to(),
-                tonemap.to(),
-            )
-            .await
-        })
-    )
+    async_std::task::block_on(async {
+        safe_run(&mut r,
+                 rows as u64,
+                 cols as u64,
+                 num_ctrl_pts as u64,
+                 &image,
+                 &tstw,
+                 &ctrl_pts,
+                 &weights,
+                 &coefs,
+                 &tonemap,
+        )
+        .await
+    })
     .as_slice()
     .to_vec()
     .into_boxed_slice()
-- 
GitLab


From 3693e9c9a90fce83c156906b01bbbea007f42fd9 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Thu, 6 Feb 2025 12:14:03 -0600
Subject: [PATCH 06/11] Improve HerculesMutBox. Edge detection safe interface
 issues

---
 hercules_rt/src/lib.rs                  | 176 +++++++++++++++++-------
 juno_samples/cava/src/main.rs           |   6 +-
 juno_samples/edge_detection/src/main.rs |  64 ++++++---
 3 files changed, 174 insertions(+), 72 deletions(-)

diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index c287e093..f8fdf2ef 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -282,6 +282,14 @@ impl<'a> HerculesCUDARefMut<'a> {
         }
     }
 
+    pub fn dup(&'a mut self) -> Self {
+        HerculesCUDARefMut {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
+        }
+    }
+
     pub unsafe fn __ptr(&self) -> *mut u8 {
         self.ptr.as_ptr()
     }
@@ -553,127 +561,195 @@ enum HerculesMutBoxLocation {
     CUDA,
 }
 
-pub struct HerculesMutBox<T> {
+enum Allocation<R, A> {
+    None,
+    Reference(R),
+    Allocation(A),
+}
+
+impl<R, A> Allocation<R, A> {
+    fn take(&mut self) -> Allocation<R, A> {
+        std::mem::replace(self, Allocation::None)
+    }
+}
+
+pub struct HerculesMutBox<'a, T> {
     loc: HerculesMutBoxLocation,
 
-    cpu_alloc: Option<Vec<T>>,
+    cpu_alloc: Allocation<&'a mut [T], Vec<T>>,
     #[cfg(feature = "cuda")]
-    cuda_alloc: Option<CUDABox>,
+    cuda_alloc: Allocation<HerculesCUDARefMut<'a>, CUDABox>,
 }
 
-impl<T: Clone> From<&mut [T]> for HerculesMutBox<T> {
-    fn from(value: &mut [T]) -> Self {
+impl<'a, T> From<&'a mut [T]> for HerculesMutBox<'a, T> {
+    fn from(value: &'a mut [T]) -> Self {
         HerculesMutBox {
             loc: HerculesMutBoxLocation::CPU,
-            cpu_alloc: Some(value.to_vec()),
+            cpu_alloc: Allocation::Reference(value),
             #[cfg(feature = "cuda")]
-            cuda_alloc: None,
+            cuda_alloc: Allocation::None,
         }
     }
 }
 
-impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<T> {
+impl<'a, T> From<Vec<T>> for HerculesMutBox<'a, T> {
+    fn from(value: Vec<T>) -> Self {
+        HerculesMutBox {
+            loc: HerculesMutBoxLocation::CPU,
+            cpu_alloc: Allocation::Allocation(value),
+            #[cfg(feature = "cuda")]
+            cuda_alloc: Allocation::None,
+        }
+    }
+}
+
+impl<'a, T: Clone> From<HerculesCPURef<'a>> for HerculesMutBox<'a, T> {
     fn from(value: HerculesCPURef<'a>) -> Self {
         HerculesMutBox {
             loc: HerculesMutBoxLocation::CPU,
-            cpu_alloc: Some(value.as_slice().to_vec()),
+            cpu_alloc: Allocation::Allocation(value.as_slice().to_vec()),
             #[cfg(feature = "cuda")]
-            cuda_alloc: None,
+            cuda_alloc: Allocation::None,
         }
     }
 }
 
-impl<'a, T: Clone> From<HerculesCPURefMut<'a>> for HerculesMutBox<T> {
+impl<'a, T> From<HerculesCPURefMut<'a>> for HerculesMutBox<'a, T> {
     fn from(value: HerculesCPURefMut<'a>) -> Self {
         HerculesMutBox {
             loc: HerculesMutBoxLocation::CPU,
-            cpu_alloc: Some(value.as_slice().to_vec()),
+            cpu_alloc: Allocation::Reference(value.as_slice()),
             #[cfg(feature = "cuda")]
-            cuda_alloc: None,
+            cuda_alloc: Allocation::None,
         }
     }
 }
 
 #[cfg(feature = "cuda")]
-impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<T> {
+impl<'a, T> From<HerculesCUDARef<'a>> for HerculesMutBox<'a, T> {
     fn from(value: HerculesCUDARef<'a>) -> Self {
         HerculesMutBox {
             loc: HerculesMutBoxLocation::CUDA,
-            cpu_alloc: None,
+            cpu_alloc: Allocation::None,
             #[cfg(feature = "cuda")]
-            cuda_alloc: Some(CUDABox::from_cuda_ref(value)),
+            cuda_alloc: Allocation::Allocation(CUDABox::from_cuda_ref(value)),
         }
     }
 }
 
 #[cfg(feature = "cuda")]
-impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<T> {
+impl<'a, T> From<HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T> {
     fn from(value: HerculesCUDARefMut<'a>) -> Self {
         HerculesMutBox {
             loc: HerculesMutBoxLocation::CUDA,
-            cpu_alloc: None,
+            cpu_alloc: Allocation::None,
             #[cfg(feature = "cuda")]
-            cuda_alloc: Some(CUDABox::from_cuda_ref(value.as_ref())),
+            cuda_alloc: Allocation::Reference(value),
         }
     }
 }
 
-impl<T> HerculesMutBox<T>
+impl<'a, T> HerculesMutBox<'a, T>
 where
-    T: Default
+    T: Default + Clone
 {
-    pub fn as_slice(&mut self) -> &mut [T] {
+    pub fn as_slice(&'a mut self) -> &'a mut [T] {
         self.as_cpu_ref().as_slice()
     }
 
-    pub fn to_vec(mut self) -> Vec<T> {
-        // Bring to CPU (if needed)
-        let _ = self.as_cpu_ref();
-        self.cpu_alloc.unwrap()
-    }
-
-    pub fn as_cpu_ref<'a>(&'a mut self) -> HerculesCPURefMut<'a> {
+    pub fn as_cpu_ref(&'a mut self) -> HerculesCPURefMut<'a> {
         match self.loc {
             HerculesMutBoxLocation::CPU => {
-                HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap())
+                match self.cpu_alloc {
+                    Allocation::None => panic!("No CPU reference"),
+                    Allocation::Reference(ref mut val) => HerculesCPURefMut::from_slice(*val),
+                    Allocation::Allocation(ref mut val) => HerculesCPURefMut::from_slice::<T>(val),
+                }
             }
             #[cfg(feature = "cuda")]
             HerculesMutBoxLocation::CUDA => {
-                let cuda_alloc = self.cuda_alloc.as_ref().unwrap();
-                let elements = cuda_alloc.get_bytes() / size_of::<T>();
+                let cuda_ref : HerculesCUDARef<'a> =
+                    match self.cuda_alloc {
+                        Allocation::None => panic!("No GPU reference"),
+                        Allocation::Reference(ref mut val) => val.dup().as_ref(),
+                        Allocation::Allocation(ref val) => val.get_ref(),
+                    };
+
+                let elements = unsafe { cuda_ref.__size() / size_of::<T>() };
 
                 // Allocate host memory (if needed)
-                if self.cpu_alloc.is_none() || self.cpu_alloc.as_ref().unwrap().len() != elements {
-                    let mut alloc = Vec::new();
-                    alloc.resize_with(elements, Default::default);
-                    self.cpu_alloc = Some(alloc);
-                }
+                let cpu_alloc : Allocation<&'a mut [T], Vec<T>> =
+                    match self.cpu_alloc.take() {
+                        Allocation::Reference(val)  if val.len() == elements => Allocation::Reference(val),
+                        Allocation::Allocation(val) if val.len() == elements => Allocation::Allocation(val),
+                        _ => {
+                            let mut alloc = Vec::new();
+                            alloc.resize_with(elements, Default::default);
+                            Allocation::Allocation(alloc)
+                        }
+                    };
+                self.cpu_alloc = cpu_alloc;
+                let cpu_ref : &'a mut [T] =
+                    match &mut self.cpu_alloc {
+                        Allocation::None => panic!(),
+                        Allocation::Reference(val)  => val,
+                        Allocation::Allocation(val) => val,
+                    };
 
                 // Transfer data from CUDA device
-                let cpu_alloc = self.cpu_alloc.as_mut().unwrap();
-                let _ = cuda_alloc.get_ref().to_cpu_ref(cpu_alloc);
+                let cpu_ref = cuda_ref.to_cpu_ref(cpu_ref);
 
                 self.loc = HerculesMutBoxLocation::CPU;
-                HerculesCPURefMut::from_slice(self.cpu_alloc.as_mut().unwrap())
+                cpu_ref
             }
         }
     }
 
     #[cfg(feature = "cuda")]
-    pub fn as_cuda_ref<'a>(&'a mut self) -> HerculesCUDARefMut<'a> {
+    pub fn as_cuda_ref(&'a mut self) -> HerculesCUDARefMut<'a> {
         match self.loc {
             HerculesMutBoxLocation::CPU => {
-                // TODO: CUDABox does not provide an interface for copying data to it, so currently
-                // we just reallocate it
-                let cpu_ref = HerculesCPURef::from_slice(self.cpu_alloc.as_ref().unwrap());
-                let cuda_alloc = CUDABox::from_cpu_ref(cpu_ref);
+                let cpu_ref : &'a [T] =
+                    match self.cpu_alloc {
+                        Allocation::None => panic!("No CPU reference"),
+                        Allocation::Reference(ref val) => val,
+                        Allocation::Allocation(ref val) => val,
+                    };
+
+                let size = cpu_ref.len() * size_of::<T>();
+                let (cuda_alloc, copied) =
+                    match self.cuda_alloc.take() {
+                        Allocation::Reference(val)  if unsafe { val.__size() == size } => (Allocation::Reference(val), false),
+                        Allocation::Allocation(val) if val.get_bytes() == size => (Allocation::Allocation(val), false),
+                        _ => {
+                            let alloc = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(cpu_ref));
+                            (Allocation::Allocation(alloc), true)
+                        }
+                    };
+                self.cuda_alloc = cuda_alloc;
+
+                let cuda_ref =
+                    match self.cuda_alloc {
+                        Allocation::None => panic!(),
+                        Allocation::Reference(ref mut val) => val.dup(),
+                        Allocation::Allocation(ref mut val) => val.get_ref_mut(),
+                    };
+
+                if !copied {
+                    unsafe {
+                        __copy_cpu_to_cuda(cuda_ref.__ptr(), cpu_ref.as_ptr() as *mut u8, size);
+                    }
+                }
 
-                self.cuda_alloc = Some(cuda_alloc);
                 self.loc = HerculesMutBoxLocation::CUDA;
-                self.cuda_alloc.as_mut().unwrap().get_ref_mut()
+                cuda_ref
             }
             HerculesMutBoxLocation::CUDA => {
-                self.cuda_alloc.as_mut().unwrap().get_ref_mut()
+                match self.cuda_alloc {
+                    Allocation::None => panic!("No GPU reference"),
+                    Allocation::Reference(ref mut val) => val.dup(),
+                    Allocation::Allocation(ref mut val) => val.get_ref_mut(),
+                }
             }
         }
     }
@@ -704,7 +780,7 @@ pub trait HerculesMutBoxTo<'a, T> {
     fn to(&'a mut self) -> T;
 }
 
-impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<T>
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCPURefMut<'a>> for HerculesMutBox<'a, T>
 where T: Default + Clone
 {
     fn to(&'a mut self) -> HerculesCPURefMut<'a> {
@@ -713,7 +789,7 @@ where T: Default + Clone
 }
 
 #[cfg(feature = "cuda")]
-impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<T>
+impl<'a, T> HerculesMutBoxTo<'a, HerculesCUDARefMut<'a>> for HerculesMutBox<'a, T>
 where T: Default + Clone
 {
     fn to(&'a mut self) -> HerculesCUDARefMut<'a> {
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index 2c5ddc13..c1acbe3f 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -8,7 +8,7 @@ use self::camera_model::*;
 use self::cava_rust::CHAN;
 use self::image_proc::*;
 
-use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
 
 use image::ImageError;
 
@@ -21,8 +21,8 @@ async fn safe_run<'a>(
     input: &'a HerculesImmBox<'a, u8>, tstw: &'a HerculesImmBox<'a, f32>,
     ctrl_pts: &'a HerculesImmBox<'a, f32>, weights: &'a HerculesImmBox<'a, f32>,
     coefs: &'a HerculesImmBox<'a, f32>, tonemap: &'a HerculesImmBox<'a, f32>,
-) -> HerculesImmBox<'a, u8> {
-    HerculesImmBox::from(
+) -> HerculesMutBox<'a, u8> {
+    HerculesMutBox::from(
         runner.run(
             r,
             c,
diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
index c74e2098..32dfaba9 100644
--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -2,7 +2,7 @@
 
 mod edge_detection_rust;
 
-use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo};
+use hercules_rt::{runner, HerculesImmBox, HerculesImmBoxTo, HerculesMutBox};
 
 use std::slice::from_raw_parts;
 
@@ -84,6 +84,33 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat {
     converted
 }
 
+async fn safe_run<'a>(runner: &'a mut HerculesRunner_edge_detection,
+                  n: u64, m: u64, gs: u64, sz: u64, sb: u64,
+                  input: &'a HerculesImmBox<'a, f32>,
+                  gaussian_filter: &'a HerculesImmBox<'a, f32>,
+                  structure: &'a HerculesImmBox<'a, f32>,
+                  sx: &'a HerculesImmBox<'a, f32>,
+                  sy: &'a HerculesImmBox<'a, f32>,
+                  theta: f32
+) -> HerculesMutBox<'a, f32> {
+    HerculesMutBox::from(
+        runner.run(
+            n,
+            m,
+            gs,
+            sz,
+            sb,
+            input.to(),
+            gaussian_filter.to(),
+            structure.to(),
+            sx.to(),
+            sy.to(),
+            theta,
+        )
+        .await
+    )
+}
+
 fn edge_detection_harness(args: EdgeDetectionInputs) {
     let EdgeDetectionInputs {
         input,
@@ -184,24 +211,23 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
         let input_h = HerculesImmBox::from(input);
 
         let result =
-            HerculesImmBox::from(
-                async_std::task::block_on(async {
-                    r.run(
-                        height as u64,
-                        width as u64,
-                        gs as u64,
-                        sz as u64,
-                        sb as u64,
-                        input_h.to(),
-                        gaussian_filter_h.to(),
-                        structure_h.to(),
-                        sx_h.to(),
-                        sy_h.to(),
-                        theta,
-                    )
-                    .await
-                })
-            )
+            async_std::task::block_on(async {
+                safe_run(&mut r,
+                         height as u64,
+                         width as u64,
+                         gs as u64,
+                         sz as u64,
+                         sb as u64,
+                         &input_h,
+                         &gaussian_filter_h,
+                         &structure_h,
+                         &sx_h,
+                         &sy_h,
+                         theta,
+                )
+                .await
+            })
+            .as_slice()
             .to_vec();
 
         if display {
-- 
GitLab


From 81525f538a4012f7c4a590a96929fe7ad025de63 Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Thu, 6 Feb 2025 14:28:58 -0600
Subject: [PATCH 07/11] Fixing lifetime issues

---
 hercules_rt/src/lib.rs                  | 14 +++++++-------
 juno_samples/cava/src/main.rs           |  8 ++++----
 juno_samples/edge_detection/src/main.rs | 12 ++++++------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index f8fdf2ef..848309e3 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -503,15 +503,15 @@ impl<'a, T> HerculesImmBox<'a, T>
 where 
     T: Default + Clone
 {
-    pub fn as_slice(&'a self) -> &'a [T] {
+    pub fn as_slice<'b>(&'b self) -> &'a [T] {
         self.as_cpu_ref().as_slice()
     }
 
-    pub fn to_vec(&'a self) -> Vec<T> {
+    pub fn to_vec<'b>(&'b self) -> Vec<T> {
         Vec::from(self.as_cpu_ref().as_slice())
     }
 
-    pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> {
+    pub fn as_cpu_ref<'b>(&'b self) -> HerculesCPURef<'a> {
         if let Some(cpu_ref) = self.cpu_ref.get() {
             cpu_ref.clone()
         } else {
@@ -536,7 +536,7 @@ where
     }
 
     #[cfg(feature = "cuda")]
-    pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> {
+    pub fn as_cuda_ref<'b>(&'b self) -> HerculesCUDARef<'a> {
         if let Some(cuda_ref) = self.cuda_ref.get() {
             cuda_ref.clone()
         } else {
@@ -756,13 +756,13 @@ where
 }
 
 pub trait HerculesImmBoxTo<'a, T> {
-    fn to(&'a self) -> T;
+    fn to(&self) -> T;
 }
 
 impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
 where T: Default + Clone
 {
-    fn to(&'a self) -> HerculesCPURef<'a> {
+    fn to(&self) -> HerculesCPURef<'a> {
         self.as_cpu_ref()
     }
 }
@@ -771,7 +771,7 @@ where T: Default + Clone
 impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
 where T: Default + Clone
 {
-    fn to(&'a self) -> HerculesCUDARef<'a> {
+    fn to(&self) -> HerculesCUDARef<'a> {
         self.as_cuda_ref()
     }
 }
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index c1acbe3f..72a9d823 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -16,11 +16,11 @@ use clap::Parser;
 
 juno_build::juno!("cava");
 
-async fn safe_run<'a>(
+async fn safe_run<'a, 'b: 'a>(
     runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64,
-    input: &'a HerculesImmBox<'a, u8>, tstw: &'a HerculesImmBox<'a, f32>,
-    ctrl_pts: &'a HerculesImmBox<'a, f32>, weights: &'a HerculesImmBox<'a, f32>,
-    coefs: &'a HerculesImmBox<'a, f32>, tonemap: &'a HerculesImmBox<'a, f32>,
+    input: &HerculesImmBox<'b, u8>, tstw: &HerculesImmBox<'b, f32>,
+    ctrl_pts: &HerculesImmBox<'b, f32>, weights: &HerculesImmBox<'b, f32>,
+    coefs: &HerculesImmBox<'b, f32>, tonemap: &HerculesImmBox<'b, f32>,
 ) -> HerculesMutBox<'a, u8> {
     HerculesMutBox::from(
         runner.run(
diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
index 32dfaba9..7452a0ec 100644
--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -84,13 +84,13 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat {
     converted
 }
 
-async fn safe_run<'a>(runner: &'a mut HerculesRunner_edge_detection,
+async fn safe_run<'a, 'b: 'a>(runner: &'a mut HerculesRunner_edge_detection,
                   n: u64, m: u64, gs: u64, sz: u64, sb: u64,
-                  input: &'a HerculesImmBox<'a, f32>,
-                  gaussian_filter: &'a HerculesImmBox<'a, f32>,
-                  structure: &'a HerculesImmBox<'a, f32>,
-                  sx: &'a HerculesImmBox<'a, f32>,
-                  sy: &'a HerculesImmBox<'a, f32>,
+                  input: &HerculesImmBox<'b, f32>,
+                  gaussian_filter: &HerculesImmBox<'b, f32>,
+                  structure: &HerculesImmBox<'b, f32>,
+                  sx: &HerculesImmBox<'b, f32>,
+                  sy: &HerculesImmBox<'b, f32>,
                   theta: f32
 ) -> HerculesMutBox<'a, f32> {
     HerculesMutBox::from(
-- 
GitLab


From 255898ca0b050139e2e8c65887bcbfbe1d86fd8f Mon Sep 17 00:00:00 2001
From: Aaron Councilman <aaronjc4@illinois.edu>
Date: Thu, 6 Feb 2025 14:47:15 -0600
Subject: [PATCH 08/11] Actually fixed this time

---
 hercules_rt/src/lib.rs                  | 14 +++++++-------
 juno_samples/cava/src/main.rs           | 10 ++++++----
 juno_samples/edge_detection/src/main.rs | 20 ++++++++++++--------
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index 848309e3..f8fdf2ef 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -503,15 +503,15 @@ impl<'a, T> HerculesImmBox<'a, T>
 where 
     T: Default + Clone
 {
-    pub fn as_slice<'b>(&'b self) -> &'a [T] {
+    pub fn as_slice(&'a self) -> &'a [T] {
         self.as_cpu_ref().as_slice()
     }
 
-    pub fn to_vec<'b>(&'b self) -> Vec<T> {
+    pub fn to_vec(&'a self) -> Vec<T> {
         Vec::from(self.as_cpu_ref().as_slice())
     }
 
-    pub fn as_cpu_ref<'b>(&'b self) -> HerculesCPURef<'a> {
+    pub fn as_cpu_ref(&'a self) -> HerculesCPURef<'a> {
         if let Some(cpu_ref) = self.cpu_ref.get() {
             cpu_ref.clone()
         } else {
@@ -536,7 +536,7 @@ where
     }
 
     #[cfg(feature = "cuda")]
-    pub fn as_cuda_ref<'b>(&'b self) -> HerculesCUDARef<'a> {
+    pub fn as_cuda_ref(&'a self) -> HerculesCUDARef<'a> {
         if let Some(cuda_ref) = self.cuda_ref.get() {
             cuda_ref.clone()
         } else {
@@ -756,13 +756,13 @@ where
 }
 
 pub trait HerculesImmBoxTo<'a, T> {
-    fn to(&self) -> T;
+    fn to(&'a self) -> T;
 }
 
 impl<'a, T> HerculesImmBoxTo<'a, HerculesCPURef<'a>> for HerculesImmBox<'a, T>
 where T: Default + Clone
 {
-    fn to(&self) -> HerculesCPURef<'a> {
+    fn to(&'a self) -> HerculesCPURef<'a> {
         self.as_cpu_ref()
     }
 }
@@ -771,7 +771,7 @@ where T: Default + Clone
 impl<'a, T> HerculesImmBoxTo<'a, HerculesCUDARef<'a>> for HerculesImmBox<'a, T>
 where T: Default + Clone
 {
-    fn to(&self) -> HerculesCUDARef<'a> {
+    fn to(&'a self) -> HerculesCUDARef<'a> {
         self.as_cuda_ref()
     }
 }
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index 72a9d823..b4a0f6fd 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -16,11 +16,13 @@ use clap::Parser;
 
 juno_build::juno!("cava");
 
-async fn safe_run<'a, 'b: 'a>(
+// Individual lifetimes are not needed in this example but should probably be generated for
+// flexibility
+async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a, 'g: 'a>(
     runner: &'a mut HerculesRunner_cava, r: u64, c: u64, num_ctrl_pts: u64,
-    input: &HerculesImmBox<'b, u8>, tstw: &HerculesImmBox<'b, f32>,
-    ctrl_pts: &HerculesImmBox<'b, f32>, weights: &HerculesImmBox<'b, f32>,
-    coefs: &HerculesImmBox<'b, f32>, tonemap: &HerculesImmBox<'b, f32>,
+    input: &'b HerculesImmBox<'b, u8>, tstw: &'c HerculesImmBox<'c, f32>,
+    ctrl_pts: &'d HerculesImmBox<'d, f32>, weights: &'e HerculesImmBox<'e, f32>,
+    coefs: &'f HerculesImmBox<'f, f32>, tonemap: &'g HerculesImmBox<'g, f32>,
 ) -> HerculesMutBox<'a, u8> {
     HerculesMutBox::from(
         runner.run(
diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
index 7452a0ec..9605e69d 100644
--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -84,14 +84,18 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat {
     converted
 }
 
-async fn safe_run<'a, 'b: 'a>(runner: &'a mut HerculesRunner_edge_detection,
-                  n: u64, m: u64, gs: u64, sz: u64, sb: u64,
-                  input: &HerculesImmBox<'b, f32>,
-                  gaussian_filter: &HerculesImmBox<'b, f32>,
-                  structure: &HerculesImmBox<'b, f32>,
-                  sx: &HerculesImmBox<'b, f32>,
-                  sy: &HerculesImmBox<'b, f32>,
-                  theta: f32
+// If all of the HerculesImmBox are given lifetimes 'b: &'b HerculesImmBox<'b, f32>
+// this fails to borrow-check (I think because the input is declared inside the loop while
+// everything else is declared outside it)
+async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a>(
+    runner: &'a mut HerculesRunner_edge_detection,
+    n: u64, m: u64, gs: u64, sz: u64, sb: u64,
+    input: &'b HerculesImmBox<'b, f32>,
+    gaussian_filter: &'c HerculesImmBox<'c, f32>,
+    structure: &'d HerculesImmBox<'d, f32>,
+    sx: &'e HerculesImmBox<'e, f32>,
+    sy: &'f HerculesImmBox<'f, f32>,
+    theta: f32
 ) -> HerculesMutBox<'a, f32> {
     HerculesMutBox::from(
         runner.run(
-- 
GitLab


From 27ce61d09d94e20a1584876a5615fc6bfadd98bb Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Fri, 7 Feb 2025 10:15:26 -0600
Subject: [PATCH 09/11] Clarifying comment regarding run() lifetimes

---
 hercules_cg/src/rt.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 2c5f7c35..6ea9b45e 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -1001,6 +1001,9 @@ impl<'a> RTContext<'a> {
             )?;
         }
         write!(w, "}}}}")?;
+        // Every reference, including the runner and the Hercules Refs, have the
+        // same lifetime, since a returned object may come from backing memory
+        // or from one of the parameters.
         write!(w, "async fn run<'a>(&'a mut self")?;
         for idx in 0..func.num_dynamic_constants {
             write!(w, ", dc_p{}: u64", idx)?;
-- 
GitLab


From 3b1f9e24af05094d2de630ccf299728f4e139372 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 9 Feb 2025 09:25:32 -0600
Subject: [PATCH 10/11] Be more precise about lifetimes in RT backend

---
 hercules_cg/src/rt.rs          | 50 +++++++++++++++++++++++++++++-----
 hercules_ir/src/collections.rs |  6 +++-
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 30b1d22c..b79e4953 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -1013,10 +1013,28 @@ impl<'a> RTContext<'a> {
             )?;
         }
         write!(w, "}}}}")?;
-        // Every reference, including the runner and the Hercules Refs, have the
-        // same lifetime, since a returned object may come from backing memory
-        // or from one of the parameters.
-        write!(w, "async fn run<'a>(&'a mut self")?;
+
+        // Every reference that may be returned has the same lifetime. Every
+        // other reference gets its own unique lifetime.
+        let returned_origins: HashSet<_> = self.collection_objects[&self.func_id]
+            .returned_objects()
+            .into_iter()
+            .map(|obj| self.collection_objects[&self.func_id].origin(*obj))
+            .collect();
+
+        write!(w, "async fn run<'runner, 'returned")?;
+        for idx in 0..func.param_types.len() {
+            write!(w, ", 'p{}", idx)?;
+        }
+        write!(
+            w,
+            ">(&'{} mut self",
+            if returned_origins.iter().any(|origin| !origin.is_parameter()) {
+                "returned"
+            } else {
+                "runner"
+            }
+        )?;
         for idx in 0..func.num_dynamic_constants {
             write!(w, ", dc_p{}: u64", idx)?;
         }
@@ -1032,8 +1050,19 @@ impl<'a> RTContext<'a> {
                 let mutability = if param_muts[idx] { "Mut" } else { "" };
                 write!(
                     w,
-                    ", p{}: ::hercules_rt::Hercules{}Ref{}<'a>",
-                    idx, device, mutability
+                    ", p{}: ::hercules_rt::Hercules{}Ref{}<'{}>",
+                    idx,
+                    device,
+                    mutability,
+                    if returned_origins.iter().any(|origin| origin
+                        .try_parameter()
+                        .map(|oidx| idx == oidx)
+                        .unwrap_or(false))
+                    {
+                        "returned".to_string()
+                    } else {
+                        format!("p{}", idx)
+                    }
                 )?;
             }
         }
@@ -1048,10 +1077,13 @@ impl<'a> RTContext<'a> {
             let mutability = if return_mut { "Mut" } else { "" };
             write!(
                 w,
-                ") -> ::hercules_rt::Hercules{}Ref{}<'a> {{",
+                ") -> ::hercules_rt::Hercules{}Ref{}<'returned> {{",
                 device, mutability
             )?;
         }
+
+        // Start with possibly re-allocating the backing memory if it's not
+        // large enough.
         write!(w, "unsafe {{")?;
         for (device, (total, _)) in self.backing_allocations[&self.func_id].iter() {
             write!(w, "let size = ")?;
@@ -1087,6 +1119,8 @@ impl<'a> RTContext<'a> {
                 )?;
             }
         }
+
+        // Call the wrapped function.
         write!(w, "let ret = {}(", func.name)?;
         for (device, _) in self.backing_allocations[&self.func_id].iter() {
             write!(
@@ -1120,6 +1154,8 @@ impl<'a> RTContext<'a> {
             )?;
         }
         write!(w, "}}}}")?;
+
+        // De-allocate the backing memory on drop.
         write!(
             w,
             "}}impl Drop for HerculesRunner_{} {{#[allow(unused_unsafe)]fn drop(&mut self) {{unsafe {{",
diff --git a/hercules_ir/src/collections.rs b/hercules_ir/src/collections.rs
index 1bc650e9..d236d5b5 100644
--- a/hercules_ir/src/collections.rs
+++ b/hercules_ir/src/collections.rs
@@ -36,7 +36,7 @@ use crate::*;
  * - For each function, which collection objects may be returned?
  * - For each collection object, how was it originated?
  */
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum CollectionObjectOrigin {
     Parameter(usize),
     Constant(NodeID),
@@ -57,6 +57,10 @@ pub struct FunctionCollectionObjects {
 pub type CollectionObjects = BTreeMap<FunctionID, FunctionCollectionObjects>;
 
 impl CollectionObjectOrigin {
+    pub fn is_parameter(&self) -> bool {
+        self.try_parameter().is_some()
+    }
+
     pub fn try_parameter(&self) -> Option<usize> {
         match self {
             CollectionObjectOrigin::Parameter(index) => Some(*index),
-- 
GitLab


From 3d6432d7fa7a44ad5811a81e35041c7be3770a97 Mon Sep 17 00:00:00 2001
From: Russel Arbore <russel.jma@gmail.com>
Date: Sun, 9 Feb 2025 09:30:44 -0600
Subject: [PATCH 11/11] edge detection no longer needs longer than lifetime
 annotations

---
 juno_samples/edge_detection/src/main.rs | 80 +++++++++++++------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
index 9605e69d..60ccb515 100644
--- a/juno_samples/edge_detection/src/main.rs
+++ b/juno_samples/edge_detection/src/main.rs
@@ -84,34 +84,36 @@ fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat {
     converted
 }
 
-// If all of the HerculesImmBox are given lifetimes 'b: &'b HerculesImmBox<'b, f32>
-// this fails to borrow-check (I think because the input is declared inside the loop while
-// everything else is declared outside it)
-async fn safe_run<'a, 'b: 'a, 'c: 'a, 'd: 'a, 'e: 'a, 'f: 'a>(
+async fn safe_run<'a, 'b, 'c, 'd, 'e, 'f>(
     runner: &'a mut HerculesRunner_edge_detection,
-    n: u64, m: u64, gs: u64, sz: u64, sb: u64,
+    n: u64,
+    m: u64,
+    gs: u64,
+    sz: u64,
+    sb: u64,
     input: &'b HerculesImmBox<'b, f32>,
     gaussian_filter: &'c HerculesImmBox<'c, f32>,
     structure: &'d HerculesImmBox<'d, f32>,
     sx: &'e HerculesImmBox<'e, f32>,
     sy: &'f HerculesImmBox<'f, f32>,
-    theta: f32
+    theta: f32,
 ) -> HerculesMutBox<'a, f32> {
     HerculesMutBox::from(
-        runner.run(
-            n,
-            m,
-            gs,
-            sz,
-            sb,
-            input.to(),
-            gaussian_filter.to(),
-            structure.to(),
-            sx.to(),
-            sy.to(),
-            theta,
-        )
-        .await
+        runner
+            .run(
+                n,
+                m,
+                gs,
+                sz,
+                sb,
+                input.to(),
+                gaussian_filter.to(),
+                structure.to(),
+                sx.to(),
+                sy.to(),
+                theta,
+            )
+            .await,
     )
 }
 
@@ -214,25 +216,25 @@ fn edge_detection_harness(args: EdgeDetectionInputs) {
 
         let input_h = HerculesImmBox::from(input);
 
-        let result =
-            async_std::task::block_on(async {
-                safe_run(&mut r,
-                         height as u64,
-                         width as u64,
-                         gs as u64,
-                         sz as u64,
-                         sb as u64,
-                         &input_h,
-                         &gaussian_filter_h,
-                         &structure_h,
-                         &sx_h,
-                         &sy_h,
-                         theta,
-                )
-                .await
-            })
-            .as_slice()
-            .to_vec();
+        let result = async_std::task::block_on(async {
+            safe_run(
+                &mut r,
+                height as u64,
+                width as u64,
+                gs as u64,
+                sz as u64,
+                sb as u64,
+                &input_h,
+                &gaussian_filter_h,
+                &structure_h,
+                &sx_h,
+                &sy_h,
+                theta,
+            )
+            .await
+        })
+        .as_slice()
+        .to_vec();
 
         if display {
             let result = frame_from_slice(&result, height, width);
-- 
GitLab