diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d8af3a03dd006708a95167fc8dc9f7d49381a372..b80dd590ab42b4019daa285a83ec89d978884006 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,9 +1,5 @@
-build-job:
-  stage: build
-  script:
-    - cargo build
-
 test-job:
   stage: test
   script:
     - cargo test
+    - cargo test --features=cuda
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 65f7121576c8b7c5d703a140ad90ea6262005cd8..13370c4580a9365f0a9013b573e558133656d407 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -295,10 +295,19 @@ impl<'a> RTContext<'a> {
                 ref dynamic_constants,
                 ref args,
             } => {
-                match self.devices[callee_id.idx()] {
-                    Device::LLVM => {
+                let device = self.devices[callee_id.idx()];
+                match device {
+                    // The device backends ensure that device functions have the
+                    // same C interface.
+                    Device::LLVM | Device::CUDA => {
                         let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap();
 
+                        let device = match device {
+                            Device::LLVM => "cpu",
+                            Device::CUDA => "cuda",
+                            _ => panic!(),
+                        };
+
                         // First, get the raw pointers to collections that the
                         // device function takes as input.
                         let callee_objs = &self.collection_objects[&callee_id];
@@ -308,16 +317,18 @@ impl<'a> RTContext<'a> {
                                 if callee_objs.is_mutated(obj) {
                                     write!(
                                         block,
-                                        "                let arg_tmp{} = unsafe {{ {}.__cpu_ptr_mut() }};\n",
+                                        "                let arg_tmp{} = unsafe {{ {}.__{}_ptr_mut() }};\n",
                                         idx,
-                                        self.get_value(*arg)
+                                        self.get_value(*arg),
+                                        device
                                     )?;
                                 } else {
                                     write!(
                                         block,
-                                        "                let arg_tmp{} = unsafe {{ {}.__cpu_ptr() }};\n",
+                                        "                let arg_tmp{} = unsafe {{ {}.__{}_ptr() }};\n",
                                         idx,
-                                        self.get_value(*arg)
+                                        self.get_value(*arg),
+                                        device
                                     )?;
                                 }
                             } else {
@@ -401,7 +412,6 @@ impl<'a> RTContext<'a> {
                         }
                         write!(block, ").await;\n")?;
                     }
-                    _ => todo!(),
                 }
             }
             _ => panic!(
diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs
index 3c5ed0bdfa9c6a7e7c9c55c9da16782ff5cc7cde..4fd0cf0b11a55beb3320e4211b2c52f7c0e4e38d 100644
--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -329,7 +329,7 @@ pub enum Schedule {
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum Device {
     LLVM,
-    NVVM,
+    CUDA,
     // Entry functions are lowered to async Rust code that calls device
     // functions (leaf nodes in the call graph), possibly concurrently.
     AsyncRust,
diff --git a/hercules_rt/Cargo.toml b/hercules_rt/Cargo.toml
index 7201f41127a8a307b1df6a361743da889a2b49b8..c4678b18918f323e601a75b0efa1f0289b9b8de9 100644
--- a/hercules_rt/Cargo.toml
+++ b/hercules_rt/Cargo.toml
@@ -4,5 +4,8 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"
 
+[features]
+cuda = []
+
 [dependencies]
 
diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..15b9f6396d2c90eee66dfecb5a255cef2890726b
--- /dev/null
+++ b/hercules_rt/build.rs
@@ -0,0 +1,25 @@
+use std::env::var;
+use std::path::Path;
+use std::process::Command;
+
+fn main() {
+    if cfg!(feature = "cuda") {
+        let out_dir = var("OUT_DIR").unwrap();
+        Command::new("nvcc")
+            .args(&["src/rtdefs.cu", "-c", "-o"])
+            .arg(&format!("{}/rtdefs.o", out_dir))
+            .status()
+            .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?");
+        Command::new("ar")
+            .args(&["crus", "librtdefs.a", "rtdefs.o"])
+            .current_dir(&Path::new(&out_dir))
+            .status()
+            .unwrap();
+
+        println!("cargo::rustc-link-search=native={}", out_dir);
+        println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+        println!("cargo::rustc-link-lib=static=rtdefs");
+        println!("cargo::rustc-link-lib=cudart");
+        println!("cargo::rerun-if-changed=src/rtdefs.cu");
+    }
+}
diff --git a/hercules_rt/src/lib.rs b/hercules_rt/src/lib.rs
index 50ac260c160deb37c40707809438870a1f3acf57..2a96970e88e2b432c8c9c2ed896dc1c623781cbd 100644
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -4,6 +4,16 @@ use std::mem::swap;
 use std::ptr::{copy_nonoverlapping, NonNull};
 use std::slice::from_raw_parts;
 
+#[cfg(feature = "cuda")]
+extern "C" {
+    fn cuda_alloc(size: usize) -> *mut u8;
+    fn cuda_alloc_zeroed(size: usize) -> *mut u8;
+    fn cuda_dealloc(ptr: *mut u8);
+    fn copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+    fn copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
+    fn copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+}
+
 /*
  * An in-memory collection object that can be used by functions compiled by the
  * Hercules compiler.
@@ -13,16 +23,23 @@ pub struct HerculesBox<'a> {
     cpu_exclusive: Option<NonNull<u8>>,
     cpu_owned: Option<NonNull<u8>>,
 
+    #[cfg(feature = "cuda")]
+    cuda_owned: Option<NonNull<u8>>,
+
     size: usize,
     _phantom: PhantomData<&'a u8>,
 }
 
-impl<'a> HerculesBox<'a> {
+impl<'b, 'a: 'b> HerculesBox<'a> {
     pub fn from_slice<T>(slice: &'a [T]) -> Self {
         HerculesBox {
             cpu_shared: Some(unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) }),
             cpu_exclusive: None,
             cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
             size: slice.len() * size_of::<T>(),
             _phantom: PhantomData,
         }
@@ -33,36 +50,69 @@ impl<'a> HerculesBox<'a> {
             cpu_shared: None,
             cpu_exclusive: Some(unsafe { NonNull::new_unchecked(slice.as_mut_ptr() as *mut u8) }),
             cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
             size: slice.len() * size_of::<T>(),
             _phantom: PhantomData,
         }
     }
 
-    pub fn as_slice<T>(&'a self) -> &'a [T] {
+    pub fn as_slice<T>(&'b mut self) -> &'b [T] {
         assert_eq!(self.size % size_of::<T>(), 0);
         unsafe { from_raw_parts(self.__cpu_ptr() as *const T, self.size / size_of::<T>()) }
     }
 
-    unsafe fn into_cpu(&self) -> NonNull<u8> {
-        self.cpu_shared
-            .or(self.cpu_exclusive)
-            .or(self.cpu_owned)
-            .unwrap()
+    unsafe fn get_cpu_ptr(&self) -> Option<NonNull<u8>> {
+        self.cpu_owned.or(self.cpu_exclusive).or(self.cpu_shared)
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn get_cuda_ptr(&self) -> Option<NonNull<u8>> {
+        self.cuda_owned
     }
 
-    unsafe fn into_cpu_mut(&mut self) -> NonNull<u8> {
-        if let Some(ptr) = self.cpu_exclusive.or(self.cpu_owned) {
+    unsafe fn allocate_cpu(&mut self) -> NonNull<u8> {
+        if let Some(ptr) = self.cpu_owned {
             ptr
         } else {
             let ptr =
                 NonNull::new(alloc(Layout::from_size_align_unchecked(self.size, 16))).unwrap();
-            copy_nonoverlapping(self.cpu_shared.unwrap().as_ptr(), ptr.as_ptr(), self.size);
             self.cpu_owned = Some(ptr);
-            self.cpu_shared = None;
             ptr
         }
     }
 
+    #[cfg(feature = "cuda")]
+    unsafe fn allocate_cuda(&mut self) -> NonNull<u8> {
+        if let Some(ptr) = self.cuda_owned {
+            ptr
+        } else {
+            let ptr = cuda_alloc(self.size);
+            self.cuda_owned = Some(NonNull::new(ptr).unwrap());
+            self.cuda_owned.unwrap()
+        }
+    }
+
+    unsafe fn deallocate_cpu(&mut self) {
+        if let Some(ptr) = self.cpu_owned {
+            dealloc(
+                ptr.as_ptr(),
+                Layout::from_size_align_unchecked(self.size, 16),
+            );
+            self.cpu_owned = None;
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn deallocate_cuda(&mut self) {
+        if let Some(ptr) = self.cuda_owned {
+            cuda_dealloc(ptr.as_ptr());
+            self.cuda_owned = None;
+        }
+    }
+
     pub unsafe fn __zeros(size: u64) -> Self {
         assert_ne!(size, 0);
         let size = size as usize;
@@ -72,6 +122,10 @@ impl<'a> HerculesBox<'a> {
             cpu_owned: Some(
                 NonNull::new(alloc_zeroed(Layout::from_size_align_unchecked(size, 16))).unwrap(),
             ),
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
             size: size,
             _phantom: PhantomData,
         }
@@ -82,6 +136,10 @@ impl<'a> HerculesBox<'a> {
             cpu_shared: None,
             cpu_exclusive: None,
             cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
             size: 0,
             _phantom: PhantomData,
         }
@@ -93,24 +151,61 @@ impl<'a> HerculesBox<'a> {
         ret
     }
 
-    pub unsafe fn __cpu_ptr(&self) -> *mut u8 {
-        self.into_cpu().as_ptr()
+    pub unsafe fn __cpu_ptr(&mut self) -> *mut u8 {
+        if let Some(ptr) = self.get_cpu_ptr() {
+            return ptr.as_ptr();
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let cuda_ptr = self.get_cuda_ptr().unwrap();
+            let cpu_ptr = self.allocate_cpu();
+            copy_cuda_to_cpu(cpu_ptr.as_ptr(), cuda_ptr.as_ptr(), self.size);
+            return cpu_ptr.as_ptr();
+        }
+        panic!()
     }
 
     pub unsafe fn __cpu_ptr_mut(&mut self) -> *mut u8 {
-        self.into_cpu_mut().as_ptr()
+        let cpu_ptr = self.__cpu_ptr();
+        if Some(cpu_ptr) == self.cpu_shared.map(|nn| nn.as_ptr()) {
+            self.allocate_cpu();
+            copy_nonoverlapping(cpu_ptr, self.cpu_owned.unwrap().as_ptr(), self.size);
+        }
+        self.cpu_shared = None;
+        self.cpu_exclusive = None;
+        #[cfg(feature = "cuda")]
+        self.deallocate_cuda();
+        cpu_ptr
+    }
+
+    #[cfg(feature = "cuda")]
+    pub unsafe fn __cuda_ptr(&mut self) -> *mut u8 {
+        if let Some(ptr) = self.get_cuda_ptr() {
+            ptr.as_ptr()
+        } else {
+            let cpu_ptr = self.get_cpu_ptr().unwrap();
+            let cuda_ptr = self.allocate_cuda();
+            copy_cpu_to_cuda(cuda_ptr.as_ptr(), cpu_ptr.as_ptr(), self.size);
+            cuda_ptr.as_ptr()
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub unsafe fn __cuda_ptr_mut(&mut self) -> *mut u8 {
+        let cuda_ptr = self.__cuda_ptr();
+        self.cpu_shared = None;
+        self.cpu_exclusive = None;
+        self.deallocate_cpu();
+        cuda_ptr
     }
 }
 
 impl<'a> Drop for HerculesBox<'a> {
     fn drop(&mut self) {
-        if let Some(ptr) = self.cpu_owned {
-            unsafe {
-                dealloc(
-                    ptr.as_ptr(),
-                    Layout::from_size_align_unchecked(self.size, 16),
-                )
-            }
+        unsafe {
+            self.deallocate_cpu();
+            #[cfg(feature = "cuda")]
+            self.deallocate_cuda();
         }
     }
 }
diff --git a/hercules_rt/src/rtdefs.cu b/hercules_rt/src/rtdefs.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b7378d816c65666b445a989fc6b0530d821beb7b
--- /dev/null
+++ b/hercules_rt/src/rtdefs.cu
@@ -0,0 +1,38 @@
+extern "C" {
+	void *cuda_alloc(size_t size) {
+		void *ptr = NULL;
+		cudaError_t res = cudaMalloc(&ptr, size);
+		if (res != cudaSuccess) {
+			ptr = NULL;
+		}
+		return ptr;
+	}
+	
+	void *cuda_alloc_zeroed(size_t size) {
+		void *ptr = cuda_alloc(size);
+		if (!ptr) {
+			return NULL;
+		}
+		cudaError_t res = cudaMemset(ptr, 0, size);
+		if (res != cudaSuccess) {
+			return NULL;
+		}
+		return ptr;
+	}
+	
+	void cuda_dealloc(void *ptr) {
+		cudaFree(ptr);
+	}
+	
+	void copy_cpu_to_cuda(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
+	}
+	
+	void copy_cuda_to_cpu(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
+	}
+	
+	void copy_cuda_to_cuda(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
+	}
+}
diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs
index 587109f88336f987450b24d1eb3c972fd58da2e3..767fda07195544ae291e0e9e731a1ee43bd1615b 100644
--- a/hercules_samples/matmul/src/main.rs
+++ b/hercules_samples/matmul/src/main.rs
@@ -23,7 +23,7 @@ fn main() {
         }
         let a = HerculesBox::from_slice_mut(&mut a);
         let b = HerculesBox::from_slice_mut(&mut b);
-        let c = matmul(I as u64, J as u64, K as u64, a, b).await;
+        let mut c = matmul(I as u64, J as u64, K as u64, a, b).await;
         assert_eq!(c.as_slice::<i32>(), &*correct_c);
     });
 }
diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs
index e6fb25436b2f7d901a475c726face0b28017bc1b..bace3765789010655ae058aa92f95862b93b3084 100644
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -21,17 +21,17 @@ fn main() {
                 }
             }
         }
-        let c = {
+        let mut c = {
             let a = HerculesBox::from_slice(&a);
             let b = HerculesBox::from_slice(&b);
             matmul(I as u64, J as u64, K as u64, a, b).await
         };
-        let tiled_c = {
+        assert_eq!(c.as_slice::<i32>(), &*correct_c);
+        let mut tiled_c = {
             let a = HerculesBox::from_slice(&a);
             let b = HerculesBox::from_slice(&b);
             tiled_64_matmul(I as u64, J as u64, K as u64, a, b).await
         };
-        assert_eq!(c.as_slice::<i32>(), &*correct_c);
         assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
     });
 }