Add CUDA support to HerculesBox

0187323d · rarbore2 · 9db302b1 · 0187323d · 0187323d · 0187323d
Commit 0187323d authored 8 months ago by rarbore2
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-build-job:
-  stage: build
-  script:
-    - cargo build
-
 test-job:
  stage: test
  script:
    - cargo test
+    - cargo test --features=cuda
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -295,10 +295,19 @@ impl<'a> RTContext<'a> {
                ref dynamic_constants,
                ref args,
            } => {
-                match self.devices[callee_id.idx()] {
-                    Device::LLVM => {
+                let device = self.devices[callee_id.idx()];
+                match device {
+                    // The device backends ensure that device functions have the
+                    // same C interface.
+                    Device::LLVM | Device::CUDA => {
                        let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap();

+                        let device = match device {
+                            Device::LLVM => "cpu",
+                            Device::CUDA => "cuda",
+                            _ => panic!(),
+                        };
+
                        // First, get the raw pointers to collections that the
                        // device function takes as input.
                        let callee_objs = &self.collection_objects[&callee_id];
@@ -308,16 +317,18 @@ impl<'a> RTContext<'a> {
                                if callee_objs.is_mutated(obj) {
                                    write!(
                                        block,
-                                        "                let arg_tmp{} = unsafe {{ {}.__cpu_ptr_mut() }};\n",
+                                        "                let arg_tmp{} = unsafe {{ {}.__{}_ptr_mut() }};\n",
                                        idx,
-                                        self.get_value(*arg)
+                                        self.get_value(*arg),
+                                        device
                                    )?;
                                } else {
                                    write!(
                                        block,
-                                        "                let arg_tmp{} = unsafe {{ {}.__cpu_ptr() }};\n",
+                                        "                let arg_tmp{} = unsafe {{ {}.__{}_ptr() }};\n",
                                        idx,
-                                        self.get_value(*arg)
+                                        self.get_value(*arg),
+                                        device
                                    )?;
                                }
                            } else {
@@ -401,7 +412,6 @@ impl<'a> RTContext<'a> {
                        }
                        write!(block, ").await;\n")?;
                    }
-                    _ => todo!(),
                }
            }
            _ => panic!(

--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -329,7 +329,7 @@ pub enum Schedule {
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum Device {
    LLVM,
-    NVVM,
+    CUDA,
    // Entry functions are lowered to async Rust code that calls device
    // functions (leaf nodes in the call graph), possibly concurrently.
    AsyncRust,

--- a/hercules_rt/Cargo.toml
+++ b/hercules_rt/Cargo.toml
@@ -4,5 +4,8 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"

+[features]
+cuda = []
+
 [dependencies]

--- a/hercules_rt/build.rs
+++ b/hercules_rt/build.rs
+use std::env::var;
+use std::path::Path;
+use std::process::Command;
+
+fn main() {
+    if cfg!(feature = "cuda") {
+        let out_dir = var("OUT_DIR").unwrap();
+        Command::new("nvcc")
+            .args(&["src/rtdefs.cu", "-c", "-o"])
+            .arg(&format!("{}/rtdefs.o", out_dir))
+            .status()
+            .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?");
+        Command::new("ar")
+            .args(&["crus", "librtdefs.a", "rtdefs.o"])
+            .current_dir(&Path::new(&out_dir))
+            .status()
+            .unwrap();
+
+        println!("cargo::rustc-link-search=native={}", out_dir);
+        println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+        println!("cargo::rustc-link-lib=static=rtdefs");
+        println!("cargo::rustc-link-lib=cudart");
+        println!("cargo::rerun-if-changed=src/rtdefs.cu");
+    }
+}
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -4,6 +4,16 @@ use std::mem::swap;
 use std::ptr::{copy_nonoverlapping, NonNull};
 use std::slice::from_raw_parts;

+#[cfg(feature = "cuda")]
+extern "C" {
+    fn cuda_alloc(size: usize) -> *mut u8;
+    fn cuda_alloc_zeroed(size: usize) -> *mut u8;
+    fn cuda_dealloc(ptr: *mut u8);
+    fn copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+    fn copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
+    fn copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+}
+
 /*
 * An in-memory collection object that can be used by functions compiled by the
 * Hercules compiler.
@@ -13,16 +23,23 @@ pub struct HerculesBox<'a> {
    cpu_exclusive: Option<NonNull<u8>>,
    cpu_owned: Option<NonNull<u8>>,

+    #[cfg(feature = "cuda")]
+    cuda_owned: Option<NonNull<u8>>,
+
    size: usize,
    _phantom: PhantomData<&'a u8>,
 }

-impl<'a> HerculesBox<'a> {
+impl<'b, 'a: 'b> HerculesBox<'a> {
    pub fn from_slice<T>(slice: &'a [T]) -> Self {
        HerculesBox {
            cpu_shared: Some(unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) }),
            cpu_exclusive: None,
            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
            size: slice.len() * size_of::<T>(),
            _phantom: PhantomData,
        }
@@ -33,36 +50,69 @@ impl<'a> HerculesBox<'a> {
            cpu_shared: None,
            cpu_exclusive: Some(unsafe { NonNull::new_unchecked(slice.as_mut_ptr() as *mut u8) }),
            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
            size: slice.len() * size_of::<T>(),
            _phantom: PhantomData,
        }
    }

-    pub fn as_slice<T>(&'a self) -> &'a [T] {
+    pub fn as_slice<T>(&'b mut self) -> &'b [T] {
        assert_eq!(self.size % size_of::<T>(), 0);
        unsafe { from_raw_parts(self.__cpu_ptr() as *const T, self.size / size_of::<T>()) }
    }

-    unsafe fn into_cpu(&self) -> NonNull<u8> {
-        self.cpu_shared
-            .or(self.cpu_exclusive)
-            .or(self.cpu_owned)
-            .unwrap()
+    unsafe fn get_cpu_ptr(&self) -> Option<NonNull<u8>> {
+        self.cpu_owned.or(self.cpu_exclusive).or(self.cpu_shared)
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn get_cuda_ptr(&self) -> Option<NonNull<u8>> {
+        self.cuda_owned
    }

-    unsafe fn into_cpu_mut(&mut self) -> NonNull<u8> {
-        if let Some(ptr) = self.cpu_exclusive.or(self.cpu_owned) {
+    unsafe fn allocate_cpu(&mut self) -> NonNull<u8> {
+        if let Some(ptr) = self.cpu_owned {
            ptr
        } else {
            let ptr =
                NonNull::new(alloc(Layout::from_size_align_unchecked(self.size, 16))).unwrap();
-            copy_nonoverlapping(self.cpu_shared.unwrap().as_ptr(), ptr.as_ptr(), self.size);
            self.cpu_owned = Some(ptr);
-            self.cpu_shared = None;
            ptr
        }
    }

+    #[cfg(feature = "cuda")]
+    unsafe fn allocate_cuda(&mut self) -> NonNull<u8> {
+        if let Some(ptr) = self.cuda_owned {
+            ptr
+        } else {
+            let ptr = cuda_alloc(self.size);
+            self.cuda_owned = Some(NonNull::new(ptr).unwrap());
+            self.cuda_owned.unwrap()
+        }
+    }
+
+    unsafe fn deallocate_cpu(&mut self) {
+        if let Some(ptr) = self.cpu_owned {
+            dealloc(
+                ptr.as_ptr(),
+                Layout::from_size_align_unchecked(self.size, 16),
+            );
+            self.cpu_owned = None;
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn deallocate_cuda(&mut self) {
+        if let Some(ptr) = self.cuda_owned {
+            cuda_dealloc(ptr.as_ptr());
+            self.cuda_owned = None;
+        }
+    }
+
    pub unsafe fn __zeros(size: u64) -> Self {
        assert_ne!(size, 0);
        let size = size as usize;
@@ -72,6 +122,10 @@ impl<'a> HerculesBox<'a> {
            cpu_owned: Some(
                NonNull::new(alloc_zeroed(Layout::from_size_align_unchecked(size, 16))).unwrap(),
            ),
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
            size: size,
            _phantom: PhantomData,
        }
@@ -82,6 +136,10 @@ impl<'a> HerculesBox<'a> {
            cpu_shared: None,
            cpu_exclusive: None,
            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
            size: 0,
            _phantom: PhantomData,
        }
@@ -93,24 +151,61 @@ impl<'a> HerculesBox<'a> {
        ret
    }

-    pub unsafe fn __cpu_ptr(&self) -> *mut u8 {
-        self.into_cpu().as_ptr()
+    pub unsafe fn __cpu_ptr(&mut self) -> *mut u8 {
+        if let Some(ptr) = self.get_cpu_ptr() {
+            return ptr.as_ptr();
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let cuda_ptr = self.get_cuda_ptr().unwrap();
+            let cpu_ptr = self.allocate_cpu();
+            copy_cuda_to_cpu(cpu_ptr.as_ptr(), cuda_ptr.as_ptr(), self.size);
+            return cpu_ptr.as_ptr();
+        }
+        panic!()
    }

    pub unsafe fn __cpu_ptr_mut(&mut self) -> *mut u8 {
-        self.into_cpu_mut().as_ptr()
+        let cpu_ptr = self.__cpu_ptr();
+        if Some(cpu_ptr) == self.cpu_shared.map(|nn| nn.as_ptr()) {
+            self.allocate_cpu();
+            copy_nonoverlapping(cpu_ptr, self.cpu_owned.unwrap().as_ptr(), self.size);
+        }
+        self.cpu_shared = None;
+        self.cpu_exclusive = None;
+        #[cfg(feature = "cuda")]
+        self.deallocate_cuda();
+        cpu_ptr
+    }
+
+    #[cfg(feature = "cuda")]
+    pub unsafe fn __cuda_ptr(&mut self) -> *mut u8 {
+        if let Some(ptr) = self.get_cuda_ptr() {
+            ptr.as_ptr()
+        } else {
+            let cpu_ptr = self.get_cpu_ptr().unwrap();
+            let cuda_ptr = self.allocate_cuda();
+            copy_cpu_to_cuda(cuda_ptr.as_ptr(), cpu_ptr.as_ptr(), self.size);
+            cuda_ptr.as_ptr()
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub unsafe fn __cuda_ptr_mut(&mut self) -> *mut u8 {
+        let cuda_ptr = self.__cuda_ptr();
+        self.cpu_shared = None;
+        self.cpu_exclusive = None;
+        self.deallocate_cpu();
+        cuda_ptr
    }
 }

 impl<'a> Drop for HerculesBox<'a> {
    fn drop(&mut self) {
-        if let Some(ptr) = self.cpu_owned {
-            unsafe {
-                dealloc(
-                    ptr.as_ptr(),
-                    Layout::from_size_align_unchecked(self.size, 16),
-                )
-            }
+        unsafe {
+            self.deallocate_cpu();
+            #[cfg(feature = "cuda")]
+            self.deallocate_cuda();
        }
    }
 }
--- a/hercules_rt/src/rtdefs.cu
+++ b/hercules_rt/src/rtdefs.cu
+extern "C" {
+	void *cuda_alloc(size_t size) {
+		void *ptr = NULL;
+		cudaError_t res = cudaMalloc(&ptr, size);
+		if (res != cudaSuccess) {
+			ptr = NULL;
+		}
+		return ptr;
+	}
+	
+	void *cuda_alloc_zeroed(size_t size) {
+		void *ptr = cuda_alloc(size);
+		if (!ptr) {
+			return NULL;
+		}
+		cudaError_t res = cudaMemset(ptr, 0, size);
+		if (res != cudaSuccess) {
+			return NULL;
+		}
+		return ptr;
+	}
+	
+	void cuda_dealloc(void *ptr) {
+		cudaFree(ptr);
+	}
+	
+	void copy_cpu_to_cuda(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
+	}
+	
+	void copy_cuda_to_cpu(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
+	}
+	
+	void copy_cuda_to_cuda(void *dst, void *src, size_t size) {
+		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
+	}
+}
--- a/hercules_samples/matmul/src/main.rs
+++ b/hercules_samples/matmul/src/main.rs
@@ -23,7 +23,7 @@ fn main() {
        }
        let a = HerculesBox::from_slice_mut(&mut a);
        let b = HerculesBox::from_slice_mut(&mut b);
-        let c = matmul(I as u64, J as u64, K as u64, a, b).await;
+        let mut c = matmul(I as u64, J as u64, K as u64, a, b).await;
        assert_eq!(c.as_slice::<i32>(), &*correct_c);
    });
 }

--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -21,17 +21,17 @@ fn main() {
                }
            }
        }
-        let c = {
+        let mut c = {
            let a = HerculesBox::from_slice(&a);
            let b = HerculesBox::from_slice(&b);
            matmul(I as u64, J as u64, K as u64, a, b).await
        };
-        let tiled_c = {
+        assert_eq!(c.as_slice::<i32>(), &*correct_c);
+        let mut tiled_c = {
            let a = HerculesBox::from_slice(&a);
            let b = HerculesBox::from_slice(&b);
            tiled_64_matmul(I as u64, J as u64, K as u64, a, b).await
        };
-        assert_eq!(c.as_slice::<i32>(), &*correct_c);
        assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
    });
 }