From d4a8a9488da6f6d49e8a098f26e38a2a28e38e93 Mon Sep 17 00:00:00 2001
From: prrathi <prrathi10@gmail.com>
Date: Wed, 29 Jan 2025 21:58:34 -0600
Subject: [PATCH] not fixed yet but switching machines

---
 hercules_samples/call/build.rs                |  2 +
 hercules_samples/call/src/cpu.sch             | 19 +++++++++
 hercules_samples/call/src/gpu.sch             | 18 ++++++++
 hercules_samples/ccp/build.rs                 |  2 +
 hercules_samples/ccp/src/cpu.sch              | 19 +++++++++
 hercules_samples/ccp/src/gpu.sch              | 18 ++++++++
 hercules_samples/dot/src/gpu.sch              |  5 +--
 hercules_samples/fac/build.rs                 |  2 +
 hercules_samples/fac/src/cpu.sch              | 19 +++++++++
 hercules_samples/fac/src/gpu.sch              | 18 ++++++++
 hercules_samples/matmul/src/gpu.sch           |  5 +--
 hercules_samples/matmul/src/main.rs           | 10 ++---
 juno_samples/antideps/build.rs                |  9 ++++
 juno_samples/antideps/src/gpu.sch             | 18 ++++++++
 juno_samples/casts_and_intrinsics/build.rs    |  9 ++++
 juno_samples/casts_and_intrinsics/src/gpu.sch | 18 ++++++++
 juno_samples/cava/build.rs                    |  9 ++++
 juno_samples/cava/src/gpu.sch                 | 18 ++++++++
 juno_samples/concat/build.rs                  |  2 +
 juno_samples/concat/src/concat.jn             | 10 +----
 juno_samples/concat/src/cpu.sch               | 17 ++++++++
 juno_samples/concat/src/gpu.sch               | 17 ++++++++
 juno_samples/concat/src/main.rs               | 26 ++++++++++--
 juno_samples/cpu.sch                          | 19 +++++++++
 juno_samples/gpu.sch                          | 18 ++++++++
 juno_samples/implicit_clone/build.rs          |  9 ++++
 juno_samples/implicit_clone/src/gpu.sch       | 18 ++++++++
 juno_samples/matmul/build.rs                  |  9 ++++
 juno_samples/matmul/src/gpu.sch               | 18 ++++++++
 juno_samples/matmul/src/main.rs               | 41 ++++++++++++++-----
 juno_samples/nested_ccp/build.rs              |  9 ++++
 juno_samples/nested_ccp/src/gpu.sch           | 18 ++++++++
 juno_samples/nested_ccp/src/main.rs           | 35 +++++++++++-----
 juno_samples/schedule_test/src/main.rs        | 28 ++++++++++---
 juno_samples/simple3/build.rs                 |  9 ++++
 juno_samples/simple3/src/gpu.sch              | 18 ++++++++
 juno_samples/simple3/src/main.rs              | 24 ++++++++---
 37 files changed, 505 insertions(+), 58 deletions(-)
 create mode 100644 hercules_samples/call/src/cpu.sch
 create mode 100644 hercules_samples/call/src/gpu.sch
 create mode 100644 hercules_samples/ccp/src/cpu.sch
 create mode 100644 hercules_samples/ccp/src/gpu.sch
 create mode 100644 hercules_samples/fac/src/cpu.sch
 create mode 100644 hercules_samples/fac/src/gpu.sch
 create mode 100644 juno_samples/antideps/src/gpu.sch
 create mode 100644 juno_samples/casts_and_intrinsics/src/gpu.sch
 create mode 100644 juno_samples/cava/src/gpu.sch
 create mode 100644 juno_samples/concat/src/cpu.sch
 create mode 100644 juno_samples/concat/src/gpu.sch
 create mode 100644 juno_samples/cpu.sch
 create mode 100644 juno_samples/gpu.sch
 create mode 100644 juno_samples/implicit_clone/src/gpu.sch
 create mode 100644 juno_samples/matmul/src/gpu.sch
 create mode 100644 juno_samples/nested_ccp/src/gpu.sch
 create mode 100644 juno_samples/simple3/src/gpu.sch

diff --git a/hercules_samples/call/build.rs b/hercules_samples/call/build.rs
index af48fe64..7f5816ce 100644
--- a/hercules_samples/call/build.rs
+++ b/hercules_samples/call/build.rs
@@ -4,6 +4,8 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("call.hir")
         .unwrap()
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/hercules_samples/call/src/cpu.sch b/hercules_samples/call/src/cpu.sch
new file mode 100644
index 00000000..4c684da2
--- /dev/null
+++ b/hercules_samples/call/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+fork-split(*);
+unforkify(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch
new file mode 100644
index 00000000..1e654e22
--- /dev/null
+++ b/hercules_samples/call/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.add);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/ccp/build.rs b/hercules_samples/ccp/build.rs
index f04d48c7..c98d0551 100644
--- a/hercules_samples/ccp/build.rs
+++ b/hercules_samples/ccp/build.rs
@@ -4,6 +4,8 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("ccp.hir")
         .unwrap()
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/hercules_samples/ccp/src/cpu.sch b/hercules_samples/ccp/src/cpu.sch
new file mode 100644
index 00000000..4c684da2
--- /dev/null
+++ b/hercules_samples/ccp/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+fork-split(*);
+unforkify(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch
new file mode 100644
index 00000000..d8f6a2d0
--- /dev/null
+++ b/hercules_samples/ccp/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.tricky);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch
index a1a51088..4adbf530 100644
--- a/hercules_samples/dot/src/gpu.sch
+++ b/hercules_samples/dot/src/gpu.sch
@@ -2,9 +2,8 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-auto-outline(*);
-gpu(*);
-host(dot);
+let out = auto-outline(*);
+gpu(out.dot);
 
 ip-sroa(*);
 sroa(*);
diff --git a/hercules_samples/fac/build.rs b/hercules_samples/fac/build.rs
index 4d8226f1..1986a746 100644
--- a/hercules_samples/fac/build.rs
+++ b/hercules_samples/fac/build.rs
@@ -4,6 +4,8 @@ fn main() {
     JunoCompiler::new()
         .ir_in_src("fac.hir")
         .unwrap()
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/hercules_samples/fac/src/cpu.sch b/hercules_samples/fac/src/cpu.sch
new file mode 100644
index 00000000..4c684da2
--- /dev/null
+++ b/hercules_samples/fac/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+fork-split(*);
+unforkify(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch
new file mode 100644
index 00000000..1885854c
--- /dev/null
+++ b/hercules_samples/fac/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.fac);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch
index c9d6b336..9a714789 100644
--- a/hercules_samples/matmul/src/gpu.sch
+++ b/hercules_samples/matmul/src/gpu.sch
@@ -2,9 +2,8 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-auto-outline(*);
-gpu(*);
-host(matmul);
+let out = auto-outline(*);
+gpu(out.matmul);
 
 ip-sroa(*);
 sroa(*);
diff --git a/hercules_samples/matmul/src/main.rs b/hercules_samples/matmul/src/main.rs
index 7b6cfe79..abd25ec9 100644
--- a/hercules_samples/matmul/src/main.rs
+++ b/hercules_samples/matmul/src/main.rs
@@ -33,15 +33,13 @@ fn main() {
         }
         #[cfg(feature = "cuda")]
         {
-            let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
-            let a = a_box.get_ref();
-            let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
-            let b = b_box.get_ref();
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
             let mut r = runner!(matmul);
-            let c = r.run(I as u64, J as u64, K as u64, a, b).await;
+            let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
             let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
             c.to_cpu_ref(&mut c_cpu);
-            assert_eq!(c_cpu.as_ref(), &*correct_c);
+            assert_eq!(&*c_cpu, &*correct_c);
         }
     });
 }
diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs
index 7ed716a4..92b30c43 100644
--- a/juno_samples/antideps/build.rs
+++ b/juno_samples/antideps/build.rs
@@ -1,6 +1,15 @@
 use juno_build::JunoCompiler;
 
 fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("antideps.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
     JunoCompiler::new()
         .file_in_src("antideps.jn")
         .unwrap()
diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch
new file mode 100644
index 00000000..d3f4a6c2
--- /dev/null
+++ b/juno_samples/antideps/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_antideps2, out.very_complex_antideps, out.read_chains, out.array_of_structs);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs
index 16d5c7a4..e43a2ac8 100644
--- a/juno_samples/casts_and_intrinsics/build.rs
+++ b/juno_samples/casts_and_intrinsics/build.rs
@@ -1,6 +1,15 @@
 use juno_build::JunoCompiler;
 
 fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("casts_and_intrinsics.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
     JunoCompiler::new()
         .file_in_src("casts_and_intrinsics.jn")
         .unwrap()
diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch
new file mode 100644
index 00000000..b2fb3449
--- /dev/null
+++ b/juno_samples/casts_and_intrinsics/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.casts_and_intrinsics);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/cava/build.rs b/juno_samples/cava/build.rs
index 929d3eba..03d54160 100644
--- a/juno_samples/cava/build.rs
+++ b/juno_samples/cava/build.rs
@@ -2,6 +2,15 @@ extern crate juno_build;
 use juno_build::JunoCompiler;
 
 fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("cava.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
     JunoCompiler::new()
         .file_in_src("cava.jn")
         .unwrap()
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
new file mode 100644
index 00000000..07f71c99
--- /dev/null
+++ b/juno_samples/cava/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+gpu(scale, demosaic, medianMatrix, transform, gamut, tone_map, descale);
+
+ip-sroa(*);
+sroa(*);
+
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/concat/build.rs b/juno_samples/concat/build.rs
index f7784b99..c91df94e 100644
--- a/juno_samples/concat/build.rs
+++ b/juno_samples/concat/build.rs
@@ -4,6 +4,8 @@ fn main() {
     JunoCompiler::new()
         .file_in_src("concat.jn")
         .unwrap()
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
+        .unwrap()
         .build()
         .unwrap();
 }
diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn
index 2471671e..b9806c93 100644
--- a/juno_samples/concat/src/concat.jn
+++ b/juno_samples/concat/src/concat.jn
@@ -18,15 +18,7 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t {
 }
 
 #[entry]
-fn concat_entry(a : i32) -> i32 {
-  let arr1 : i32[3];
-  let arr2 : i32[6];
-  arr1[0] = a;
-  arr1[1] = a;
-  arr2[0] = a;
-  arr2[1] = a;
-  arr2[4] = a;
-  arr2[5] = a;
+fn concat_entry(arr1 : i32[3], arr2 : i32[6]) -> i32 {
   let arr3 = concat::<i32, 3, 6>(arr1, arr2);
   return sum::<i32, 9>(arr3);
 }
diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch
new file mode 100644
index 00000000..680adaeb
--- /dev/null
+++ b/juno_samples/concat/src/cpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+cpu(concat, sum);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch
new file mode 100644
index 00000000..8ee4ef0e
--- /dev/null
+++ b/juno_samples/concat/src/gpu.sch
@@ -0,0 +1,17 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+gpu(concat, sum);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs
index db3f37fd..d0929fbf 100644
--- a/juno_samples/concat/src/main.rs
+++ b/juno_samples/concat/src/main.rs
@@ -1,15 +1,35 @@
 #![feature(concat_idents)]
 
 use hercules_rt::runner;
+use hercules_rt::HerculesCPURef;
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("concat");
 
 fn main() {
     async_std::task::block_on(async {
         let mut r = runner!(concat_entry);
-        let output = r.run(7).await;
-        println!("{}", output);
-        assert_eq!(output, 42);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let mut a_data = [7, 7, 0];
+            let a = HerculesCPURef::from_slice(&mut a_data);
+            let mut b_data = [7, 7, 0, 0, 7, 7];
+            let b = HerculesCPURef::from_slice(&mut b_data);
+            let output = r.run(a, b).await;
+            println!("{}", output);
+            assert_eq!(output, 42);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let mut a_data = [7, 7, 0];
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data));
+            let mut b_data = [7, 7, 0, 0, 7, 7];
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data));
+            let output = r.run(a.get_ref(), b.get_ref()).await;
+            println!("{}", output);
+            assert_eq!(output, 42);
+        }
     });
 }
 
diff --git a/juno_samples/cpu.sch b/juno_samples/cpu.sch
new file mode 100644
index 00000000..4c684da2
--- /dev/null
+++ b/juno_samples/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+fork-split(*);
+unforkify(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/gpu.sch b/juno_samples/gpu.sch
new file mode 100644
index 00000000..9a714789
--- /dev/null
+++ b/juno_samples/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.matmul);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs
index 75c1afc4..dc134e59 100644
--- a/juno_samples/implicit_clone/build.rs
+++ b/juno_samples/implicit_clone/build.rs
@@ -1,6 +1,15 @@
 use juno_build::JunoCompiler;
 
 fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("implicit_clone.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
     JunoCompiler::new()
         .file_in_src("implicit_clone.jn")
         .unwrap()
diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch
new file mode 100644
index 00000000..443fc778
--- /dev/null
+++ b/juno_samples/implicit_clone/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit_clone, out.tricky_loop_implicit_clone, out.tricky2_loop_implicit_clone, out.tricky3_loop_implicit_clone, out.no_implicit_clone, out.mirage_implicit_clone);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs
index 926fbc33..ff3e3d8c 100644
--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
@@ -1,6 +1,15 @@
 use juno_build::JunoCompiler;
 
 fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("matmul.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
     JunoCompiler::new()
         .file_in_src("matmul.jn")
         .unwrap()
diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
new file mode 100644
index 00000000..e85dafdf
--- /dev/null
+++ b/juno_samples/matmul/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.matmul, out.tiled_64_matmul);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/matmul/src/main.rs b/juno_samples/matmul/src/main.rs
index fa5d1f04..50fe1760 100644
--- a/juno_samples/matmul/src/main.rs
+++ b/juno_samples/matmul/src/main.rs
@@ -3,6 +3,8 @@
 use rand::random;
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("matmul");
 
@@ -11,8 +13,8 @@ fn main() {
         const I: usize = 256;
         const J: usize = 64;
         const K: usize = 128;
-        let a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
-        let b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
+        let mut a: Box<[i32]> = (0..I * J).map(|_| random::<i32>() % 100).collect();
+        let mut b: Box<[i32]> = (0..J * K).map(|_| random::<i32>() % 100).collect();
         let mut correct_c: Box<[i32]> = (0..I * K).map(|_| 0).collect();
         for i in 0..I {
             for k in 0..K {
@@ -21,14 +23,32 @@ fn main() {
                 }
             }
         }
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURef::from_slice(&b);
-        let mut r = runner!(matmul);
-        let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
-        assert_eq!(c.as_slice::<i32>(), &*correct_c);
-        let mut r = runner!(tiled_64_matmul);
-        let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
-        assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let b = HerculesCPURef::from_slice(&b);
+            let mut r = runner!(matmul);
+            let c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
+            assert_eq!(c.as_slice::<i32>(), &*correct_c);
+            let mut r = runner!(tiled_64_matmul);
+            let tiled_c = r.run(I as u64, J as u64, K as u64, a.clone(), b.clone()).await;
+            assert_eq!(tiled_c.as_slice::<i32>(), &*correct_c);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
+            let mut r = runner!(matmul);
+            let c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let mut c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
+            c.to_cpu_ref(&mut c_cpu);
+            assert_eq!(&*c_cpu, &*correct_c);
+            let mut r = runner!(tiled_64_matmul);
+            let tiled_c = r.run(I as u64, J as u64, K as u64, a.get_ref(), b.get_ref()).await;
+            let mut tiled_c_cpu: Box<[i32]> = vec![0; correct_c.len()].into_boxed_slice();
+            tiled_c.to_cpu_ref(&mut tiled_c_cpu);
+            assert_eq!(&*tiled_c_cpu, &*correct_c);
+        }
     });
 }
 
@@ -36,4 +56,3 @@ fn main() {
 fn matmul_test() {
     main();
 }
-
diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs
index c5c7ca6a..2352ddef 100644
--- a/juno_samples/nested_ccp/build.rs
+++ b/juno_samples/nested_ccp/build.rs
@@ -1,6 +1,15 @@
 use juno_build::JunoCompiler;
 
 fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("nested_ccp.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
     JunoCompiler::new()
         .file_in_src("nested_ccp.jn")
         .unwrap()
diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch
new file mode 100644
index 00000000..021a05e3
--- /dev/null
+++ b/juno_samples/nested_ccp/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.ccp_example, out.median_array, out.no_underflow);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs
index 423b66fb..412d56a4 100644
--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
@@ -1,6 +1,8 @@
 #![feature(concat_idents)]
 
 use hercules_rt::{runner, HerculesCPURef, HerculesCPURefMut};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("nested_ccp");
 
@@ -8,19 +10,30 @@ fn main() {
     async_std::task::block_on(async {
         let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
         let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]);
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURefMut::from_slice(&mut b);
-        let mut r = runner!(ccp_example);
-        let output_example = r.run(a).await;
-        let mut r = runner!(median_array);
-        let output_median = r.run(9, b).await;
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let b = HerculesCPURefMut::from_slice(&mut b);
+            let mut r = runner!(ccp_example);
+            let output_example = r.run(a).await;
+            let mut r = runner!(median_array);
+            let output_median = r.run(9, b).await;
+            assert_eq!(output_example, 1.0);
+            assert_eq!(output_median, 18);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
+            let mut r = runner!(ccp_example);
+            let output_example = r.run(a.get_ref_mut()).await;
+            let mut r = runner!(median_array);
+            let output_median = r.run(9, b.get_ref_mut()).await;
+            assert_eq!(output_example, 1.0);
+            assert_eq!(output_median, 18);
+        }
         let mut r = runner!(no_underflow);
         let out_no_underflow = r.run().await;
-        println!("{}", output_example);
-        println!("{}", output_median);
-        println!("{}", out_no_underflow);
-        assert_eq!(output_example, 1.0);
-        assert_eq!(output_median, 18);
         assert_eq!(out_no_underflow, 7);
     });
 }
diff --git a/juno_samples/schedule_test/src/main.rs b/juno_samples/schedule_test/src/main.rs
index 2e63babf..1505d4e5 100644
--- a/juno_samples/schedule_test/src/main.rs
+++ b/juno_samples/schedule_test/src/main.rs
@@ -3,6 +3,8 @@
 use rand::random;
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("code");
 
@@ -26,12 +28,26 @@ fn main() {
             }
         }
 
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURef::from_slice(&b);
-        let c = HerculesCPURef::from_slice(&c);
-        let mut r = runner!(test);
-        let res = r.run(N as u64, M as u64, K as u64, a, b, c).await;
-        assert_eq!(res.as_slice::<i32>(), &*correct_res);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let b = HerculesCPURef::from_slice(&b);
+            let c = HerculesCPURef::from_slice(&c);
+            let mut r = runner!(test);
+            let res = r.run(N as u64, M as u64, K as u64, a, b, c).await;
+            assert_eq!(res.as_slice::<i32>(), &*correct_res);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
+            let c = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&c));
+            let mut r = runner!(test);
+            let res = r.run(N as u64, M as u64, K as u64, a.get_ref(), b.get_ref(), c.get_ref()).await;
+            let mut res_cpu: Box<[i32]> = vec![0; correct_res.len()].into_boxed_slice();
+            res.to_cpu_ref(&mut res_cpu);
+            assert_eq!(&*res_cpu, &*correct_res);
+        }
     });
 }
 
diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs
index 94760025..a0874af7 100644
--- a/juno_samples/simple3/build.rs
+++ b/juno_samples/simple3/build.rs
@@ -1,6 +1,15 @@
 use juno_build::JunoCompiler;
 
 fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("simple3.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
     JunoCompiler::new()
         .file_in_src("simple3.jn")
         .unwrap()
diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch
new file mode 100644
index 00000000..e97627d4
--- /dev/null
+++ b/juno_samples/simple3/src/gpu.sch
@@ -0,0 +1,18 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+gpu(out.simple3);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+float-collections(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/simple3/src/main.rs b/juno_samples/simple3/src/main.rs
index 4f9fe6a7..8eb78f7c 100644
--- a/juno_samples/simple3/src/main.rs
+++ b/juno_samples/simple3/src/main.rs
@@ -1,6 +1,8 @@
 #![feature(concat_idents)]
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("simple3");
 
@@ -8,12 +10,22 @@ fn main() {
     async_std::task::block_on(async {
         let a: Box<[u32]> = Box::new([1, 2, 3, 4, 5, 6, 7, 8]);
         let b: Box<[u32]> = Box::new([8, 7, 6, 5, 4, 3, 2, 1]);
-        let a = HerculesCPURef::from_slice(&a);
-        let b = HerculesCPURef::from_slice(&b);
-        let mut r = runner!(simple3);
-        let c = r.run(8, a, b).await;
-        println!("{}", c);
-        assert_eq!(c, 120);
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&a);
+            let b = HerculesCPURef::from_slice(&b);
+            let mut r = runner!(simple3);
+            let c = r.run(8, a, b).await;
+            assert_eq!(c, 120);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
+            let mut r = runner!(simple3);
+            let c = r.run(8, a.get_ref(), b.get_ref()).await;
+            assert_eq!(c, 120);
+        }
     });
 }
 
-- 
GitLab