From 2fed8e6b88b726494ed568bc2b6c85c7360991a5 Mon Sep 17 00:00:00 2001
From: Russel Arbore <prathi3@illinois.edu>
Date: Fri, 31 Jan 2025 16:03:35 -0600
Subject: [PATCH] gpu non-tile works

---
 hercules_cg/src/gpu.rs                        | 84 +++++++++++++------
 hercules_samples/matmul/src/gpu.sch           |  1 +
 juno_samples/cava/src/gpu.sch                 |  2 -
 juno_samples/concat/src/main.rs               | 37 +++++++-
 juno_samples/implicit_clone/src/main.rs       |  8 --
 juno_samples/matmul/src/cpu.sch               | 20 -----
 juno_samples/matmul/src/gpu.sch               |  1 -
 juno_samples/nested_ccp/build.rs              | 25 ++++--
 juno_samples/nested_ccp/src/gpu.sch           |  2 -
 juno_samples/nested_ccp/src/main.rs           |  8 +-
 juno_samples/patterns/Cargo.toml              |  3 +
 juno_samples/patterns/build.rs                | 23 +++--
 .../src/cpu.sch => patterns/src/gpu.sch}      |  5 +-
 juno_samples/simple3/build.rs                 | 25 ++++--
 juno_samples/simple3/src/gpu.sch              |  2 -
 15 files changed, 155 insertions(+), 91 deletions(-)
 delete mode 100644 juno_samples/matmul/src/cpu.sch
 rename juno_samples/{nested_ccp/src/cpu.sch => patterns/src/gpu.sch} (73%)

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index 341e143e..6c62ed76 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -23,6 +23,7 @@ pub fn gpu_codegen<W: Write>(
     typing: &Vec<TypeID>,
     control_subgraph: &Subgraph,
     bbs: &BasicBlocks,
+    backing_allocation: &FunctionBackingAllocation,
     collection_objects: &FunctionCollectionObjects,
     def_use_map: &ImmutableDefUseMap,
     fork_join_map: &HashMap<NodeID, NodeID>,
@@ -168,6 +169,7 @@ pub fn gpu_codegen<W: Write>(
         typing,
         control_subgraph,
         bbs,
+        backing_allocation,
         collection_objects,
         def_use_map,
         fork_join_map,
@@ -196,6 +198,7 @@ struct GPUContext<'a> {
     typing: &'a Vec<TypeID>,
     control_subgraph: &'a Subgraph,
     bbs: &'a BasicBlocks,
+    backing_allocation: &'a FunctionBackingAllocation,
     collection_objects: &'a FunctionCollectionObjects,
     def_use_map: &'a ImmutableDefUseMap,
     fork_join_map: &'a HashMap<NodeID, NodeID>,
@@ -352,8 +355,14 @@ namespace cg = cooperative_groups;
             "__global__ void __launch_bounds__({}) {}_gpu(",
             self.kernel_params.max_num_threads, self.function.name
         )?;
-        // The first set of parameters are dynamic constants.
         let mut first_param = true;
+        // The first parameter is a pointer to GPU backing memory, if it's
+        // needed.
+        if self.backing_allocation.contains_key(&Device::CUDA) {
+            first_param = false;
+            write!(w, "char* backing")?;
+        }
+        // The second set of parameters are dynamic constants.
         for idx in 0..self.function.num_dynamic_constants {
             if first_param {
                 first_param = false;
@@ -362,7 +371,7 @@ namespace cg = cooperative_groups;
             }
             write!(w, "unsigned long long dc_p{}", idx)?;
         }
-        // The second set of parameters are normal arguments.
+        // The third set of parameters are normal arguments.
         for (idx, ty) in self.function.param_types.iter().enumerate() {
             if first_param {
                 first_param = false;
@@ -403,38 +412,46 @@ namespace cg = cooperative_groups;
      * Emit calculation of all dynamic constants
      */
     fn codegen_dynamic_constants(&self, w: &mut String) -> Result<(), Error> {
-        for dc in dynamic_constants_bottom_up(self.dynamic_constants) {
+        for dc in dynamic_constants_bottom_up(&self.dynamic_constants) {
             let dc_val = format!("unsigned long long dc{}", dc.idx());
-            match self.dynamic_constants[dc.idx()] {
+            match &self.dynamic_constants[dc.idx()] {
                 DynamicConstant::Constant(val) => write!(w, "\t{} = {}ull;\n", dc_val, val)?,
                 DynamicConstant::Parameter(idx) => {
-                    if idx < self.function.num_dynamic_constants as usize {
+                    if *idx < self.function.num_dynamic_constants as usize {
                         write!(w, "\t{} = dc_p{};\n", dc_val, idx)?
                     } else {
                         write!(w, "\t{} = 0;\n", dc_val)?
                     }
                 }
-                DynamicConstant::Add(left, right) => {
-                    write!(w, "\t{} = dc{} + dc{};\n", dc_val, left.idx(), right.idx())?
+                DynamicConstant::Add(args) => {
+                    let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" + ");
+                    write!(w, "\t{} = {};\n", dc_val, rhs)?
+                }
+                DynamicConstant::Mul(args) => {
+                    let rhs = args.iter().map(|arg| format!("dc{}", arg.idx())).collect::<Vec<_>>().join(" * ");
+                    write!(w, "\t{} = {};\n", dc_val, rhs)?
+                }
+                DynamicConstant::Min(args) => {
+                    let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("min(dc{}, ", arg.idx())).collect();
+                    let rhs_last = format!("dc{}", args.last().unwrap().idx());
+                    let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect();
+                    write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)?
+                }
+                DynamicConstant::Max(args) => {
+                    let rhs_but_last: String = args.iter().take(args.len() - 1).map(|arg| format!("max(dc{}, ", arg.idx())).collect();
+                    let rhs_last = format!("dc{}", args.last().unwrap().idx());
+                    let rhs_end: String = std::iter::repeat(")").take(args.len() - 1).collect();
+                    write!(w, "\t{} = {}{}{};\n", dc_val, rhs_but_last, rhs_last, rhs_end)?
                 }
                 DynamicConstant::Sub(left, right) => {
                     write!(w, "\t{} = dc{} - dc{};\n", dc_val, left.idx(), right.idx())?
                 }
-                DynamicConstant::Mul(left, right) => {
-                    write!(w, "\t{} = dc{} * dc{};\n", dc_val, left.idx(), right.idx())?
-                }
                 DynamicConstant::Div(left, right) => {
                     write!(w, "\t{} = dc{} / dc{};\n", dc_val, left.idx(), right.idx())?
                 }
                 DynamicConstant::Rem(left, right) => {
                     write!(w, "\t{} = dc{} % dc{};\n", dc_val, left.idx(), right.idx())?
                 }
-                DynamicConstant::Min(left, right) => {
-                    write!(w, "\t{} = min(dc{}, dc{});\n", dc_val, left.idx(), right.idx())?
-                }
-                DynamicConstant::Max(left, right) => {
-                    write!(w, "\t{} = max(dc{}, dc{});\n", dc_val, left.idx(), right.idx())?
-                }
             }
         }
         Ok(())
@@ -502,8 +519,15 @@ namespace cg = cooperative_groups;
         let mut pass_args = String::new();
         write!(w, "
 extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
-        // The first set of parameters are dynamic constants.
         let mut first_param = true;
+        // The first parameter is a pointer to GPU backing memory, if it's
+        // needed.
+        if self.backing_allocation.contains_key(&Device::CUDA) {
+            first_param = false;
+            write!(w, "char* backing")?;
+            write!(pass_args, "backing")?;
+        }
+        // The second set of parameters are dynamic constants.
         for idx in 0..self.function.num_dynamic_constants {
             if first_param {
                 first_param = false;
@@ -514,7 +538,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
             write!(w, "unsigned long long dc_p{}", idx)?;
             write!(pass_args, "dc_p{}", idx)?;
         }
-        // The second set of parameters are normal arguments.
+        // The third set of parameters are normal arguments.
         for (idx, ty) in self.function.param_types.iter().enumerate() {
             if first_param {
                 first_param = false;
@@ -540,8 +564,13 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
             write!(pass_args, "ret")?;
             write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
         }
+        write!(w, "\tcudaError_t err;\n");
         write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?;
+        write!(w, "\terr = cudaGetLastError();\n");
+        write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error1: %s\\n\", cudaGetErrorString(err)); }}\n");
         write!(w, "\tcudaDeviceSynchronize();\n")?;
+        write!(w, "\terr = cudaGetLastError();\n");
+        write!(w, "\tif (cudaSuccess != err) {{ printf(\"Error2: %s\\n\", cudaGetErrorString(err)); }}\n");
         if has_ret_var {
             // Copy return from device to host, whether it's primitive value or collection pointer
             write!(w, "\t{} host_ret;\n", ret_type)?;
@@ -979,17 +1008,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
             Node::Parameter { index: _ } => {}
             // If the constant is primitive, it's stored in register so we repeat
             // for all threads. Otherwise, it can be inside or outside block fork.
-            // If inside, it's stored in shared memory so we only want to "allocate"
-            // and initialize it once. In either case, we then parallelize memset to 0.
+            // If inside, it's stored in shared memory so we "allocate" it once
+            // and parallelize memset to 0. If outside, we initialize as offset
+            // to backing, but if multi-block grid, don't memset to avoid grid-level sync.
             Node::Constant { id: cons_id } => {
                 let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive();
                 let cg_tile = match state {
                     KernelState::OutBlock | KernelState::InBlock => "block".to_string(),
                     KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId),
                 };
-                if !is_primitive && state == KernelState::OutBlock && is_block_parallel.is_some() && is_block_parallel.unwrap() {
-                    panic!("GPU can't memset collection for multi-block grid");
-                }
                 if !is_primitive && state != KernelState::OutBlock {
                     write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?;
                     *num_tabs += 1;
@@ -1007,9 +1034,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                 }
                 if !is_primitive && state != KernelState::OutBlock {
                     write!(w, "{}}}\n", tabs)?;
+                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
                     *num_tabs -= 1;
                 }
-                if !is_primitive {
+                if !is_primitive && state == KernelState::OutBlock {
+                    let (_, offsets) = &self.backing_allocation[&Device::CUDA];
+                    let offset = offsets[&id];
+                    write!(w, "{}{} = backing + dc{};\n", tabs, define_variable, offset.idx())?;
+                }
+                if !is_primitive && (state != KernelState::OutBlock || is_block_parallel.is_none() || !is_block_parallel.unwrap()) {
                     let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects));
                     write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?;
                     write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?;
@@ -1223,9 +1256,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                 let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()]));
                 let data_variable = self.get_value(*data, false, false);
                 let data_type_id = self.typing[data.idx()];
-                if KernelState::OutBlock == state && is_block_parallel.is_some() && is_block_parallel.unwrap() {
-                    panic!("GPU can't guarantee correctness for multi-block collection writes");
-                }
                 let cg_tile = match state {
                     KernelState::OutBlock | KernelState::InBlock => "block".to_string(),
                     KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId),
diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch
index a4eb3240..c0a1a5ce 100644
--- a/hercules_samples/matmul/src/gpu.sch
+++ b/hercules_samples/matmul/src/gpu.sch
@@ -12,6 +12,7 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
+forkify(*);
 infer-schedules(*);
 
 gcm(*);
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index ace9082c..a5570b8d 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -17,5 +17,3 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-dce(*);
-gcm(*)
diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs
index 83534c9d..9674c2c5 100644
--- a/juno_samples/concat/src/main.rs
+++ b/juno_samples/concat/src/main.rs
@@ -2,16 +2,47 @@
 
 use hercules_rt::runner;
 use hercules_rt::HerculesCPURef;
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 juno_build::juno!("concat");
 
 fn main() {
     async_std::task::block_on(async {
         let mut r = runner!(concat_entry);
-        let output = r.run(7).await;
-        println!("{}", output);
-        assert_eq!(output, 42);
+        let mut a_data = [7, 7, 0];
+        let mut b_data = [7, 7, 0, 0, 7, 7];
+        #[cfg(not(feature = "cuda"))]
+        {
+            let a = HerculesCPURef::from_slice(&mut a_data);
+            let b = HerculesCPURef::from_slice(&mut b_data);
+            let output = r.run(3, 6, a, b).await;
+            assert_eq!(output, 42);
 
+            const N: usize = 3;
+            let arr : Box<[i32]> = (2..=4).collect();
+            let arr = HerculesCPURef::from_slice(&arr);
+
+            let mut r = runner!(concat_switch);
+            let output = r.run(N as u64, 50, arr.clone()).await;
+            let result = output.as_slice::<i32>();
+            println!("{:?}", result);
+            assert_eq!(result, [0, 1, 2, 3, 4]);
+
+            let output = r.run(N as u64, 30, arr).await;
+            let result = output.as_slice::<i32>();
+            println!("{:?}", result);
+            assert_eq!(result, [2, 3, 4, 0, 1]);
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let mut a_data = [7, 7, 0];
+            let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data));
+            let mut b_data = [7, 7, 0, 0, 7, 7];
+            let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data));
+            let output = r.run(3, 6, a.get_ref(), b.get_ref()).await;
+            assert_eq!(output, 42);
+        }
     });
 }
 
diff --git a/juno_samples/implicit_clone/src/main.rs b/juno_samples/implicit_clone/src/main.rs
index 1e94ff89..c1f82528 100644
--- a/juno_samples/implicit_clone/src/main.rs
+++ b/juno_samples/implicit_clone/src/main.rs
@@ -8,42 +8,34 @@ fn main() {
     async_std::task::block_on(async {
         let mut r = runner!(simple_implicit_clone);
         let output = r.run(3).await;
-        println!("{}", output);
         assert_eq!(output, 11);
 
         let mut r = runner!(loop_implicit_clone);
         let output = r.run(100).await;
-        println!("{}", output);
         assert_eq!(output, 7);
 
         let mut r = runner!(double_loop_implicit_clone);
         let output = r.run(3).await;
-        println!("{}", output);
         assert_eq!(output, 42);
 
         let mut r = runner!(tricky_loop_implicit_clone);
         let output = r.run(2, 2).await;
-        println!("{}", output);
         assert_eq!(output, 130);
 
         let mut r = runner!(tricky2_loop_implicit_clone);
         let output = r.run(2, 3).await;
-        println!("{}", output);
         assert_eq!(output, 39);
 
         let mut r = runner!(tricky3_loop_implicit_clone);
         let output = r.run(5, 7).await;
-        println!("{}", output);
         assert_eq!(output, 7);
 
         let mut r = runner!(no_implicit_clone);
         let output = r.run(4).await;
-        println!("{}", output);
         assert_eq!(output, 13);
 
         let mut r = runner!(mirage_implicit_clone);
         let output = r.run(73).await;
-        println!("{}", output);
         assert_eq!(output, 843);
     });
 }
diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch
deleted file mode 100644
index b256d73b..00000000
--- a/juno_samples/matmul/src/cpu.sch
+++ /dev/null
@@ -1,20 +0,0 @@
-gvn(*);
-phi-elim(*);
-dce(*);
-
-auto-outline(*);
-
-ip-sroa(*);
-sroa(*);
-dce(*);
-gvn(*);
-phi-elim(*);
-dce(*);
-
-forkify(*);
-infer-schedules(*);
-
-gcm(*);
-float-collections(*);
-dce(*);
-gcm(*);
diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
index 205dee9c..3d3f919c 100644
--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
@@ -12,7 +12,6 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-forkify(*);
 infer-schedules(*);
 
 gcm(*);
diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs
index ec111bc1..074937e7 100644
--- a/juno_samples/nested_ccp/build.rs
+++ b/juno_samples/nested_ccp/build.rs
@@ -1,11 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("nested_ccp.jn")
-        .unwrap()
-        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("nested_ccp.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("nested_ccp.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch
index 4f36ddd8..c56d046a 100644
--- a/juno_samples/nested_ccp/src/gpu.sch
+++ b/juno_samples/nested_ccp/src/gpu.sch
@@ -15,5 +15,3 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-dce(*);
-gcm(*);
diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs
index 99ef150d..bc99a4bd 100644
--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
@@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp");
 
 fn main() {
     async_std::task::block_on(async {
-        let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
+        let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
         let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]);
         #[cfg(not(feature = "cuda"))]
         {
-            let a = HerculesCPURefMut::from_slice(&mut a);
+            let a = HerculesCPURef::from_slice(&a);
             let b = HerculesCPURefMut::from_slice(&mut b);
             let mut r = runner!(ccp_example);
             let output_example = r.run(a).await;
@@ -23,8 +23,8 @@ fn main() {
         }
         #[cfg(feature = "cuda")]
         {
-            let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
-            let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
+            let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&b));
             let mut r = runner!(ccp_example);
             let output_example = r.run(a.get_ref_mut()).await;
             let mut r = runner!(median_array);
diff --git a/juno_samples/patterns/Cargo.toml b/juno_samples/patterns/Cargo.toml
index a8dda157..bedaf7ca 100644
--- a/juno_samples/patterns/Cargo.toml
+++ b/juno_samples/patterns/Cargo.toml
@@ -8,6 +8,9 @@ edition = "2021"
 name = "juno_patterns"
 path = "src/main.rs"
 
+[features]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
 [build-dependencies]
 juno_build = { path = "../../juno_build" }
 
diff --git a/juno_samples/patterns/build.rs b/juno_samples/patterns/build.rs
index 8ac92f00..625da0a5 100644
--- a/juno_samples/patterns/build.rs
+++ b/juno_samples/patterns/build.rs
@@ -1,9 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("patterns.jn")
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("patterns.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("patterns.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/patterns/src/gpu.sch
similarity index 73%
rename from juno_samples/nested_ccp/src/cpu.sch
rename to juno_samples/patterns/src/gpu.sch
index 7e6be7ee..3d9c8c9e 100644
--- a/juno_samples/nested_ccp/src/cpu.sch
+++ b/juno_samples/patterns/src/gpu.sch
@@ -2,7 +2,8 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-auto-outline(*);
+let out = auto-outline(*);
+gpu(out.entry);
 
 ip-sroa(*);
 sroa(*);
@@ -14,5 +15,3 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-dce(*);
-gcm(*);
diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs
index bfd37cb5..58c2c5aa 100644
--- a/juno_samples/simple3/build.rs
+++ b/juno_samples/simple3/build.rs
@@ -1,11 +1,22 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    JunoCompiler::new()
-        .file_in_src("simple3.jn")
-        .unwrap()
-        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
-        .unwrap()
-        .build()
-        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    {
+        JunoCompiler::new()
+            .file_in_src("simple3.jn")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
+    #[cfg(feature = "cuda")]
+    {
+        JunoCompiler::new()
+            .file_in_src("simple3.jn")
+            .unwrap()
+            .schedule_in_src("gpu.sch")
+            .unwrap()
+            .build()
+            .unwrap();
+    }
 }
diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch
index 93e85c48..d6c2a9d6 100644
--- a/juno_samples/simple3/src/gpu.sch
+++ b/juno_samples/simple3/src/gpu.sch
@@ -15,5 +15,3 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-dce(*);
-gcm(*);
-- 
GitLab