diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index a3a46d93943d10bbebd125df93f95685c3ad5ac0..85ee7d90f8e69e74a079c32816a5bf27be764f5b 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -631,7 +631,7 @@ extern \"C\" {} {}(", if ret_primitive { ret_type.clone() } else { "void".to_str
                 write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?;
                 write!(pass_args, ", ret")?;
             }
-            write!(w, "\t{}<<<{}_gpu, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?;
+            write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?;
             write!(w, "\tcudaDeviceSynchronize();\n")?;
             if ret_primitive {
                 write!(w, "\t{} host_ret;\n", ret_type)?;