diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b80dd590ab42b4019daa285a83ec89d978884006..4a3ca13d145f35e54c8ae8c30b00b2be1d69d2b9 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,5 +1,5 @@
 test-job:
   stage: test
   script:
-    - cargo test
-    - cargo test --features=cuda
+    - cargo test --features=opencv
+    - cargo test --features=cuda,opencv
diff --git a/Cargo.lock b/Cargo.lock
index 49630436a0f0b90d8252824046c29f0e18b78af2..b8bf227809ac5a475407f4fb53cabfbe5d11f38a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -422,6 +422,27 @@ dependencies = [
  "vob",
 ]
 
+[[package]]
+name = "clang"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84c044c781163c001b913cd018fc95a628c50d0d2dfea8bca77dad71edb16e37"
+dependencies = [
+ "clang-sys",
+ "libc",
+]
+
+[[package]]
+name = "clang-sys"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
+dependencies = [
+ "glob",
+ "libc",
+ "libloading",
+]
+
 [[package]]
 name = "clap"
 version = "4.5.27"
@@ -576,6 +597,12 @@ dependencies = [
  "with_builtin_macros",
 ]
 
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
 [[package]]
 name = "either"
 version = "1.13.0"
@@ -777,6 +804,12 @@ dependencies = [
  "weezl",
 ]
 
+[[package]]
+name = "glob"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
+
 [[package]]
 name = "gloo-timers"
 version = "0.3.0"
@@ -1085,6 +1118,18 @@ dependencies = [
  "with_builtin_macros",
 ]
 
+[[package]]
+name = "juno_edge_detection"
+version = "0.1.0"
+dependencies = [
+ "async-std",
+ "clap",
+ "hercules_rt",
+ "juno_build",
+ "opencv",
+ "with_builtin_macros",
+]
+
 [[package]]
 name = "juno_frontend"
 version = "0.1.0"
@@ -1225,6 +1270,16 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "libloading"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
+dependencies = [
+ "cfg-if",
+ "windows-targets 0.52.6",
+]
+
 [[package]]
 name = "libredox"
 version = "0.1.3"
@@ -1473,6 +1528,41 @@ version = "1.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 
+[[package]]
+name = "opencv"
+version = "0.94.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f80fd7d018d20b1e49bdd65e72350f1f63cad6bc9c15f850c47c31a6ad8d0d20"
+dependencies = [
+ "cc",
+ "dunce",
+ "jobserver",
+ "libc",
+ "num-traits",
+ "once_cell",
+ "opencv-binding-generator",
+ "pkg-config",
+ "semver",
+ "shlex",
+ "vcpkg",
+ "windows",
+]
+
+[[package]]
+name = "opencv-binding-generator"
+version = "0.95.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7283829fe440be381fea73521f850b287fd44f994acd6453e1e19b3d479ef7fc"
+dependencies = [
+ "clang",
+ "clang-sys",
+ "dunce",
+ "once_cell",
+ "percent-encoding",
+ "regex",
+ "shlex",
+]
+
 [[package]]
 name = "ordered-float"
 version = "4.6.0"
@@ -1506,6 +1596,12 @@ version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
 [[package]]
 name = "phf"
 version = "0.11.3"
@@ -2242,6 +2338,12 @@ version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3ef4c4aa54d5d05a279399bfa921ec387b7aba77caf7a682ae8d86785b8fdad2"
 
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
 [[package]]
 name = "vergen"
 version = "8.3.2"
@@ -2369,13 +2471,76 @@ version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082"
 
+[[package]]
+name = "windows"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f919aee0a93304be7f62e8e5027811bbba96bcb1de84d6618be56e43f8a32a1"
+dependencies = [
+ "windows-core",
+ "windows-targets 0.53.0",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "810ce18ed2112484b0d4e15d022e5f598113e220c53e373fb31e67e21670c1ce"
+dependencies = [
+ "windows-implement",
+ "windows-interface",
+ "windows-result",
+ "windows-strings",
+ "windows-targets 0.53.0",
+]
+
+[[package]]
+name = "windows-implement"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "83577b051e2f49a058c308f17f273b570a6a758386fc291b5f6a934dd84e48c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]
+
+[[package]]
+name = "windows-interface"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb26fd936d991781ea39e87c3a27285081e3c0da5ca0fcbc02d368cc6f52ff01"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.96",
+]
+
+[[package]]
+name = "windows-result"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d08106ce80268c4067c0571ca55a9b4e9516518eaa1a1fe9b37ca403ae1d1a34"
+dependencies = [
+ "windows-targets 0.53.0",
+]
+
+[[package]]
+name = "windows-strings"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b888f919960b42ea4e11c2f408fadb55f78a9f236d5eef084103c8ce52893491"
+dependencies = [
+ "windows-targets 0.53.0",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -2384,14 +2549,30 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_gnullvm",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm 0.52.6",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b"
+dependencies = [
+ "windows_aarch64_gnullvm 0.53.0",
+ "windows_aarch64_msvc 0.53.0",
+ "windows_i686_gnu 0.53.0",
+ "windows_i686_gnullvm 0.53.0",
+ "windows_i686_msvc 0.53.0",
+ "windows_x86_64_gnu 0.53.0",
+ "windows_x86_64_gnullvm 0.53.0",
+ "windows_x86_64_msvc 0.53.0",
 ]
 
 [[package]]
@@ -2400,48 +2581,96 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3"
+
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.53.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486"
+
 [[package]]
 name = "winnow"
 version = "0.7.0"
diff --git a/Cargo.toml b/Cargo.toml
index ced011a96c96793891228876314debeabcb561ed..f4976106dfab468bc77e9d9aa2c933b4c6913490 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,7 +27,8 @@ members = [
 	"juno_samples/nested_ccp",
 	"juno_samples/antideps",
 	"juno_samples/implicit_clone",
-  	"juno_samples/cava",
+  "juno_samples/cava",
 	"juno_samples/concat",
-  	"juno_samples/schedule_test",
+  "juno_samples/schedule_test",
+  "juno_samples/edge_detection",
 ]
diff --git a/hercules_rt/build.rs b/hercules_rt/build.rs
index 8f82e86cab73b28f44222a8a3d8eaa4c49f04218..2a1538d600e1632e9311ef9464c38a83fdf44415 100644
--- a/hercules_rt/build.rs
+++ b/hercules_rt/build.rs
@@ -1,3 +1,5 @@
+#![feature(exit_status_error)]
+
 use std::env::var;
 use std::path::Path;
 use std::process::Command;
@@ -9,12 +11,16 @@ fn main() {
             .args(&["src/rtdefs.cu", "-c", "-o"])
             .arg(&format!("{}/rtdefs.o", out_dir))
             .status()
-            .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?");
+            .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?")
+            .exit_ok()
+            .expect("NVCC did not succeed");
         Command::new("ar")
             .current_dir(&Path::new(&out_dir))
             .args(&["crus", "librtdefs.a", "rtdefs.o"])
             .status()
-            .unwrap();
+            .unwrap()
+            .exit_ok()
+            .expect("ar did not succeed");
 
         println!("cargo::rustc-link-search=native={}", out_dir);
         println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
diff --git a/juno_samples/edge_detection/Cargo.toml b/juno_samples/edge_detection/Cargo.toml
new file mode 100644
index 0000000000000000000000000000000000000000..a3825eed9b6601ca4d06d8e81d332ed5986b3124
--- /dev/null
+++ b/juno_samples/edge_detection/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "juno_edge_detection"
+version = "0.1.0"
+authors = ["Aaron Councilman <aaronjc4@illinois.edu>"]
+edition = "2021"
+
+[features]
+opencv = ["dep:opencv"]
+cuda = ["juno_build/cuda", "hercules_rt/cuda"]
+
+[[bin]]
+name = "juno_edge_detection"
+path = "src/main.rs"
+required-features = ["opencv"]
+
+[build-dependencies]
+juno_build = { path = "../../juno_build" }
+
+[dependencies]
+juno_build = { path = "../../juno_build" }
+hercules_rt = { path = "../../hercules_rt" }
+async-std = "*"
+clap = { version = "*", features = ["derive"] }
+with_builtin_macros = "0.1.0"
+opencv = { version = "*", features = ["clang-runtime"], optional = true }
diff --git a/juno_samples/edge_detection/build.rs b/juno_samples/edge_detection/build.rs
new file mode 100644
index 0000000000000000000000000000000000000000..7071fae7612ab871757fddebdbb576a2e4a6073c
--- /dev/null
+++ b/juno_samples/edge_detection/build.rs
@@ -0,0 +1,19 @@
+extern crate juno_build;
+use juno_build::JunoCompiler;
+
+fn main() {
+    #[cfg(feature = "cuda")]
+    JunoCompiler::new()
+        .file_in_src("edge_detection.jn")
+        .unwrap()
+        .schedule_in_src("gpu.sch")
+        .unwrap()
+        .build()
+        .unwrap();
+    #[cfg(not(feature = "cuda"))]
+    JunoCompiler::new()
+        .file_in_src("edge_detection.jn")
+        .unwrap()
+        .build()
+        .unwrap();
+}
diff --git a/juno_samples/edge_detection/examples/formula1_scaled.mp4 b/juno_samples/edge_detection/examples/formula1_scaled.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1e0e3979fe0f1d2ca299313a1f404beeee6f9f3f
Binary files /dev/null and b/juno_samples/edge_detection/examples/formula1_scaled.mp4 differ
diff --git a/juno_samples/edge_detection/src/edge_detection.jn b/juno_samples/edge_detection/src/edge_detection.jn
new file mode 100644
index 0000000000000000000000000000000000000000..d49258c5d328d021953aaf035f80fb328a4440af
--- /dev/null
+++ b/juno_samples/edge_detection/src/edge_detection.jn
@@ -0,0 +1,238 @@
+fn gaussian_smoothing<n, m, gs : usize>(
+  input: f32[n, m],
+  filter: f32[gs, gs],
+) -> f32[n, m] {
+  let result : f32[n, m];
+
+  // Define the gaussian radius as half the gaussian size
+  const gr = gs / 2;
+
+  for row = 0 to n {
+    for col = 0 to m {
+      let smoothed = 0.0;
+
+      for i = 0 to gs {
+        for j = 0 to gs {
+          let val = input[if row + i < gr               then 0
+                          else if row + i - gr > n - 1  then n - 1
+                                                        else row + i - gr,
+                          if col + j < gr               then 0
+                          else if col + j - gr > m - 1  then m - 1
+                                                        else col + j - gr];
+          smoothed += val * filter[i, j];
+        }
+      }
+
+      result[row, col] = smoothed;
+    }
+  }
+
+  return result;
+}
+
+const MIN_BR : f32 = 0;
+const MAX_BR : f32 = 1;
+
+fn laplacian_estimate<n, m, sz: usize>(
+  input: f32[n, m],
+  structure: f32[sz, sz],
+) -> f32[n, m] {
+  const r = sz / 2;
+
+  let result : f32[n, m];
+
+  for row = 0 to n {
+    for col = 0 to m {
+      // Copy data for dilation filter
+      let imageArea : f32[sz, sz];
+      for i = 0 to sz {
+        for j = 0 to sz {
+          imageArea[i, j] = if row + i < r              then MIN_BR
+                            else if row + i - r > n - 1 then MIN_BR
+                            else if col + j < r         then MIN_BR
+                            else if col + j - r > m - 1 then MIN_BR
+                                 else input[row + i - r, col + j - r];
+        }
+      }
+
+      // Compute pixel of dilated image
+      let dilated = MIN_BR;
+      for i = 0 to sz {
+        for j = 0 to sz {
+          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
+        }
+      }
+
+      // Data copy for erotion filter
+      let imageArea : f32[sz, sz];
+      for i = 0 to sz {
+        for j = 0 to sz {
+          imageArea[i, j] = if row + i < r              then MAX_BR
+                            else if row + i - r > n - 1 then MAX_BR
+                            else if col + j < r         then MAX_BR
+                            else if col + j - r > m - 1 then MAX_BR
+                                 else input[row + i - r, col + j - r];
+        }
+      }
+
+      // Compute pixel of eroded image
+      let eroded = MAX_BR;
+      for i = 0 to sz {
+        for j = 0 to sz {
+          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+        }
+      }
+
+      let laplacian = dilated + eroded - 2 * input[row, col];
+      result[row, col] = laplacian;
+    }
+  }
+
+  return result;
+}
+
+fn zero_crossings<n, m, sz: usize>(
+  input: f32[n, m],
+  structure: f32[sz, sz],
+) -> f32[n, m] {
+  const r = sz / 2;
+
+  let result : f32[n, m];
+
+  for row = 0 to n {
+    for col = 0 to m {
+      // Data copy for dilation filter
+      let imageArea : f32[sz, sz];
+      for i = 0 to sz {
+        for j = 0 to sz {
+          imageArea[i, j] = if row + i < r              then MIN_BR
+                            else if row + i - r > n - 1 then MIN_BR
+                            else if col + j < r         then MIN_BR
+                            else if col + j - r > m - 1 then MIN_BR
+                            else if input[row + i - r, col + j - r] > MIN_BR
+                            then MAX_BR
+                            else MIN_BR;
+        }
+      }
+
+      // Compute the pixel of dilated image
+      let dilated = MIN_BR;
+      for i = 0 to sz {
+        for j = 0 to sz {
+          dilated = max!(dilated, imageArea[i, j] * structure[i, j]);
+        }
+      }
+
+      // Data copy for erotion filter
+      let imageArea : f32[sz, sz];
+      for i = 0 to sz {
+        for j = 0 to sz {
+          imageArea[i, j] = if row + i < r              then MAX_BR
+                            else if row + i - r > n - 1 then MAX_BR
+                            else if col + j < r         then MAX_BR
+                            else if col + j - r > m - 1 then MAX_BR
+                            else if input[row + i - r, col + j - r] > MIN_BR
+                            then MAX_BR
+                            else MIN_BR;
+        }
+      }
+
+      // Compute the pixel of eroded image
+      let eroded = MAX_BR;
+      for i = 0 to sz {
+        for j = 0 to sz {
+          eroded = min!(eroded, imageArea[i, j] * structure[i, j]);
+        }
+      }
+
+      let sign = dilated - eroded;
+      result[row, col] = sign;
+    }
+  }
+
+  return result;
+}
+
+fn gradient<n, m, sb: usize>(
+  input: f32[n, m],
+  sx: f32[sb, sb],
+  sy: f32[sb, sb],
+) -> f32[n, m] {
+  const sbr = sb / 2;
+
+  let result : f32[n, m];
+
+  for row = 0 to n {
+    for col = 0 to m {
+
+      let gx = 0;
+      let gy = 0;
+
+      for i = 0 to sb {
+        for j = 0 to sb {
+          let val = input[if row + i < sbr              then 0
+                          else if row + i - sbr > n - 1 then n - 1
+                                                        else row + i - sbr,
+                          if col + j < sbr              then 0
+                          else if col + j - sbr > m - 1 then m - 1
+                                                        else col + j - sbr];
+          gx += val * sx[i, j];
+          gy += val * sy[i, j];
+        }
+      }
+
+      result[row, col] = sqrt!(gx * gx + gy * gy);
+    }
+  }
+
+  return result;
+}
+
+fn max_gradient<n, m: usize>(gradient: f32[n, m]) -> f32 {
+  let max = gradient[0, 0];
+
+  for i = 0 to n {
+    for j = 0 to m {
+      max = max!(max, gradient[i, j]);
+    }
+  }
+
+  return max;
+}
+
+fn reject_zero_crossings<n, m: usize>(
+  crossings: f32[n, m],
+  gradient: f32[n, m],
+  max_gradient: f32,
+  theta: f32,
+) -> f32[n, m] {
+  let result : f32[n, m];
+
+  for row = 0 to n {
+    for col = 0 to m {
+      result[row, col] =
+        if crossings[row, col] > 0 && gradient[row, col] > theta * max_gradient
+        then 1.0
+        else 0.0;
+    }
+  }
+
+  return result;
+}
+
+#[entry]
+fn edge_detection<n, m, gs, sz, sb: usize>(
+  input: f32[n, m],
+  gaussian_filter: f32[gs, gs],
+  structure: f32[sz, sz],
+  sx: f32[sb, sb],
+  sy: f32[sb, sb],
+  theta: f32,
+) -> f32[n, m] {
+  let smoothed  = gaussian_smoothing::<n, m, gs>(input, gaussian_filter);
+  let laplacian = laplacian_estimate::<n, m, sz>(smoothed, structure);
+  let zcs       = zero_crossings::<n, m, sz>(laplacian, structure);
+  let gradient  = gradient::<n, m, sb>(smoothed, sx, sy);
+  let maxgrad   = max_gradient::<n, m>(gradient);
+  return reject_zero_crossings::<n, m>(zcs, gradient, maxgrad, theta);
+}
diff --git a/juno_samples/edge_detection/src/edge_detection_rust.rs b/juno_samples/edge_detection/src/edge_detection_rust.rs
new file mode 100644
index 0000000000000000000000000000000000000000..cbe0a7b30afc0338b3e24f6d576a36e65ccaeee4
--- /dev/null
+++ b/juno_samples/edge_detection/src/edge_detection_rust.rs
@@ -0,0 +1,461 @@
+fn gaussian_smoothing(
+    rows: usize,
+    cols: usize,
+    gaussian_size: usize,
+    input: &[f32],
+    filter: &[f32],
+) -> Vec<f32> {
+    let mut result = vec![0.0; rows * cols];
+    let gaussian_radius: i64 = (gaussian_size / 2) as i64;
+
+    let rows = rows as i64;
+    let cols = cols as i64;
+
+    for gy in 0..rows {
+        for gx in 0..cols {
+            let gloc = gx + gy * cols;
+
+            let mut smoothed_val = 0.0;
+
+            for i in -gaussian_radius..=gaussian_radius {
+                for j in -gaussian_radius..=gaussian_radius {
+                    let mut load_offset = gloc + i * cols + j;
+
+                    if gy + i < 0 {
+                        // top contour
+                        load_offset = gx + j;
+                    } else if gy + i > rows - 1 {
+                        // bottom contour
+                        load_offset = (rows - 1) * cols + gx + j;
+                    }
+
+                    // Adjust so we are within image horizontally
+                    if gx + j < 0 {
+                        // left contour
+                        load_offset -= gx + j;
+                    } else if gx + j > cols - 1 {
+                        // right contour
+                        load_offset = load_offset - gx - j + cols - 1;
+                    }
+
+                    let gval = input[load_offset as usize];
+                    let factor_offset =
+                        (gaussian_radius + i) * (gaussian_size as i64) + gaussian_radius + j;
+                    smoothed_val += gval * filter[factor_offset as usize];
+                }
+            }
+
+            result[gloc as usize] = smoothed_val;
+        }
+    }
+
+    result
+}
+
+const MIN_BR: f32 = 0.0;
+const MAX_BR: f32 = 1.0;
+
+fn laplacian_estimate(
+    rows: usize,
+    cols: usize,
+    szb: usize,
+    input: &[f32],
+    structure: &[f32],
+) -> Vec<f32> {
+    assert!(szb == 3);
+
+    let mut result = vec![0.0; rows * cols];
+
+    for gy in 0..rows {
+        for gx in 0..cols {
+            let mut image_area = vec![0.0; szb * szb];
+
+            // Copy for dilation filter
+            image_area[1 * szb + 1] = input[gy * cols + gx];
+
+            if gx == 0 {
+                image_area[0 * szb + 0] = MIN_BR;
+                image_area[1 * szb + 0] = MIN_BR;
+                image_area[2 * szb + 0] = MIN_BR;
+            } else {
+                image_area[1 * szb + 0] = input[gy * cols + gx - 1];
+                image_area[0 * szb + 0] = if gy > 0 {
+                    input[(gy - 1) * cols + gx - 1]
+                } else {
+                    MIN_BR
+                };
+                image_area[2 * szb + 0] = if gy < rows - 1 {
+                    input[(gy + 1) * cols + gx - 1]
+                } else {
+                    MIN_BR
+                };
+            }
+
+            if gx == cols - 1 {
+                image_area[0 * szb + 2] = MIN_BR;
+                image_area[1 * szb + 2] = MIN_BR;
+                image_area[2 * szb + 2] = MIN_BR;
+            } else {
+                image_area[1 * szb + 2] = input[gy * cols + gx + 1];
+                image_area[0 * szb + 2] = if gy > 0 {
+                    input[(gy - 1) * cols + gx + 1]
+                } else {
+                    MIN_BR
+                };
+                image_area[2 * szb + 2] = if gy < rows - 1 {
+                    input[(gy + 1) * cols + gx + 1]
+                } else {
+                    MIN_BR
+                };
+            }
+
+            image_area[0 * szb + 1] = if gy > 0 {
+                input[(gy - 1) * cols + gx]
+            } else {
+                MIN_BR
+            };
+            image_area[2 * szb + 1] = if gy < rows - 1 {
+                input[(gy + 1) * cols + gx]
+            } else {
+                MIN_BR
+            };
+
+            // Compute pixel of dilated image
+            let mut dilated_pixel = MIN_BR;
+            for i in 0..szb {
+                for j in 0..szb {
+                    dilated_pixel = f32::max(
+                        dilated_pixel,
+                        image_area[i * szb + j] * structure[i * szb + j],
+                    );
+                }
+            }
+
+            // Data copy for erosion filter - only change the boundary conditions
+            if gx == 0 {
+                image_area[0 * szb + 0] = MAX_BR;
+                image_area[1 * szb + 0] = MAX_BR;
+                image_area[2 * szb + 0] = MAX_BR;
+            } else {
+                if gy == 0 {
+                    image_area[0 * szb + 0] = MAX_BR;
+                }
+                if gy == rows - 1 {
+                    image_area[2 * szb + 0] = MAX_BR;
+                }
+            }
+
+            if gx == cols - 1 {
+                image_area[0 * szb + 2] = MAX_BR;
+                image_area[1 * szb + 2] = MAX_BR;
+                image_area[2 * szb + 2] = MAX_BR;
+            } else {
+                if gy == 0 {
+                    image_area[0 * szb + 2] = MAX_BR;
+                }
+                if gy == rows - 1 {
+                    image_area[2 * szb + 2] = MAX_BR;
+                }
+            }
+
+            if gy == 0 {
+                image_area[0 * szb + 1] = MAX_BR;
+            }
+            if gy == rows - 1 {
+                image_area[2 * szb + 1] = MAX_BR;
+            }
+
+            // Compute pixel of eroded image
+            let mut eroded_pixel = MAX_BR;
+            for i in 0..szb {
+                for j in 0..szb {
+                    eroded_pixel = f32::min(
+                        eroded_pixel,
+                        image_area[i * szb + j] * structure[i * szb + j],
+                    );
+                }
+            }
+
+            let laplacian = dilated_pixel + eroded_pixel - 2.0 * image_area[1 * szb + 1];
+            result[gy * cols + gx] = laplacian;
+        }
+    }
+
+    result
+}
+
+fn compute_zero_crossings(
+    rows: usize,
+    cols: usize,
+    szb: usize,
+    input: &[f32],
+    structure: &[f32],
+) -> Vec<f32> {
+    assert!(szb == 3);
+
+    let mut result = vec![0.0; rows * cols];
+
+    for gy in 0..rows {
+        for gx in 0..cols {
+            let mut image_area = vec![vec![0.0, 0.0, 0.0]; szb];
+
+            // Copy for dilation filter
+            image_area[1][1] = if input[gy * cols + gx] > MIN_BR {
+                MAX_BR
+            } else {
+                MIN_BR
+            };
+
+            if gx == 0 {
+                image_area[0][0] = MIN_BR;
+                image_area[1][0] = MIN_BR;
+                image_area[2][0] = MIN_BR;
+            } else {
+                image_area[1][0] = if input[gy * cols + gx - 1] > MIN_BR {
+                    MAX_BR
+                } else {
+                    MIN_BR
+                };
+                image_area[0][0] = if gy > 0 {
+                    if input[(gy - 1) * cols + gx - 1] > MIN_BR {
+                        MAX_BR
+                    } else {
+                        MIN_BR
+                    }
+                } else {
+                    MIN_BR
+                };
+                image_area[2][0] = if gy < rows - 1 {
+                    if input[(gy + 1) * cols + gx - 1] > MIN_BR {
+                        MAX_BR
+                    } else {
+                        MIN_BR
+                    }
+                } else {
+                    MIN_BR
+                };
+            }
+
+            if gx == cols - 1 {
+                image_area[0][2] = MIN_BR;
+                image_area[1][2] = MIN_BR;
+                image_area[2][2] = MIN_BR;
+            } else {
+                image_area[1][2] = if input[gy * cols + gx + 1] > MIN_BR {
+                    MAX_BR
+                } else {
+                    MIN_BR
+                };
+                image_area[0][2] = if gy > 0 {
+                    if input[(gy - 1) * cols + gx + 1] > MIN_BR {
+                        MAX_BR
+                    } else {
+                        MIN_BR
+                    }
+                } else {
+                    MIN_BR
+                };
+                image_area[2][2] = if gy < rows - 1 {
+                    if input[(gy + 1) * cols + gx + 1] > MIN_BR {
+                        MAX_BR
+                    } else {
+                        MIN_BR
+                    }
+                } else {
+                    MIN_BR
+                };
+            }
+
+            image_area[0][1] = if gy > 0 {
+                if input[(gy - 1) * cols + gx] > MIN_BR {
+                    MAX_BR
+                } else {
+                    MIN_BR
+                }
+            } else {
+                MIN_BR
+            };
+            image_area[2][1] = if gy < rows - 1 {
+                if input[(gy + 1) * cols + gx] > MIN_BR {
+                    MAX_BR
+                } else {
+                    MIN_BR
+                }
+            } else {
+                MIN_BR
+            };
+
+            // Compute pixel of dilated image
+            let mut dilated_pixel = MIN_BR;
+            for i in 0..szb {
+                for j in 0..szb {
+                    dilated_pixel =
+                        f32::max(dilated_pixel, image_area[i][j] * structure[i * szb + j]);
+                }
+            }
+
+            // Data copy for erosion filter - only change the boundary conditions
+            if gx == 0 {
+                image_area[0][0] = MAX_BR;
+                image_area[1][0] = MAX_BR;
+                image_area[2][0] = MAX_BR;
+            } else {
+                if gy == 0 {
+                    image_area[0][0] = MAX_BR;
+                }
+                if gy == rows - 1 {
+                    image_area[2][0] = MAX_BR;
+                }
+            }
+
+            if gx == cols - 1 {
+                image_area[0][2] = MAX_BR;
+                image_area[1][2] = MAX_BR;
+                image_area[2][2] = MAX_BR;
+            } else {
+                if gy == 0 {
+                    image_area[0][2] = MAX_BR;
+                }
+                if gy == rows - 1 {
+                    image_area[2][2] = MAX_BR;
+                }
+            }
+
+            if gy == 0 {
+                image_area[0][1] = MAX_BR;
+            }
+            if gy == rows - 1 {
+                image_area[2][1] = MAX_BR;
+            }
+
+            // Compute pixel of eroded image
+            let mut eroded_pixel = MAX_BR;
+            for i in 0..szb {
+                for j in 0..szb {
+                    eroded_pixel =
+                        f32::min(eroded_pixel, image_area[i][j] * structure[i * szb + j]);
+                }
+            }
+
+            let pixel_sign = dilated_pixel - eroded_pixel;
+            result[gy * cols + gx] = pixel_sign;
+        }
+    }
+
+    result
+}
+
+fn compute_gradient(
+    rows: usize,
+    cols: usize,
+    sobel_size: usize,
+    input: &[f32],
+    sx: &[f32],
+    sy: &[f32],
+) -> Vec<f32> {
+    assert!(sobel_size == 3);
+
+    let mut result = vec![0.0; rows * cols];
+    let sobel_radius = (sobel_size / 2) as i64;
+    let rows = rows as i64;
+    let cols = cols as i64;
+
+    for gy in 0..rows {
+        for gx in 0..cols {
+            let gloc = gx + gy * cols;
+
+            let mut gradient_x = 0.0;
+            let mut gradient_y = 0.0;
+
+            for i in -sobel_radius..=sobel_radius {
+                for j in -sobel_radius..=sobel_radius {
+                    let mut load_offset = gloc + i * cols + j;
+
+                    if gy + i < 0 {
+                        // top contour
+                        load_offset = gx + j;
+                    } else if gy + i > rows - 1 {
+                        // bottom contour
+                        load_offset = (rows - 1) * cols + gx + j;
+                    }
+
+                    // Adjust so we are within image horizontally
+                    if gx + j < 0 {
+                        // left contour
+                        load_offset -= gx + j;
+                    } else if gx + j > cols - 1 {
+                        // right contour
+                        load_offset = load_offset - gx - j + cols - 1;
+                    }
+
+                    let gval = input[load_offset as usize];
+                    let s_offset = (sobel_radius + i) * (sobel_size as i64) + sobel_radius + j;
+                    gradient_x += gval * sx[s_offset as usize];
+                    gradient_y += gval * sy[s_offset as usize];
+                }
+            }
+
+            result[gloc as usize] = f32::sqrt(gradient_x * gradient_x + gradient_y * gradient_y);
+        }
+    }
+
+    result
+}
+
+fn compute_max_gradient(rows: usize, cols: usize, gradient: &[f32]) -> f32 {
+    let mut max = gradient[0];
+
+    for i in 1..rows * cols {
+        if max < gradient[i] {
+            max = gradient[i];
+        }
+    }
+
+    max
+}
+
+fn reject_zero_crossings(
+    rows: usize,
+    cols: usize,
+    zero_crossings: &[f32],
+    gradient: &[f32],
+    max_gradient: f32,
+    theta: f32,
+) -> Vec<f32> {
+    let mut result = vec![0.0; rows * cols];
+
+    for gy in 0..rows {
+        for gx in 0..cols {
+            result[gy * cols + gx] = if zero_crossings[gy * cols + gx] > 0.0
+                && gradient[gy * cols + gx] > theta * max_gradient
+            {
+                1.0
+            } else {
+                0.0
+            };
+        }
+    }
+
+    result
+}
+
+pub fn edge_detection(
+    rows: usize,
+    cols: usize,
+    gaussian_size: usize,
+    szb: usize,
+    soebel_size: usize,
+    input: &[f32],
+    gaussian_filter: &[f32],
+    structure: &[f32],
+    sx: &[f32],
+    sy: &[f32],
+    theta: f32,
+) -> Vec<f32> {
+    let smoothed = gaussian_smoothing(rows, cols, gaussian_size, input, gaussian_filter);
+    let laplacian = laplacian_estimate(rows, cols, szb, &smoothed, structure);
+    let zcs = compute_zero_crossings(rows, cols, szb, &laplacian, structure);
+    let gradient = compute_gradient(rows, cols, soebel_size, &smoothed, sx, sy);
+    let max_gradient = compute_max_gradient(rows, cols, &gradient);
+    reject_zero_crossings(rows, cols, &zcs, &gradient, max_gradient, theta)
+}
diff --git a/juno_samples/edge_detection/src/gpu.sch b/juno_samples/edge_detection/src/gpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..d7330c679edc6a890efe5c0134369ca411ac7d67
--- /dev/null
+++ b/juno_samples/edge_detection/src/gpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+inline(*);
+let out = auto-outline(*);
+gpu(out.edge_detection);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+//forkify(*);
+infer-schedules(*);
+
+gcm(*);
diff --git a/juno_samples/edge_detection/src/main.rs b/juno_samples/edge_detection/src/main.rs
new file mode 100644
index 0000000000000000000000000000000000000000..eda65016c60de26a5cd1fe21d8d95dcaff826cf9
--- /dev/null
+++ b/juno_samples/edge_detection/src/main.rs
@@ -0,0 +1,307 @@
+#![feature(concat_idents)]
+
+mod edge_detection_rust;
+
+use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
+
+use std::slice::from_raw_parts;
+
+use clap::Parser;
+
+use opencv::core::{Mat, Size, CV_32F, CV_8U};
+use opencv::highgui::{imshow, wait_key};
+use opencv::imgproc::{cvt_color_def, ColorConversionCodes};
+use opencv::prelude::{MatTraitConst, VideoCaptureTrait, VideoCaptureTraitConst};
+use opencv::videoio::{VideoCapture, VideoCaptureProperties, VideoWriter, VideoWriterTrait};
+
+juno_build::juno!("edge_detection");
+
+#[derive(Parser)]
+#[clap(author, version, about, long_about = None)]
+struct EdgeDetectionInputs {
+    input: String,
+    #[clap(short, long)]
+    display: bool,
+    #[clap(short, long, value_name = "PATH")]
+    output: Option<String>,
+    #[clap(short, long)]
+    verify: bool,
+    #[clap(long = "display-verify")]
+    display_verify: bool,
+    #[clap(long = "output-verify", value_name = "PATH")]
+    output_verify: Option<String>,
+    #[clap(short, long, value_name = "COUNT")]
+    frames: Option<usize>,
+}
+
+fn load_frame(video: &mut VideoCapture) -> Mat {
+    let mut frame = Mat::default();
+
+    let Ok(true) = video.read(&mut frame) else {
+        panic!("Failed to load frame");
+    };
+    let result = if frame.channels() == 3 {
+        let mut converted = Mat::default();
+        let () = cvt_color_def(
+            &frame,
+            &mut converted,
+            ColorConversionCodes::COLOR_BGR2GRAY.into(),
+        )
+        .expect("Failure in conversion to grayscale");
+        let mut result = Mat::default();
+        let () = converted
+            .convert_to(&mut result, CV_32F, 1.0 / 255.0, 0.0)
+            .expect("Failure in conversion to f32");
+        result
+    } else if frame.channels() == 1 {
+        let mut result = Mat::default();
+        let () = frame
+            .convert_to(&mut result, CV_32F, 1.0 / 255.0, 0.0)
+            .expect("Failure in conversion to f32");
+        result
+    } else {
+        panic!("Expected either RGB or grayscale image");
+    };
+
+    assert!(result.is_continuous());
+    result
+}
+
+fn frame_from_slice(frame: &[f32], height: usize, width: usize) -> Mat {
+    let result = Mat::from_slice(frame)
+        .expect("Failed to create matrix from result")
+        .reshape(1, height as i32)
+        .expect("Failed to reshape result matrix")
+        .clone_pointee();
+    assert!(result.cols() == width as i32);
+
+    // Convert to u8 since the VideoWriter seems to require that
+    let mut converted = Mat::default();
+    let () = result
+        .convert_to(&mut converted, CV_8U, 255.0, 0.0)
+        .expect("Failure in conversion to u8");
+
+    converted
+}
+
+fn edge_detection_harness(args: EdgeDetectionInputs) {
+    let EdgeDetectionInputs {
+        input,
+        display,
+        output,
+        verify,
+        display_verify,
+        output_verify,
+        frames,
+    } = args;
+
+    let gs: usize = 7;
+    let gaussian_filter: Vec<f32> = vec![
+        0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036, 0.000363, 0.003676,
+        0.014662, 0.023226, 0.014662, 0.003676, 0.000363, 0.001446, 0.014662, 0.058488, 0.092651,
+        0.058488, 0.014662, 0.001446, 0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226,
+        0.002291, 0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446, 0.000363,
+        0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363, 0.000036, 0.000363, 0.001446,
+        0.002291, 0.001446, 0.000363, 0.000036,
+    ];
+    #[cfg(not(feature = "cuda"))]
+    let gaussian_filter_h = HerculesCPURef::from_slice(&gaussian_filter);
+    #[cfg(feature = "cuda")]
+    let gaussian_filter_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&gaussian_filter));
+    #[cfg(feature = "cuda")]
+    let gaussian_filter_h = gaussian_filter_cuda.get_ref();
+
+    let sz: usize = 3;
+    let structure: Vec<f32> = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
+    #[cfg(not(feature = "cuda"))]
+    let structure_h = HerculesCPURef::from_slice(&structure);
+    #[cfg(feature = "cuda")]
+    let structure_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&structure));
+    #[cfg(feature = "cuda")]
+    let structure_h = structure_cuda.get_ref();
+
+    let sb: usize = 3;
+    let sx: Vec<f32> = vec![-1.0, 0.0, 1.0, -2.0, 0.0, 2.0, -1.0, 0.0, 1.0];
+    #[cfg(not(feature = "cuda"))]
+    let sx_h = HerculesCPURef::from_slice(&sx);
+    #[cfg(feature = "cuda")]
+    let sx_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sx));
+    #[cfg(feature = "cuda")]
+    let sx_h = sx_cuda.get_ref();
+
+    let sy: Vec<f32> = vec![-1.0, -2.0, -1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 1.0];
+    #[cfg(not(feature = "cuda"))]
+    let sy_h = HerculesCPURef::from_slice(&sy);
+    #[cfg(feature = "cuda")]
+    let sy_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&sy));
+    #[cfg(feature = "cuda")]
+    let sy_h = sy_cuda.get_ref();
+
+    let theta: f32 = 0.1;
+
+    let mut video = VideoCapture::from_file_def(&input).expect("Error loading video");
+    assert!(video.is_opened().unwrap());
+
+    let fps = video
+        .get(VideoCaptureProperties::CAP_PROP_FPS.into())
+        .expect("Error getting fps");
+
+    let num_frames = video
+        .get(VideoCaptureProperties::CAP_PROP_FRAME_COUNT.into())
+        .expect("Error getting number of frames") as usize;
+    let width = video
+        .get(VideoCaptureProperties::CAP_PROP_FRAME_WIDTH.into())
+        .expect("Error getting width") as usize;
+    let height = video
+        .get(VideoCaptureProperties::CAP_PROP_FRAME_HEIGHT.into())
+        .expect("Error getting height") as usize;
+
+    let num_frames = if let Some(frames) = frames {
+        usize::min(frames, num_frames)
+    } else {
+        num_frames
+    };
+
+    let mut r = runner!(edge_detection);
+
+    let mut output = output.map(|filename| {
+        VideoWriter::new(
+            &filename,
+            VideoWriter::fourcc('m', 'p', '4', 'v').unwrap(),
+            fps,
+            Size {
+                width: width as i32,
+                height: height as i32,
+            },
+            false,
+        )
+        .expect("Error opening output video")
+    });
+
+    let mut output_verify = output_verify.map(|filename| {
+        VideoWriter::new(
+            &filename,
+            VideoWriter::fourcc('m', 'p', '4', 'v').unwrap(),
+            fps,
+            Size {
+                width: width as i32,
+                height: height as i32,
+            },
+            false,
+        )
+        .expect("Error opening output video")
+    });
+
+    for i in 0..num_frames {
+        let frame = load_frame(&mut video);
+        let ptr = frame.ptr_def().unwrap() as *const f32;
+
+        assert!(frame.rows() as usize == height);
+        assert!(frame.cols() as usize == width);
+
+        let input = unsafe { from_raw_parts(ptr, height * width) };
+
+        #[cfg(not(feature = "cuda"))]
+        let input_h = HerculesCPURef::from_slice(input);
+        #[cfg(feature = "cuda")]
+        let input_cuda = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(input));
+        #[cfg(feature = "cuda")]
+        let input_h = input_cuda.get_ref();
+
+        let result = async_std::task::block_on(async {
+            r.run(
+                height as u64,
+                width as u64,
+                gs as u64,
+                sz as u64,
+                sb as u64,
+                input_h,
+                gaussian_filter_h.clone(),
+                structure_h.clone(),
+                sx_h.clone(),
+                sy_h.clone(),
+                theta,
+            )
+            .await
+        });
+
+        #[cfg(not(feature = "cuda"))]
+        let result : Box<[f32]> = result.as_slice::<f32>().to_vec().into_boxed_slice();
+        #[cfg(feature = "cuda")]
+        let result : Box<[f32]> = {
+            let num_out = unsafe { result.__size() / std::mem::size_of::<f32>() };
+            let mut res_cpu: Box<[f32]> = vec![0.0; num_out].into_boxed_slice();
+            result.to_cpu_ref(&mut res_cpu);
+            res_cpu
+        };
+
+        if display {
+            let result = frame_from_slice(&result, height, width);
+            let () = imshow("Juno", &result).expect("Failure in displaying image");
+        }
+        if let Some(ref mut output) = output {
+            let result = frame_from_slice(&result, height, width);
+            let () = output.write(&result).expect("Failure in writing frame");
+        }
+
+        if verify {
+            let rust_result = edge_detection_rust::edge_detection(
+                height,
+                width,
+                gs,
+                sz,
+                sb,
+                input,
+                &gaussian_filter,
+                &structure,
+                &sx,
+                &sy,
+                theta,
+            );
+
+            assert_eq!(result.as_ref(), <Vec<f32> as AsRef<[f32]>>::as_ref(&rust_result));
+            println!("Frames {} match", i);
+
+            if display_verify {
+                let rust_result = frame_from_slice(&rust_result, height, width);
+                let () = imshow("Rust", &rust_result).expect("Failure in displaying image");
+            }
+            if let Some(ref mut output) = output_verify {
+                let result = frame_from_slice(&rust_result, height, width);
+                let () = output.write(&result).expect("Failure in writing frame");
+            }
+        }
+
+        if display || (verify && display_verify) {
+            let _ = wait_key(0);
+        }
+    }
+
+    if let Some(mut output) = output {
+        let () = output.release().expect("Failure releasing output video");
+    }
+    if let Some(mut output) = output_verify {
+        let () = output.release().expect("Failure releasing output video");
+    }
+}
+
+fn main() {
+    let args = EdgeDetectionInputs::parse();
+    edge_detection_harness(args);
+}
+
+#[test]
+fn edge_detection_test() {
+    edge_detection_harness(EdgeDetectionInputs {
+        input: "examples/formula1_scaled.mp4".to_string(),
+        display: false,
+        output: None,
+        verify: true,
+        display_verify: false,
+        output_verify: None,
+        // Limit frames to keep runtime low
+        frames: Some(2),
+    });
+}