From f2ce75095348fa2ab5491676ca36c03063c05cb6 Mon Sep 17 00:00:00 2001 From: prrathi <prrathi10@gmail.com> Date: Sun, 19 Jan 2025 16:40:21 +0000 Subject: [PATCH] yay --- hercules_cg/src/gpu.rs | 242 +++++++++++++--------------------- hercules_rt/src/rtdefs.cu | 11 +- juno_samples/cava/src/main.rs | 12 +- 3 files changed, 106 insertions(+), 159 deletions(-) diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index b14a136f..bb28db8a 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -292,13 +292,9 @@ enum CGType { impl GPUContext<'_> { fn codegen_function<W: Write>(&self, w: &mut W) -> Result<(), Error> { - // If run_debug, wrapping C host code is self-contained with malloc, etc, - // else it only does kernel launch. - let run_debug = false; - // Emit all code up to the "goto" to Start's block let mut top = String::new(); - self.codegen_kernel_begin(run_debug, &mut top)?; + self.codegen_kernel_begin(&mut top)?; let mut dynamic_shared_offset = "0".to_string(); self.codegen_dynamic_constants(&mut top)?; self.codegen_declare_data(&mut top)?; @@ -344,6 +340,7 @@ impl GPUContext<'_> { &fork_thread_quota_map, &extra_dim_collects, &mut dynamic_shared_offset, + num_blocks, num_threads, &mut gotos, )?; @@ -358,14 +355,14 @@ impl GPUContext<'_> { // Emit host launch code let mut host_launch = String::new(); - self.codegen_launch_code(run_debug, num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; + self.codegen_launch_code(num_blocks, num_threads, &dynamic_shared_offset, &mut host_launch)?; write!(w, "{}", host_launch)?; Ok(()) } // Emit kernel headers, signature, arguments, and dynamic shared memory declaration - fn codegen_kernel_begin(&self, run_debug: bool, w: &mut String) -> Result<(), Error> { + fn codegen_kernel_begin(&self, w: &mut String) -> Result<(), Error> { write!(w, " #include <assert.h> #include <stdio.h> @@ -390,8 +387,8 @@ namespace cg = cooperative_groups; write!( w, - "__global__ void __launch_bounds__({}) {}{}(", - self.kernel_params.max_num_threads, self.function.name, if run_debug { "" } else { "_gpu" } + "__global__ void __launch_bounds__({}) {}_gpu(", + self.kernel_params.max_num_threads, self.function.name )?; // The first set of parameters are dynamic constants. let mut first_param = true; @@ -534,129 +531,59 @@ namespace cg = cooperative_groups; Ok(()) } - fn codegen_launch_code(&self, run_debug: bool, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { + fn codegen_launch_code(&self, num_blocks: usize, num_threads: usize, dynamic_shared_offset: &str, w: &mut String) -> Result<(), Error> { // The following steps are for host-side C function arguments, but we also // need to pass arguments to kernel, so we keep track of the arguments here. let mut pass_args = String::new(); - if run_debug { - write!(w, " -int main() {{ -")?; - // The first set of parameters are dynamic constants. - let mut first_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(pass_args, ", ")?; - } - write!(w, "\tunsigned long long dc_p{} = 1ull;\n", idx)?; - write!(pass_args, "dc_p{}", idx)?; - } - self.codegen_dynamic_constants(w)?; - // The second set of parameters are normal arguments. - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { - write!(pass_args, ", ")?; - } - let param_type = self.get_type(*ty, false); - if self.types[ty.idx()].is_primitive() { - write!(w, "\t{} p{} = 1;\n", param_type, idx)?; - } else { - let param_size = self.get_size(*ty, None, None); - write!(w, "\t{} p{};\n", param_type, idx)?; - write!(w, "\tif (cudaMalloc((void**)&p{}, {}) != cudaSuccess) {{\n", idx, param_size)?; - write!(w, "\t\tprintf(\"Error allocating memory for parameter %d\\n\", {});\n", idx)?; - write!(w, "\t\treturn -1;\n")?; - write!(w, "\t}}\n")?; - } - write!(pass_args, "p{}", idx)?; - } - // Pull primitive return to a pointer parameter - if self.types[self.return_type_id.idx()].is_primitive() { - let ret_type_no_pnt = self.get_type(*self.return_type_id, false); - let ret_type = self.get_type(*self.return_type_id, true); - write!(w, "\t{} ret;\n", ret_type)?; - write!(w, "\tif (cudaMalloc((void**)&ret, sizeof({})) != cudaSuccess) {{\n", ret_type_no_pnt)?; - write!(w, "\t\tprintf(\"Error allocating memory for return value\\n\");\n")?; - write!(w, "\t\treturn -1;\n")?; - write!(w, "\t}}\n")?; - write!(pass_args, ", ret")?; - } - write!(w, "\t{}<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; - write!(w, "\tbool skip = false;\n")?; - write!(w, "\tcudaError_t err = cudaGetLastError();\n")?; - write!(w, "\tif (err != cudaSuccess) {{\n")?; - write!(w, "\t\tprintf(\"Error launching kernel: %s\\n\", cudaGetErrorString(err));\n")?; - write!(w, "\t\tskip = true;\n")?; - write!(w, "\t}}\n")?; - write!(w, "\tif (cudaDeviceSynchronize() != cudaSuccess && !skip) {{\n")?; - write!(w, "\t\tprintf(\"Error synchronizing device\\n\");\n")?; - write!(w, "\t\tskip = true;\n")?; - write!(w, "\t}}\n")?; - for (idx, ty) in self.function.param_types.iter().enumerate() { - if !self.types[ty.idx()].is_primitive() { - write!(w, "\tcudaFree(p{});\n", idx)?; - } - } - if self.types[self.return_type_id.idx()].is_primitive() { - write!(w, "\tcudaFree(ret);\n")?; - } - } - - else { - let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); - let ret_type = self.get_type(*self.return_type_id, false); - write!(w, " + let ret_primitive = self.types[self.return_type_id.idx()].is_primitive(); + let ret_type = self.get_type(*self.return_type_id, false); + write!(w, " extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; - // The first set of parameters are dynamic constants. - let mut first_param = true; - for idx in 0..self.function.num_dynamic_constants { - if first_param { - first_param = false; - } else { - write!(w, ", ")?; - write!(pass_args, ", ")?; - } - write!(w, "unsigned long long dc_p{}", idx)?; - write!(pass_args, "dc_p{}", idx)?; - } - // The second set of parameters are normal arguments. - for (idx, ty) in self.function.param_types.iter().enumerate() { - if first_param { - first_param = false; - } else { - write!(w, ", ")?; - write!(pass_args, ", ")?; - } - let param_type = self.get_type(*ty, false); - write!(w, "{} p{}", param_type, idx)?; - write!(pass_args, "p{}", idx)?; - } - write!(w, ") {{\n")?; - // Pull primitive return as pointer parameter for kernel - if ret_primitive { - let ret_type_pnt = self.get_type(*self.return_type_id, true); - write!(w, "\t{} ret;\n", ret_type_pnt)?; - write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; - if !first_param { - write!(pass_args, ", ")?; - } - write!(pass_args, "ret")?; + // The first set of parameters are dynamic constants. + let mut first_param = true; + for idx in 0..self.function.num_dynamic_constants { + if first_param { + first_param = false; + } else { + write!(w, ", ")?; + write!(pass_args, ", ")?; } - write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; - write!(w, "\tcudaDeviceSynchronize();\n")?; - if ret_primitive { - write!(w, "\t{} host_ret;\n", ret_type)?; - write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; - write!(w, "\treturn host_ret;\n")?; + write!(w, "unsigned long long dc_p{}", idx)?; + write!(pass_args, "dc_p{}", idx)?; + } + // The second set of parameters are normal arguments. + for (idx, ty) in self.function.param_types.iter().enumerate() { + if first_param { + first_param = false; } else { - write!(w, "\treturn p{};\n", self.return_param_idx.unwrap())?; + write!(w, ", ")?; + write!(pass_args, ", ")?; } + let param_type = self.get_type(*ty, false); + write!(w, "{} p{}", param_type, idx)?; + write!(pass_args, "p{}", idx)?; + } + write!(w, ") {{\n")?; + // Pull primitive return as pointer parameter for kernel + if ret_primitive { + let ret_type_pnt = self.get_type(*self.return_type_id, true); + write!(w, "\t{} ret;\n", ret_type_pnt)?; + write!(w, "\tcudaMalloc((void**)&ret, sizeof({}));\n", ret_type)?; + if !first_param { + write!(pass_args, ", ")?; + } + write!(pass_args, "ret")?; + } + write!(w, "\t{}_gpu<<<{}, {}, {}>>>({});\n", self.function.name, num_blocks, num_threads, dynamic_shared_offset, pass_args)?; + write!(w, "\tcudaDeviceSynchronize();\n")?; + write!(w, "\tfflush(stdout);\n")?; + if ret_primitive { + write!(w, "\t{} host_ret;\n", ret_type)?; + write!(w, "\tcudaMemcpy(&host_ret, ret, sizeof({}), cudaMemcpyDeviceToHost);\n", ret_type)?; + write!(w, "\treturn host_ret;\n")?; + } else { + write!(w, "\treturn p{};\n", self.return_param_idx.unwrap())?; } - write!(w, "}}\n")?; Ok(()) } @@ -903,7 +830,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, KernelState::OutBlock, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + self.codegen_data_node(*data, KernelState::OutBlock, Some(1), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } Ok(()) }) @@ -921,6 +848,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; fork_thread_quota_map: &HashMap<NodeID, (usize, usize, usize)>, extra_dim_collects: &HashSet<TypeID>, dynamic_shared_offset: &mut String, + num_blocks: usize, num_threads: usize, gotos: &mut BTreeMap<NodeID, CudaGoto>, ) -> Result<(), Error> { @@ -935,7 +863,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, None, None, None, init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + self.codegen_data_node(*data, state, Some(num_blocks), None, None, None, false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } // Then generate data and control for the single block fork if it exists @@ -949,7 +877,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let term = &mut goto.term; let mut tabs = self.codegen_control_node(*control, Some(num_threads), Some(num_threads), Some(1), init, post_init, term)?; for data in self.bbs.1[control.idx()].iter() { - self.codegen_data_node(*data, state, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; + self.codegen_data_node(*data, state, None, Some(num_threads), None, Some(block_fork.unwrap()), false, extra_dim_collects, dynamic_shared_offset, body, &mut tabs)?; } } } @@ -1020,6 +948,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; self.codegen_data_node( *data, state, + None, Some(use_thread_quota), parallel_factor, Some(curr_fork), @@ -1052,6 +981,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; &self, id: NodeID, state: KernelState, + num_blocks: Option<usize>, use_thread_quota: Option<usize>, parallel_factor: Option<usize>, nesting_fork: Option<NodeID>, @@ -1348,17 +1278,18 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; write!(w, "{}{} = *({});\n", tabs, define_variable, collect_with_indices)?; } } else { - // Divide up "elements", which are collection size divided - // by element size, among threads. + if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { + panic!("GPU can't guarantee correctness for multi-block collection reads"); + } let cg_tile = match state { - KernelState::OutBlock => "grid".to_string(), - KernelState::InBlock => "block".to_string(), + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); - let data_type = self.get_type(data_type_id, false); - let num_elements = format!("(({}) / sizeof({}))", data_size, data_type.strip_suffix('*').unwrap()); - write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {} / {}.size()) {{\n", tabs, cg_tile, num_elements, num_elements, cg_tile)?; + // Divide up "elements", which are collection size divided + // by element size, among threads. + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects), Some(true)); + let num_elements = format!("({})", data_size); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, define_variable, collect_with_indices)?; write!(w, "{}}}\n", tabs)?; write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; @@ -1378,9 +1309,11 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, is_char, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; + if KernelState::OutBlock == state && num_blocks.unwrap() > 1 { + panic!("GPU can't guarantee correctness for multi-block collection writes"); + } let cg_tile = match state { - KernelState::OutBlock => "grid".to_string(), - KernelState::InBlock => "block".to_string(), + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), }; if self.types[data_type_id.idx()].is_primitive() { @@ -1393,17 +1326,16 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; } write!(w, "{}}}\n", tabs)?; } else { - let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects)); - let data_type = self.get_type(data_type_id, false); - let num_elements = format!("(({}) / sizeof({}))", data_size, data_type.strip_suffix('*').unwrap()); - write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {} / {}.size()) {{\n", tabs, cg_tile, num_elements, num_elements, cg_tile)?; + let data_size = self.get_size(data_type_id, None, Some(extra_dim_collects), Some(true)); + let num_elements = format!("({})", data_size); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; write!(w, "{}\t*({} + i) = *({} + i);\n", tabs, collect_with_indices, data_variable)?; write!(w, "{}}}\n", tabs)?; write!(w, "{}if ({}.thread_rank() < {} % {}.size()) {{\n", tabs, cg_tile, num_elements, cg_tile)?; write!(w, "{}\t*({} + {}.size() * ({} / {}.size()) + {}.thread_rank()) = *({} + {}.size() * ({} / {}.size()) + {}.thread_rank());\n", tabs, collect_with_indices, cg_tile, num_elements, cg_tile, cg_tile, data_variable, cg_tile, num_elements, cg_tile, cg_tile)?; write!(w, "{}}}\n", tabs)?; - write!(w, "{}{}.sync();\n", tabs, cg_tile)?; } + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; let collect_variable = self.get_value(*collect, false, false); write!(w, "{}{} = {};\n", tabs, define_variable, collect_variable)?; } @@ -1587,7 +1519,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; for index in indices { match index { Index::Field(field) => { - self.get_size(type_id, Some(*field), None); + self.get_size(type_id, Some(*field), None, None); } // Variants of summations have zero offset Index::Variant(_) => {} @@ -1619,7 +1551,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; ")".repeat(array_indices.len() - if has_extra_dim { 1 } else { 0 }) )); if is_char { - let element_size = self.get_size(*element_type, None, None); + let element_size = self.get_size(*element_type, None, None, None); index_ptr.push_str(&format!(" * {}", element_size)); } } @@ -1668,7 +1600,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Constant::Product(type_id, constant_fields) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None, extra_dim_collects, None); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; @@ -1680,7 +1612,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; for i in 0..constant_fields.len() { // For each field update offset and issue recursive call let field_type = self.get_type(type_fields[i], true); - let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects); + let offset = self.get_size(type_fields[i], Some(i), extra_dim_collects, None); let field_constant = &self.constants[constant_fields[i].idx()]; if field_constant.is_scalar() { self.codegen_constant( @@ -1700,7 +1632,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; Constant::Summation(type_id, variant, field) => { if allow_allocate { let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None, extra_dim_collects, None); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; write!(w, "{}{} = dynamic_shared + dynamic_shared_offset;\n", tabs, name)?; @@ -1735,7 +1667,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; panic!("Nested array constant should not be re-allocated"); } let alignment = self.get_alignment(*type_id); - let size = self.get_size(*type_id, None, extra_dim_collects); + let size = self.get_size(*type_id, None, extra_dim_collects, None); let element_type = self.get_type(*element_type, true); *dynamic_shared_offset = format!("(({} + {} - 1) / {}) * {}", dynamic_shared_offset, alignment, alignment, alignment); write!(w, "{}dynamic_shared_offset = {};\n", tabs, dynamic_shared_offset)?; @@ -1752,11 +1684,15 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; * and offset to 2nd field. This is useful for constant initialization and read/write * index math. */ - fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>) -> String { + fn get_size(&self, type_id: TypeID, num_fields: Option<usize>, extra_dim_collects: Option<&HashSet<TypeID>>, exclude_element_size: Option<bool>) -> String { match &self.types[type_id.idx()] { Type::Array(element_type, extents) => { let array_size = multiply_dcs(if extra_dim_collects.is_some() && extra_dim_collects.unwrap().contains(&type_id) { &extents[1..] } else { extents }); - format!("{} * {}", self.get_alignment(*element_type), array_size) + if exclude_element_size.unwrap_or(false) { + array_size + } else { + format!("{} * {}", self.get_alignment(*element_type), array_size) + } } Type::Product(fields) => { let num_fields = &num_fields.unwrap_or(fields.len()); @@ -1764,7 +1700,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; .iter() .enumerate() .filter(|(i, _)| i < num_fields) - .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects), self.get_alignment(*id))) + .map(|(_, id)| (self.get_size(*id, None, extra_dim_collects, None), self.get_alignment(*id))) .fold(String::from("0"), |acc, (size, align)| { if acc == "0" { size @@ -1779,7 +1715,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; format!( "{} - {}", with_field, - self.get_size(fields[*num_fields], None, extra_dim_collects) + self.get_size(fields[*num_fields], None, extra_dim_collects, None) ) } else { with_field @@ -1789,7 +1725,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // The argmax variant by size is not guaranteed to be same as // argmax variant by alignment, eg product of 3 4-byte primitives // vs 1 8-byte primitive, so we need to calculate both. - let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects)).fold( + let max_size = variants.iter().map(|id| self.get_size(*id, None, extra_dim_collects, None)).fold( String::from("0"), |acc, x| { if acc == "0" { diff --git a/hercules_rt/src/rtdefs.cu b/hercules_rt/src/rtdefs.cu index b7378d81..6c59abe2 100644 --- a/hercules_rt/src/rtdefs.cu +++ b/hercules_rt/src/rtdefs.cu @@ -7,7 +7,7 @@ extern "C" { } return ptr; } - + void *cuda_alloc_zeroed(size_t size) { void *ptr = cuda_alloc(size); if (!ptr) { @@ -15,23 +15,24 @@ extern "C" { } cudaError_t res = cudaMemset(ptr, 0, size); if (res != cudaSuccess) { + cuda_dealloc(ptr); return NULL; } return ptr; } - + void cuda_dealloc(void *ptr) { cudaFree(ptr); } - + void copy_cpu_to_cuda(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); } - + void copy_cuda_to_cpu(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost); } - + void copy_cuda_to_cuda(void *dst, void *src, size_t size) { cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice); } diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index 9c2f99a8..a36d8826 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -171,7 +171,17 @@ fn cava_harness(args: CavaInputs) { .expect("Error saving verification image"); } - assert_eq!(result, cpu_result.into(), "Verification failed, mismatch"); + let max_diff = result.iter() + .zip(cpu_result.iter()) + .map(|(a, b)| (*a as i16 - *b as i16).abs()) + .max() + .unwrap_or(0); + + assert!( + max_diff <= 3, + "Verification failed: maximum pixel difference of {} exceeds threshold of 3", + max_diff + ); } } -- GitLab