diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
index b3bfb5a6d76e1f50698328f3d6ed6f0624b69cc6..413e2f2bbf6cec280b522ffe2e42af7c5622bcea 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h
@@ -19,14 +19,12 @@ void *handleTensorFftApproximationTuples(
     case GPUNodeConfiguration::APPROX::FP32: {
       void *t_out;
       RC->resume_profiler();
-      t_out = tensorFft(input); // TODO: correct name here
+      t_out = tensorFft(input);
       RC->pause_profiler();
       std::pair<double, double> pinfo = RC->get_time_energy();
       RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime(
-          "tensorFft", pinfo.first); // and here
-      RC->addToCurrentIterationComputeEnergy(
-          "tensorFft", pinfo.second); // and here
+      RC->addToCurrentIterationComputeTime("tensorFft", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorFft", pinfo.second);
       return t_out;
     }
     default:
@@ -55,29 +53,36 @@ void *handleTensorReduceApproximationTuples(
     case GPUNodeConfiguration::APPROX::FP32: {
       void *t_out;
       RC->resume_profiler();
-      t_out = tensorReduce(input, axis, func); // TODO: correct name here
+      t_out = tensorReduce(input, axis, func, false);
       RC->pause_profiler();
       std::pair<double, double> pinfo = RC->get_time_energy();
       RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime(
-          "tensorReduce", pinfo.first); // and here
-      RC->addToCurrentIterationComputeEnergy(
-          "tensorReduce", pinfo.second); // and here
+      RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorReduce(input, axis, func, true);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorReduceHalf", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorReduceHalf", pinfo.second);
       return t_out;
     }
     case GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING: {
       void *t_out;
       RC->resume_profiler();
-      t_out = tensorReductionSamplingReduce(input, axis, func, param); // TODO: correct name here
+      t_out = tensorReductionSamplingReduce(input, axis, func, param);
       RC->pause_profiler();
       std::pair<double, double> pinfo = RC->get_time_energy();
       RC->reset_profiler();
       RC->addToCurrentIterationComputeTime(
-          "tensorReductionSamplingReduce",
-          pinfo.first); // and here
+          "tensorReductionSamplingReduce", pinfo.first);
       RC->addToCurrentIterationComputeEnergy(
-          "tensorReductionSamplingReduce",
-          pinfo.second); // and here
+          "tensorReductionSamplingReduce", pinfo.second);
       return t_out;
     }
     default:
@@ -106,14 +111,12 @@ void *handleTensorProjectiveTApproximationTuples(
     case GPUNodeConfiguration::APPROX::FP32: {
       void *t_out;
       RC->resume_profiler();
-      t_out = tensorProjectiveT(input, transformation); // TODO: correct name here
+      t_out = tensorProjectiveT(input, transformation);
       RC->pause_profiler();
       std::pair<double, double> pinfo = RC->get_time_energy();
       RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime(
-          "tensorProjectiveT", pinfo.first); // and here
-      RC->addToCurrentIterationComputeEnergy(
-          "tensorProjectiveT", pinfo.second); // and here
+      RC->addToCurrentIterationComputeTime("tensorProjectiveT", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorProjectiveT", pinfo.second);
       return t_out;
     }
     default:
@@ -142,14 +145,23 @@ void *handleTensorMap1ApproximationTuples(
     case GPUNodeConfiguration::APPROX::FP32: {
       void *t_out;
       RC->resume_profiler();
-      t_out = tensorMap1(func, input); // TODO: correct name here
+      t_out = tensorMap1(func, input, false);
       RC->pause_profiler();
       std::pair<double, double> pinfo = RC->get_time_energy();
       RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime(
-          "tensorMap1", pinfo.first); // and here
-      RC->addToCurrentIterationComputeEnergy(
-          "tensorMap1", pinfo.second); // and here
+      RC->addToCurrentIterationComputeTime("tensorMap1", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorMap1", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorMap1(func, input, true);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorMap1Half", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorMap1Half", pinfo.second);
       return t_out;
     }
     default:
@@ -178,14 +190,23 @@ void *handleTensorMap2ApproximationTuples(
     case GPUNodeConfiguration::APPROX::FP32: {
       void *t_out;
       RC->resume_profiler();
-      t_out = tensorMap2(func, input1, input2); // TODO: correct name here
+      t_out = tensorMap2(func, input1, input2, false);
       RC->pause_profiler();
       std::pair<double, double> pinfo = RC->get_time_energy();
       RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime(
-          "tensorMap2", pinfo.first); // and here
-      RC->addToCurrentIterationComputeEnergy(
-          "tensorMap2", pinfo.second); // and here
+      RC->addToCurrentIterationComputeTime("tensorMap2", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorMap2", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorMap2(func, input1, input2, true);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorMap2Half", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorMap2Half", pinfo.second);
       return t_out;
     }
     default:
@@ -214,14 +235,23 @@ void *handleTensorMap3ApproximationTuples(
     case GPUNodeConfiguration::APPROX::FP32: {
       void *t_out;
       RC->resume_profiler();
-      t_out = tensorMap3(func, input1, input2, input3); // TODO: correct name here
+      t_out = tensorMap3(func, input1, input2, input3, false);
       RC->pause_profiler();
       std::pair<double, double> pinfo = RC->get_time_energy();
       RC->reset_profiler();
-      RC->addToCurrentIterationComputeTime(
-          "tensorMap3", pinfo.first); // and here
-      RC->addToCurrentIterationComputeEnergy(
-          "tensorMap3", pinfo.second); // and here
+      RC->addToCurrentIterationComputeTime("tensorMap3", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorMap3", pinfo.second);
+      return t_out;
+    }
+    case GPUNodeConfiguration::APPROX::FP16: {
+      void *t_out;
+      RC->resume_profiler();
+      t_out = tensorMap3(func, input1, input2, input3, true);
+      RC->pause_profiler();
+      std::pair<double, double> pinfo = RC->get_time_energy();
+      RC->reset_profiler();
+      RC->addToCurrentIterationComputeTime("tensorMap3Half", pinfo.first);
+      RC->addToCurrentIterationComputeEnergy("tensorMap3Half", pinfo.second);
       return t_out;
     }
     default:
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
index 813351da7b0b1fa53252c097bde3d2630efe012c..ac92b0fece1999557e959117fa30195a4949c6be 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/debug.h
@@ -48,10 +48,14 @@ void _checkCUBLAS(cublasStatus_t error, const char *file, int line);
 
 void _checkCUFFT(cufftResult error, const char *file, int line);
 
+void _checkCUDA(cudaError_t err, const char *file, int line);
+
 #define checkCUBLAS(err) _checkCUBLAS(err, __FILE__, __LINE__)
 
 #define checkCUFFT(err) _checkCUFFT(err, __FILE__, __LINE__)
 
+#define checkCUDA(err) _checkCUDA(err, __FILE__, __LINE__)
+
 void INFO(const char* format, ...){
   if(!LOG_INFO) // Don't print if logging info is disabled
     return;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
index 520c1954b237039aee681dccc44acdb9b94dc443..26e8e71bfc277fb57b92301f4db199f5aada0b60 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
@@ -1,6 +1,7 @@
 #ifndef IMAGE_PROCESSING_COMMON_H
 #define IMAGE_PROCESSING_COMMON_H
 
+#include <cuda_fp16.h>
 #include <cudnn.h>
 #include <device_launch_parameters.h>
 #include <vector>
@@ -19,6 +20,16 @@ template <typename T> __host__ T resolve_func_ptr(void *func_symbol_ptr) {
   return (T)v_func_ptr;
 }
 
+template <typename T> T *convertAndGetGPUData(Tensor *t) {
+  if (t->data_type == CUDNN_DATA_HALF)
+    throw std::runtime_error("Half tensor cannot be converted to this type");
+  return static_cast<T *>(t->gpu_data);
+}
+
+template <> float *convertAndGetGPUData<float>(Tensor *t);
+
+template <> half *convertAndGetGPUData<half>(Tensor *t);
+
 std::vector<size_t> sizes(Tensor *t);
 
 std::vector<size_t> sizes(const Dimension &dim);
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
index 1375fc8476304af1e74bf6d6be9349cdd802e98d..30ec8569755a77ae63293aa9ca09c34967c229a1 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
@@ -42,7 +42,7 @@ std::tuple<size_t *, Scalar **> make_cuda_params(
   std::array<Scalar *, N> gpu_datas;
   std::transform(srcs.begin(), srcs.end(), gpu_datas.begin(), [](Tensor *t) {
     hostToDeviceCopy(t);
-    return (Scalar *)t->gpu_data;
+    return convertAndGetGPUData<Scalar>(t);
   });
   size_t *cuda_strides;
   Scalar **cuda_gpu_data;
@@ -80,7 +80,7 @@ mapGeneral(void *host_func_ptr, const std::array<Tensor *, N> &srcs) {
   unsigned threads = std::min(max_threads, n_elem);
   unsigned grids = std::min(max_grid, ceilDiv(n_elem, threads));
   kernelMapBroadcast<Scalar, N><<<grids, threads>>>(
-      (Scalar *)target->gpu_data, n_elem, func_ptr, gpu_data, cuda_strides);
+      convertAndGetGPUData<Scalar>(target), n_elem, func_ptr, gpu_data, cuda_strides);
   cudaDeviceSynchronize();
   checkCudaErrors(cudaGetLastError());
   return target;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
index 51d8c4da3f41bdbff079288f743ee8222d123292..4f9424c04d2cc462c64de00ec4d4fb6d01e1da37 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
@@ -133,7 +133,8 @@ __host__ Tensor *reduceDim(
   std::vector<size_t> in_sizes = sizes(src), out_sizes = in_sizes;
   out_sizes[axis] = 1;
   auto *target = (Tensor *)create4DTensor(
-      CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, out_sizes[0], out_sizes[1], out_sizes[2], out_sizes[3]);
+      CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, out_sizes[0], out_sizes[1],
+      out_sizes[2], out_sizes[3]);
   changeTensorPlacement(target, DEVICE);
 
   // Calculate schedule parameters
@@ -144,7 +145,7 @@ __host__ Tensor *reduceDim(
       in_sizes.begin() + axis + 1, in_sizes.end(), 1, std::multiplies<>());
   size_t num_rows = num_irows * num_orows;
 
-  // Calculate approximation parameters  
+  // Calculate approximation parameters
   if (skip_rate != 0.0f)
     INFO("Approximation happening...\n");
   size_t approx_row_size = (size_t)((1 - skip_rate) * row_size);
@@ -162,8 +163,8 @@ __host__ Tensor *reduceDim(
     dim3 threads(AlongDimTh, CrossDimTh);
     dim3 grid(grid_x, grid_y);
     kernelReduceDimParallel<Scalar, bin_float_op><<<grid, threads>>>(
-        (Scalar *)target->gpu_data, (Scalar *)src->gpu_data, init, func_ptr,
-        num_irows, num_orows, row_size, approx_row_size);
+        convertAndGetGPUData<Scalar>(target), convertAndGetGPUData<Scalar>(src),
+        init, func_ptr, num_irows, num_orows, row_size, approx_row_size);
   } else {
     DEBUG(
         "Reducing sequentially, row size = %lu, actually using %lu\n", row_size,
@@ -175,10 +176,10 @@ __host__ Tensor *reduceDim(
         std::min(MaxBlocksPerDim, num_orows), ceilDiv(MaxNBlocks, grid_x));
     dim3 grid(grid_x, grid_y);
     kernelReduceDimSeq<Scalar, bin_float_op><<<grid, threads>>>(
-        (Scalar *)target->gpu_data, (Scalar *)src->gpu_data, init, func_ptr,
-        num_irows, num_orows, row_size, approx_row_size);
+        convertAndGetGPUData<Scalar>(target), convertAndGetGPUData<Scalar>(src),
+        init, func_ptr, num_irows, num_orows, row_size, approx_row_size);
   }
   cudaDeviceSynchronize();
-  checkCudaErrors(cudaGetLastError());
+  checkCUDA(cudaGetLastError());
   return target;
 }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
index 69982e656d6eaf989d15939432e28b235eb9cb6e..e329de391b5a64ceb81b1aab6d071d0b8d852da4 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
@@ -1,19 +1,19 @@
 #ifndef IMG_TENSOR_RUNTIME_H
 #define IMG_TENSOR_RUNTIME_H
 
-#include <cstddef>
-#include "img_tensor_utils.h"
 #include "device_math.h"
+#include "img_tensor_utils.h"
+#include <cstddef>
 
 // ***                        Runtime declaration                        *** //
 void *tensorFft(void *input);
-void *tensorReduce(void *input, size_t axis, void *func);
+void *tensorReduce(void *input, size_t axis, void *func, bool fp16);
 void *tensorReductionSamplingReduce(
     void *input, size_t axis, void *func, int skip_level);
 void *tensorProjectiveT(void *input, void *transformation);
-void *tensorMap1(void *f, void *i);
-void *tensorMap2(void *f2, void *i1, void *i2);
-void *tensorMap3(void *f3, void *i1, void *i2, void *i3);
+void *tensorMap1(void *f, void *i, bool fp16);
+void *tensorMap2(void *f2, void *i1, void *i2, bool fp16);
+void *tensorMap3(void *f3, void *i1, void *i2, void *i3, bool fp16);
 
 // ***                      Wrapper API declaration                      *** //
 extern "C" {
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
index c30ddf25e5aeffef8123897af863b520846b62ce..f812460d175f42cbcf93c0f851b7ffc69fe3a14c 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/common.cpp
@@ -1,7 +1,17 @@
 #include "functional/common.h"
 
-#include <numeric>
 #include <functional>
+#include <numeric>
+
+template <> float *convertAndGetGPUData<float>(Tensor *t) {
+  convertToFP32(t);
+  return static_cast<float *>(t->gpu_data);
+}
+
+template <> half *convertAndGetGPUData<half>(Tensor *t) {
+  convertToFP16(t);
+  return static_cast<half *>(t->gpu_half_data);
+}
 
 std::vector<size_t> sizes(const Dimension &dim) {
   return std::vector<size_t>(dim.dim_sizes, dim.dim_sizes + dim.num_dims);
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
index 633c5a6655d024e9d25257abb2c2a4c8c9a9e05f..c4589b741cd05487814063cc60065bece5a229f1 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
@@ -32,14 +32,14 @@ void *tensorFft(void *input) {
   int n_batch = int(all_dims[0]) * int(all_dims[1]);
   // Prepare input data
   hostToDeviceCopy(t_input);
-  auto *input_cuda = (cufftReal *)t_input->gpu_data;
+  auto *input_cuda = convertAndGetGPUData<cufftReal>(t_input);
   // Define output data
   // FIXME: make a flag for float2_; not CUDNN_DATA_FLOAT.
   auto *out_tensor = (Tensor *)create4DTensor(
       CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, all_dims[0], all_dims[1], width,
       (height / 2 + 1));
   changeTensorPlacement(out_tensor, DEVICE);
-  auto *output_cuda = (cufftComplex *)out_tensor->gpu_data;
+  auto *output_cuda = convertAndGetGPUData<cufftComplex>(out_tensor);
   // Create a 2D FFT plan
   cufftHandle plan;
   checkCUFFT(cufftPlanMany(
@@ -53,7 +53,7 @@ void *tensorFft(void *input) {
   return out_tensor;
 }
 
-void *tensorReduce(void *input, size_t axis, void *func) {
+void *tensorReduce(void *input, size_t axis, void *func, bool fp16) {
   INFO("Reduce\n");
   auto *src = (Tensor *)input;
   if (axis >= src->dims.num_dims)
@@ -62,7 +62,10 @@ void *tensorReduce(void *input, size_t axis, void *func) {
     throw std::runtime_error("Not supported");
 
   // Skip 0% of sample
-  return reduceDim<float>(src, 0.0f, func, axis, 0.0f);
+  if (fp16)
+    return reduceDim<half>(src, 0.0f, func, axis, 0.0f);
+  else
+    return reduceDim<float>(src, 0.0f, func, axis, 0.0f);
 }
 
 void *tensorReductionSamplingReduce(
@@ -91,22 +94,31 @@ void *tensorProjectiveT(void *input, void *transformation) {
   abort();
 }
 
-void *tensorMap1(void *f, void *i) {
+void *tensorMap1(void *f, void *i, bool fp16) {
   INFO("Map1\n");
   auto *src = (Tensor *)i;
-  return mapGeneral<float, 1>(f, {src});
+  if (fp16)
+    return mapGeneral<half, 1>(f, {src});
+  else
+    return mapGeneral<float, 1>(f, {src});
 }
 
-void *tensorMap2(void *f2, void *i1, void *i2) {
+void *tensorMap2(void *f2, void *i1, void *i2, bool fp16) {
   INFO("Map2\n");
   auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2;
-  return mapGeneral<float, 2>(f2, {src1, src2});
+  if (fp16)
+    return mapGeneral<half, 2>(f2, {src1, src2});
+  else
+    return mapGeneral<float, 2>(f2, {src1, src2});
 }
 
-void *tensorMap3(void *f3, void *i1, void *i2, void *i3) {
+void *tensorMap3(void *f3, void *i1, void *i2, void *i3, bool fp16) {
   INFO("Map3\n");
   auto *src1 = (Tensor *)i1, *src2 = (Tensor *)i2, *src3 = (Tensor *)i3;
-  return mapGeneral<float, 3>(f3, {src1, src2, src3});
+  if (fp16)
+    return mapGeneral<half, 3>(f3, {src1, src2, src3});
+  else
+    return mapGeneral<float, 3>(f3, {src1, src2, src3});
 }
 
 // ***                     Wrapper API implementation                    *** //
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
index 6d909a1736482efcadb0797c1b47475c0084ce3c..adcce5ee28db75077c6285a619a9ef74d8b53b97 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
@@ -240,12 +240,12 @@ std::vector<float> PSNR(void *gold_ptr, void *approx_ptr) {
       (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 1, 1, 1);
   std::memcpy(image_size_tensor->host_data, &image_size_f, sizeof(float));
 
-  auto *diff = tensorMap2(device::fsub_ptrptr, gold_tensor, approx_tensor);
-  auto *diffsqr = tensorMap2(device::fmul_ptrptr, diff, diff);
-  auto *mse_sum_1d = tensorReduce(diffsqr, 3, device::fadd_ptrptr);
-  auto *mse_sum = tensorReduce(mse_sum_1d, 2, device::fadd_ptrptr);
-  auto *mse_avg = tensorMap2(device::fdiv_ptrptr, mse_sum, image_size_tensor);
-  auto *psnr_val = (Tensor *)tensorMap1(psnr_ptrptr, mse_avg);
+  auto *diff = tensorMap2(device::fsub_ptrptr, gold_tensor, approx_tensor, false);
+  auto *diffsqr = tensorMap2(device::fmul_ptrptr, diff, diff, false);
+  auto *mse_sum_1d = tensorReduce(diffsqr, 3, device::fadd_ptrptr, false);
+  auto *mse_sum = tensorReduce(mse_sum_1d, 2, device::fadd_ptrptr, false);
+  auto *mse_avg = tensorMap2(device::fdiv_ptrptr, mse_sum, image_size_tensor, false);
+  auto *psnr_val = (Tensor *)tensorMap1(psnr_ptrptr, mse_avg, false);
 
   auto *float_data = (float *)psnr_val->host_data;
   return std::vector<float>(float_data, float_data + batch_dim);