Added util functions

18bc775b · Yifan Zhao · ad0fbba6 · 18bc775b · 18bc775b · 18bc775b
Commit 18bc775b authored 5 years ago by Yifan Zhao
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
+#include <device_launch_parameters.h>
+
+namespace device {
+__device__ float atan2(float y, float x);
+
+__device__ float sqrt(float x);
+
+__device__ float hypot(float x, float y);
+
+__device__ float add(float x, float y);
+
+__device__ float sub(float x, float y);
+
+__device__ float mul(float x, float y);
+
+__device__ float avg3(float x);
+
+extern void *fhypot_ptrptr;
+extern void *atan2_ptrptr;
+extern void *fadd_ptrptr;
+extern void *fsub_ptrptr;
+extern void *fmul_ptrptr;
+extern void *fdiv_ptrptr;
+extern void *sqrt_ptrptr;
+extern void *fmax_ptrptr;
+extern void *fmin_ptrptr;
+extern void *favg3_ptrptr;
+
+extern void *hhypot_ptrptr;
+extern void *hadd_ptrptr;
+extern void *hmax_ptrptr;
+extern void *hdiv_ptrptr;
+} // namespace device
+
+#define DEF_FUNC(func)                                                         \
+  __device__ void *func##_ptr = (void *)func;                                  \
+  void *func##_ptrptr = (void *)&func##_ptr;
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image.h
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/image/stb_image_write.h
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
@@ -2,6 +2,8 @@
 #define IMG_TENSOR_RUNTIME_H

 #include <cstddef>
+#include "img_tensor_utils.h"
+#include "device_math.h"

 // ***                        Runtime declaration                        *** //
 void *tensorFft(void *input);

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
+#ifndef IMG_TENSOR_UTILS
+#define IMG_TENSOR_UTILS
+
+#include <string>
+#include <vector>
+
+#include "tensor.h"
+
+const size_t N_RGB_CHAN = 3;
+
+// Loader constructor
+void *loadAsImage(const char *filename, size_t n_color = N_RGB_CHAN);
+
+void saveToImage(const char *filename, Tensor *tensor);
+
+Tensor *readDataSet(const char *path, size_t n_color = N_RGB_CHAN);
+
+void saveDataSet(const char *path, const char *prefix, Tensor *batch);
+
+// Kernel constructor
+void *createFilterFromData(int data_type, void *data, size_t w, size_t h, size_t n_chan);
+
+std::vector<float> PSNR(void *gold_ptr, void *approx_ptr);
+
+#endif
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
+#include "device_math.h"
+
+#include <cmath>
+#include <cstdint>
+#include <cuda_fp16.h>
+#include <limits>
+
+using uint_t = uint32_t;
+
+const float HALF_PI = M_PI / 2;
+const float EPS = std::numeric_limits<float>::epsilon();
+const float NAN_ = std::numeric_limits<float>::quiet_NaN();
+const float POS_INF = std::numeric_limits<float>::infinity();
+
+#define SQRT_MAX_ITER 100
+
+namespace _internal {
+// Series
+__device__ float
+atan_series_order_calc(float x, float x_pow, const uint_t order) {
+  float s1m4 = (order - 1) * 4;
+  float p1 = (s1m4 - 1) * x_pow, p2 = (s1m4 + 1) * x_pow * x;
+  return __fdividef(1.0, p1) - __fdividef(1.0, p2);
+}
+
+__device__ float atan_series_order(
+    float x, float x_pow, const uint_t order, const uint_t max_order) {
+  if (order == 1) {
+    float rec_x = __fdividef(1.0f, x);
+    return HALF_PI - rec_x +
+           atan_series_order(x * x, pow(x, 3), order + 1, max_order);
+  }
+  // NOTE: x changes to x*x for order > 1
+  if (order < max_order) {
+    return atan_series_order_calc(x, x_pow, order) +
+           atan_series_order(x, x_pow * x * x, order + 1, max_order);
+  } else {
+    // order == max_order
+    return atan_series_order_calc(x, x_pow, order);
+  }
+}
+
+__device__ float atan_series_main(float x) {
+  return (
+      x < 3.0f ? atan_series_order(x, x, 1U, 10U) :                 // O(1/x^39)
+          x < 4.0f ? atan_series_order(x, x, 1U, 9U) :              // O(1/x^35)
+              x < 5.0f ? atan_series_order(x, x, 1U, 8U) :          // O(1/x^31)
+                  x < 7.0f ? atan_series_order(x, x, 1U, 7U) :      // O(1/x^27)
+                      x < 11.0f ? atan_series_order(x, x, 1U, 6U) : // O(1/x^23)
+                          x < 25.0f ? atan_series_order(x, x, 1U, 5U)
+                                    : // O(1/x^19)
+                              x < 100.0f ? atan_series_order(x, x, 1U, 4U)
+                                         : // O(1/x^15)
+                                  x < 1000.0f ? atan_series_order(x, x, 1U, 3U)
+                                              : // O(1/x^11)
+                                      atan_series_order(
+                                          x, x, 1U, 2U) // O(1/x^7)
+  );
+}
+
+// CF
+__device__ float
+atan_cf_recur(float xx, const uint_t depth, const uint_t max_depth) {
+  if (depth < max_depth) {
+    float next_cf = atan_cf_recur(xx, depth + 1, max_depth);
+    return float(2 * depth - 1) + __fdividef(depth * depth * xx, next_cf);
+  } else
+    return float(2 * depth - 1);
+}
+
+__device__ float atan_cf_main(float x) {
+  if (x < 0.5f)
+    return __fdividef(x, atan_cf_recur(x * x, 1U, 15U));
+  if (x < 1.0f)
+    return __fdividef(x, atan_cf_recur(x * x, 1U, 25U));
+  if (x < 1.5f)
+    return __fdividef(x, atan_cf_recur(x * x, 1U, 35U));
+  if (x < 2.0f)
+    return __fdividef(x, atan_cf_recur(x * x, 1U, 45U));
+  return __fdividef(x, atan_cf_recur(x * x, 1U, 52U));
+}
+
+__device__ float atan_begin(float x) {
+  return (x > 2.5f ? atan_series_main(x) : atan_cf_main(x));
+}
+
+__device__ float atan(float x) {
+  if (std::isnan(x))
+    return NAN_;
+  // indistinguishable from zero
+  if (EPS > abs(x))
+    return 0.0f;
+  // negative or positive
+  return x < 0.0f ? -atan_begin(-x) : atan_begin(x);
+}
+
+__device__ float sqrt_recur(float x, float xn, int count) {
+  float x_xn = __fdividef(x, xn);
+  float comp_eps = __fdividef(abs(xn - x_xn), (1.0f + xn));
+  if (comp_eps < EPS)
+    return xn;
+  if (count < SQRT_MAX_ITER)
+    return sqrt_recur(x, 0.5f * (xn + x_xn), count + 1);
+  else
+    return xn;
+}
+
+__device__ float sqrt_check(float x, float m_val) {
+  if (std::isnan(x))
+    return NAN_;
+  if (x < 0.0f)
+    return NAN_;
+  if (x == POS_INF)
+    return x;
+  // indistinguishable from zero or one
+  if (EPS > abs(x))
+    return 0;
+  if (EPS > abs(x - 1.0f))
+    return x;
+  if (x > 4.0f)
+    return sqrt_check(__fdividef(x, 4.0f), 2.0f * m_val);
+  else
+    return m_val * sqrt_recur(x, __fdividef(x, 2.0f), 0);
+}
+} // namespace _internal
+
+__device__ float device::atan2(float y, float x) {
+  if (std::isnan(y) || std::isnan(x))
+    return NAN_;
+  if (std::abs(x) < EPS) {
+    if (std::abs(y) < EPS) {
+      if (y == -0.0f)
+        return x == -0.0f ? -M_PI : -0.0f;
+      else
+        return x == -0.0f ? M_PI : 0.0f;
+    }
+    return y > 0.0f ? M_PI : -M_PI;
+  }
+  float y_over_x = __fdividef(y, x);
+  if (x < 0.0f)
+    return y < 0.0f ? _internal::atan(y_over_x) - M_PI
+                    : _internal::atan(y_over_x) + M_PI;
+  return _internal::atan(y_over_x);
+}
+
+__device__ float device::sqrt(float x) {
+  return _internal::sqrt_check(x, 1.0f);
+}
+
+__device__ float device::hypot(float x, float y) {
+  return device::sqrt(x * x + y * y);
+}
+
+__device__ float device::add(float x, float y) { return x + y; }
+
+__device__ float device::sub(float x, float y) { return x - y; }
+
+__device__ float device::mul(float x, float y) { return x * y; }
+
+__device__ float device::avg3(float x) { return __fdividef(x, 3.0f); }
+
+__device__ half hhypot(half x, half y) { return hsqrt(x * x + y * y); }
+
+__device__ half hmax(half x, half y) { return x >= y ? x : y; }
+
+__device__ void *hypot_ptr = (void *)device::hypot;
+__device__ void *atan2_ptr = (void *)device::atan2;
+__device__ void *fadd_ptr = (void *)device::add;
+__device__ void *fsub_ptr = (void *)device::sub;
+__device__ void *fmul_ptr = (void *)device::mul;
+__device__ void *fdiv_ptr = (void *)__fdividef;
+__device__ void *sqrt_ptr = (void *)device::sqrt;
+__device__ void *fmax_ptr = (void *)(float (*)(float, float))fmax;
+__device__ void *fmin_ptr = (void *)(float (*)(float, float))fmin;
+__device__ void *favg3_ptr = (void *)device::avg3;
+
+__device__ void *hhypot_ptr = (void *)hhypot;
+__device__ void *hadd_ptr = (void *)(half(*)(half, half))__hadd;
+__device__ void *hmax_ptr = (void *)(half(*)(half, half))hmax;
+__device__ void *hdiv_ptr = (void *)__hdiv;
+
+namespace device {
+void *fhypot_ptrptr = (void *)&hypot_ptr;
+void *atan2_ptrptr = (void *)&atan2_ptr;
+void *fadd_ptrptr = (void *)&fadd_ptr;
+void *fsub_ptrptr = (void *)&fsub_ptr;
+void *fmul_ptrptr = (void *)&fmul_ptr;
+void *fdiv_ptrptr = (void *)&fdiv_ptr;
+void *sqrt_ptrptr = (void *)&sqrt_ptr;
+void *fmax_ptrptr = (void *)&fmax_ptr;
+void *fmin_ptrptr = (void *)&fmin_ptr;
+void *favg3_ptrptr = (void *)&favg3_ptr;
+
+void *hhypot_ptrptr = (void *)&hhypot_ptr;
+void *hadd_ptrptr = (void *)&hadd_ptr;
+void *hmax_ptrptr = (void *)&hmax_ptr;
+void *hdiv_ptrptr = (void *)&hdiv_ptr;
+} // namespace device
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
@@ -11,6 +11,8 @@
 // FIXME: really just a hack to compile into a single .o
 #include "common.cpp"
 #include "debug.cpp"
+#include "img_tensor_utils.cpp"
+#include "device_math.cu"

 // ***                       Runtime implementation                      *** //
 void *tensorFft(void *input) {
@@ -79,11 +81,14 @@ void *tensorReductionSamplingReduce(
    return reduceDim<float>(src, 0.0f, func, axis, 0.2f);
  case 2:
    return reduceDim<float>(src, 0.0f, func, axis, 0.4f);
+  default:
+    throw std::runtime_error("Skip level not understood");
  }
 }

 void *tensorProjectiveT(void *input, void *transformation) {
  ERROR("ProjectiveT operation currently unsupported.\n");
+  abort();
 }

 void *tensorMap1(void *f, void *i) {

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
+#include <cstring>
+#include <dirent.h>
+#include <string>
+#include <sys/stat.h>
+
+#include "debug.h"
+#include "img_tensor_utils.h"
+#include "img_tensor_runtime.h"
+#include "device_math.h"
+
+// Image I/O utilities
+#define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+
+#include "image/stb_image.h"
+#include "image/stb_image_write.h"
+
+static inline bool isRegFile(const char *path, dirent *dp) {
+  if (dp->d_type == DT_REG)
+    return true;
+  if (dp->d_type != DT_UNKNOWN)
+    return false;
+  struct stat sb {};
+  if (lstat(path, &sb) == -1) {
+    INFO("lstat failed for file %s", path);
+    return false;
+  }
+  mode_t type = sb.st_mode & S_IFMT;
+  return type == S_IFREG;
+}
+
+static inline std::string sample_file(const char *path) {
+  auto dirp = opendir(path);
+  dirent *dp = nullptr;
+  while ((dp = readdir(dirp)) != nullptr) {
+    std::string filename = std::string(path) + "/" + dp->d_name;
+    if (isRegFile(filename.c_str(), dp))
+      return filename;
+  }
+  return "";
+}
+
+static inline size_t count_file(const char *path) {
+  auto dirp = opendir(path);
+  size_t count = 0;
+  dirent *dp = nullptr;
+  while ((dp = readdir(dirp)) != nullptr) {
+    std::string filename = std::string(path) + "/" + dp->d_name;
+    if (isRegFile(filename.c_str(), dp))
+      count++;
+  }
+  (void)closedir(dirp);
+  return count;
+}
+
+static inline uint8_t *float_to_uint8(const float *fl, size_t count) {
+  auto *ret = new uint8_t[count];
+  float max_v = 0;
+  for (size_t i = 0; i < count; i++)
+    max_v = std::max(max_v, fl[i]);
+  for (size_t i = 0; i < count; i++)
+    ret[i] = uint8_t(fl[i] * 255 / max_v);
+  return ret;
+}
+
+static inline float *uint8_to_float(const uint8_t *ui, size_t len) {
+  auto *ret = new float[len];
+  for (size_t i = 0; i < len; i++)
+    ret[i] = float(ui[i]) / 255;
+  return ret;
+}
+
+static Tensor *to_nhwc(Tensor *t) {
+  if (t->data_format == CUDNN_TENSOR_NHWC) {
+    DEBUG("Tensor already in NHWC format, no conversion needed");
+    return t;
+  } else if (t->data_format != CUDNN_TENSOR_NCHW) {
+    throw std::runtime_error(
+        "Unknown tensor format: " + std::to_string(t->data_format));
+  } else {
+    DEBUG("Converting to NHWC format");
+  }
+
+  size_t *dim_arr = t->dims.dim_sizes;
+  size_t n = dim_arr[0], c = dim_arr[1], h = dim_arr[2], w = dim_arr[3];
+  auto *out_tensor =
+      (Tensor *)create4DTensor(t->data_type, CUDNN_TENSOR_NHWC, n, c, h, w);
+  size_t nhwc_offset = 0;
+  size_t element_size = getTypeSize(t->data_type);
+  char *out_data = (char *)(out_tensor->host_data),
+       *in_data = (char *)(t->host_data);
+  for (int n0 = 0; n0 < n; n0++)
+    for (int h0 = 0; h0 < h; h0++)
+      for (int w0 = 0; w0 < w; w0++)
+        for (int c0 = 0; c0 < c; c0++) {
+          size_t nc = n0 * c + c0, nch = nc * h + h0, nchw_idx = nch * w + w0,
+                 nchw_offset = nchw_idx * element_size;
+          std::memcpy(
+              out_data + nhwc_offset, in_data + nchw_offset, element_size);
+          nhwc_offset += element_size;
+        }
+  return out_tensor;
+}
+
+static Tensor *to_nchw(Tensor *t) {
+  if (t->data_format == CUDNN_TENSOR_NCHW) {
+    DEBUG("Tensor already in NCHW format, no conversion needed");
+    return t;
+  } else if (t->data_format != CUDNN_TENSOR_NHWC) {
+    throw std::runtime_error(
+        "Unknown tensor format: " + std::to_string(t->data_format));
+  } else {
+    DEBUG("Converting to NCHW format");
+  }
+  size_t *dim_arr = t->dims.dim_sizes;
+  size_t n = dim_arr[0], h = dim_arr[1], w = dim_arr[2], c = dim_arr[3];
+  Tensor *out_tensor =
+      (Tensor *)create4DTensor(t->data_type, CUDNN_TENSOR_NCHW, n, c, h, w);
+  size_t nchw_offset = 0;
+  size_t element_size = getTypeSize(t->data_type);
+  char *out_data = (char *)(out_tensor->host_data),
+       *in_data = (char *)(t->host_data);
+  for (int n0 = 0; n0 < n; n0++)
+    for (int c0 = 0; c0 < c; c0++)
+      for (int h0 = 0; h0 < h; h0++)
+        for (int w0 = 0; w0 < w; w0++) {
+          size_t nh = n0 * h + h0, nhw = nh * w + w0, nhwc_idx = nhw * c + c0,
+                 nhwc_offset = nhwc_idx * element_size;
+          std::memcpy(
+              out_data + nchw_offset, in_data + nhwc_offset, element_size);
+          nchw_offset += element_size;
+        }
+  return out_tensor;
+}
+
+Tensor *readDataSet(const char *path, size_t n_color) {
+  INFO("Loading image dataset from path %s", path);
+  auto *first_image = (Tensor *)loadAsImage(sample_file(path).c_str(), n_color);
+  std::vector<size_t> sizes = ::sizes(first_image);
+  delete first_image;
+  size_t h = sizes[2], w = sizes[3];
+  size_t count = count_file(path);
+  DEBUG("Counted %d images in path.", count);
+  auto *batch = (Tensor *)create4DTensor(
+      CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC, count, n_color, h, w);
+  size_t n_floats = n_color * h * w;
+  auto *base_data = (float *)batch->host_data;
+  auto dirp = opendir(path);
+  dirent *dp = nullptr;
+  while ((dp = readdir(dirp)) != nullptr) {
+    if (dp->d_type != DT_REG)
+      continue;
+    std::string entry_path = std::string(path) + "/" + dp->d_name;
+    int x, y, n; // x = width, y = height, n = # 8-bit components per pixel
+    uint8_t *data = stbi_load(entry_path.c_str(), &x, &y, &n, n_color);
+    if (data == nullptr)
+      throw std::runtime_error("Image load failed");
+    float *converted = uint8_to_float(data, n_floats);
+    stbi_image_free(data);
+    memcpy(base_data, converted, n_floats * sizeof(float));
+    delete[] converted;
+    base_data += n_floats;
+  }
+  (void)closedir(dirp);
+  auto *nchw_batch = to_nchw(batch);
+  delete batch;
+  DEBUG("Loaded all images.");
+  return nchw_batch;
+}
+
+void saveDataSet(
+    const char *path, const char *prefix, Tensor *batch) {
+  INFO("Saving image dataset to path %s", path);
+  DEBUG("Copying to CPU before printing");
+  deviceToHostCopy(batch);
+  Tensor *converted_batch = batch;
+  if (batch->data_format == CUDNN_TENSOR_NCHW) {
+    DEBUG("Copy-converting to NHWC format");
+    converted_batch = to_nhwc(batch);
+  }
+  std::vector<size_t> sizes = ::sizes(converted_batch);
+  size_t h = sizes[1], w = sizes[2], c = sizes[3];
+  auto *base_data = (float *)batch->host_data;
+  for (size_t i = 0; i < sizes[0]; i++) {
+    std::string name = path;
+    name += "/";
+    name += prefix;
+    name += std::to_string(i);
+    name += ".png";
+
+    uint8_t *ldr_data = float_to_uint8(base_data, h * w * c);
+    stbi_write_png(name.c_str(), w, h, c, ldr_data, 0);
+    delete[] ldr_data;
+
+    base_data += h * w * c;
+  }
+  if (batch != converted_batch) {
+    delete converted_batch;
+  }
+}
+
+void *loadAsImage(const char *filename, size_t n_color) {
+  INFO("Loading image from path=%s", filename);
+  int x, y, n; // x = width, y = height, n = # 8-bit components per pixel
+  uint8_t *data = stbi_load(filename, &x, &y, &n, n_color);
+  if (data == nullptr)
+    throw std::runtime_error("Image load failed");
+  float *converted = uint8_to_float(data, x * y * n);
+  auto *image =
+      (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC, 1, n, y, x);
+  memcpy(image, converted, x * y * n * sizeof(float));
+  auto *nchw_image = to_nchw(image);
+  stbi_image_free(data);
+  delete image;
+  return nchw_image;
+}
+
+void saveToImage(const char *filename, Tensor *tensor) {
+  INFO("Saving image data to path=%s", filename);
+  deviceToHostCopy(tensor);
+  Tensor *converted_tensor = tensor;
+  if (tensor->data_format == CUDNN_TENSOR_NCHW) {
+    DEBUG("Copy-converting to NHWC format");
+    converted_tensor = to_nhwc(tensor);
+  }
+  auto *hdr_data = (float *)converted_tensor->host_data;
+  size_t *dims = converted_tensor->dims.dim_sizes;
+  size_t w = dims[2], h = dims[1], c = dims[3];
+  uint8_t *ldr = float_to_uint8(hdr_data, w * h * c);
+  stbi_write_png(filename, w, h, c, ldr, 0);
+  delete[] ldr;
+  if (tensor != converted_tensor) {
+    delete converted_tensor;
+  }
+}
+
+void *createFilterFromData(
+    int data_type, void *data, size_t w, size_t h, size_t n_chan) {
+  DEBUG("Creating filter from data");
+  auto *tensor =
+      (Tensor *)create4DTensor(data_type, CUDNN_TENSOR_NCHW, 1, n_chan, h, w);
+  char *tensor_data;
+  if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_FLOAT)
+    tensor_data = (char *)tensor->host_data;
+  else {
+    throw std::runtime_error("Data type unsupported as filter");
+  }
+  size_t channel_sz = tensor->size_in_bytes / n_chan;
+  for (size_t i = 0; i < n_chan; i++, tensor_data += channel_sz) {
+    std::memcpy(tensor_data, data, channel_sz);
+  }
+  return tensor;
+}
+
+__device__ float psnr(float x) {
+    return -10 * log10(x);
+}
+
+DEF_FUNC(psnr)
+
+std::vector<float> PSNR(void *gold_ptr, void *approx_ptr) {
+    auto *gold_tensor = (Tensor *) gold_ptr, *approx_tensor = (Tensor *) approx_ptr;
+
+    size_t *dim_sizes = gold_tensor->dims.dim_sizes;
+    size_t batch_dim = dim_sizes[0];
+    size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
+    float image_size_f = image_size;
+    DEBUG("batch_dim = %lu, image_size = %lu", batch_dim, image_size);
+    auto *image_size_tensor = (Tensor *)create4DTensor(
+      CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 1, 1, 1
+    );
+    memcpy(image_size_tensor->host_data, &image_size_f, sizeof(float));
+
+    auto *diff = tensorMap2(device::fsub_ptrptr, gold_tensor, approx_tensor);
+    auto *diffsqr = tensorMap2(device::fmul_ptrptr, diff, diff);
+    auto *mse_sum_1d = tensorReduce(diffsqr, 3, device::fadd_ptrptr);
+    auto *mse_sum = tensorReduce(mse_sum_1d, 2, device::fadd_ptrptr);
+    auto *mse_avg = tensorMap2(device::fdiv_ptrptr, mse_sum, image_size_tensor);
+    auto *psnr_val = (Tensor *) tensorMap1(psnr_ptrptr, mse_avg);
+
+    auto *float_data = (float*)psnr_val->host_data;
+    return std::vector<float>(float_data, float_data + batch_dim);
+}