Skip to content
Snippets Groups Projects
Commit 18bc775b authored by Yifan Zhao's avatar Yifan Zhao
Browse files

Added util functions

parent ad0fbba6
No related branches found
No related tags found
No related merge requests found
#include <device_launch_parameters.h>
namespace device {
__device__ float atan2(float y, float x);
__device__ float sqrt(float x);
__device__ float hypot(float x, float y);
__device__ float add(float x, float y);
__device__ float sub(float x, float y);
__device__ float mul(float x, float y);
__device__ float avg3(float x);
extern void *fhypot_ptrptr;
extern void *atan2_ptrptr;
extern void *fadd_ptrptr;
extern void *fsub_ptrptr;
extern void *fmul_ptrptr;
extern void *fdiv_ptrptr;
extern void *sqrt_ptrptr;
extern void *fmax_ptrptr;
extern void *fmin_ptrptr;
extern void *favg3_ptrptr;
extern void *hhypot_ptrptr;
extern void *hadd_ptrptr;
extern void *hmax_ptrptr;
extern void *hdiv_ptrptr;
} // namespace device
#define DEF_FUNC(func) \
__device__ void *func##_ptr = (void *)func; \
void *func##_ptrptr = (void *)&func##_ptr;
This diff is collapsed.
This diff is collapsed.
......@@ -2,6 +2,8 @@
#define IMG_TENSOR_RUNTIME_H
#include <cstddef>
#include "img_tensor_utils.h"
#include "device_math.h"
// *** Runtime declaration *** //
void *tensorFft(void *input);
......
#ifndef IMG_TENSOR_UTILS
#define IMG_TENSOR_UTILS
#include <string>
#include <vector>
#include "tensor.h"
const size_t N_RGB_CHAN = 3;
// Loader constructor
void *loadAsImage(const char *filename, size_t n_color = N_RGB_CHAN);
void saveToImage(const char *filename, Tensor *tensor);
Tensor *readDataSet(const char *path, size_t n_color = N_RGB_CHAN);
void saveDataSet(const char *path, const char *prefix, Tensor *batch);
// Kernel constructor
void *createFilterFromData(int data_type, void *data, size_t w, size_t h, size_t n_chan);
std::vector<float> PSNR(void *gold_ptr, void *approx_ptr);
#endif
#include "device_math.h"
#include <cmath>
#include <cstdint>
#include <cuda_fp16.h>
#include <limits>
using uint_t = uint32_t;
const float HALF_PI = M_PI / 2;
const float EPS = std::numeric_limits<float>::epsilon();
const float NAN_ = std::numeric_limits<float>::quiet_NaN();
const float POS_INF = std::numeric_limits<float>::infinity();
#define SQRT_MAX_ITER 100
namespace _internal {
// Series
__device__ float
atan_series_order_calc(float x, float x_pow, const uint_t order) {
float s1m4 = (order - 1) * 4;
float p1 = (s1m4 - 1) * x_pow, p2 = (s1m4 + 1) * x_pow * x;
return __fdividef(1.0, p1) - __fdividef(1.0, p2);
}
__device__ float atan_series_order(
float x, float x_pow, const uint_t order, const uint_t max_order) {
if (order == 1) {
float rec_x = __fdividef(1.0f, x);
return HALF_PI - rec_x +
atan_series_order(x * x, pow(x, 3), order + 1, max_order);
}
// NOTE: x changes to x*x for order > 1
if (order < max_order) {
return atan_series_order_calc(x, x_pow, order) +
atan_series_order(x, x_pow * x * x, order + 1, max_order);
} else {
// order == max_order
return atan_series_order_calc(x, x_pow, order);
}
}
__device__ float atan_series_main(float x) {
return (
x < 3.0f ? atan_series_order(x, x, 1U, 10U) : // O(1/x^39)
x < 4.0f ? atan_series_order(x, x, 1U, 9U) : // O(1/x^35)
x < 5.0f ? atan_series_order(x, x, 1U, 8U) : // O(1/x^31)
x < 7.0f ? atan_series_order(x, x, 1U, 7U) : // O(1/x^27)
x < 11.0f ? atan_series_order(x, x, 1U, 6U) : // O(1/x^23)
x < 25.0f ? atan_series_order(x, x, 1U, 5U)
: // O(1/x^19)
x < 100.0f ? atan_series_order(x, x, 1U, 4U)
: // O(1/x^15)
x < 1000.0f ? atan_series_order(x, x, 1U, 3U)
: // O(1/x^11)
atan_series_order(
x, x, 1U, 2U) // O(1/x^7)
);
}
// CF
__device__ float
atan_cf_recur(float xx, const uint_t depth, const uint_t max_depth) {
if (depth < max_depth) {
float next_cf = atan_cf_recur(xx, depth + 1, max_depth);
return float(2 * depth - 1) + __fdividef(depth * depth * xx, next_cf);
} else
return float(2 * depth - 1);
}
__device__ float atan_cf_main(float x) {
if (x < 0.5f)
return __fdividef(x, atan_cf_recur(x * x, 1U, 15U));
if (x < 1.0f)
return __fdividef(x, atan_cf_recur(x * x, 1U, 25U));
if (x < 1.5f)
return __fdividef(x, atan_cf_recur(x * x, 1U, 35U));
if (x < 2.0f)
return __fdividef(x, atan_cf_recur(x * x, 1U, 45U));
return __fdividef(x, atan_cf_recur(x * x, 1U, 52U));
}
__device__ float atan_begin(float x) {
return (x > 2.5f ? atan_series_main(x) : atan_cf_main(x));
}
__device__ float atan(float x) {
if (std::isnan(x))
return NAN_;
// indistinguishable from zero
if (EPS > abs(x))
return 0.0f;
// negative or positive
return x < 0.0f ? -atan_begin(-x) : atan_begin(x);
}
__device__ float sqrt_recur(float x, float xn, int count) {
float x_xn = __fdividef(x, xn);
float comp_eps = __fdividef(abs(xn - x_xn), (1.0f + xn));
if (comp_eps < EPS)
return xn;
if (count < SQRT_MAX_ITER)
return sqrt_recur(x, 0.5f * (xn + x_xn), count + 1);
else
return xn;
}
__device__ float sqrt_check(float x, float m_val) {
if (std::isnan(x))
return NAN_;
if (x < 0.0f)
return NAN_;
if (x == POS_INF)
return x;
// indistinguishable from zero or one
if (EPS > abs(x))
return 0;
if (EPS > abs(x - 1.0f))
return x;
if (x > 4.0f)
return sqrt_check(__fdividef(x, 4.0f), 2.0f * m_val);
else
return m_val * sqrt_recur(x, __fdividef(x, 2.0f), 0);
}
} // namespace _internal
__device__ float device::atan2(float y, float x) {
if (std::isnan(y) || std::isnan(x))
return NAN_;
if (std::abs(x) < EPS) {
if (std::abs(y) < EPS) {
if (y == -0.0f)
return x == -0.0f ? -M_PI : -0.0f;
else
return x == -0.0f ? M_PI : 0.0f;
}
return y > 0.0f ? M_PI : -M_PI;
}
float y_over_x = __fdividef(y, x);
if (x < 0.0f)
return y < 0.0f ? _internal::atan(y_over_x) - M_PI
: _internal::atan(y_over_x) + M_PI;
return _internal::atan(y_over_x);
}
__device__ float device::sqrt(float x) {
return _internal::sqrt_check(x, 1.0f);
}
__device__ float device::hypot(float x, float y) {
return device::sqrt(x * x + y * y);
}
__device__ float device::add(float x, float y) { return x + y; }
__device__ float device::sub(float x, float y) { return x - y; }
__device__ float device::mul(float x, float y) { return x * y; }
__device__ float device::avg3(float x) { return __fdividef(x, 3.0f); }
__device__ half hhypot(half x, half y) { return hsqrt(x * x + y * y); }
__device__ half hmax(half x, half y) { return x >= y ? x : y; }
__device__ void *hypot_ptr = (void *)device::hypot;
__device__ void *atan2_ptr = (void *)device::atan2;
__device__ void *fadd_ptr = (void *)device::add;
__device__ void *fsub_ptr = (void *)device::sub;
__device__ void *fmul_ptr = (void *)device::mul;
__device__ void *fdiv_ptr = (void *)__fdividef;
__device__ void *sqrt_ptr = (void *)device::sqrt;
__device__ void *fmax_ptr = (void *)(float (*)(float, float))fmax;
__device__ void *fmin_ptr = (void *)(float (*)(float, float))fmin;
__device__ void *favg3_ptr = (void *)device::avg3;
__device__ void *hhypot_ptr = (void *)hhypot;
__device__ void *hadd_ptr = (void *)(half(*)(half, half))__hadd;
__device__ void *hmax_ptr = (void *)(half(*)(half, half))hmax;
__device__ void *hdiv_ptr = (void *)__hdiv;
namespace device {
void *fhypot_ptrptr = (void *)&hypot_ptr;
void *atan2_ptrptr = (void *)&atan2_ptr;
void *fadd_ptrptr = (void *)&fadd_ptr;
void *fsub_ptrptr = (void *)&fsub_ptr;
void *fmul_ptrptr = (void *)&fmul_ptr;
void *fdiv_ptrptr = (void *)&fdiv_ptr;
void *sqrt_ptrptr = (void *)&sqrt_ptr;
void *fmax_ptrptr = (void *)&fmax_ptr;
void *fmin_ptrptr = (void *)&fmin_ptr;
void *favg3_ptrptr = (void *)&favg3_ptr;
void *hhypot_ptrptr = (void *)&hhypot_ptr;
void *hadd_ptrptr = (void *)&hadd_ptr;
void *hmax_ptrptr = (void *)&hmax_ptr;
void *hdiv_ptrptr = (void *)&hdiv_ptr;
} // namespace device
......@@ -11,6 +11,8 @@
// FIXME: really just a hack to compile into a single .o
#include "common.cpp"
#include "debug.cpp"
#include "img_tensor_utils.cpp"
#include "device_math.cu"
// *** Runtime implementation *** //
void *tensorFft(void *input) {
......@@ -79,11 +81,14 @@ void *tensorReductionSamplingReduce(
return reduceDim<float>(src, 0.0f, func, axis, 0.2f);
case 2:
return reduceDim<float>(src, 0.0f, func, axis, 0.4f);
default:
throw std::runtime_error("Skip level not understood");
}
}
void *tensorProjectiveT(void *input, void *transformation) {
ERROR("ProjectiveT operation currently unsupported.\n");
abort();
}
void *tensorMap1(void *f, void *i) {
......
#include <cstring>
#include <dirent.h>
#include <string>
#include <sys/stat.h>
#include "debug.h"
#include "img_tensor_utils.h"
#include "img_tensor_runtime.h"
#include "device_math.h"
// Image I/O utilities
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_WRITE_IMPLEMENTATION
#include "image/stb_image.h"
#include "image/stb_image_write.h"
static inline bool isRegFile(const char *path, dirent *dp) {
if (dp->d_type == DT_REG)
return true;
if (dp->d_type != DT_UNKNOWN)
return false;
struct stat sb {};
if (lstat(path, &sb) == -1) {
INFO("lstat failed for file %s", path);
return false;
}
mode_t type = sb.st_mode & S_IFMT;
return type == S_IFREG;
}
static inline std::string sample_file(const char *path) {
auto dirp = opendir(path);
dirent *dp = nullptr;
while ((dp = readdir(dirp)) != nullptr) {
std::string filename = std::string(path) + "/" + dp->d_name;
if (isRegFile(filename.c_str(), dp))
return filename;
}
return "";
}
static inline size_t count_file(const char *path) {
auto dirp = opendir(path);
size_t count = 0;
dirent *dp = nullptr;
while ((dp = readdir(dirp)) != nullptr) {
std::string filename = std::string(path) + "/" + dp->d_name;
if (isRegFile(filename.c_str(), dp))
count++;
}
(void)closedir(dirp);
return count;
}
static inline uint8_t *float_to_uint8(const float *fl, size_t count) {
auto *ret = new uint8_t[count];
float max_v = 0;
for (size_t i = 0; i < count; i++)
max_v = std::max(max_v, fl[i]);
for (size_t i = 0; i < count; i++)
ret[i] = uint8_t(fl[i] * 255 / max_v);
return ret;
}
static inline float *uint8_to_float(const uint8_t *ui, size_t len) {
auto *ret = new float[len];
for (size_t i = 0; i < len; i++)
ret[i] = float(ui[i]) / 255;
return ret;
}
static Tensor *to_nhwc(Tensor *t) {
if (t->data_format == CUDNN_TENSOR_NHWC) {
DEBUG("Tensor already in NHWC format, no conversion needed");
return t;
} else if (t->data_format != CUDNN_TENSOR_NCHW) {
throw std::runtime_error(
"Unknown tensor format: " + std::to_string(t->data_format));
} else {
DEBUG("Converting to NHWC format");
}
size_t *dim_arr = t->dims.dim_sizes;
size_t n = dim_arr[0], c = dim_arr[1], h = dim_arr[2], w = dim_arr[3];
auto *out_tensor =
(Tensor *)create4DTensor(t->data_type, CUDNN_TENSOR_NHWC, n, c, h, w);
size_t nhwc_offset = 0;
size_t element_size = getTypeSize(t->data_type);
char *out_data = (char *)(out_tensor->host_data),
*in_data = (char *)(t->host_data);
for (int n0 = 0; n0 < n; n0++)
for (int h0 = 0; h0 < h; h0++)
for (int w0 = 0; w0 < w; w0++)
for (int c0 = 0; c0 < c; c0++) {
size_t nc = n0 * c + c0, nch = nc * h + h0, nchw_idx = nch * w + w0,
nchw_offset = nchw_idx * element_size;
std::memcpy(
out_data + nhwc_offset, in_data + nchw_offset, element_size);
nhwc_offset += element_size;
}
return out_tensor;
}
static Tensor *to_nchw(Tensor *t) {
if (t->data_format == CUDNN_TENSOR_NCHW) {
DEBUG("Tensor already in NCHW format, no conversion needed");
return t;
} else if (t->data_format != CUDNN_TENSOR_NHWC) {
throw std::runtime_error(
"Unknown tensor format: " + std::to_string(t->data_format));
} else {
DEBUG("Converting to NCHW format");
}
size_t *dim_arr = t->dims.dim_sizes;
size_t n = dim_arr[0], h = dim_arr[1], w = dim_arr[2], c = dim_arr[3];
Tensor *out_tensor =
(Tensor *)create4DTensor(t->data_type, CUDNN_TENSOR_NCHW, n, c, h, w);
size_t nchw_offset = 0;
size_t element_size = getTypeSize(t->data_type);
char *out_data = (char *)(out_tensor->host_data),
*in_data = (char *)(t->host_data);
for (int n0 = 0; n0 < n; n0++)
for (int c0 = 0; c0 < c; c0++)
for (int h0 = 0; h0 < h; h0++)
for (int w0 = 0; w0 < w; w0++) {
size_t nh = n0 * h + h0, nhw = nh * w + w0, nhwc_idx = nhw * c + c0,
nhwc_offset = nhwc_idx * element_size;
std::memcpy(
out_data + nchw_offset, in_data + nhwc_offset, element_size);
nchw_offset += element_size;
}
return out_tensor;
}
Tensor *readDataSet(const char *path, size_t n_color) {
INFO("Loading image dataset from path %s", path);
auto *first_image = (Tensor *)loadAsImage(sample_file(path).c_str(), n_color);
std::vector<size_t> sizes = ::sizes(first_image);
delete first_image;
size_t h = sizes[2], w = sizes[3];
size_t count = count_file(path);
DEBUG("Counted %d images in path.", count);
auto *batch = (Tensor *)create4DTensor(
CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC, count, n_color, h, w);
size_t n_floats = n_color * h * w;
auto *base_data = (float *)batch->host_data;
auto dirp = opendir(path);
dirent *dp = nullptr;
while ((dp = readdir(dirp)) != nullptr) {
if (dp->d_type != DT_REG)
continue;
std::string entry_path = std::string(path) + "/" + dp->d_name;
int x, y, n; // x = width, y = height, n = # 8-bit components per pixel
uint8_t *data = stbi_load(entry_path.c_str(), &x, &y, &n, n_color);
if (data == nullptr)
throw std::runtime_error("Image load failed");
float *converted = uint8_to_float(data, n_floats);
stbi_image_free(data);
memcpy(base_data, converted, n_floats * sizeof(float));
delete[] converted;
base_data += n_floats;
}
(void)closedir(dirp);
auto *nchw_batch = to_nchw(batch);
delete batch;
DEBUG("Loaded all images.");
return nchw_batch;
}
void saveDataSet(
const char *path, const char *prefix, Tensor *batch) {
INFO("Saving image dataset to path %s", path);
DEBUG("Copying to CPU before printing");
deviceToHostCopy(batch);
Tensor *converted_batch = batch;
if (batch->data_format == CUDNN_TENSOR_NCHW) {
DEBUG("Copy-converting to NHWC format");
converted_batch = to_nhwc(batch);
}
std::vector<size_t> sizes = ::sizes(converted_batch);
size_t h = sizes[1], w = sizes[2], c = sizes[3];
auto *base_data = (float *)batch->host_data;
for (size_t i = 0; i < sizes[0]; i++) {
std::string name = path;
name += "/";
name += prefix;
name += std::to_string(i);
name += ".png";
uint8_t *ldr_data = float_to_uint8(base_data, h * w * c);
stbi_write_png(name.c_str(), w, h, c, ldr_data, 0);
delete[] ldr_data;
base_data += h * w * c;
}
if (batch != converted_batch) {
delete converted_batch;
}
}
void *loadAsImage(const char *filename, size_t n_color) {
INFO("Loading image from path=%s", filename);
int x, y, n; // x = width, y = height, n = # 8-bit components per pixel
uint8_t *data = stbi_load(filename, &x, &y, &n, n_color);
if (data == nullptr)
throw std::runtime_error("Image load failed");
float *converted = uint8_to_float(data, x * y * n);
auto *image =
(Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NHWC, 1, n, y, x);
memcpy(image, converted, x * y * n * sizeof(float));
auto *nchw_image = to_nchw(image);
stbi_image_free(data);
delete image;
return nchw_image;
}
void saveToImage(const char *filename, Tensor *tensor) {
INFO("Saving image data to path=%s", filename);
deviceToHostCopy(tensor);
Tensor *converted_tensor = tensor;
if (tensor->data_format == CUDNN_TENSOR_NCHW) {
DEBUG("Copy-converting to NHWC format");
converted_tensor = to_nhwc(tensor);
}
auto *hdr_data = (float *)converted_tensor->host_data;
size_t *dims = converted_tensor->dims.dim_sizes;
size_t w = dims[2], h = dims[1], c = dims[3];
uint8_t *ldr = float_to_uint8(hdr_data, w * h * c);
stbi_write_png(filename, w, h, c, ldr, 0);
delete[] ldr;
if (tensor != converted_tensor) {
delete converted_tensor;
}
}
void *createFilterFromData(
int data_type, void *data, size_t w, size_t h, size_t n_chan) {
DEBUG("Creating filter from data");
auto *tensor =
(Tensor *)create4DTensor(data_type, CUDNN_TENSOR_NCHW, 1, n_chan, h, w);
char *tensor_data;
if (data_type == CUDNN_DATA_HALF || data_type == CUDNN_DATA_FLOAT)
tensor_data = (char *)tensor->host_data;
else {
throw std::runtime_error("Data type unsupported as filter");
}
size_t channel_sz = tensor->size_in_bytes / n_chan;
for (size_t i = 0; i < n_chan; i++, tensor_data += channel_sz) {
std::memcpy(tensor_data, data, channel_sz);
}
return tensor;
}
__device__ float psnr(float x) {
return -10 * log10(x);
}
DEF_FUNC(psnr)
std::vector<float> PSNR(void *gold_ptr, void *approx_ptr) {
auto *gold_tensor = (Tensor *) gold_ptr, *approx_tensor = (Tensor *) approx_ptr;
size_t *dim_sizes = gold_tensor->dims.dim_sizes;
size_t batch_dim = dim_sizes[0];
size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3];
float image_size_f = image_size;
DEBUG("batch_dim = %lu, image_size = %lu", batch_dim, image_size);
auto *image_size_tensor = (Tensor *)create4DTensor(
CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 1, 1, 1
);
memcpy(image_size_tensor->host_data, &image_size_f, sizeof(float));
auto *diff = tensorMap2(device::fsub_ptrptr, gold_tensor, approx_tensor);
auto *diffsqr = tensorMap2(device::fmul_ptrptr, diff, diff);
auto *mse_sum_1d = tensorReduce(diffsqr, 3, device::fadd_ptrptr);
auto *mse_sum = tensorReduce(mse_sum_1d, 2, device::fadd_ptrptr);
auto *mse_avg = tensorMap2(device::fdiv_ptrptr, mse_sum, image_size_tensor);
auto *psnr_val = (Tensor *) tensorMap1(psnr_ptrptr, mse_avg);
auto *float_data = (float*)psnr_val->host_data;
return std::vector<float>(float_data, float_data + batch_dim);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment