Skip to content
Snippets Groups Projects
Commit 8d8857d7 authored by hashimsharif's avatar hashimsharif
Browse files

Adding CPU tensor operations and approximations

parent 8febde45
No related branches found
No related tags found
No related merge requests found
...@@ -22,17 +22,18 @@ extern "C"{ ...@@ -22,17 +22,18 @@ extern "C"{
// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations
// NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width) // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width)
void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size,
size_t dim3_size, size_t dim4_size); size_t dim3_size, size_t dim4_size, bool freeMemory = true);
void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes); void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes);
/********** Tensor Operation API ******/ /********** Tensor Operation API ******/
// NOTE: For conv_mode, only value '1' is supported // NOTE: For conv_mode, only value '1' is supported
void* tensorConvolutionCPU(void* input, void* filter, void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr,
int vertical_pad, int horizontal_pad, int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride, int vertical_stride, int horizontal_stride,
int conv_mode, int compute_precision); int conv_mode, int compute_precision,
int row, int col, int skip_every, int start);
void* tensorPoolingCPU(void* input, void* tensorPoolingCPU(void* input,
int poolFunction, int poolFunction,
......
/* This file includes the API implementation of the HPVM tensor runtime built on /* This file includes the API implementation of the HPVM tensor runtime built for CPU
*cublas, cudnn **
** ** Author: Hashim Sharif
** Author: Hashim Sharif ** Email: hsharif3@illinois.edu
** Email: hsharif3@illinois.edu */
*/
#include <algorithm> #include <algorithm>
#include <cfloat> #include <cfloat>
...@@ -15,81 +14,88 @@ ...@@ -15,81 +14,88 @@
#include <iostream> #include <iostream>
#include <limits> #include <limits>
#include <map> #include <map>
#include <cmath>
#include <memory> #include <memory>
#include <vector>
#include <sstream> #include <sstream>
#include <stdarg.h> #include <stdarg.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include <math.h>
#include<bits/stdc++.h>
#include <pthread.h>
#include <omp.h>
// Tensor runtime header files // Tensor runtime header files
#include "../include/tensor_cpu.h" #include "tensor_cpu.h"
#include "../include/tensor_cpu_runtime.h" #include "tensor_cpu_runtime.h"
extern "C"{ void llvm_hpvm_initTensorRt(int) {
void llvm_hpvm_initTensorRt(int gpuid) {
// NOTE: Do Nothing // NOTE: Do Nothing
} }
void llvm_hpvm_cleanupTensorRt() { void llvm_hpvm_cleanupTensorRt() {
// NOTE: Do Nothing // NOTE: Do Nothing
} }
void hpvm_request_tensor(void *tensor, int destination) { void hpvm_request_tensor(void *tensor, int destination) {
// NOTE: Do Nothing // NOTE: Do Nothing
} }
// Returns the size of the target cudnn datatype std::vector<void *> PtrVect;
int getTypeSize(int data_type) __attribute__((always_inline));
inline int getTypeSize(int data_type) { void freeBatchMemory() {
// Float/Int data type - Full Precision for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
if (data_type == 0) free(*it);
return 4; }
// Half data type PtrVect.erase(PtrVect.begin(), PtrVect.end());
if (data_type == 1) }
return 2;
inline int getTypeSize(int data_type) {
return 1; return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
} }
void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline));
inline void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) { inline void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) {
int type_size = getTypeSize(data_type); int type_size = getTypeSize(data_type);
size_t size_in_bytes = type_size * num_elems; size_t size_in_bytes = type_size * num_elems;
tensor->size_in_bytes = size_in_bytes; tensor->size_in_bytes = size_in_bytes;
} }
void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); void allocateMemCPU(struct Tensor *tensor, int data_type,
inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) { size_t num_elems, bool freeMemory = true) __attribute__((always_inline));
inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) {
setSizeInBytes(tensor, data_type, num_elems); setSizeInBytes(tensor, data_type, num_elems);
tensor->data_type = data_type; tensor->data_type = data_type;
tensor->num_elems = num_elems; tensor->num_elems = num_elems;
tensor->host_data = tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
(void *)malloc(tensor->size_in_bytes); // Allocate memory on the host if(freeMemory)
} PtrVect.push_back(tensor->host_data);
}
void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) {
Tensor *tensor = (Tensor *)tensor_ptr; Tensor *tensor = (Tensor *)tensor_ptr;
if (tensor->size_in_bytes != size_in_bytes) { if (tensor->size_in_bytes != size_in_bytes) {
printf("The destination and source sizes don't match"); printf("The destination and source sizes don't match");
} }
memcpy(tensor->host_data, data_ptr, size_in_bytes); memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough?
} }
void *create4DTensorInternal(int data_type, int data_format, size_t dim1_size,
size_t dim2_size, size_t dim3_size, size_t dim4_size) __attribute__((always_inline));
inline void *create4DTensorInternal(int data_type, int data_format, size_t dim1_size,
size_t dim2_size, size_t dim3_size, size_t dim4_size) {
//void *create4DTensor(int data_type, int data_format, size_t dim1_size,
// size_t dim2_size, size_t dim3_size, size_t dim4_size,
//bool freeMemory = true) __attribute__((always_inline));
inline void *create4DTensor(int data_type, int data_format, size_t dim1_size,
size_t dim2_size, size_t dim3_size,
size_t dim4_size, bool freeMemory) {
struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
if(freeMemory)
allocateMemCPU(tensor, data_type, num_elems); PtrVect.push_back(tensor);
allocateMemCPU(tensor, data_type, num_elems, freeMemory);
// Setting the tensor dimensions // Setting the tensor dimensions
size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
dim_sizes[0] = dim1_size; dim_sizes[0] = dim1_size;
...@@ -98,368 +104,872 @@ extern "C"{ ...@@ -98,368 +104,872 @@ extern "C"{
dim_sizes[3] = dim4_size; dim_sizes[3] = dim4_size;
tensor->dims.dim_sizes = dim_sizes; tensor->dims.dim_sizes = dim_sizes;
tensor->dims.num_dims = 4; tensor->dims.num_dims = 4;
return tensor; return tensor;
} }
void* create4DTensor(int data_type, int data_format, size_t dim1_size,
size_t dim2_size, size_t dim3_size, size_t dim4_size) {
return create4DTensorInternal(data_type, data_format, dim1_size, dim2_size, dim3_size, dim4_size);
}
void* __attribute__((always_inline)) tensorAddCPU(void *x_ptr, void *bias_ptr) {
Tensor *x = (Tensor *)x_ptr;
Tensor *bias = (Tensor *)bias_ptr;
float *x_data = (float *)x->host_data;
float *bias_data = (float *)bias->host_data;
int n = x->dims.dim_sizes[0];
int c = x->dims.dim_sizes[1];
int h = x->dims.dim_sizes[2];
int w = x->dims.dim_sizes[3];
size_t num_elems = x->num_elems;
size_t num_elems2 = bias->num_elems;
if (num_elems == num_elems2) { void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
for (size_t i = 0; i < num_elems; i++) { int horizontal_pad, int vertical_stride,
x_data[i] += bias_data[i]; int horizontal_stride, int conv_mode,
} int compute_precision) {
} else { Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
float * __restrict__ host_image = (float *)input->host_data;
float * __restrict__ host_filter = (float *)filter->host_data;
for (int i = 0; i < n; i++) { int batch_size = input->dims.dim_sizes[0];
for (int j = 0; j < c; j++) { int channels = input->dims.dim_sizes[1];
for (int k = 0; k < h; k++) { int image_height = input->dims.dim_sizes[2];
for (int l = 0; l < w; l++) { int image_width = input->dims.dim_sizes[3];
x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; int num_filters = filter->dims.dim_sizes[0];
} int kernel_height = filter->dims.dim_sizes[2];
} int kernel_width = filter->dims.dim_sizes[3];
} int output_height =
} 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
int output_width =
1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
int num_filter_elem = kernel_height * kernel_width * channels;
int output_size = output_width * output_height;
Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width);
float * __restrict__ output_data = (float *)output->host_data;
long int conv_data_size =
sizeof(float) * num_filter_elem * output_height * output_width * batch_size;
float *host_data = (float *) malloc(conv_data_size);
omp_set_num_threads(4);
#pragma omp parallel for
for(int b = 0; b < batch_size; b++) {
for(int ch = 0; ch < channels; ch++) {
for(int h = 0; h < output_height; h++) {
for(int w = 0; w < output_width; w++) {
const int inH = h * vertical_stride - vertical_pad;
const int inW = w * horizontal_stride - horizontal_pad;
for(int i = 0; i < kernel_height; i++) {
for(int j = 0; j < kernel_width; j++) {
const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j;
const int output_index = h * output_width + w;
const int out_index = b * num_filter_elem * output_size
+ output_index * num_filter_elem + filter_elem_num;
if(inH + i >= 0 && inH + i < image_height
&& inW + j >= 0 && inW + j < image_width) {
host_data[out_index] =
host_image[((b * channels + ch) * image_height
+ (inH + i)) * image_width + (inW + j)];
} else {
host_data[out_index] = 0;
}
}
}
}
}
}
// Tensor Multiply
for (int p = 0; p < num_filters; ++p) {
for (int m = 0; m < output_size; ++m) {
float sum = 0;
#pragma omp simd reduction(+:sum)
for (int k = 0; k < num_filter_elem; ++k) {
int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
}
output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
}
}
} }
free(host_data);
return x; return output;
} }
void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
Tensor *lhs = (Tensor *)lhs_ptr;
Tensor *rhs = (Tensor *)rhs_ptr;
// 'm' holds the batch dimension - assuming NCHW format Tensors void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr,
int m = lhs->dims.dim_sizes[0]; int vertical_pad, int horizontal_pad,
// The rhs must be a 2D tensor int vertical_stride, int horizontal_stride,
int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons int conv_mode, int compute_precision,
int k = 1; int skip_every, int start) {
// Flattening the dimensions after the batch dimension Tensor *input = (Tensor *)input_ptr;
// NOTE: Allowing any number of dimensions > 2 for lhs Tensor *filter = (Tensor *)filter_ptr;
for (int j = 1; j < lhs->dims.num_dims; j++) {
k = k * lhs->dims.dim_sizes[j]; // input neurons float * __restrict__ host_image = (float *)input->host_data;
float * __restrict__ host_filter = (float *)filter->host_data;
const int batch_size = input->dims.dim_sizes[0];
const int channels = input->dims.dim_sizes[1];
const int image_height = input->dims.dim_sizes[2];
const int image_width = input->dims.dim_sizes[3];
const int num_filters = filter->dims.dim_sizes[0];
const int kernel_height = filter->dims.dim_sizes[2];
const int kernel_width = filter->dims.dim_sizes[3];
const int output_height =
1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
const int output_width =
1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
const int num_filter_elem = kernel_height * kernel_width * channels;
const int remainder = ((num_filter_elem - start) % skip_every > 0);
const int reduced_num_filter_elem =
num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
const int output_size = output_width * output_height;
Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width);
float * __restrict__ output_data = (float *)output->host_data;
const long int host_data_size = sizeof(float) * reduced_num_filter_elem
* output_height * output_width * batch_size;
float *host_data = (float *) malloc(host_data_size);
const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
float *reduced_kernels = (float *) malloc(reduced_filer_size);
float fac = ((float) skip_every) / ((float) skip_every - 1);
int reduced_filter_dim = reduced_num_filter_elem / channels;
// Create reduced filter
omp_set_num_threads(4);
#pragma omp parallel for
for(int f = 0; f < num_filters; f++) {
for(int i = 0; i < reduced_num_filter_elem; i++) {
int ch = i / reduced_filter_dim;
int offset = (start + ch) % skip_every;
int in_index;
if(i < offset) {
in_index = i;
} else {
in_index = ((i - offset + 1) * skip_every) / (skip_every - 1)
+ (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1;
}
reduced_kernels[f * reduced_num_filter_elem + i] =
fac * host_filter[num_filter_elem * f + in_index];
}
} }
#pragma omp parallel for
for(int b = 0; b < batch_size; b++) {
for(int h = 0; h < output_height; h++) {
for(int w = 0; w < output_width; w++) {
const int inH = h * vertical_stride - vertical_pad;
const int inW = w * horizontal_stride - horizontal_pad;
for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
int in_index;
const int ch = fi / reduced_filter_dim;
const int offset = (start + ch) % skip_every;
if(fi < offset) {
in_index = fi;
} else {
in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+ (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
}
const int i = (in_index % (kernel_width * kernel_height)) / kernel_width;
const int j = in_index % kernel_width;
const int output_index = h * output_width + w;
const int out_index = b * reduced_num_filter_elem * output_size
+ output_index * reduced_num_filter_elem + fi;
if(inH + i >= 0 && inH + i < image_height
&& inW + j >= 0 && inW + j < image_width) {
host_data[out_index] =
host_image[((b * channels + ch) * image_height
+ (inH + i)) * image_width + (inW + j)];
} else {
host_data[out_index] = 0;
}
}
}
}
// Tensor Multiply
for (int p = 0; p < num_filters; ++p) {
for (int m = 0; m < output_size; ++m) {
float sum = 0;
#pragma omp simd reduction(+:sum)
for (int k = 0; k < reduced_num_filter_elem; ++k) {
int input_index = k + reduced_num_filter_elem * m
+ b * reduced_num_filter_elem * output_size;
sum += host_data[input_index]
* reduced_kernels[p * reduced_num_filter_elem + k];
}
output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
}
}
int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
// NOTE: Creating a 4D tensor to be compatible with later called cuDNN
// routines
Tensor *output = (Tensor *)create4DTensorInternal(0, 0, m, n, 1, 1);
float *lhs_arr = (float *)lhs->host_data;
float *rhs_arr = (float *)rhs->host_data;
float *output_arr = (float *)output->host_data;
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
float sum = 0.0;
for (int l = 0; l < k; l++) {
float mul = lhs_arr[i * k + l] * rhs_arr[l * n + j];
sum = sum + mul;
}
output_arr[i * n + j] = sum;
}
} }
free(reduced_kernels);
free(host_data);
return output; return output;
} }
float power(float num, int exp) __attribute__((always_inline));
inline float power(float num, int exp){
bool neg = false;
if (exp < 0) {
neg = true;
exp = -1 * exp;
}
float pow = 1; void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr,
for (int i = 0; i < exp; i++) { int vertical_pad, int horizontal_pad,
pow = pow * num; int vertical_stride, int horizontal_stride,
} int conv_mode, int compute_precision,
int skip_every, int start) {
if(neg) Tensor *input = (Tensor *)input_ptr;
return 1 / pow; Tensor *filter = (Tensor *)filter_ptr;
else
return pow; float * __restrict__ host_image = (float *)input->host_data;
} float * __restrict__ host_filter = (float *)filter->host_data;
float epow(float x) __attribute__((always_inline)); const int batch_size = input->dims.dim_sizes[0];
inline float epow(float x){ const int channels = input->dims.dim_sizes[1];
const int image_height = input->dims.dim_sizes[2];
bool neg = false; const int image_width = input->dims.dim_sizes[3];
if (x < 0) { const int num_filters = filter->dims.dim_sizes[0];
x = -1 * x; const int kernel_height = filter->dims.dim_sizes[2];
neg = true; const int kernel_width = filter->dims.dim_sizes[3];
const int output_height =
1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
const int output_width =
1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
const int num_filter_elem = kernel_height * kernel_width * channels;
const int remainder = ((num_filter_elem - start) % skip_every > 0);
const int reduced_num_filter_elem =
num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
const int output_size = output_width * output_height;
Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width);
float * __restrict__ output_data = (float *)output->host_data;
const long int host_data_size = sizeof(float) * reduced_num_filter_elem
* output_height * output_width * batch_size;
float *host_data = (float *) malloc(host_data_size);
const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem;
float *reduced_kernels = (float *) malloc(reduced_filer_size);
float fac = ((float) skip_every) / ((float) skip_every - 1);
int reduced_filter_dim = reduced_num_filter_elem / channels;
// Create Reduced filter
omp_set_num_threads(4);
#pragma omp parallel for
for(int f = 0; f < num_filters; f++) {
for(int i = 0; i < start; i++) {
reduced_kernels[f * reduced_num_filter_elem + i] =
host_filter[num_filter_elem * f + i];
}
#pragma omp simd
for(int i = start; i < reduced_num_filter_elem; i++) {
int in_index = ((i - start + 1) * skip_every) / (skip_every - 1)
+ (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1;
reduced_kernels[f * reduced_num_filter_elem + i] =
fac * host_filter[num_filter_elem * f + in_index];
}
} }
float sum = 0.0; #pragma omp parallel for
float fac = 1; for(int b = 0; b < batch_size; b++) {
for(int h = 0; h < output_height; h++) {
for(int w = 0; w < output_width; w++) {
const int inH = h * vertical_stride - vertical_pad;
const int inW = w * horizontal_stride - horizontal_pad;
for(int fi = 0; fi < reduced_num_filter_elem; fi++) {
int in_index;
int offset = start;
if(fi < offset) {
in_index = fi;
} else {
in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1)
+ (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1;
}
const int ch = in_index / (kernel_width * kernel_height);
const int i = (in_index % (kernel_width * kernel_height)) / kernel_width;
const int j = in_index % kernel_width;
const int output_index = h * output_width + w;
const int out_index = b * reduced_num_filter_elem * output_size
+ output_index * reduced_num_filter_elem + fi;
if(inH + i >= 0 && inH + i < image_height
&& inW + j >= 0 && inW + j < image_width) {
host_data[out_index] =
host_image[((b * channels + ch) * image_height
+ (inH + i)) * image_width + (inW + j)];
} else {
host_data[out_index] = 0;
}
}
}
}
// Tensor Multiply
for (int p = 0; p < num_filters; ++p) {
for (int m = 0; m < output_size; ++m) {
float sum = 0;
#pragma omp simd reduction(+:sum)
for (int k = 0; k < reduced_num_filter_elem; ++k) {
int input_index = k + reduced_num_filter_elem * m
+ b * reduced_num_filter_elem * output_size;
sum += host_data[input_index]
* reduced_kernels[p * reduced_num_filter_elem + k];
}
output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
}
}
//whole number part
float pow = 1;
for (int i = x; i > 0; i--,x--) {
pow = pow * 2.71828;
} }
//x++; free(reduced_kernels);
free(host_data);
// First 15 terms in Taylor Series return output;
for (int i = 0; i < 15; i++) { }
sum = sum + power(x, i) / fac;
fac = fac * (i + 1);
}
if(neg) void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
return 1 / (sum * pow); int horizontal_pad, int vertical_stride, int horizontal_stride,
else int conv_mode, int compute_precision, int row, int start) {
return sum * pow;
}
float custom_tanh(float num) __attribute__((always_inline));
inline float custom_tanh(float num){
float value = epow(2 * num);
value = (value - 1) / (value + 1);
return value;
}
float max(float v1, float v2) __attribute__((always_inline));
inline float max(float v1, float v2){
if (v1 < v2)
return v2;
else
return v1;
}
void *tensorReluCPU(void *input_ptr) {
Tensor *input = (Tensor *)input_ptr; Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
float * __restrict__ host_image = (float *)input->host_data;
float * __restrict__ host_filter = (float *)filter->host_data;
float *input_data = (float *)input->host_data; int batch_size = input->dims.dim_sizes[0];
size_t num_elems = input->num_elems; int channels = input->dims.dim_sizes[1];
for (size_t i = 0; i < num_elems; i++) { int image_height = input->dims.dim_sizes[2];
if (input_data[i] < 0) { int image_width = input->dims.dim_sizes[3];
input_data[i] = 0; int num_filters = filter->dims.dim_sizes[0];
} int kernel_height = filter->dims.dim_sizes[2];
int kernel_width = filter->dims.dim_sizes[3];
int full_output_height =
1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
int full_output_width =
1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
int num_filter_elem = kernel_height * kernel_width * channels;
int full_output_size = full_output_height * full_output_width;
Tensor *full_output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters,
full_output_height, full_output_width);
float * __restrict__ full_output_data = (float *)full_output->host_data;
int remainder = (full_output_height - start) % row > 0;
int output_height =
full_output_height - ((full_output_height - start) / row) - remainder;
int output_width = full_output_width;
float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters
* output_height * output_width);
int output_size = output_width * output_height;
long int host_data_size = sizeof(float) * num_filter_elem * output_height
* output_width * batch_size;
float *host_data = (float *) malloc(host_data_size);
omp_set_num_threads(4);
#pragma omp parallel for
for(int b = 0; b < batch_size; b++) {
for(int ch = 0; ch < channels; ch++) {
for(int h = 0; h < output_height; h++) {
int inH;
if(h < start) {
inH = h * vertical_stride - vertical_pad;
} else {
int h_index = ((h - start + 1) * row) / (row - 1)
+ (((h - start + 1) * row) % (row - 1) > 0) + start - 1;
inH = h_index * vertical_stride - vertical_pad;
}
for(int w = 0; w < output_width; w++) {
int inW = w * horizontal_stride - horizontal_pad;
for(int i = 0; i < kernel_height; i++) {
for(int j = 0; j < kernel_width; j++) {
const int filter_elem_num =
(ch * kernel_height + i) * kernel_width + j;
const int output_index = h * output_width + w;
const int out_index = b * num_filter_elem * output_size
+ output_index * num_filter_elem + filter_elem_num;
if(inH + i >= 0 && inH + i < image_height
&& inW + j >= 0 && inW + j < image_width) {
host_data[out_index] =
host_image[((b * channels + ch) * image_height
+ (inH + i)) * image_width + (inW + j)];
} else {
host_data[out_index] = 0;
}
}
}
}
}
}
// Tensor Multiply
for (int p = 0; p < num_filters; ++p) {
for (int m = 0; m < output_size; ++m) {
float sum = 0;
#pragma omp simd reduction(+:sum)
for (int k = 0; k < num_filter_elem; ++k) {
int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size;
sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
}
output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
}
}
// Interpolate
for (int p = 0; p < num_filters; ++p) {
for(int h = 0; h < full_output_height; h++) {
for(int w = 0; w < full_output_width; w++) {
int full_output_index = b * num_filters * full_output_size
+ p * full_output_size + h * full_output_width + w;
if(h < start) {
int output_index = b * num_filters * output_size
+ p * output_size + h * output_width + w;
full_output_data[full_output_index] = output_data[output_index];
} else if(h == full_output_height - 1) {
int output_index = b * num_filters * output_size + p * output_size
+ (output_height - 1) * output_width + w;
full_output_data[full_output_index] = output_data[output_index];
} else if(h == 0) {
int output_index = b * num_filters * output_size
+ p * output_size + 0 * output_width + w;
full_output_data[full_output_index] = output_data[output_index];
} else if((h - start) % row == 0) {
int row_index = h - ((h + 1 - start) / row);
int output_index = b * num_filters * output_size + p * output_size
+ row_index * output_width + w;
full_output_data[full_output_index] =
(output_data[output_index] + output_data[output_index - output_width]) / 2;
} else {
int remainder = ((h + 1 - start) % row) > 0;
int row_index = h - ((h + 1 - start) / row) - remainder;
int output_index = b * num_filters * output_size + p * output_size
+ row_index * output_width + w;
full_output_data[full_output_index] = output_data[output_index];
}
}
}
}
} }
free(output_data);
free(host_data);
return input; return full_output;
} }
void *tensorTanhCPU(void *input_ptr) { void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
int horizontal_pad, int vertical_stride, int horizontal_stride,
int conv_mode, int compute_precision, int col, int start) {
Tensor *input = (Tensor *)input_ptr; Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
float *input_data = (float *)input->host_data;
size_t num_elems = input->num_elems; float * __restrict__ host_image = (float *)input->host_data;
for (size_t i = 0; i < num_elems; i++) { float * __restrict__ host_filter = (float *)filter->host_data;
input_data[i] = custom_tanh(input_data[i]);
int batch_size = input->dims.dim_sizes[0];
int channels = input->dims.dim_sizes[1];
int image_height = input->dims.dim_sizes[2];
int image_width = input->dims.dim_sizes[3];
int num_filters = filter->dims.dim_sizes[0];
int kernel_height = filter->dims.dim_sizes[2];
int kernel_width = filter->dims.dim_sizes[3];
int full_output_height =
1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
int full_output_width =
1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride);
int num_filter_elem = kernel_height * kernel_width * channels;
int full_output_size = full_output_height * full_output_width;
Tensor *full_output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters,
full_output_height, full_output_width);
float * __restrict__ full_output_data = (float *)full_output->host_data;
int remainder = (full_output_width - start) % col > 0;
int output_width = full_output_width - ((full_output_width - start) / col) - remainder;
int output_height = full_output_height;
float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters
* output_height * output_width);
int output_size = output_width * output_height;
long int host_data_size = sizeof(float) * num_filter_elem * output_height
* output_width * batch_size;
float *host_data = (float *) malloc(host_data_size);
omp_set_num_threads(4);
#pragma omp parallel for
for(int b = 0; b < batch_size; b++) {
for(int ch = 0; ch < channels; ch++) {
for(int h = 0; h < output_height; h++) {
int inH = h * vertical_stride - vertical_pad;
for(int w = 0; w < output_width; w++) {
int inW;
if(w < start) {
inW = w * horizontal_stride - horizontal_pad;
} else {
int w_index = ((w - start + 1) * col) / (col - 1)
+ (((w - start + 1) * col) % (col - 1) > 0) + start - 1;
inW = w_index * horizontal_stride - horizontal_pad;
}
for(int i = 0; i < kernel_height; i++) {
for(int j = 0; j < kernel_width; j++) {
const int filter_elem_num =
(ch * kernel_height + i) * kernel_width + j;
const int output_index = h * output_width + w;
const int out_index = b * num_filter_elem * output_size
+ output_index * num_filter_elem + filter_elem_num;
if(inH + i >= 0 && inH + i < image_height
&& inW + j >= 0 && inW + j < image_width) {
host_data[out_index] =
host_image[((b * channels + ch) * image_height
+ (inH + i)) * image_width + (inW + j)];
} else {
host_data[out_index] = 0;
}
}
}
}
}
}
// Tensor Multiply
for (int p = 0; p < num_filters; ++p) {
for (int m = 0; m < output_size; ++m) {
float sum = 0;
#pragma omp simd reduction(+:sum)
for (int k = 0; k < num_filter_elem; ++k) {
int input_index = k + num_filter_elem * m
+ b * num_filter_elem * output_size;
sum += host_data[input_index] * host_filter[p * num_filter_elem + k];
}
output_data[b * (output_size * num_filters) + p * output_size + m] = sum;
}
}
// Interpolate
for (int p = 0; p < num_filters; ++p) {
for(int h = 0; h < full_output_height; h++) {
for(int w = 0; w < full_output_width; w++) {
int full_output_index = b * num_filters * full_output_size
+ p * full_output_size + h * full_output_width + w;
if(w < start) {
int output_index = b * num_filters * output_size
+ p * output_size + h * output_width + w;
full_output_data[full_output_index] = output_data[output_index];
} else if(w == full_output_width - 1) {
int output_index = b * num_filters * output_size + p * output_size
+ h * output_width + output_width - 1;
full_output_data[full_output_index] = output_data[output_index];
} else if(w == 0) {
int output_index = b * num_filters * output_size + p * output_size
+ h * output_width + 0;
full_output_data[full_output_index] = output_data[output_index];
} else if((w - start) % col == 0) {
int col_index = w - ((w + 1 - start) / col);
int output_index = b * num_filters * output_size + p * output_size
+ h * output_width + col_index;
full_output_data[full_output_index] =
(output_data[output_index] + output_data[output_index - 1]) / 2;
} else {
int remainder = ((w + 1 - start) % col) > 0;
int col_index = w - ((w + 1 - start) / col) - remainder;
int output_index = b * num_filters * output_size + p * output_size
+ h * output_width + col_index;
full_output_data[full_output_index] = output_data[output_index];
}
}
}
}
} }
free(output_data);
free(host_data);
return input; return full_output;
} }
void *tensorRelu2CPU(void *input_ptr, float min, float max) {
Tensor *input = (Tensor *)input_ptr;
float *input_data = (float *)input->host_data; void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr,
size_t num_elems = input->num_elems; int vertical_pad, int horizontal_pad,
for (size_t i = 0; i < num_elems; i++) { int vertical_stride, int horizontal_stride,
if (input_data[i] < min) { int conv_mode, int compute_precision,
input_data[i] = min; int row, int col, int skip_every, int start) {
} if(row > 1) {
if (input_data[i] > max) { return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
input_data[i] = max; horizontal_pad, vertical_stride, horizontal_stride, conv_mode,
} compute_precision, row, start);
}
if(col > 1) {
return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
horizontal_pad, vertical_stride, horizontal_stride, conv_mode,
compute_precision, col, start);
}
if(skip_every > 1) {
Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
const int kernel_height = filter->dims.dim_sizes[2];
const int kernel_width = filter->dims.dim_sizes[3];
if(!(kernel_height * kernel_width % skip_every)) {
return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr,
vertical_pad, horizontal_pad, vertical_stride,
horizontal_stride, conv_mode,
compute_precision, skip_every, start);
}
return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr,
vertical_pad, horizontal_pad, vertical_stride,
horizontal_stride, conv_mode,
compute_precision, skip_every, start);
} }
return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad,
horizontal_pad, vertical_stride,
horizontal_stride, conv_mode, compute_precision);
}
return input; void* tensorAddCPU(void *x_ptr, void *bias_ptr) {
} Tensor *x = (Tensor *)x_ptr;
Tensor *bias = (Tensor *)bias_ptr;
float * __restrict__ x_data = (float *)x->host_data;
float * __restrict__ bias_data = (float *)bias->host_data;
int n = x->dims.dim_sizes[0];
int c = x->dims.dim_sizes[1];
int h = x->dims.dim_sizes[2];
int w = x->dims.dim_sizes[3];
if(x->num_elems == bias->num_elems) {
int const1 = c * h * w;
int const2 = h * w;
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < n; i++) {
for (int j = 0; j < c; j++) {
#pragma omp simd collapse(2)
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
x_data[i * const1 + j * const2 + (k * w) + l] +=
bias_data[i * const1 + j * const2 + (k*w) + l];
}
}
}
}
} else {
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < n; i++) {
for (int j = 0; j < c; j++) {
#pragma omp simd collapse(2)
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j];
}
}
}
}
}
return x;
}
void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, float max(float v1, float v2) __attribute__((always_inline));
int window_width, int vertical_pad, int horizontal_pad, inline float maximum(float v1, float v2){
int vertical_stride, int horizontal_stride) { return (v1 < v2) ? v2 : v1;
}
Tensor *input = (Tensor *)input_ptr;
float *input_data = (float *)input->host_data;
void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
int window_width, int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride) {
Tensor *input = (Tensor *)input_ptr;
float * __restrict__ input_data = (float *)input->host_data;
int batch_size = input->dims.dim_sizes[0]; int batch_size = input->dims.dim_sizes[0];
int channels = input->dims.dim_sizes[1]; int channels = input->dims.dim_sizes[1];
int image_height = input->dims.dim_sizes[2]; int image_height = input->dims.dim_sizes[2];
int image_width = input->dims.dim_sizes[3]; int image_width = input->dims.dim_sizes[3];
int output_height = int output_height =
1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride);
int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) / int output_width =
horizontal_stride); 1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride);
int center_x = (window_width - 1) / 2 - horizontal_pad; int center_x = (window_width - 1) / 2 - horizontal_pad;
int center_y = (window_height - 1) / 2 - vertical_pad; int center_y = (window_height - 1) / 2 - vertical_pad;
int x_radius = (window_width - 1) / 2; int x_radius = (window_width - 1) / 2;
int y_radius = (window_height - 1) / 2; int y_radius = (window_height - 1) / 2;
Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, channels,
output_height, output_width);
float * __restrict__ output_data = (float *)output->host_data;
omp_set_num_threads(4);
#pragma omp parallel for
for (int b = 0; b < batch_size; b++) {
for (int ch = 0; ch < channels; ch++) {
int ii = 0, jj = 0;
for (int r = center_y; r < image_height + vertical_pad - y_radius;
r += vertical_stride) {
for (int c = center_x; c < image_width + horizontal_pad - x_radius;
c += horizontal_stride) {
float val = (poolFunction == 0) ? -3.40282e+38 : 0;
int y_radius_var = y_radius - r;
int y_radius_var_max = y_radius_var + image_height;
int x_radius_var = x_radius - c;
int x_radius_var_max = x_radius_var + image_width;
int ki_min = (y_radius_var > 0) ?
((y_radius_var < window_height) ? y_radius_var : -1) : 0;
int ki_max = (y_radius_var_max < window_height) ?
((y_radius_var_max >= 0) ? y_radius_var_max : -1) : window_height;
int kj_min = (x_radius_var > 0) ?
((x_radius_var < window_width) ? x_radius_var : -1) : 0;
int kj_max = (x_radius_var_max < window_width) ?
((x_radius_var_max >= 0) ? x_radius_var_max : -1) : window_width;
if(ki_min != ki_max && kj_min != kj_max && ki_min != -1
&& ki_max != -1 && kj_min != -1 && kj_max != -1) {
if(!poolFunction) {
for (int ki = 0; ki < window_height; ki++) {
for (int kj = 0; kj < window_width; kj++) {
val = maximum(
val,
input_data[b * (channels * image_height * image_width) +
ch * (image_height * image_width) +
(r - y_radius + ki) * image_width + (c - x_radius + kj)]);
}
}
} else {
for (int ki = 0; ki < window_height; ki++) {
for (int kj = 0; kj < window_width; kj++) {
val += input_data[b * (channels * image_height * image_width)
+ ch * (image_height * image_width) +
(r - y_radius + ki) * image_width + (c - x_radius + kj)];
}
}
}
}
if (poolFunction == 1) {
val /= window_height * window_width;
}
output_data[b * (channels * output_height * output_width) +
ch * (output_height * output_width) + ii * output_width + jj] = val;
jj++;
if (jj == output_width) {
jj = 0;
ii++;
}
}
}
}
}
return output;
}
void *tensorTanhCPU(void *input_ptr) {
Tensor *input = (Tensor *)input_ptr;
float *input_data = (float *)input->host_data;
size_t num_elems = input->num_elems;
omp_set_num_threads(4);
#pragma omp parallel for
for (size_t i = 0; i < num_elems; i++) {
input_data[i] = tanhf(input_data[i]);
}
return input;
}
Tensor *output = (Tensor *) create4DTensorInternal(0, 0, batch_size, channels, void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
output_height, output_width); Tensor *lhs = (Tensor *)lhs_ptr;
float *output_data = (float *)output->host_data; Tensor *rhs = (Tensor *)rhs_ptr;
for (int b = 0; b < batch_size; b++) { // 'm' holds the batch dimension - assuming NCHW format Tensors
for (int ch = 0; ch < channels; ch++) { int m = lhs->dims.dim_sizes[0];
int ii = 0, jj = 0; // The rhs must be a 2D tensor
for (int r = center_y; r < image_height + vertical_pad - y_radius; int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
r += vertical_stride) { int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2];
for (int c = center_x; c < image_width + horizontal_pad - x_radius;
c += horizontal_stride) { Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
float val;
if (poolFunction == 0) float * __restrict__ lhs_arr = (float *)lhs->host_data;
val = -3.40282e+38; // assuming values never fall below min value of float float * __restrict__ rhs_arr = (float *)rhs->host_data;
else float * __restrict__ output_arr = (float *)output->host_data;
val = 0;
int k = 1;
for (int i = r - y_radius, ki = 0; ki < window_height; i++, ki++) { #pragma unroll 2 // Can we unroll more???
for (int j = c - x_radius, kj = 0; kj < window_width; j++, kj++) { for (int j = 1; j < lhs->dims.num_dims; j++) {
if (i >= 0 && j >= 0 && i < image_height && j < image_width) { k = k * lhs->dims.dim_sizes[j]; // input neurons
if (poolFunction == 0)
val = max(
val,
input_data[b * (channels * image_height * image_width) +
ch * (image_height * image_width) +
i * image_width + j]);
else
val +=
input_data[b * (channels * image_height * image_width) +
ch * (image_height * image_width) +
i * image_width + j];
}
}
}
if (poolFunction == 1)
val /= window_height * window_width;
output_data[b * (channels * output_height * output_width) +
ch * (output_height * output_width) + ii * output_width +
jj] = val;
jj++;
if (jj == output_width) {
jj = 0;
ii++;
}
}
}
}
} }
float *tran_rhs = (float *) malloc(sizeof(float) * k * n);
omp_set_num_threads(4);
#pragma omp parallel for simd
for (int l = 0; l < k; l++) {
for (int j = 0; j < n; j++) {
tran_rhs[j * k + l] = rhs_arr[l * n + j];
}
}
#pragma omp parallel for
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
float sum = 0.0;
#pragma omp simd reduction(+:sum)
for (int l = 0; l < k; l++) {
sum += lhs_arr[i * k + l] * tran_rhs[j * k + l];
}
output_arr[i * n + j] = sum;
}
}
free(tran_rhs);
return output; return output;
} }
void *tensorSoftmaxCPU(void *input_ptr) { void *tensorSoftmaxCPU(void *input_ptr) {
Tensor *input = (Tensor *)input_ptr; Tensor *input = (Tensor *)input_ptr;
float *logits = (float *)input->host_data; float *logits = (float *)input->host_data;
int n = input->dims.dim_sizes[0]; int n = input->dims.dim_sizes[0];
int c = input->dims.dim_sizes[1]; int c = input->dims.dim_sizes[1];
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
float x = 0; float x = 0;
for (int j = 0; j < c; j++) for(int j = i*c; j < c + i*c; j++) {
x += epow(logits[i * c + j]); logits[j] = expf(logits[j]);
}
for (int j = 0; j < c; j++)
logits[i * c + j] = epow(logits[i * c + j]) / x; #pragma omp simd reduction(+:x)
for(int j = i*c; j < i*c+c; j++) {
x += logits[j];
}
#pragma omp simd
for(int j = i*c; j < i*c + c; j++) {
logits[j] /= x;
}
} }
return input; return input;
} }
void* __attribute__((always_inline)) tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
int horizontal_pad, int vertical_stride,
int horizontal_stride, int conv_mode,
int compute_precision) {
Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
float *image = (float *)input->host_data;
float *kernels = (float *)filter->host_data;
int batch_size = input->dims.dim_sizes[0];
int channels = input->dims.dim_sizes[1];
int image_height = input->dims.dim_sizes[2];
int image_width = input->dims.dim_sizes[3];
int num_filters = filter->dims.dim_sizes[0];
int kernel_height = filter->dims.dim_sizes[2];
int kernel_width = filter->dims.dim_sizes[3];
// kernel centers
int center_x = (kernel_width - 1) / 2 - horizontal_pad;
int center_y = (kernel_height - 1) / 2 - vertical_pad;
int x_radius = (kernel_width - 1) / 2;
int y_radius = (kernel_height - 1) / 2;
int output_height =
1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride);
int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) /
horizontal_stride);
Tensor *output = (Tensor *) create4DTensorInternal(0, 0, batch_size, num_filters,
output_height, output_width);
float *output_data = (float *)output->host_data;
for (int b = 0; b < batch_size; b++) { void *tensorReluCPU(void *input_ptr) {
for (int f = 0; f < num_filters; f++) { Tensor *input = (Tensor *)input_ptr;
int ii = 0, jj = 0; float *input_data = (float *)input->host_data;
for (int r = center_y; r < image_height + vertical_pad - y_radius; size_t num_elems = input->num_elems;
r += vertical_stride) {
for (int c = center_x; c < image_width + horizontal_pad - x_radius; #pragma omp simd
c += horizontal_stride) { for (size_t i = 0; i < num_elems; i++) {
input_data[i] = (input_data[i] < 0) ? 0 : input_data[i];
float sum = 0;
for (int ch = 0; ch < channels; ch++) {
for (int i = r - y_radius, ki = 0; ki < kernel_height; i++, ki++) {
for (int j = c - x_radius, kj = 0; kj < kernel_width; j++, kj++) {
if (i >= 0 && j >= 0 && i < image_height && j < image_width) {
sum += image[b * (channels * image_height * image_width) +
ch * (image_height * image_width) +
i * image_width + j] *
kernels[f * (channels * kernel_height * kernel_width) +
ch * (kernel_height * kernel_width) +
ki * kernel_width + kj];
}
}
}
}
output_data[b * (num_filters * output_height * output_width) +
f * (output_height * output_width) + ii * output_width +
jj] = sum;
jj++;
if (jj == output_width) {
jj = 0;
ii++;
}
}
}
}
} }
return output; return input;
} }
void *tensorRelu2CPU(void *input_ptr, float min, float max) {
Tensor *input = (Tensor *)input_ptr;
float *input_data = (float *)input->host_data;
size_t num_elems = input->num_elems;
#pragma omp simd
for (size_t i = 0; i < num_elems; i++) {
input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ?
max : input_data[i]);
}
} return input;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment