diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h index 42969d27712d92c451ae066a44fac0c03cedf170..845a1a008c42124b4e40ea19eaf2dfda2777f061 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h @@ -22,17 +22,18 @@ extern "C"{ // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width) void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, - size_t dim3_size, size_t dim4_size); + size_t dim3_size, size_t dim4_size, bool freeMemory = true); void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes); /********** Tensor Operation API ******/ // NOTE: For conv_mode, only value '1' is supported - void* tensorConvolutionCPU(void* input, void* filter, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision); +void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int row, int col, int skip_every, int start); void* tensorPoolingCPU(void* input, int poolFunction, diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index 7397e78013c1e0284314ba8e47012c435345da59..c98b2946a5c2bdede825ba3b3d3fb6dfa03feb4e 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -1,9 +1,8 @@ -/* This file includes the API implementation of the HPVM tensor runtime built on - *cublas, cudnn - ** - ** Author: Hashim Sharif - ** Email: hsharif3@illinois.edu - */ +/* This file includes the API implementation of the HPVM tensor runtime built for CPU +** +** Author: Hashim Sharif +** Email: hsharif3@illinois.edu +*/ #include <algorithm> #include <cfloat> @@ -15,81 +14,88 @@ #include <iostream> #include <limits> #include <map> +#include <cmath> #include <memory> +#include <vector> #include <sstream> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <string> #include <vector> +#include <math.h> +#include<bits/stdc++.h> +#include <pthread.h> +#include <omp.h> // Tensor runtime header files -#include "../include/tensor_cpu.h" -#include "../include/tensor_cpu_runtime.h" - +#include "tensor_cpu.h" +#include "tensor_cpu_runtime.h" -extern "C"{ - - void llvm_hpvm_initTensorRt(int gpuid) { +void llvm_hpvm_initTensorRt(int) { // NOTE: Do Nothing - } +} - void llvm_hpvm_cleanupTensorRt() { +void llvm_hpvm_cleanupTensorRt() { // NOTE: Do Nothing - } +} - void hpvm_request_tensor(void *tensor, int destination) { +void hpvm_request_tensor(void *tensor, int destination) { // NOTE: Do Nothing - } - - // Returns the size of the target cudnn datatype - int getTypeSize(int data_type) __attribute__((always_inline)); - inline int getTypeSize(int data_type) { - // Float/Int data type - Full Precision - if (data_type == 0) - return 4; - // Half data type - if (data_type == 1) - return 2; - - return 1; - } - - void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); - inline void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) { +} + +std::vector<void *> PtrVect; + +void freeBatchMemory() { + for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) { + free(*it); + } + PtrVect.erase(PtrVect.begin(), PtrVect.end()); +} + +inline int getTypeSize(int data_type) { + return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1); +} + +void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); +inline void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) { int type_size = getTypeSize(data_type); size_t size_in_bytes = type_size * num_elems; tensor->size_in_bytes = size_in_bytes; - } +} - void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); - inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems) { +void allocateMemCPU(struct Tensor *tensor, int data_type, + size_t num_elems, bool freeMemory = true) __attribute__((always_inline)); +inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) { setSizeInBytes(tensor, data_type, num_elems); tensor->data_type = data_type; tensor->num_elems = num_elems; - tensor->host_data = - (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host - } - - void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { + tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host + if(freeMemory) + PtrVect.push_back(tensor->host_data); +} +void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { Tensor *tensor = (Tensor *)tensor_ptr; if (tensor->size_in_bytes != size_in_bytes) { - printf("The destination and source sizes don't match"); + printf("The destination and source sizes don't match"); } - memcpy(tensor->host_data, data_ptr, size_in_bytes); - } + memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough? +} - - void *create4DTensorInternal(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size) __attribute__((always_inline)); - inline void *create4DTensorInternal(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size) { +//void *create4DTensor(int data_type, int data_format, size_t dim1_size, + // size_t dim2_size, size_t dim3_size, size_t dim4_size, + //bool freeMemory = true) __attribute__((always_inline)); +inline void *create4DTensor(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, + size_t dim4_size, bool freeMemory) { struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - - allocateMemCPU(tensor, data_type, num_elems); + if(freeMemory) + PtrVect.push_back(tensor); + allocateMemCPU(tensor, data_type, num_elems, freeMemory); + // Setting the tensor dimensions size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); dim_sizes[0] = dim1_size; @@ -98,368 +104,872 @@ extern "C"{ dim_sizes[3] = dim4_size; tensor->dims.dim_sizes = dim_sizes; tensor->dims.num_dims = 4; - + return tensor; - } - - void* create4DTensor(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size) { - - return create4DTensorInternal(data_type, data_format, dim1_size, dim2_size, dim3_size, dim4_size); - } - - - void* __attribute__((always_inline)) tensorAddCPU(void *x_ptr, void *bias_ptr) { - - Tensor *x = (Tensor *)x_ptr; - Tensor *bias = (Tensor *)bias_ptr; - - float *x_data = (float *)x->host_data; - float *bias_data = (float *)bias->host_data; - - int n = x->dims.dim_sizes[0]; - int c = x->dims.dim_sizes[1]; - int h = x->dims.dim_sizes[2]; - int w = x->dims.dim_sizes[3]; - - size_t num_elems = x->num_elems; - size_t num_elems2 = bias->num_elems; +} - if (num_elems == num_elems2) { - for (size_t i = 0; i < num_elems; i++) { - x_data[i] += bias_data[i]; - } - } else { +void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; - } - } - } - } + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int output_size = output_width * output_height; + + Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, + output_height, output_width); + float * __restrict__ output_data = (float *)output->host_data; + + long int conv_data_size = + sizeof(float) * num_filter_elem * output_height * output_width * batch_size; + float *host_data = (float *) malloc(conv_data_size); + + omp_set_num_threads(4); + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int ch = 0; ch < channels; ch++) { + for(int h = 0; h < output_height; h++) { + for(int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for(int i = 0; i < kernel_height; i++) { + for(int j = 0; j < kernel_width; j++) { + const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + filter_elem_num; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } + } + } + + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } + } } + free(host_data); - return x; - } - - void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { - - Tensor *lhs = (Tensor *)lhs_ptr; - Tensor *rhs = (Tensor *)rhs_ptr; + return output; +} - // 'm' holds the batch dimension - assuming NCHW format Tensors - int m = lhs->dims.dim_sizes[0]; - // The rhs must be a 2D tensor - int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons - int k = 1; - // Flattening the dimensions after the batch dimension - // NOTE: Allowing any number of dimensions > 2 for lhs - for (int j = 1; j < lhs->dims.num_dims; j++) { - k = k * lhs->dims.dim_sizes[j]; // input neurons +void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int num_filter_elem = kernel_height * kernel_width * channels; + + const int remainder = ((num_filter_elem - start) % skip_every > 0); + const int reduced_num_filter_elem = + num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, + output_height, output_width); + float * __restrict__ output_data = (float *)output->host_data; + + const long int host_data_size = sizeof(float) * reduced_num_filter_elem + * output_height * output_width * batch_size; + float *host_data = (float *) malloc(host_data_size); + + const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem; + float *reduced_kernels = (float *) malloc(reduced_filer_size); + + float fac = ((float) skip_every) / ((float) skip_every - 1); + int reduced_filter_dim = reduced_num_filter_elem / channels; + + // Create reduced filter + omp_set_num_threads(4); + #pragma omp parallel for + for(int f = 0; f < num_filters; f++) { + for(int i = 0; i < reduced_num_filter_elem; i++) { + int ch = i / reduced_filter_dim; + int offset = (start + ch) % skip_every; + int in_index; + if(i < offset) { + in_index = i; + } else { + in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) + + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1; + } + reduced_kernels[f * reduced_num_filter_elem + i] = + fac * host_filter[num_filter_elem * f + in_index]; + } } + + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int h = 0; h < output_height; h++) { + for(int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for(int fi = 0; fi < reduced_num_filter_elem; fi++) { + int in_index; + const int ch = fi / reduced_filter_dim; + const int offset = (start + ch) % skip_every; + if(fi < offset) { + in_index = fi; + } else { + in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; + } + const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; + const int j = in_index % kernel_width; + const int output_index = h * output_width + w; + const int out_index = b * reduced_num_filter_elem * output_size + + output_index * reduced_num_filter_elem + fi; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } + + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < reduced_num_filter_elem; ++k) { + int input_index = k + reduced_num_filter_elem * m + + b * reduced_num_filter_elem * output_size; + sum += host_data[input_index] + * reduced_kernels[p * reduced_num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } + } - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; - - // NOTE: Creating a 4D tensor to be compatible with later called cuDNN - // routines - Tensor *output = (Tensor *)create4DTensorInternal(0, 0, m, n, 1, 1); - - float *lhs_arr = (float *)lhs->host_data; - float *rhs_arr = (float *)rhs->host_data; - float *output_arr = (float *)output->host_data; - - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - float sum = 0.0; - for (int l = 0; l < k; l++) { - float mul = lhs_arr[i * k + l] * rhs_arr[l * n + j]; - sum = sum + mul; - } - output_arr[i * n + j] = sum; - } } - + free(reduced_kernels); + free(host_data); + return output; - } - - float power(float num, int exp) __attribute__((always_inline)); - inline float power(float num, int exp){ - bool neg = false; - if (exp < 0) { - neg = true; - exp = -1 * exp; - } +} - float pow = 1; - for (int i = 0; i < exp; i++) { - pow = pow * num; - } - - if(neg) - return 1 / pow; - else - return pow; - } - - float epow(float x) __attribute__((always_inline)); - inline float epow(float x){ - - bool neg = false; - if (x < 0) { - x = -1 * x; - neg = true; +void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int num_filter_elem = kernel_height * kernel_width * channels; + + const int remainder = ((num_filter_elem - start) % skip_every > 0); + const int reduced_num_filter_elem = + num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, + output_height, output_width); + float * __restrict__ output_data = (float *)output->host_data; + + const long int host_data_size = sizeof(float) * reduced_num_filter_elem + * output_height * output_width * batch_size; + float *host_data = (float *) malloc(host_data_size); + + const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem; + float *reduced_kernels = (float *) malloc(reduced_filer_size); + + float fac = ((float) skip_every) / ((float) skip_every - 1); + int reduced_filter_dim = reduced_num_filter_elem / channels; + + // Create Reduced filter + omp_set_num_threads(4); + #pragma omp parallel for + for(int f = 0; f < num_filters; f++) { + for(int i = 0; i < start; i++) { + reduced_kernels[f * reduced_num_filter_elem + i] = + host_filter[num_filter_elem * f + i]; + } + #pragma omp simd + for(int i = start; i < reduced_num_filter_elem; i++) { + int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) + + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1; + reduced_kernels[f * reduced_num_filter_elem + i] = + fac * host_filter[num_filter_elem * f + in_index]; + } } - float sum = 0.0; - float fac = 1; + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int h = 0; h < output_height; h++) { + for(int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for(int fi = 0; fi < reduced_num_filter_elem; fi++) { + int in_index; + int offset = start; + if(fi < offset) { + in_index = fi; + } else { + in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; + } + const int ch = in_index / (kernel_width * kernel_height); + const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; + const int j = in_index % kernel_width; + const int output_index = h * output_width + w; + const int out_index = b * reduced_num_filter_elem * output_size + + output_index * reduced_num_filter_elem + fi; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } + + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < reduced_num_filter_elem; ++k) { + int input_index = k + reduced_num_filter_elem * m + + b * reduced_num_filter_elem * output_size; + sum += host_data[input_index] + * reduced_kernels[p * reduced_num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } + } - //whole number part - float pow = 1; - for (int i = x; i > 0; i--,x--) { - pow = pow * 2.71828; } - //x++; + free(reduced_kernels); + free(host_data); - // First 15 terms in Taylor Series - for (int i = 0; i < 15; i++) { - sum = sum + power(x, i) / fac; - fac = fac * (i + 1); - } + return output; +} - if(neg) - return 1 / (sum * pow); - else - return sum * pow; - } - - float custom_tanh(float num) __attribute__((always_inline)); - inline float custom_tanh(float num){ - float value = epow(2 * num); - value = (value - 1) / (value + 1); - return value; - } - - float max(float v1, float v2) __attribute__((always_inline)); - inline float max(float v1, float v2){ - if (v1 < v2) - return v2; - else - return v1; - } - - void *tensorReluCPU(void *input_ptr) { +void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, int row, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - for (size_t i = 0; i < num_elems; i++) { - if (input_data[i] < 0) { - input_data[i] = 0; - } + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + + int full_output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int full_output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int full_output_size = full_output_height * full_output_width; + + Tensor *full_output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, + full_output_height, full_output_width); + float * __restrict__ full_output_data = (float *)full_output->host_data; + + int remainder = (full_output_height - start) % row > 0; + int output_height = + full_output_height - ((full_output_height - start) / row) - remainder; + + int output_width = full_output_width; + float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters + * output_height * output_width); + int output_size = output_width * output_height; + long int host_data_size = sizeof(float) * num_filter_elem * output_height + * output_width * batch_size; + float *host_data = (float *) malloc(host_data_size); + + omp_set_num_threads(4); + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int ch = 0; ch < channels; ch++) { + for(int h = 0; h < output_height; h++) { + int inH; + if(h < start) { + inH = h * vertical_stride - vertical_pad; + } else { + int h_index = ((h - start + 1) * row) / (row - 1) + + (((h - start + 1) * row) % (row - 1) > 0) + start - 1; + inH = h_index * vertical_stride - vertical_pad; + } + for(int w = 0; w < output_width; w++) { + int inW = w * horizontal_stride - horizontal_pad; + for(int i = 0; i < kernel_height; i++) { + for(int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + filter_elem_num; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } + } + } + + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } + } + + // Interpolate + for (int p = 0; p < num_filters; ++p) { + for(int h = 0; h < full_output_height; h++) { + for(int w = 0; w < full_output_width; w++) { + int full_output_index = b * num_filters * full_output_size + + p * full_output_size + h * full_output_width + w; + if(h < start) { + int output_index = b * num_filters * output_size + + p * output_size + h * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if(h == full_output_height - 1) { + int output_index = b * num_filters * output_size + p * output_size + + (output_height - 1) * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if(h == 0) { + int output_index = b * num_filters * output_size + + p * output_size + 0 * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if((h - start) % row == 0) { + int row_index = h - ((h + 1 - start) / row); + int output_index = b * num_filters * output_size + p * output_size + + row_index * output_width + w; + full_output_data[full_output_index] = + (output_data[output_index] + output_data[output_index - output_width]) / 2; + } else { + int remainder = ((h + 1 - start) % row) > 0; + int row_index = h - ((h + 1 - start) / row) - remainder; + int output_index = b * num_filters * output_size + p * output_size + + row_index * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } + } + } + } } + free(output_data); + free(host_data); - return input; - } + return full_output; +} - void *tensorTanhCPU(void *input_ptr) { +void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, int col, int start) { + Tensor *input = (Tensor *)input_ptr; - - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = custom_tanh(input_data[i]); + Tensor *filter = (Tensor *)filter_ptr; + + float * __restrict__ host_image = (float *)input->host_data; + float * __restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + int full_output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int full_output_width = + 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int full_output_size = full_output_height * full_output_width; + + Tensor *full_output = (Tensor *) create4DTensor(0, 0, batch_size, num_filters, + full_output_height, full_output_width); + float * __restrict__ full_output_data = (float *)full_output->host_data; + + int remainder = (full_output_width - start) % col > 0; + int output_width = full_output_width - ((full_output_width - start) / col) - remainder; + + int output_height = full_output_height; + float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters + * output_height * output_width); + int output_size = output_width * output_height; + long int host_data_size = sizeof(float) * num_filter_elem * output_height + * output_width * batch_size; + float *host_data = (float *) malloc(host_data_size); + + omp_set_num_threads(4); + #pragma omp parallel for + for(int b = 0; b < batch_size; b++) { + for(int ch = 0; ch < channels; ch++) { + for(int h = 0; h < output_height; h++) { + int inH = h * vertical_stride - vertical_pad; + for(int w = 0; w < output_width; w++) { + int inW; + if(w < start) { + inW = w * horizontal_stride - horizontal_pad; + } else { + int w_index = ((w - start + 1) * col) / (col - 1) + + (((w - start + 1) * col) % (col - 1) > 0) + start - 1; + inW = w_index * horizontal_stride - horizontal_pad; + } + for(int i = 0; i < kernel_height; i++) { + for(int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + filter_elem_num; + if(inH + i >= 0 && inH + i < image_height + && inW + j >= 0 && inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * image_width + (inW + j)]; + } else { + host_data[out_index] = 0; + } + } + } + } + } + } + + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; + #pragma omp simd reduction(+:sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = k + num_filter_elem * m + + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; + } + output_data[b * (output_size * num_filters) + p * output_size + m] = sum; + } + } + + // Interpolate + for (int p = 0; p < num_filters; ++p) { + for(int h = 0; h < full_output_height; h++) { + for(int w = 0; w < full_output_width; w++) { + int full_output_index = b * num_filters * full_output_size + + p * full_output_size + h * full_output_width + w; + if(w < start) { + int output_index = b * num_filters * output_size + + p * output_size + h * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if(w == full_output_width - 1) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + output_width - 1; + full_output_data[full_output_index] = output_data[output_index]; + } else if(w == 0) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + 0; + full_output_data[full_output_index] = output_data[output_index]; + } else if((w - start) % col == 0) { + int col_index = w - ((w + 1 - start) / col); + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + col_index; + full_output_data[full_output_index] = + (output_data[output_index] + output_data[output_index - 1]) / 2; + } else { + int remainder = ((w + 1 - start) % col) > 0; + int col_index = w - ((w + 1 - start) / col) - remainder; + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + col_index; + full_output_data[full_output_index] = output_data[output_index]; + } + } + } + } } + free(output_data); + free(host_data); - return input; - } + return full_output; +} - void *tensorRelu2CPU(void *input_ptr, float min, float max) { - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - for (size_t i = 0; i < num_elems; i++) { - if (input_data[i] < min) { - input_data[i] = min; - } - if (input_data[i] > max) { - input_data[i] = max; - } +void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int row, int col, int skip_every, int start) { + if(row > 1) { + return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad, + horizontal_pad, vertical_stride, horizontal_stride, conv_mode, + compute_precision, row, start); + } + if(col > 1) { + return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad, + horizontal_pad, vertical_stride, horizontal_stride, conv_mode, + compute_precision, col, start); + } + if(skip_every > 1) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + + if(!(kernel_height * kernel_width % skip_every)) { + return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, + vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, + compute_precision, skip_every, start); + } + return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, + vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, + compute_precision, skip_every, start); } + return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad, + horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision); +} - return input; - } +void* tensorAddCPU(void *x_ptr, void *bias_ptr) { + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; + + float * __restrict__ x_data = (float *)x->host_data; + float * __restrict__ bias_data = (float *)bias->host_data; + int n = x->dims.dim_sizes[0]; + int c = x->dims.dim_sizes[1]; + int h = x->dims.dim_sizes[2]; + int w = x->dims.dim_sizes[3]; + + if(x->num_elems == bias->num_elems) { + int const1 = c * h * w; + int const2 = h * w; + omp_set_num_threads(4); + #pragma omp parallel for + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + #pragma omp simd collapse(2) + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * const1 + j * const2 + (k * w) + l] += + bias_data[i * const1 + j * const2 + (k*w) + l]; + } + } + } + } + } else { + omp_set_num_threads(4); + #pragma omp parallel for + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + #pragma omp simd collapse(2) + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; + } + } + } + } + } + + return x; +} - void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, - int window_width, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { - - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; +float max(float v1, float v2) __attribute__((always_inline)); +inline float maximum(float v1, float v2){ + return (v1 < v2) ? v2 : v1; +} +void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { + + Tensor *input = (Tensor *)input_ptr; + float * __restrict__ input_data = (float *)input->host_data; + int batch_size = input->dims.dim_sizes[0]; int channels = input->dims.dim_sizes[1]; int image_height = input->dims.dim_sizes[2]; int image_width = input->dims.dim_sizes[3]; - - int output_height = - 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); - int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) / - horizontal_stride); - + + int output_height = + 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); + int output_width = + 1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride); + int center_x = (window_width - 1) / 2 - horizontal_pad; int center_y = (window_height - 1) / 2 - vertical_pad; int x_radius = (window_width - 1) / 2; int y_radius = (window_height - 1) / 2; + + Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, channels, + output_height, output_width); + float * __restrict__ output_data = (float *)output->host_data; + + omp_set_num_threads(4); + #pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + int ii = 0, jj = 0; + for (int r = center_y; r < image_height + vertical_pad - y_radius; + r += vertical_stride) { + for (int c = center_x; c < image_width + horizontal_pad - x_radius; + c += horizontal_stride) { + float val = (poolFunction == 0) ? -3.40282e+38 : 0; + int y_radius_var = y_radius - r; + int y_radius_var_max = y_radius_var + image_height; + int x_radius_var = x_radius - c; + int x_radius_var_max = x_radius_var + image_width; + int ki_min = (y_radius_var > 0) ? + ((y_radius_var < window_height) ? y_radius_var : -1) : 0; + int ki_max = (y_radius_var_max < window_height) ? + ((y_radius_var_max >= 0) ? y_radius_var_max : -1) : window_height; + int kj_min = (x_radius_var > 0) ? + ((x_radius_var < window_width) ? x_radius_var : -1) : 0; + int kj_max = (x_radius_var_max < window_width) ? + ((x_radius_var_max >= 0) ? x_radius_var_max : -1) : window_width; + + if(ki_min != ki_max && kj_min != kj_max && ki_min != -1 + && ki_max != -1 && kj_min != -1 && kj_max != -1) { + if(!poolFunction) { + for (int ki = 0; ki < window_height; ki++) { + for (int kj = 0; kj < window_width; kj++) { + val = maximum( + val, + input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + (r - y_radius + ki) * image_width + (c - x_radius + kj)]); + } + } + } else { + for (int ki = 0; ki < window_height; ki++) { + for (int kj = 0; kj < window_width; kj++) { + val += input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + (r - y_radius + ki) * image_width + (c - x_radius + kj)]; + } + } + } + } + if (poolFunction == 1) { + val /= window_height * window_width; + } + output_data[b * (channels * output_height * output_width) + + ch * (output_height * output_width) + ii * output_width + jj] = val; + jj++; + if (jj == output_width) { + jj = 0; + ii++; + } + } + } + } + } + + return output; +} + +void *tensorTanhCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; + + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + + omp_set_num_threads(4); + #pragma omp parallel for + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = tanhf(input_data[i]); + } + + return input; +} - Tensor *output = (Tensor *) create4DTensorInternal(0, 0, batch_size, channels, - output_height, output_width); - float *output_data = (float *)output->host_data; +void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - int ii = 0, jj = 0; - for (int r = center_y; r < image_height + vertical_pad - y_radius; - r += vertical_stride) { - for (int c = center_x; c < image_width + horizontal_pad - x_radius; - c += horizontal_stride) { - float val; - if (poolFunction == 0) - val = -3.40282e+38; // assuming values never fall below min value of float - else - val = 0; - - for (int i = r - y_radius, ki = 0; ki < window_height; i++, ki++) { - for (int j = c - x_radius, kj = 0; kj < window_width; j++, kj++) { - if (i >= 0 && j >= 0 && i < image_height && j < image_width) { - if (poolFunction == 0) - val = max( - val, - input_data[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - i * image_width + j]); - else - val += - input_data[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - i * image_width + j]; - } - } - } - if (poolFunction == 1) - val /= window_height * window_width; - - output_data[b * (channels * output_height * output_width) + - ch * (output_height * output_width) + ii * output_width + - jj] = val; - jj++; - if (jj == output_width) { - jj = 0; - ii++; - } - } - } - } + // 'm' holds the batch dimension - assuming NCHW format Tensors + int m = lhs->dims.dim_sizes[0]; + // The rhs must be a 2D tensor + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; + + Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1); + + float * __restrict__ lhs_arr = (float *)lhs->host_data; + float * __restrict__ rhs_arr = (float *)rhs->host_data; + float * __restrict__ output_arr = (float *)output->host_data; + + int k = 1; + #pragma unroll 2 // Can we unroll more??? + for (int j = 1; j < lhs->dims.num_dims; j++) { + k = k * lhs->dims.dim_sizes[j]; // input neurons } + + float *tran_rhs = (float *) malloc(sizeof(float) * k * n); + omp_set_num_threads(4); + #pragma omp parallel for simd + for (int l = 0; l < k; l++) { + for (int j = 0; j < n; j++) { + tran_rhs[j * k + l] = rhs_arr[l * n + j]; + } + } + #pragma omp parallel for + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sum = 0.0; + #pragma omp simd reduction(+:sum) + for (int l = 0; l < k; l++) { + sum += lhs_arr[i * k + l] * tran_rhs[j * k + l]; + } + output_arr[i * n + j] = sum; + } + } + free(tran_rhs); return output; - } +} - void *tensorSoftmaxCPU(void *input_ptr) { +void *tensorSoftmaxCPU(void *input_ptr) { Tensor *input = (Tensor *)input_ptr; - + float *logits = (float *)input->host_data; - int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; - + + omp_set_num_threads(4); + #pragma omp parallel for for (int i = 0; i < n; i++) { - float x = 0; - for (int j = 0; j < c; j++) - x += epow(logits[i * c + j]); - - for (int j = 0; j < c; j++) - logits[i * c + j] = epow(logits[i * c + j]) / x; + float x = 0; + for(int j = i*c; j < c + i*c; j++) { + logits[j] = expf(logits[j]); + } + + #pragma omp simd reduction(+:x) + for(int j = i*c; j < i*c+c; j++) { + x += logits[j]; + } + + #pragma omp simd + for(int j = i*c; j < i*c + c; j++) { + logits[j] /= x; + } } return input; - } - - void* __attribute__((always_inline)) tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int compute_precision) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float *image = (float *)input->host_data; - float *kernels = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - - // kernel centers - int center_x = (kernel_width - 1) / 2 - horizontal_pad; - int center_y = (kernel_height - 1) / 2 - vertical_pad; - - int x_radius = (kernel_width - 1) / 2; - int y_radius = (kernel_height - 1) / 2; - int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) / - horizontal_stride); - - Tensor *output = (Tensor *) create4DTensorInternal(0, 0, batch_size, num_filters, - output_height, output_width); - float *output_data = (float *)output->host_data; +} - for (int b = 0; b < batch_size; b++) { - for (int f = 0; f < num_filters; f++) { - int ii = 0, jj = 0; - for (int r = center_y; r < image_height + vertical_pad - y_radius; - r += vertical_stride) { - for (int c = center_x; c < image_width + horizontal_pad - x_radius; - c += horizontal_stride) { - - float sum = 0; - for (int ch = 0; ch < channels; ch++) { - for (int i = r - y_radius, ki = 0; ki < kernel_height; i++, ki++) { - for (int j = c - x_radius, kj = 0; kj < kernel_width; j++, kj++) { - if (i >= 0 && j >= 0 && i < image_height && j < image_width) { - sum += image[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - i * image_width + j] * - kernels[f * (channels * kernel_height * kernel_width) + - ch * (kernel_height * kernel_width) + - ki * kernel_width + kj]; - } - } - } - } - output_data[b * (num_filters * output_height * output_width) + - f * (output_height * output_width) + ii * output_width + - jj] = sum; - jj++; - if (jj == output_width) { - jj = 0; - ii++; - } - } - } - } + void *tensorReluCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + + #pragma omp simd + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = (input_data[i] < 0) ? 0 : input_data[i]; } - return output; - } + return input; +} +void *tensorRelu2CPU(void *input_ptr, float min, float max) { + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + + #pragma omp simd + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ? + max : input_data[i]); + } -} + return input; +}