diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index d3a037ea152c279c1528d8a4ec24d32f28df64f9..89148bd5e05e350c29ac49c2eed0a5b93696d38a 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -34,9 +34,11 @@ #include <omp.h> // Tensor runtime header files -//#include "tensor_cpu.h" #include "tensor.h" +#include "tensor_runtime.h" #include "tensor_cpu_runtime.h" +#include "approx_api.h" + void llvm_hpvm_initTensorRtCPU() { // NOTE: Do Nothing @@ -50,78 +52,6 @@ void hpvm_request_tensorCPU(void *tensor, int destination) { // NOTE: Do Nothing } -std::vector<void *> PtrVect; - -void freeBatchMemory() { - for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) { - free(*it); - } - PtrVect.erase(PtrVect.begin(), PtrVect.end()); -} - -int getTypeSizeCPU(int data_type) __attribute__((always_inline)); -inline int getTypeSizeCPU(int data_type) { - return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1); -} - -void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) - __attribute__((always_inline)); -inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, - size_t num_elems) { - int type_size = getTypeSizeCPU(data_type); - size_t size_in_bytes = type_size * num_elems; - tensor->size_in_bytes = size_in_bytes; -} - -void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, - bool freeMemory = true) __attribute__((always_inline)); -inline void allocateMemCPU(struct Tensor *tensor, int data_type, - size_t num_elems, bool freeMemory) { - setSizeInBytesCPU(tensor, data_type, num_elems); - tensor->data_type = data_type; - tensor->num_elems = num_elems; - tensor->host_data = - (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host - if (freeMemory) - PtrVect.push_back(tensor->host_data); -} - -void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) - __attribute__((always_inline)); -inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, - size_t size_in_bytes) { - Tensor *tensor = (Tensor *)tensor_ptr; - if (tensor->size_in_bytes != size_in_bytes) { - printf("The destination and source sizes don't match"); - } - memcpy(tensor->host_data, data_ptr, - size_in_bytes); // Is this efficient enough? -} - -void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size, - bool freeMemory = true) __attribute__((always_inline)); -inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, - size_t dim4_size, bool freeMemory) { - struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - if (freeMemory) - PtrVect.push_back(tensor); - allocateMemCPU(tensor, data_type, num_elems, freeMemory); - - // Setting the tensor dimensions - size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - dim_sizes[3] = dim4_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 4; - tensor->data_placement = HOST; - return tensor; -} - void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, @@ -146,7 +76,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int num_filter_elem = kernel_height * kernel_width * channels; int output_size = output_width * output_height; printf("--CREATE 4D TENSOR\n"); - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters, output_height, output_width); float *__restrict__ output_data = (float *)output->host_data; printf("CREATED 4D TENSOR\n"); @@ -235,7 +165,7 @@ void *tensorRegularFilterSamplingConvolutionCPU( num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; const int output_size = output_width * output_height; - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters, output_height, output_width); float *__restrict__ output_data = (float *)output->host_data; @@ -359,7 +289,7 @@ void *tensorIrregularFilterSamplingConvolutionCPU( num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; const int output_size = output_width * output_height; - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters, output_height, output_width); float *__restrict__ output_data = (float *)output->host_data; @@ -478,7 +408,7 @@ void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int num_filter_elem = kernel_height * kernel_width * channels; int full_output_size = full_output_height * full_output_width; - Tensor *full_output = (Tensor *)create4DTensorCPU( + Tensor *full_output = (Tensor *)create4DTensor( 0, 0, batch_size, num_filters, full_output_height, full_output_width); float *__restrict__ full_output_data = (float *)full_output->host_data; @@ -619,7 +549,7 @@ void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int num_filter_elem = kernel_height * kernel_width * channels; int full_output_size = full_output_height * full_output_width; - Tensor *full_output = (Tensor *)create4DTensorCPU( + Tensor *full_output = (Tensor *)create4DTensor( 0, 0, batch_size, num_filters, full_output_height, full_output_width); float *__restrict__ full_output_data = (float *)full_output->host_data; @@ -785,7 +715,6 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, const int channels = input->dims.dim_sizes[1]; const int image_height = input->dims.dim_sizes[2]; const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; const int kernel_height = filter->dims.dim_sizes[2]; const int kernel_width = filter->dims.dim_sizes[3]; const int output_height = @@ -797,8 +726,8 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, const int num_filter_elem = filter_dim * channels; const int output_size = output_width * output_height; - Tensor *output = (Tensor *)create4DTensorCPU( - 0, 0, batch_size, num_filters, channels, output_height * output_width); + Tensor *output = (Tensor *)create4DTensor( + 0, 0, batch_size, channels, output_height, output_width); float *__restrict__ output_data = (float *)output->host_data; const long int conv_data_size = sizeof(float) * num_filter_elem * @@ -836,22 +765,18 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, } } } - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { + for (int m = 0; m < output_size; ++m) { for (int ch = 0; ch < channels; ch++) { float sum = 0; #pragma omp simd reduction(+ : sum) for (int k = 0; k < filter_dim; ++k) { int input_index = k + ch * filter_dim + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * - host_filter[p * num_filter_elem + ch * filter_dim + k]; + sum += host_data[input_index] * host_filter[ch * filter_dim + k]; } - output_data[b * (output_size * num_filters * channels) + - p * output_size * channels + ch * output_size + m] = sum; + output_data[b * (output_size * channels) + ch * output_size + m] = sum; } - } - } + } } free(host_data); @@ -928,7 +853,7 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, int x_radius = (window_width - 1) / 2; int y_radius = (window_height - 1) / 2; - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels, + Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels, output_height, output_width); float *__restrict__ output_data = (float *)output->host_data; @@ -1026,7 +951,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { int m = lhs->dims.dim_sizes[0]; int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); + Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1); float *__restrict__ lhs_arr = (float *)lhs->host_data; float *__restrict__ rhs_arr = (float *)rhs->host_data;