Skip to content
Snippets Groups Projects
Commit e6190395 authored by Akash Kothari's avatar Akash Kothari
Browse files

Get rid of CreateTensor4DCPU internal API

parent 8a92297d
No related branches found
No related tags found
No related merge requests found
...@@ -34,9 +34,11 @@ ...@@ -34,9 +34,11 @@
#include <omp.h> #include <omp.h>
// Tensor runtime header files // Tensor runtime header files
//#include "tensor_cpu.h"
#include "tensor.h" #include "tensor.h"
#include "tensor_runtime.h"
#include "tensor_cpu_runtime.h" #include "tensor_cpu_runtime.h"
#include "approx_api.h"
void llvm_hpvm_initTensorRtCPU() { void llvm_hpvm_initTensorRtCPU() {
// NOTE: Do Nothing // NOTE: Do Nothing
...@@ -50,78 +52,6 @@ void hpvm_request_tensorCPU(void *tensor, int destination) { ...@@ -50,78 +52,6 @@ void hpvm_request_tensorCPU(void *tensor, int destination) {
// NOTE: Do Nothing // NOTE: Do Nothing
} }
std::vector<void *> PtrVect;
void freeBatchMemory() {
for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
free(*it);
}
PtrVect.erase(PtrVect.begin(), PtrVect.end());
}
int getTypeSizeCPU(int data_type) __attribute__((always_inline));
inline int getTypeSizeCPU(int data_type) {
return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
}
void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems)
__attribute__((always_inline));
inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type,
size_t num_elems) {
int type_size = getTypeSizeCPU(data_type);
size_t size_in_bytes = type_size * num_elems;
tensor->size_in_bytes = size_in_bytes;
}
void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems,
bool freeMemory = true) __attribute__((always_inline));
inline void allocateMemCPU(struct Tensor *tensor, int data_type,
size_t num_elems, bool freeMemory) {
setSizeInBytesCPU(tensor, data_type, num_elems);
tensor->data_type = data_type;
tensor->num_elems = num_elems;
tensor->host_data =
(void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
if (freeMemory)
PtrVect.push_back(tensor->host_data);
}
void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes)
__attribute__((always_inline));
inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr,
size_t size_in_bytes) {
Tensor *tensor = (Tensor *)tensor_ptr;
if (tensor->size_in_bytes != size_in_bytes) {
printf("The destination and source sizes don't match");
}
memcpy(tensor->host_data, data_ptr,
size_in_bytes); // Is this efficient enough?
}
void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
size_t dim2_size, size_t dim3_size, size_t dim4_size,
bool freeMemory = true) __attribute__((always_inline));
inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
size_t dim2_size, size_t dim3_size,
size_t dim4_size, bool freeMemory) {
struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
if (freeMemory)
PtrVect.push_back(tensor);
allocateMemCPU(tensor, data_type, num_elems, freeMemory);
// Setting the tensor dimensions
size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
dim_sizes[0] = dim1_size;
dim_sizes[1] = dim2_size;
dim_sizes[2] = dim3_size;
dim_sizes[3] = dim4_size;
tensor->dims.dim_sizes = dim_sizes;
tensor->dims.num_dims = 4;
tensor->data_placement = HOST;
return tensor;
}
void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
int vertical_pad, int horizontal_pad, int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride, int vertical_stride, int horizontal_stride,
...@@ -146,7 +76,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, ...@@ -146,7 +76,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
int num_filter_elem = kernel_height * kernel_width * channels; int num_filter_elem = kernel_height * kernel_width * channels;
int output_size = output_width * output_height; int output_size = output_width * output_height;
printf("--CREATE 4D TENSOR\n"); printf("--CREATE 4D TENSOR\n");
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width); output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data; float *__restrict__ output_data = (float *)output->host_data;
printf("CREATED 4D TENSOR\n"); printf("CREATED 4D TENSOR\n");
...@@ -235,7 +165,7 @@ void *tensorRegularFilterSamplingConvolutionCPU( ...@@ -235,7 +165,7 @@ void *tensorRegularFilterSamplingConvolutionCPU(
num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
const int output_size = output_width * output_height; const int output_size = output_width * output_height;
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width); output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data; float *__restrict__ output_data = (float *)output->host_data;
...@@ -359,7 +289,7 @@ void *tensorIrregularFilterSamplingConvolutionCPU( ...@@ -359,7 +289,7 @@ void *tensorIrregularFilterSamplingConvolutionCPU(
num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
const int output_size = output_width * output_height; const int output_size = output_width * output_height;
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width); output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data; float *__restrict__ output_data = (float *)output->host_data;
...@@ -478,7 +408,7 @@ void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, ...@@ -478,7 +408,7 @@ void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
int num_filter_elem = kernel_height * kernel_width * channels; int num_filter_elem = kernel_height * kernel_width * channels;
int full_output_size = full_output_height * full_output_width; int full_output_size = full_output_height * full_output_width;
Tensor *full_output = (Tensor *)create4DTensorCPU( Tensor *full_output = (Tensor *)create4DTensor(
0, 0, batch_size, num_filters, full_output_height, full_output_width); 0, 0, batch_size, num_filters, full_output_height, full_output_width);
float *__restrict__ full_output_data = (float *)full_output->host_data; float *__restrict__ full_output_data = (float *)full_output->host_data;
...@@ -619,7 +549,7 @@ void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, ...@@ -619,7 +549,7 @@ void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
int num_filter_elem = kernel_height * kernel_width * channels; int num_filter_elem = kernel_height * kernel_width * channels;
int full_output_size = full_output_height * full_output_width; int full_output_size = full_output_height * full_output_width;
Tensor *full_output = (Tensor *)create4DTensorCPU( Tensor *full_output = (Tensor *)create4DTensor(
0, 0, batch_size, num_filters, full_output_height, full_output_width); 0, 0, batch_size, num_filters, full_output_height, full_output_width);
float *__restrict__ full_output_data = (float *)full_output->host_data; float *__restrict__ full_output_data = (float *)full_output->host_data;
...@@ -785,7 +715,6 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, ...@@ -785,7 +715,6 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
const int channels = input->dims.dim_sizes[1]; const int channels = input->dims.dim_sizes[1];
const int image_height = input->dims.dim_sizes[2]; const int image_height = input->dims.dim_sizes[2];
const int image_width = input->dims.dim_sizes[3]; const int image_width = input->dims.dim_sizes[3];
const int num_filters = filter->dims.dim_sizes[0];
const int kernel_height = filter->dims.dim_sizes[2]; const int kernel_height = filter->dims.dim_sizes[2];
const int kernel_width = filter->dims.dim_sizes[3]; const int kernel_width = filter->dims.dim_sizes[3];
const int output_height = const int output_height =
...@@ -797,8 +726,8 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, ...@@ -797,8 +726,8 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
const int num_filter_elem = filter_dim * channels; const int num_filter_elem = filter_dim * channels;
const int output_size = output_width * output_height; const int output_size = output_width * output_height;
Tensor *output = (Tensor *)create4DTensorCPU( Tensor *output = (Tensor *)create4DTensor(
0, 0, batch_size, num_filters, channels, output_height * output_width); 0, 0, batch_size, channels, output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data; float *__restrict__ output_data = (float *)output->host_data;
const long int conv_data_size = sizeof(float) * num_filter_elem * const long int conv_data_size = sizeof(float) * num_filter_elem *
...@@ -836,22 +765,18 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, ...@@ -836,22 +765,18 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
} }
} }
} }
for (int p = 0; p < num_filters; ++p) { for (int m = 0; m < output_size; ++m) {
for (int m = 0; m < output_size; ++m) {
for (int ch = 0; ch < channels; ch++) { for (int ch = 0; ch < channels; ch++) {
float sum = 0; float sum = 0;
#pragma omp simd reduction(+ : sum) #pragma omp simd reduction(+ : sum)
for (int k = 0; k < filter_dim; ++k) { for (int k = 0; k < filter_dim; ++k) {
int input_index = k + ch * filter_dim + num_filter_elem * m + int input_index = k + ch * filter_dim + num_filter_elem * m +
b * num_filter_elem * output_size; b * num_filter_elem * output_size;
sum += host_data[input_index] * sum += host_data[input_index] * host_filter[ch * filter_dim + k];
host_filter[p * num_filter_elem + ch * filter_dim + k];
} }
output_data[b * (output_size * num_filters * channels) + output_data[b * (output_size * channels) + ch * output_size + m] = sum;
p * output_size * channels + ch * output_size + m] = sum;
} }
} }
}
} }
free(host_data); free(host_data);
...@@ -928,7 +853,7 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, ...@@ -928,7 +853,7 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
int x_radius = (window_width - 1) / 2; int x_radius = (window_width - 1) / 2;
int y_radius = (window_height - 1) / 2; int y_radius = (window_height - 1) / 2;
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels, Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels,
output_height, output_width); output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data; float *__restrict__ output_data = (float *)output->host_data;
...@@ -1026,7 +951,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { ...@@ -1026,7 +951,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
int m = lhs->dims.dim_sizes[0]; int m = lhs->dims.dim_sizes[0];
int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
float *__restrict__ lhs_arr = (float *)lhs->host_data; float *__restrict__ lhs_arr = (float *)lhs->host_data;
float *__restrict__ rhs_arr = (float *)rhs->host_data; float *__restrict__ rhs_arr = (float *)rhs->host_data;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment