Skip to content
Snippets Groups Projects
Commit 37ae3dca authored by Yifan Zhao's avatar Yifan Zhao
Browse files

Merge remote-tracking branch 'origin/approx_hpvm_reorg_akash' into approx_hpvm_reorg

parents 6caf9a1c e6190395
No related branches found
No related tags found
No related merge requests found
......@@ -34,9 +34,11 @@
#include <omp.h>
// Tensor runtime header files
//#include "tensor_cpu.h"
#include "tensor.h"
#include "tensor_runtime.h"
#include "tensor_cpu_runtime.h"
#include "approx_api.h"
void llvm_hpvm_initTensorRtCPU() {
// NOTE: Do Nothing
......@@ -50,78 +52,6 @@ void hpvm_request_tensorCPU(void *tensor, int destination) {
// NOTE: Do Nothing
}
std::vector<void *> PtrVect;
void freeBatchMemory() {
for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) {
free(*it);
}
PtrVect.erase(PtrVect.begin(), PtrVect.end());
}
int getTypeSizeCPU(int data_type) __attribute__((always_inline));
inline int getTypeSizeCPU(int data_type) {
return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1);
}
void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems)
__attribute__((always_inline));
inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type,
size_t num_elems) {
int type_size = getTypeSizeCPU(data_type);
size_t size_in_bytes = type_size * num_elems;
tensor->size_in_bytes = size_in_bytes;
}
void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems,
bool freeMemory = true) __attribute__((always_inline));
inline void allocateMemCPU(struct Tensor *tensor, int data_type,
size_t num_elems, bool freeMemory) {
setSizeInBytesCPU(tensor, data_type, num_elems);
tensor->data_type = data_type;
tensor->num_elems = num_elems;
tensor->host_data =
(void *)malloc(tensor->size_in_bytes); // Allocate memory on the host
if (freeMemory)
PtrVect.push_back(tensor->host_data);
}
void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes)
__attribute__((always_inline));
inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr,
size_t size_in_bytes) {
Tensor *tensor = (Tensor *)tensor_ptr;
if (tensor->size_in_bytes != size_in_bytes) {
printf("The destination and source sizes don't match");
}
memcpy(tensor->host_data, data_ptr,
size_in_bytes); // Is this efficient enough?
}
void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
size_t dim2_size, size_t dim3_size, size_t dim4_size,
bool freeMemory = true) __attribute__((always_inline));
inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size,
size_t dim2_size, size_t dim3_size,
size_t dim4_size, bool freeMemory) {
struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor));
size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
if (freeMemory)
PtrVect.push_back(tensor);
allocateMemCPU(tensor, data_type, num_elems, freeMemory);
// Setting the tensor dimensions
size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4);
dim_sizes[0] = dim1_size;
dim_sizes[1] = dim2_size;
dim_sizes[2] = dim3_size;
dim_sizes[3] = dim4_size;
tensor->dims.dim_sizes = dim_sizes;
tensor->dims.num_dims = 4;
tensor->data_placement = HOST;
return tensor;
}
void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride,
......@@ -146,7 +76,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
int num_filter_elem = kernel_height * kernel_width * channels;
int output_size = output_width * output_height;
printf("--CREATE 4D TENSOR\n");
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data;
printf("CREATED 4D TENSOR\n");
......@@ -235,7 +165,7 @@ void *tensorRegularFilterSamplingConvolutionCPU(
num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
const int output_size = output_width * output_height;
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data;
......@@ -359,7 +289,7 @@ void *tensorIrregularFilterSamplingConvolutionCPU(
num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder;
const int output_size = output_width * output_height;
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters,
Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters,
output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data;
......@@ -478,7 +408,7 @@ void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
int num_filter_elem = kernel_height * kernel_width * channels;
int full_output_size = full_output_height * full_output_width;
Tensor *full_output = (Tensor *)create4DTensorCPU(
Tensor *full_output = (Tensor *)create4DTensor(
0, 0, batch_size, num_filters, full_output_height, full_output_width);
float *__restrict__ full_output_data = (float *)full_output->host_data;
......@@ -619,7 +549,7 @@ void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr,
int num_filter_elem = kernel_height * kernel_width * channels;
int full_output_size = full_output_height * full_output_width;
Tensor *full_output = (Tensor *)create4DTensorCPU(
Tensor *full_output = (Tensor *)create4DTensor(
0, 0, batch_size, num_filters, full_output_height, full_output_width);
float *__restrict__ full_output_data = (float *)full_output->host_data;
......@@ -785,7 +715,6 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
const int channels = input->dims.dim_sizes[1];
const int image_height = input->dims.dim_sizes[2];
const int image_width = input->dims.dim_sizes[3];
const int num_filters = filter->dims.dim_sizes[0];
const int kernel_height = filter->dims.dim_sizes[2];
const int kernel_width = filter->dims.dim_sizes[3];
const int output_height =
......@@ -797,8 +726,8 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
const int num_filter_elem = filter_dim * channels;
const int output_size = output_width * output_height;
Tensor *output = (Tensor *)create4DTensorCPU(
0, 0, batch_size, num_filters, channels, output_height * output_width);
Tensor *output = (Tensor *)create4DTensor(
0, 0, batch_size, channels, output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data;
const long int conv_data_size = sizeof(float) * num_filter_elem *
......@@ -836,22 +765,18 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
}
}
}
for (int p = 0; p < num_filters; ++p) {
for (int m = 0; m < output_size; ++m) {
for (int m = 0; m < output_size; ++m) {
for (int ch = 0; ch < channels; ch++) {
float sum = 0;
#pragma omp simd reduction(+ : sum)
for (int k = 0; k < filter_dim; ++k) {
int input_index = k + ch * filter_dim + num_filter_elem * m +
b * num_filter_elem * output_size;
sum += host_data[input_index] *
host_filter[p * num_filter_elem + ch * filter_dim + k];
sum += host_data[input_index] * host_filter[ch * filter_dim + k];
}
output_data[b * (output_size * num_filters * channels) +
p * output_size * channels + ch * output_size + m] = sum;
output_data[b * (output_size * channels) + ch * output_size + m] = sum;
}
}
}
}
}
free(host_data);
......@@ -928,7 +853,7 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
int x_radius = (window_width - 1) / 2;
int y_radius = (window_height - 1) / 2;
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels,
Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels,
output_height, output_width);
float *__restrict__ output_data = (float *)output->host_data;
......@@ -1026,7 +951,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
int m = lhs->dims.dim_sizes[0];
int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1);
Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1);
float *__restrict__ lhs_arr = (float *)lhs->host_data;
float *__restrict__ rhs_arr = (float *)rhs->host_data;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment