Skip to content
Snippets Groups Projects
Commit f84274d5 authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Adding necessary deviceToHostCopy(tensor) calls for CPU tensor RT routines

parent cb200682
No related branches found
No related tags found
No related merge requests found
......@@ -38,6 +38,7 @@
#include "tensor_runtime.h"
#include "tensor_cpu_runtime.h"
#include "approx_api.h"
#include "tensor_utils.h"
void llvm_hpvm_initTensorRtCPU() {
......@@ -56,6 +57,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride,
int conv_mode, int compute_precision) {
Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
......@@ -135,12 +137,14 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr,
return output;
}
void *tensorRegularFilterSamplingConvolutionCPU(
void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride, int conv_mode,
int compute_precision, int skip_every, int start) {
Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
void *tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr,
int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride,
int conv_mode, int compute_precision,
int skip_every, int start) {
Tensor *input = (Tensor *) input_ptr;
Tensor *filter = (Tensor *) filter_ptr;
float *__restrict__ host_image = (float *)input->host_data;
float *__restrict__ host_filter = (float *)filter->host_data;
......@@ -259,10 +263,12 @@ void *tensorRegularFilterSamplingConvolutionCPU(
return output;
}
void *tensorIrregularFilterSamplingConvolutionCPU(
void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride, int conv_mode,
int compute_precision, int skip_every, int start) {
void *tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr,
int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride,
int conv_mode, int compute_precision,
int skip_every, int start) {
Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
......@@ -665,20 +671,24 @@ void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
int horizontal_stride, int conv_mode,
int compute_precision, int row, int col,
int skip_every, int start) {
Tensor *input = (Tensor *) input_ptr;
Tensor *filter = (Tensor *) filter_ptr;
deviceToHostCopy(input);
deviceToHostCopy(filter);
if (row > 1) {
//printf("ROW PERFORATION\n");
return tensorRowPerfConvolutionCPU(
input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
horizontal_stride, conv_mode, compute_precision, row, start);
}
if (col > 1) {
//printf("COL PERFORATION\n");
return tensorColPerfConvolutionCPU(
input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride,
horizontal_stride, conv_mode, compute_precision, col, start);
}
if (skip_every > 1) {
//printf("INPUT FILTERING\n");
Tensor *filter = (Tensor *)filter_ptr;
const int kernel_height = filter->dims.dim_sizes[2];
......@@ -707,6 +717,9 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
Tensor *input = (Tensor *)input_ptr;
Tensor *filter = (Tensor *)filter_ptr;
deviceToHostCopy(input);
deviceToHostCopy(filter);
float *__restrict__ host_image = (float *)input->host_data;
float *__restrict__ host_filter = (float *)filter->host_data;
......@@ -783,8 +796,12 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad,
}
void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
Tensor *x = (Tensor *)x_ptr;
Tensor *bias = (Tensor *)bias_ptr;
Tensor *x = (Tensor *) x_ptr;
Tensor *bias = (Tensor *) bias_ptr;
deviceToHostCopy(x);
deviceToHostCopy(bias);
float *__restrict__ x_data = (float *)x->host_data;
float *__restrict__ bias_data = (float *)bias->host_data;
......@@ -793,6 +810,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
int h = x->dims.dim_sizes[2];
int w = x->dims.dim_sizes[3];
if (x->num_elems == bias->num_elems) {
int const1 = c * h * w;
int const2 = h * w;
......@@ -824,6 +842,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) {
}
}
return x;
}
......@@ -835,6 +854,8 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
int vertical_stride, int horizontal_stride) {
Tensor *input = (Tensor *)input_ptr;
deviceToHostCopy(input);
float *__restrict__ input_data = (float *)input->host_data;
int batch_size = input->dims.dim_sizes[0];
......@@ -929,13 +950,15 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height,
}
void *tensorTanhCPU(void *input_ptr) {
Tensor *input = (Tensor *)input_ptr;
Tensor *input = (Tensor *)input_ptr;
deviceToHostCopy(input);
float *input_data = (float *)input->host_data;
size_t num_elems = input->num_elems;
omp_set_num_threads(4);
#pragma omp parallel for
#pragma omp parallel for
for (size_t i = 0; i < num_elems; i++) {
input_data[i] = tanhf(input_data[i]);
}
......@@ -944,9 +967,13 @@ void *tensorTanhCPU(void *input_ptr) {
}
void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
Tensor *lhs = (Tensor *)lhs_ptr;
Tensor *rhs = (Tensor *)rhs_ptr;
deviceToHostCopy(lhs);
deviceToHostCopy(rhs);
int m = lhs->dims.dim_sizes[0];
int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons
......@@ -963,6 +990,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
}
float *tran_rhs = (float *)malloc(sizeof(float) * k * n);
omp_set_num_threads(4);
#pragma omp parallel for simd
for (int l = 0; l < k; l++) {
for (int j = 0; j < n; j++) {
......@@ -986,8 +1014,11 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) {
}
void *tensorSoftmaxCPU(void *input_ptr) {
Tensor *input = (Tensor *)input_ptr;
deviceToHostCopy(input);
float *logits = (float *)input->host_data;
int n = input->dims.dim_sizes[0];
int c = input->dims.dim_sizes[1];
......@@ -1016,12 +1047,19 @@ void *tensorSoftmaxCPU(void *input_ptr) {
void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
void *mean_ptr, void *variance_ptr, double epsilon) {
Tensor *input = (Tensor *)input_ptr;
Tensor *gamma = (Tensor *)gamma_ptr;
Tensor *beta = (Tensor *)beta_ptr;
Tensor *mean = (Tensor *)mean_ptr;
Tensor *variance = (Tensor *)variance_ptr;
Tensor *input = (Tensor *) input_ptr;
Tensor *gamma = (Tensor *) gamma_ptr;
Tensor *beta = (Tensor *) beta_ptr;
Tensor *mean = (Tensor *) mean_ptr;
Tensor *variance = (Tensor *) variance_ptr;
deviceToHostCopy(input);
deviceToHostCopy(gamma);
deviceToHostCopy(beta);
deviceToHostCopy(mean);
deviceToHostCopy(variance);
float *__restrict__ host_image = (float *)input->host_data;
float *__restrict__ host_beta = (float *)beta->host_data;
float *__restrict__ host_gamma = (float *)gamma->host_data;
......@@ -1055,7 +1093,10 @@ void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr,
}
void *tensorReluCPU(void *input_ptr) {
Tensor *input = (Tensor *)input_ptr;
deviceToHostCopy(input);
float *input_data = (float *)input->host_data;
size_t num_elems = input->num_elems;
......@@ -1068,7 +1109,10 @@ void *tensorReluCPU(void *input_ptr) {
}
void *tensorRelu2CPU(void *input_ptr, float min, float max) {
Tensor *input = (Tensor *)input_ptr;
deviceToHostCopy(input);
float *input_data = (float *)input->host_data;
size_t num_elems = input->num_elems;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment