diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index 24f8c7d370a2922ba117eb1ec573dfc6af4742cc..95c17752fd3fd0e441f2c6335591eeb741105db9 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -38,6 +38,7 @@ #include "tensor_runtime.h" #include "tensor_cpu_runtime.h" #include "approx_api.h" +#include "tensor_utils.h" void llvm_hpvm_initTensorRtCPU() { @@ -56,6 +57,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int compute_precision) { + Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; @@ -135,12 +137,14 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, return output; } -void *tensorRegularFilterSamplingConvolutionCPU( - void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, int conv_mode, - int compute_precision, int skip_every, int start) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; +void *tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int skip_every, int start) { + + Tensor *input = (Tensor *) input_ptr; + Tensor *filter = (Tensor *) filter_ptr; float *__restrict__ host_image = (float *)input->host_data; float *__restrict__ host_filter = (float *)filter->host_data; @@ -259,10 +263,12 @@ void *tensorRegularFilterSamplingConvolutionCPU( return output; } -void *tensorIrregularFilterSamplingConvolutionCPU( - void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, int conv_mode, - int compute_precision, int skip_every, int start) { +void *tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; @@ -665,20 +671,24 @@ void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_stride, int conv_mode, int compute_precision, int row, int col, int skip_every, int start) { + + Tensor *input = (Tensor *) input_ptr; + Tensor *filter = (Tensor *) filter_ptr; + + deviceToHostCopy(input); + deviceToHostCopy(filter); + if (row > 1) { - //printf("ROW PERFORATION\n"); return tensorRowPerfConvolutionCPU( input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, compute_precision, row, start); } if (col > 1) { - //printf("COL PERFORATION\n"); return tensorColPerfConvolutionCPU( input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, compute_precision, col, start); } if (skip_every > 1) { - //printf("INPUT FILTERING\n"); Tensor *filter = (Tensor *)filter_ptr; const int kernel_height = filter->dims.dim_sizes[2]; @@ -707,6 +717,9 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; + deviceToHostCopy(input); + deviceToHostCopy(filter); + float *__restrict__ host_image = (float *)input->host_data; float *__restrict__ host_filter = (float *)filter->host_data; @@ -783,8 +796,12 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, } void *tensorAddCPU(void *x_ptr, void *bias_ptr) { - Tensor *x = (Tensor *)x_ptr; - Tensor *bias = (Tensor *)bias_ptr; + + Tensor *x = (Tensor *) x_ptr; + Tensor *bias = (Tensor *) bias_ptr; + + deviceToHostCopy(x); + deviceToHostCopy(bias); float *__restrict__ x_data = (float *)x->host_data; float *__restrict__ bias_data = (float *)bias->host_data; @@ -793,6 +810,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) { int h = x->dims.dim_sizes[2]; int w = x->dims.dim_sizes[3]; + if (x->num_elems == bias->num_elems) { int const1 = c * h * w; int const2 = h * w; @@ -824,6 +842,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) { } } + return x; } @@ -835,6 +854,8 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, int vertical_stride, int horizontal_stride) { Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *__restrict__ input_data = (float *)input->host_data; int batch_size = input->dims.dim_sizes[0]; @@ -929,13 +950,15 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, } void *tensorTanhCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; + Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *input_data = (float *)input->host_data; size_t num_elems = input->num_elems; omp_set_num_threads(4); -#pragma omp parallel for + #pragma omp parallel for for (size_t i = 0; i < num_elems; i++) { input_data[i] = tanhf(input_data[i]); } @@ -944,9 +967,13 @@ void *tensorTanhCPU(void *input_ptr) { } void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { + Tensor *lhs = (Tensor *)lhs_ptr; Tensor *rhs = (Tensor *)rhs_ptr; + deviceToHostCopy(lhs); + deviceToHostCopy(rhs); + int m = lhs->dims.dim_sizes[0]; int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons @@ -963,6 +990,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { } float *tran_rhs = (float *)malloc(sizeof(float) * k * n); omp_set_num_threads(4); + #pragma omp parallel for simd for (int l = 0; l < k; l++) { for (int j = 0; j < n; j++) { @@ -986,8 +1014,11 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { } void *tensorSoftmaxCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *logits = (float *)input->host_data; int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; @@ -1016,12 +1047,19 @@ void *tensorSoftmaxCPU(void *input_ptr) { void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, void *variance_ptr, double epsilon) { - Tensor *input = (Tensor *)input_ptr; - Tensor *gamma = (Tensor *)gamma_ptr; - Tensor *beta = (Tensor *)beta_ptr; - Tensor *mean = (Tensor *)mean_ptr; - Tensor *variance = (Tensor *)variance_ptr; - + Tensor *input = (Tensor *) input_ptr; + Tensor *gamma = (Tensor *) gamma_ptr; + Tensor *beta = (Tensor *) beta_ptr; + Tensor *mean = (Tensor *) mean_ptr; + Tensor *variance = (Tensor *) variance_ptr; + + deviceToHostCopy(input); + deviceToHostCopy(gamma); + deviceToHostCopy(beta); + deviceToHostCopy(mean); + deviceToHostCopy(variance); + + float *__restrict__ host_image = (float *)input->host_data; float *__restrict__ host_beta = (float *)beta->host_data; float *__restrict__ host_gamma = (float *)gamma->host_data; @@ -1055,7 +1093,10 @@ void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, } void *tensorReluCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *input_data = (float *)input->host_data; size_t num_elems = input->num_elems; @@ -1068,7 +1109,10 @@ void *tensorReluCPU(void *input_ptr) { } void *tensorRelu2CPU(void *input_ptr, float min, float max) { + Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *input_data = (float *)input->host_data; size_t num_elems = input->num_elems;