diff --git a/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp b/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp index c0dbd3899b0f6f31e0cb3d58a501aef7771b8281..1aec81d69ba89167489b3d7be161e8baa79b3bb0 100644 --- a/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp +++ b/hpvm/lib/Transforms/DFG2LLVM_WrapperAPI/DFG2LLVM_WrapperAPI.cpp @@ -1243,8 +1243,7 @@ private: // HPVM Runtime API and Tensor runtime API FunctionCallee llvm_hpvm_initApproxhpvmRt; FunctionCallee llvm_hpvm_cleanupApproxhpvmRt; - FunctionCallee hpvm_request_tensor; - + FunctionCallee llvm_hpvm_initializeRuntimeController; FunctionCallee llvm_hpvm_clearRuntimeController; @@ -1286,11 +1285,9 @@ void CGT_WrapperAPI::initRuntimeAPI() { // Get or insert Global declarations for // - initialization // - cleanup - // - request a tensor DECLARE(llvm_hpvm_initApproxhpvmRt); DECLARE(llvm_hpvm_cleanupApproxhpvmRt); - DECLARE(hpvm_request_tensor); - + DECLARE(llvm_hpvm_initializeRuntimeController); DECLARE(llvm_hpvm_clearRuntimeController); @@ -1403,15 +1400,6 @@ void CGT_WrapperAPI::codeGen(DFLeafNode *N) { ConstantInt *TargetDeviceID = ConstantInt::get(Type::getInt32Ty(M.getContext()), 1); - for (Function::arg_iterator ai = F_wrapper_api->arg_begin(), - ae = F_wrapper_api->arg_end(); - ai != ae; ++ai) { - Argument *Arg = &*ai; - if (Arg->getType()->isPointerTy()) { - Value *Args[] = {Arg, TargetDeviceID}; - CallInst::Create(hpvm_request_tensor, ArrayRef<Value *>(Args, 2), "", FI); - } - } CodeGenStateMachine CGM(&M, runtimeModule.get()); diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h index c318a8fb6aba604282cf709d09b6a6ef1a771f0e..8dfa287afc0b33c654931d038d6743241f72a5ed 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h @@ -111,9 +111,10 @@ void *handleTensorConvApproximationTuples_CPU( } case CPUNodeConfiguration::APPROX::PERFORATION: { PerfParams params = perfParamSet->getPerfParams(param); - INFO("perforation param = %i\n", param); - INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", - params.row, params.col, params.skip_offset); + + DEBUG("perforation param = %i\n", param); + DEBUG("params.row = %i, params.col = %i, params.skip_offset = %i\n", params.row, params.col, params.skip_offset); + void *t_out; RC->resume_profiler(); t_out = tensorConvApproxCPU( @@ -131,9 +132,10 @@ void *handleTensorConvApproximationTuples_CPU( } case CPUNodeConfiguration::APPROX::INPUT_SAMPLING: { SampParams params = sampParamSet->getSampParams(param); - INFO("sampling param = %i\n", param); - INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, - params.skip_offset); + + DEBUG("sampling param = %i\n", param); + DEBUG("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, params.skip_offset); + void *t_out; RC->resume_profiler(); t_out = tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w, @@ -536,9 +538,10 @@ void *handleTensorConvApproximationTuples( case GPUNodeConfiguration::APPROX::PERFORATION: case GPUNodeConfiguration::APPROX::PERFORATION_HP: { PerfParams params = perfParamSet->getPerfParams(param); - INFO("perforation param = %i\n", param); - INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", - params.row, params.col, params.skip_offset); + + DEBUG("perforation param = %i\n", param); + DEBUG("params.row = %i, params.col = %i, params.skip_offset = %i\n", params.row, params.col, params.skip_offset); + void *t_out; RC->resume_profiler(); t_out = tensorConvApproxHalf2( @@ -557,9 +560,11 @@ void *handleTensorConvApproximationTuples( case GPUNodeConfiguration::APPROX::INPUT_SAMPLING: case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: { SampParams params = sampParamSet->getSampParams(param); - INFO("sampling param = %i\n", param); - INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, + + DEBUG("sampling param = %i\n", param); + DEBUG("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, params.skip_offset); + void *t_out; RC->resume_profiler(); t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h index 6e81c7a3fbfbe4cae3cd1c40f43a4c7d5ea2d7c8..4d89d38ad193164027c2e7fde78764df3cdd7a92 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor.h @@ -33,14 +33,11 @@ struct Tensor { int data_type; int cur_type; int data_format; - data_location_t - data_placement; // Maintains the location of the tensor {host, device...} - cudnnTensorDescriptor_t tensor_desc; - cudnnFilterDescriptor_t - filter_desc; // FIXIT: Rethink if this should be in tensor struct - cudnnTensorDescriptor_t tensor_half_desc; - cudnnFilterDescriptor_t - filter_half_desc; // FIXIT: Rethink if this should be in tensor struct + data_location_t data_placement; // Maintains the location of the tensor {host, device...} + cudnnTensorDescriptor_t tensor_desc; + cudnnFilterDescriptor_t filter_desc; // FIXIT: Rethink if this should be in tensor struct + cudnnTensorDescriptor_t tensor_half_desc; + cudnnFilterDescriptor_t filter_half_desc; // FIXIT: Rethink if this should be in tensor struct void *host_data; void *gpu_data; // Pointer to GPU FP32 data void *gpu_half_data; // Pointer to GPU FP16 data diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h index d4961d19b9326daa4571d066dfe2b3177f6a78d4..698ab026dac0324cb456f69b4398111ac708412d 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensorUtils.h @@ -43,13 +43,13 @@ void dumpWeightsToFile(char *file_name, void *weights_ptr) { abort(); } - // printf("size_in_bytes = %lu \n", weights->size_in_bytes); size_t bytes_written = fwrite(weights->host_data, 1, weights->size_in_bytes, fp); - // printf("bytes_written = %lu \n", bytes_written); + fclose(fp); } + void fillTensorWithOnes(void *tensor_ptr) { struct Tensor *tensor = (struct Tensor *)tensor_ptr; @@ -201,8 +201,7 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type, long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; float *tensor_data = (float *)malloc(sizeof(float) * num_elems); - printf("size_in_bytes = %lu \n", size_in_bytes); - + int file_header_size = 0; FILE *file = fopen(file_name, "rb"); @@ -214,21 +213,19 @@ struct Tensor *readTrainedWeights(const char *file_name, int data_type, fseek(file, file_header_size, SEEK_CUR); // Skipping the file header size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); - // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, - // bytes_read); - fclose(file); struct Tensor *weights = (struct Tensor *)create4DTensor( data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); initTensorData(weights, tensor_data, size_in_bytes); - // compareValues(weights, tensor_data, num_elems); + free(tensor_data); return weights; } + struct Tensor *readInputBatch(const char *file_name, long data_type, long start, long end, long dim2_size, long dim3_size, long dim4_size) { @@ -254,10 +251,8 @@ struct Tensor *readInputBatch(const char *file_name, long data_type, fclose(file); - // printf ("FIXED input BATCH read \n"); - struct Tensor *weights = (struct Tensor *)create4DTensor( - data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); + struct Tensor *weights = (struct Tensor *) create4DTensor(data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); initTensorData(weights, tensor_data, size_in_bytes); free(tensor_data); @@ -593,3 +588,4 @@ void dumpOutput(void *output_ptr, const char *file_name) { } #endif + diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu index 6a3fcc12e014205aaf81e2cae0906ed6cfbff33e..e42b2cbc06d0c51050f8fa858aae349f35e7fa71 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu @@ -218,7 +218,7 @@ void *tensorConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups) { - INFO("*** TensorConvolution \n"); + //INFO("*** TensorGroupConvolution \n"); profileEvent("Conv"); Tensor *input = (Tensor *)input_ptr; @@ -290,7 +290,7 @@ void *tensorConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad, hostToDeviceCopy(input); hostToDeviceCopy(filter); - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); @@ -386,7 +386,7 @@ void *tensorHalfConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_stride, int conv_mode, int conv_groups) { - INFO("*** TensorHConvolution \n"); + DEBUG("*** TensorHConvolution \n"); profileEvent("#Conv"); Tensor *input = (Tensor *)input_ptr; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp index 0332313c573bcd28215a4277cd788e63a7820b2a..2d0ba3288b6e440046655a0acef9454b335ce55c 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp @@ -688,7 +688,6 @@ void RuntimeController::readConfigurationFile(const char *str) { firstTensorID += NodeConf->getApproxChoices().size(); } else if (tokens[1] == "cpu") { - INFO("---------Found cpu configuration\n"); // There must be at least one operation, with an approximation option CUSTOM_ASSERT((tokens.size() >= 5) && @@ -700,8 +699,7 @@ void RuntimeController::readConfigurationFile(const char *str) { InitialConfigurations.back().idConfigMap.insert( std::make_pair(firstTensorID, NodeConf)); - INFO("*** firstTensorID = %d \n\n", firstTensorID); - INFO("FIXED CPU ID ISSUE\n"); + unsigned idx = 2; while (idx < tokens.size()) { if (tokens[idx] == "add") { @@ -1193,7 +1191,7 @@ static int num_executations = 0; float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { - struct Tensor *result = (struct Tensor *)result_ptr; + struct Tensor *result = (struct Tensor *) result_ptr; size_t batch_dim = result->dims.dim_sizes[0]; size_t num_classes = result->dims.dim_sizes[1]; @@ -1206,6 +1204,7 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { int chosen = 0; for (int id = 1; id < num_classes; ++id) { + //printf(" check = %f \n ", data[i * num_classes + id]); if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index 89148bd5e05e350c29ac49c2eed0a5b93696d38a..700793529f1a7e1e9d8c887c28e7aefbd9afba93 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -38,6 +38,7 @@ #include "tensor_runtime.h" #include "tensor_cpu_runtime.h" #include "approx_api.h" +#include "tensor_utils.h" void llvm_hpvm_initTensorRtCPU() { @@ -56,6 +57,7 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int compute_precision) { + Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; @@ -75,17 +77,15 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, horizontal_stride); int num_filter_elem = kernel_height * kernel_width * channels; int output_size = output_width * output_height; - printf("--CREATE 4D TENSOR\n"); + Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, num_filters, output_height, output_width); float *__restrict__ output_data = (float *)output->host_data; - printf("CREATED 4D TENSOR\n"); + long int conv_data_size = sizeof(float) * num_filter_elem * output_height * output_width * batch_size; float *host_data = (float *)malloc(conv_data_size); - printf("host data: %p\n", host_data); - printf("conv_data_size: %d\n", conv_data_size); - printf("number of batches: %d\n", batch_size); + omp_set_num_threads(4); #pragma omp parallel for for (int b = 0; b < batch_size; b++) { @@ -131,17 +131,20 @@ void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, } } } + free(host_data); - printf("END: %p\n", output); + return output; } -void *tensorRegularFilterSamplingConvolutionCPU( - void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, int conv_mode, - int compute_precision, int skip_every, int start) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; +void *tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int skip_every, int start) { + + Tensor *input = (Tensor *) input_ptr; + Tensor *filter = (Tensor *) filter_ptr; float *__restrict__ host_image = (float *)input->host_data; float *__restrict__ host_filter = (float *)filter->host_data; @@ -260,10 +263,12 @@ void *tensorRegularFilterSamplingConvolutionCPU( return output; } -void *tensorIrregularFilterSamplingConvolutionCPU( - void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, int conv_mode, - int compute_precision, int skip_every, int start) { +void *tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, + int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; @@ -666,20 +671,24 @@ void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_stride, int conv_mode, int compute_precision, int row, int col, int skip_every, int start) { + + Tensor *input = (Tensor *) input_ptr; + Tensor *filter = (Tensor *) filter_ptr; + + deviceToHostCopy(input); + deviceToHostCopy(filter); + if (row > 1) { - printf("ROW PERFORATION\n"); return tensorRowPerfConvolutionCPU( input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, compute_precision, row, start); } if (col > 1) { - printf("COL PERFORATION\n"); return tensorColPerfConvolutionCPU( input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, compute_precision, col, start); } if (skip_every > 1) { - printf("INPUT FILTERING\n"); Tensor *filter = (Tensor *)filter_ptr; const int kernel_height = filter->dims.dim_sizes[2]; @@ -694,7 +703,7 @@ void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad, input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, compute_precision, skip_every, start); } - printf("---REGULAR CONV\n"); + return tensorRegularConvolutionCPU( input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, horizontal_stride, conv_mode, compute_precision); @@ -708,6 +717,9 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; + deviceToHostCopy(input); + deviceToHostCopy(filter); + float *__restrict__ host_image = (float *)input->host_data; float *__restrict__ host_filter = (float *)filter->host_data; @@ -784,8 +796,12 @@ void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, } void *tensorAddCPU(void *x_ptr, void *bias_ptr) { - Tensor *x = (Tensor *)x_ptr; - Tensor *bias = (Tensor *)bias_ptr; + + Tensor *x = (Tensor *) x_ptr; + Tensor *bias = (Tensor *) bias_ptr; + + deviceToHostCopy(x); + deviceToHostCopy(bias); float *__restrict__ x_data = (float *)x->host_data; float *__restrict__ bias_data = (float *)bias->host_data; @@ -794,6 +810,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) { int h = x->dims.dim_sizes[2]; int w = x->dims.dim_sizes[3]; + if (x->num_elems == bias->num_elems) { int const1 = c * h * w; int const2 = h * w; @@ -825,6 +842,7 @@ void *tensorAddCPU(void *x_ptr, void *bias_ptr) { } } + return x; } @@ -836,6 +854,8 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, int vertical_stride, int horizontal_stride) { Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *__restrict__ input_data = (float *)input->host_data; int batch_size = input->dims.dim_sizes[0]; @@ -853,8 +873,9 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, int x_radius = (window_width - 1) / 2; int y_radius = (window_height - 1) / 2; - Tensor *output = (Tensor *)create4DTensor(0, 0, batch_size, channels, + Tensor *output = (Tensor *) create4DTensor(0, 0, batch_size, channels, output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; omp_set_num_threads(4); @@ -871,10 +892,10 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, int y_radius_var_max = y_radius_var + image_height; int x_radius_var = x_radius - c; int x_radius_var_max = x_radius_var + image_width; - int ki_min = - (y_radius_var > 0) - ? ((y_radius_var < window_height) ? y_radius_var : -1) - : 0; + int ki_min = (y_radius_var > 0) + ? ((y_radius_var < window_height) ? y_radius_var : -1) + : 0; + int ki_max = (y_radius_var_max < window_height) ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1) : window_height; @@ -930,13 +951,15 @@ void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, } void *tensorTanhCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; + Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *input_data = (float *)input->host_data; size_t num_elems = input->num_elems; omp_set_num_threads(4); -#pragma omp parallel for + #pragma omp parallel for for (size_t i = 0; i < num_elems; i++) { input_data[i] = tanhf(input_data[i]); } @@ -945,17 +968,21 @@ void *tensorTanhCPU(void *input_ptr) { } void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { + Tensor *lhs = (Tensor *)lhs_ptr; Tensor *rhs = (Tensor *)rhs_ptr; + deviceToHostCopy(lhs); + deviceToHostCopy(rhs); + int m = lhs->dims.dim_sizes[0]; int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons - Tensor *output = (Tensor *)create4DTensor(0, 0, m, n, 1, 1); + Tensor *output = (Tensor *) create4DTensor(0, 0, m, n, 1, 1); - float *__restrict__ lhs_arr = (float *)lhs->host_data; - float *__restrict__ rhs_arr = (float *)rhs->host_data; - float *__restrict__ output_arr = (float *)output->host_data; + float *__restrict__ lhs_arr = (float *) lhs->host_data; + float *__restrict__ rhs_arr = (float *) rhs->host_data; + float *__restrict__ output_arr = (float *) output->host_data; int k = 1; #pragma unroll 4 // Can we unroll more??? @@ -964,6 +991,7 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { } float *tran_rhs = (float *)malloc(sizeof(float) * k * n); omp_set_num_threads(4); + #pragma omp parallel for simd for (int l = 0; l < k; l++) { for (int j = 0; j < n; j++) { @@ -982,47 +1010,71 @@ void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { output_arr[i * n + j] = sum; } } + free(tran_rhs); + return output; } void *tensorSoftmaxCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - float *logits = (float *)input->host_data; + Tensor *input = (Tensor *) input_ptr; + + deviceToHostCopy(input); + + float *logits = (float *) input->host_data; int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; + omp_set_num_threads(4); #pragma omp parallel for for (int i = 0; i < n; i++) { - float x = 0; - for (int j = i * c; j < c + i * c; j++) { - logits[j] = expf(logits[j]); + + float max = logits[i * c]; + for (unsigned int k = i * c; k < c + i * c; k++){ + if (logits[k] > max){ + max = logits[k]; + } + } + + double x = 0; + for (int j = i * c; j < c + i * c; j++) { + logits[j] = exp( logits[j] - max ); } #pragma omp simd reduction(+ : x) for (int j = i * c; j < i * c + c; j++) { x += logits[j]; } - + #pragma omp simd for (int j = i * c; j < i * c + c; j++) { logits[j] /= x; } + + //printf("logits[i * c] = %f \n ", logits[i * c]); } + return input; } void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, void *variance_ptr, double epsilon) { - Tensor *input = (Tensor *)input_ptr; - Tensor *gamma = (Tensor *)gamma_ptr; - Tensor *beta = (Tensor *)beta_ptr; - Tensor *mean = (Tensor *)mean_ptr; - Tensor *variance = (Tensor *)variance_ptr; - + Tensor *input = (Tensor *) input_ptr; + Tensor *gamma = (Tensor *) gamma_ptr; + Tensor *beta = (Tensor *) beta_ptr; + Tensor *mean = (Tensor *) mean_ptr; + Tensor *variance = (Tensor *) variance_ptr; + + deviceToHostCopy(input); + deviceToHostCopy(gamma); + deviceToHostCopy(beta); + deviceToHostCopy(mean); + deviceToHostCopy(variance); + + float *__restrict__ host_image = (float *)input->host_data; float *__restrict__ host_beta = (float *)beta->host_data; float *__restrict__ host_gamma = (float *)gamma->host_data; @@ -1056,7 +1108,10 @@ void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, } void *tensorReluCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *input_data = (float *)input->host_data; size_t num_elems = input->num_elems; @@ -1069,7 +1124,10 @@ void *tensorReluCPU(void *input_ptr) { } void *tensorRelu2CPU(void *input_ptr, float min, float max) { + Tensor *input = (Tensor *)input_ptr; + deviceToHostCopy(input); + float *input_data = (float *)input->host_data; size_t num_elems = input->num_elems; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index 253f7614337908e72c82ba986f860dd58c7c9b3f..9f69e54437766fcb8b549b7cbec1510bd6badbd7 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -52,7 +52,7 @@ void *tensorAdd(void *x_ptr, void *bias_ptr) { Tensor *x = (Tensor *)x_ptr; Tensor *bias = (Tensor *)bias_ptr; - INFO("*** TensorAdd \n"); + //INFO("*** TensorAdd \n"); profileEvent("Add"); float alpha = 1.0f; @@ -85,7 +85,8 @@ void *tensorConvolution(void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups) { - INFO("*** TensorConvolution \n"); + + //INFO("*** TensorConvolution \n"); profileEvent("Conv"); Tensor *input = (Tensor *)input_ptr; @@ -213,7 +214,6 @@ void *tensorPooling(void *input_ptr, int poolFunction, int window_height, int window_width, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride) { - INFO("*** TensorPooling \n"); profileEvent("Pool"); Tensor *input = (Tensor *)input_ptr; @@ -272,7 +272,7 @@ void *tensorPooling(void *input_ptr, int poolFunction, int window_height, * https://gist.github.com/peterwittek/6303527 */ void *tensorGemmGPU(void *lhs_ptr, void *rhs_ptr) { - INFO("*** TensorGemmGPU \n"); + //INFO("*** TensorGemmGPU \n"); profileEvent("Mul"); Tensor *lhs = (Tensor *)lhs_ptr; @@ -364,7 +364,7 @@ void *tensorRelu(void *input_ptr) { // Think: Should Softmax be broken into multiple IR operations? void *tensorSoftmax(void *input_ptr) { - INFO("*** TensorSoftmax \n"); + //INFO("*** TensorSoftmax \n"); profileEvent("Softmax"); Tensor *input = (Tensor *)input_ptr; @@ -386,7 +386,7 @@ void *tensorSoftmax(void *input_ptr) { void *tensorRelu2(void *input_ptr, float min, float max) { - INFO("*** TensorClippedRelu *** \n"); + //INFO("*** TensorClippedRelu *** \n"); profileEvent("Relu"); cudnnActivationDescriptor_t reluDesc; @@ -413,7 +413,7 @@ void *tensorRelu2(void *input_ptr, float min, float max) { void *tensorTanh(void *input_ptr) { - INFO("*** TensorTanh \n"); + //INFO("*** TensorTanh \n"); profileEvent("Tanh"); Tensor *input = (Tensor *)input_ptr; @@ -441,7 +441,7 @@ void *tensorTanh(void *input_ptr) { void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, void *variance_ptr, double epsilon) { - INFO("*** TensorBatchNorm \n"); + // INFO("*** TensorBatchNorm \n"); profileEvent("BatchNorm"); Tensor *input = (Tensor *)input_ptr; @@ -477,7 +477,7 @@ void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr, // TODO: benchmark performance of tensorSplit void **tensorSplit(void *tensor_ptr, int num_splits, int split_dim) { - INFO("*** TensorSplit \n"); + //INFO("*** TensorSplit \n"); profileEvent("tensorSplit"); Tensor *tensor = (Tensor *)tensor_ptr; @@ -533,7 +533,7 @@ void **tensorSplit(void *tensor_ptr, int num_splits, int split_dim) { void *tensorConcat(void **tensors_ptr, int num_splits, int split_dim) { - INFO("*** TensorConcat \n"); + //INFO("*** TensorConcat \n"); profileEvent("tensorConcat"); Tensor **tensors = (Tensor **)tensors_ptr; @@ -595,7 +595,7 @@ void *tensorConcat(void **tensors_ptr, int num_splits, int split_dim) { void *tensorLRN(void *input_ptr, unsigned int LRN_window, double LRN_alpha, double LRN_beta, double LRN_k) { - INFO("*** TensorLRN \n"); + //INFO("*** TensorLRN \n"); profileEvent("tensorLRN"); Tensor *input = (Tensor *)input_ptr; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu index f6bfe700b44c88fea06c6a76267b49af4a523716..4934f5834f07a37d91575a7b5821ec243762b52a 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu @@ -289,6 +289,7 @@ void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size, void *create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, size_t dim3_size, size_t dim4_size) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; allocateMem(tensor, data_type, num_elems); @@ -308,15 +309,16 @@ void *create4DTensor(int data_type, int data_format, size_t dim1_size, set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size); + changeTensorPlacement(tensor, HOST); + return tensor; } void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { - Tensor *tensor = (Tensor *)tensor_ptr; - + Tensor *tensor = (Tensor *) tensor_ptr; size_t host_size_in_bytes = tensor->num_elems * 4; - // if(tensor->size_in_bytes != size_in_bytes){ + if (host_size_in_bytes != size_in_bytes) { ERROR("The destination and source sizes don't match"); } @@ -330,29 +332,33 @@ void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { void hostToDeviceCopy(struct Tensor *tensor) { + DEBUG("** HostToDevice *** \n"); if (tensor->data_placement != DEVICE) { cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, cudaMemcpyHostToDevice); DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); tensor->data_placement = DEVICE; - } else { + } + else { DEBUG("No data movement required - Data on Device \n"); } } void deviceToHostCopy(struct Tensor *tensor) { + DEBUG("*** DeviceToHost *** "); if (tensor->data_placement != HOST) { cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes, cudaMemcpyDeviceToHost); DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); tensor->data_placement = HOST; - } else { + } + else { DEBUG("No data movement required - Data on Host \n"); } } -// void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){ + void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) { @@ -364,7 +370,8 @@ void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) { srcTensor->size_in_bytes); DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes); dstTensor->data_placement = HOST; - } else if (srcTensor->data_placement == DEVICE) { + } + else if (srcTensor->data_placement == DEVICE) { cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data, srcTensor->size_in_bytes, cudaMemcpyDeviceToDevice); DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes); @@ -382,7 +389,8 @@ void hpvm_request_tensor(void *tensor_ptr, int destination) { cudaMemcpyDeviceToHost); DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); tensor->data_placement = HOST; - } else { + } + else { DEBUG("No data movement required - Data on Host \n"); } } @@ -394,7 +402,8 @@ void hpvm_request_tensor(void *tensor_ptr, int destination) { cudaMemcpyHostToDevice); DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); tensor->data_placement = DEVICE; - } else { + } + else { DEBUG("No data movement required - Data on Device \n"); } } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu index c2e116a56fbf038628396eeb611711295a4a9170..a972d097bf29963eae34f9e3521ee6b76d1c5a9e 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu @@ -75,9 +75,7 @@ extern "C" { - -void * -wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, + void* wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, void *bias, int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, int pool_id, int pool_size, int activation_id, @@ -123,7 +121,6 @@ wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, void *activation_out; switch (activation_id) { case -1: { // No activation - // INFO("No activation Function\n"); activation_out = add_out; } break; case 0: { // TanH activation @@ -204,7 +201,7 @@ wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, std::vector< std::pair< CPUNodeConfiguration::TENSOR_OP, std::vector< std::pair<CPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = CPUConf->getApproxChoices(); + int> > > > &ApproxChoices = CPUConf->getApproxChoices(); // Check for convolution as first operation CUSTOM_ASSERT((ApproxChoices.size() >= 1) && @@ -230,7 +227,6 @@ wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, switch (activation_id) { case -1: { // No activation - INFO("No activation Function\n"); activation_out = add_out; } break; @@ -325,15 +321,14 @@ wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, return NULL; } -void *wrapper_ConvLayer2( - const char *hpvm_node_id, void *input, void *filter, void *bias, - int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v, - int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id, - // NOTE: out_min, out_max are only relevant for ClippedRelu - float out_min, float out_max) { +void *wrapper_ConvLayer2(const char *hpvm_node_id, void *input, void *filter, void *bias, + int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v, + int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id, + // NOTE: out_min, out_max are only relevant for ClippedRelu + float out_min, float out_max) { - //INFO("*** ------Conv Layer \n"); + INFO("*** ConvLayer \n"); NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); if (NodeConf->isGPUNodeConfiguration()) { @@ -345,11 +340,6 @@ void *wrapper_ConvLayer2( std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> &ApproxChoices = GPUConf->getApproxChoices(); - // printf("*** Convolution \n ApproxChoice = %d \n BatchNorm = %d \n CONV = - // %d \n", ApproxChoices[0].first, - // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, - // GPUNodeConfiguration::TENSOR_OP::CONV); - // Check for convolution as first operation CUSTOM_ASSERT( (ApproxChoices.size() >= 1) && @@ -377,7 +367,6 @@ void *wrapper_ConvLayer2( void *activation_out; switch (activation_id) { case -1: { // No activation - // INFO("No activation Function\n"); activation_out = add_out; } break; case 0: { // TanH activation @@ -531,7 +520,6 @@ void *wrapper_ConvLayer2( } void* pool_out; - if (pool_size_v > 0) { switch (pool_id) { case 0: @@ -599,6 +587,8 @@ wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights, // NOTE: out_min and out_max are only relevant for ClippedRelu float out_min, float out_max) { + INFO("*** DenseLayer \n"); + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); if (NodeConf->isGPUNodeConfiguration()) { DEBUG("GPU Configuration for FCLayer\n"); @@ -628,7 +618,6 @@ wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights, CUSTOM_ASSERT( (ApproxChoices.size() == 2) && "Incorrect number of operations in provided FC layer configuration"); - // INFO("No activation Function\n"); activation_out = add_out; } break; case 0: { // TanH activation @@ -739,7 +728,7 @@ wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights, void *wrapper_tensorRelu(const char *hpvm_node_id, void *input_ptr) { - INFO("*** Relu Operation \n"); + INFO("*** TensorRelu \n"); NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); @@ -893,10 +882,8 @@ void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, void *variance_ptr, double epsilon) { - INFO("*** BatchNorm Operation \n"); - + INFO("*** TensorBatchNorm \n"); NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); - if (NodeConf->isGPUNodeConfiguration()) { // mapped to GPU - get a GPU configuration @@ -907,14 +894,8 @@ void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr, std::pair<GPUNodeConfiguration::TENSOR_OP, std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> &ApproxChoices = - GPUConf->getApproxChoices(); - // printf("*** BatchNorm \n ApproxChoice = %d \n BatchNorm = %d \n CONV = %d - // \n", ApproxChoices[0].first, - // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, - // GPUNodeConfiguration::TENSOR_OP::CONV); - // Approximation choices must be for a batchnorm operation CUSTOM_ASSERT( ApproxChoices.size() == 1 && @@ -955,6 +936,8 @@ void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr, void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr, void *bias_ptr) { + INFO("*** TensorAdd \n"); + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); if (NodeConf->isGPUNodeConfiguration()) { @@ -981,7 +964,7 @@ void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr, } else if (NodeConf->isCPUNodeConfiguration()) { DEBUG("Add operation: CPU Configuration\n"); // Mapped to CPU - get a CPU configuration - CPUNodeConfiguration *CPUConf = (CPUNodeConfiguration *)NodeConf; + CPUNodeConfiguration *CPUConf = (CPUNodeConfiguration *) NodeConf; std::vector< std::pair< CPUNodeConfiguration::TENSOR_OP, std::vector< std::pair<CPUNodeConfiguration::APPROX, @@ -994,7 +977,7 @@ void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr, "Invalid configuration generated for tensor add wrapper operation"); return handleTensorAddApproximationTuples_CPU(ApproxChoices[0].second, - input_ptr, bias_ptr); + input_ptr, bias_ptr); } else { ERROR("Unsupported Configuration"); abort(); @@ -1008,7 +991,7 @@ void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr, int horizontal_pad, int vertical_stride, int horizontal_stride) { - INFO("*** TensorPooling Operation \n"); + INFO("*** TensorPooling \n"); NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); @@ -1084,6 +1067,9 @@ void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input, int horizontal_pad, int vertical_stride, int horizontal_stride, int conv_mode, int conv_groups) { + + INFO("*** TensorGroupConv \n"); + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); if (NodeConf->isGPUNodeConfiguration()) { @@ -1137,8 +1123,9 @@ void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input, } void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr) { - // return tensorSoftmax(input_ptr); + INFO("*** TensorSoftmax \n "); + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); if (NodeConf->isGPUNodeConfiguration()) {