From 2cdc4519ab21ec7982ffc5559c836313f10d655d Mon Sep 17 00:00:00 2001 From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu> Date: Sat, 25 Jul 2020 23:34:53 -0500 Subject: [PATCH] Removing unused code from tensor_runtime.cu --- .../dnn_sources/src/unit_tests.cc | 31 -- .../tensor_runtime/src/tensor_runtime.cu | 311 +----------------- 2 files changed, 12 insertions(+), 330 deletions(-) diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc index 954e54a503..3b08755172 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc @@ -80,37 +80,6 @@ public: -void testTensorGemm(){ - - printf("***** TensorSgemm ***** \n\n"); - void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); - struct Tensor* lhs = (struct Tensor*) lhs_ptr; - fillTensorWithOnes(lhs); - - float* data_arr = (float*) lhs->host_data; - for(int i = 0; i < lhs->num_elems; i++){ - data_arr[i] = (i / 4) + 1; - } - - void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); - fillTensorWithOnes(rhs); - - void* output = tensorGemmCPU(lhs, rhs); - printTensorValues(output); - - void* bias_ptr = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 3, 1, 1); - struct Tensor* bias = (struct Tensor*) bias_ptr; - fillTensorWithOnes(bias); - - float* bias_arr = (float*) bias->host_data; - for(int i = 0; i < bias->num_elems; i++){ - bias_arr[i] = i + 1; - } - - void* output2 = tensorAdd(output, bias); - printTensorValues(output2); -} - void testTensorHgemm(UnitTestResults& unitTestResults){ diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index a80b2f726f..59d3b56183 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -45,9 +45,6 @@ #include "half_precision_api.h" #include "approx_simulation.h" -//** Potential Improvements: -// 1) Add support for dataypes beyond floats and half -// 2) Support for more CUDNN operations @@ -62,12 +59,17 @@ void llvm_hpvm_initTensorRt(int gpuid){ checkCudaErrors(cublasCreate(&cublasHandle)); checkCUDNN(cudnnCreate(&cudnnHandle)); + printf("CREATED HANDLES %d \n", gpuid); + #ifdef PROMISE_TUNER_ENABLED // readOpenTunerFlags("opentuner_flags"); + readOpenTunerFlags("promise_flags"); initializeAutotuner(); - + + printf("Read PROMISE FLAGS %d \n", gpuid); + #endif @@ -78,7 +80,9 @@ void llvm_hpvm_initTensorRt(int gpuid){ runtime_initialized = true; } - + + printf("DONE INTIALIZING GPU %d \n", gpuid); + } @@ -673,77 +677,9 @@ void* tensorPooling(void* input_ptr, -void* tensorGemmCPU(void* lhs_ptr, void* rhs_ptr){ - - INFO("*** TensorGemmCPU \n"); - - Tensor* lhs = (Tensor*) lhs_ptr; - Tensor* rhs = (Tensor*) rhs_ptr; - - // The operation is done on the CPU - deviceToHostCopy(lhs); - deviceToHostCopy(rhs); - - if(lhs->data_type != CUDNN_DATA_FLOAT){ - ERROR("Currently only Floating point is supported "); - } - - profileEvent("tensorGemmCPU"); - - INFO("rhs->dims.num_dims = %d \n", rhs->dims.num_dims); - INFO("lhs->dims.num_dims = %d \n", lhs->dims.num_dims); - - // FIXIT: Need to be more aware of the implications of alpha and beta - //float alpha = 1.0f; - // float beta = 0.0f; - // 'm' holds the batch dimension - assuming NCHW format Tensors - int m = lhs->dims.dim_sizes[0]; - // The rhs must be a 2D tensor - int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons - int k = 1; - // Flattening the dimensions after the batch dimension - // NOTE: Allowing any number of dimensions > 2 for lhs - for (int j = 1 ; j < lhs->dims.num_dims; j++){ - k = k * lhs->dims.dim_sizes[j]; // input neurons - } - - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; - // Dimension-note: Check if k is same across the two tensors - INFO("m = %d, n = %d, k = %d \n", m, n, k); - if(rhs_k != k){ - ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k); - } - - // NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines - Tensor* output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1); - // Changing output tensor placement from host to device - changeTensorPlacement(output, HOST); - - float* lhs_arr = (float*) lhs->host_data; - float* rhs_arr = (float*) rhs->host_data; - float* output_arr = (float*) output->host_data; - - for(int i = 0; i < m; i++){ - for(int j = 0; j < n; j++){ - float sum = 0.0; - for(int l = 0; l < k; l++){ - float mul = lhs_arr[i*k+l] * rhs_arr[l*n+j]; - sum = sum + mul; - } - output_arr[i*n+j] = sum; - } - } - - - profileEvent("tensorGemmCPU_end", true); - - return output; -} - - -// Reference: https://gist.github.com/peterwittek/6303527 -void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ //, void* result_tensor){ +/* Reference Implementation based on: https://gist.github.com/peterwittek/6303527 */ +void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ INFO("*** TensorGemmGPU \n"); profileEvent("Mul"); @@ -780,16 +716,7 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ //, void* result_tensor){ DEBUG("Creating new TENSOR * \n"); output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1); - - /* else{ - DEBUG("Reusing TENSOR *\n"); - // FIXIT: Add Assertion to check for null pointer and dimension matching - output = (Tensor*) result_tensor; - // FIXIT: output value is trashing - Is this deallocated? - INFO("output->num_elems = %lu \n", output->data_type); - } - */ - + DEBUG("Changing placement *\n"); // Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); @@ -846,107 +773,6 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ //, void* result_tensor){ - -void* tensorGemm(void* lhs_ptr, void* rhs_ptr){ - - INFO("*** TensorGemm \n"); - profileEvent("tensorGemm"); - - Tensor* lhs = (Tensor*) lhs_ptr; - Tensor* rhs = (Tensor*) rhs_ptr; - - INFO("rhs->dims.num_dims = %d \n", rhs->dims.num_dims); - INFO("lhs->dims.num_dims = %d \n", lhs->dims.num_dims); - - // FIXIT: Need to be more aware of the implications of alpha and beta - float alpha = 1.0f, beta = 0.0f; - // 'm' holds the batch dimension - assuming NCHW format Tensors - int m = lhs->dims.dim_sizes[0]; - // The rhs last dimension must contain the neurons - int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons - int k = 1; - // Flattening the dimensions after the batch dimension - // NOTE: Allowing any number of dimensions > 2 for lhs - for (int j = 1 ; j < lhs->dims.num_dims; j++){ - k = k * lhs->dims.dim_sizes[j]; // input neurons - } - - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; - // Dimension-note: Check if k is same across the two tensors - INFO("m = %d, n = %d, k = %d \n", m, n, k); - if(rhs_k != k){ - ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k); - } - - // NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines - Tensor* output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1); - // Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); - - hostToDeviceCopy(lhs); - hostToDeviceCopy(rhs); - - // NOTE: cuBlas uses column-major format - // NOTE: The leading dimension is the FIRST Dimension - // NOTE: The output is N * M in column-major format, M*N in row-major - what cuDNN expects - checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, - n, m, k, - &alpha, - (float*) rhs->gpu_data, k, - (float*) lhs->gpu_data, k, - &beta, - (float*) output->gpu_data, n)); - - profileEvent("tensorGemm_end", true); - - return output; -} - - - - -// FIXIT: Add dimension check assertions throughout the code -void* tensorGemmBias(void* input_ptr, void* bias_ptr){ - - INFO("*** TensorGemmBias \n"); - profileEvent("tensorGemmBias"); - - Tensor* input = (Tensor*) input_ptr; - Tensor* bias = (Tensor*) bias_ptr; - - // NOTE: beta is set to 1 to append to input - // C = A * B + Beta * C - float alpha = 1.0f, beta = 1.0f; - // 'm' holds the batch dimension - assuming NCHW format Tensors - int m = input->dims.dim_sizes[0]; - // The bias must be a 2D tensor - int n = bias->dims.dim_sizes[bias->dims.num_dims - 1]; // output neurons - - INFO("m = %d, n = %d \n", m, n); - - hostToDeviceCopy(input); - hostToDeviceCopy(bias); - - struct Tensor* onevec = (Tensor*) create2DTensor(CUDNN_DATA_FLOAT, m, 1); - fillOnes(onevec); - hostToDeviceCopy(onevec); - - // NOTE: cuBlas uses column-major format - // NOTE: The leading dimension is just the FIRST Dimension - checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n, m, 1, - &alpha, - (float*) bias->gpu_data, n, - (float*) onevec->gpu_data, 1, - &beta, - (float*) input->gpu_data, n)); - - profileEvent("tensorGemmBias_end", true); - - return input; -} - - void* tensorRelu(void* input_ptr){ INFO("*** TensorRelu \n"); @@ -1019,17 +845,6 @@ void* tensorSoftmax(void* input_ptr){ -__global__ void clipValues(float* A, float min, float max, int n){ - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - if(id < n){ - A[id] = fmaxf(min, A[id]); - A[id] = fminf(max, A[id]); - } -} - - void* tensorRelu2(void* input_ptr, float min, float max){ @@ -1184,105 +999,3 @@ void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, - -/************* GPU Layer API *************/ - -void* ConvLayer_GPU(void* input, - void* filter, - void* bias, - int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max){ // NOTE: min_val, max_val apply to 'ClippedRelu' - - void* conv_out = tensorConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); - void* conv_add; - if(bias != NULL){ - conv_add = tensorAdd(conv_out, bias); - } - else{ - conv_add = conv_out; - } - - void* activation_out; - switch(activation_id){ - case -1: - activation_out = conv_add; - INFO("NO Activation Function \n"); - break; - case 0: - activation_out = tensorTanh(conv_add); - break; - case 1: - activation_out = tensorRelu(conv_add); - break; - case 2: - activation_out = tensorRelu2(conv_add, out_min, out_max); - break; - default: - ERROR("Activation id %d NOT supported \n", activation_out); - break; - } - - - void* pool_out = activation_out; - // NOTE: Skip pooling on negative pool sizes - if(pool_size > 0){ - //FIXME: Currently only using MaxPooling - pool_out = tensorPooling(activation_out, 0, pool_size, pool_size, 0, 0, pool_size, pool_size); - } - else{ - pool_out = activation_out; - } - - return pool_out; -} - - -void* FCLayer_GPU(void* input, - void* weights, - void* bias, - int activation_id, - float out_min, float out_max){ // NOTE: min_val, max_val apply to 'ClippedRelu' - - void* gemm_out = tensorGemmGPU(input, weights); - - void* gemmbias_out; - if(bias != NULL){ - gemmbias_out = tensorAdd(gemm_out, bias); - } - else{ - gemmbias_out = gemm_out; - } - - void* activation_out; - switch(activation_id){ - - case -1: - activation_out = gemmbias_out; - INFO("No Activation Function \n"); - break; - case 0: - activation_out = tensorTanh(gemmbias_out); - break; - case 1: - activation_out = tensorRelu(gemmbias_out); - break; - case 2: - activation_out = tensorRelu2(gemmbias_out, out_min, out_max); - break; - default: - ERROR("Activation id %d NOT supported \n", activation_out); - break; - } - - return activation_out; -} - - - - - -- GitLab