Skip to content
Snippets Groups Projects
Commit 2cdc4519 authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Removing unused code from tensor_runtime.cu

parent d7c6508a
No related branches found
No related tags found
No related merge requests found
......@@ -80,37 +80,6 @@ public:
void testTensorGemm(){
printf("***** TensorSgemm ***** \n\n");
void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1);
struct Tensor* lhs = (struct Tensor*) lhs_ptr;
fillTensorWithOnes(lhs);
float* data_arr = (float*) lhs->host_data;
for(int i = 0; i < lhs->num_elems; i++){
data_arr[i] = (i / 4) + 1;
}
void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3);
fillTensorWithOnes(rhs);
void* output = tensorGemmCPU(lhs, rhs);
printTensorValues(output);
void* bias_ptr = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 3, 1, 1);
struct Tensor* bias = (struct Tensor*) bias_ptr;
fillTensorWithOnes(bias);
float* bias_arr = (float*) bias->host_data;
for(int i = 0; i < bias->num_elems; i++){
bias_arr[i] = i + 1;
}
void* output2 = tensorAdd(output, bias);
printTensorValues(output2);
}
void testTensorHgemm(UnitTestResults& unitTestResults){
......
......@@ -45,9 +45,6 @@
#include "half_precision_api.h"
#include "approx_simulation.h"
//** Potential Improvements:
// 1) Add support for dataypes beyond floats and half
// 2) Support for more CUDNN operations
......@@ -62,12 +59,17 @@ void llvm_hpvm_initTensorRt(int gpuid){
checkCudaErrors(cublasCreate(&cublasHandle));
checkCUDNN(cudnnCreate(&cudnnHandle));
printf("CREATED HANDLES %d \n", gpuid);
#ifdef PROMISE_TUNER_ENABLED
// readOpenTunerFlags("opentuner_flags");
readOpenTunerFlags("promise_flags");
initializeAutotuner();
printf("Read PROMISE FLAGS %d \n", gpuid);
#endif
......@@ -78,7 +80,9 @@ void llvm_hpvm_initTensorRt(int gpuid){
runtime_initialized = true;
}
printf("DONE INTIALIZING GPU %d \n", gpuid);
}
......@@ -673,77 +677,9 @@ void* tensorPooling(void* input_ptr,
void* tensorGemmCPU(void* lhs_ptr, void* rhs_ptr){
INFO("*** TensorGemmCPU \n");
Tensor* lhs = (Tensor*) lhs_ptr;
Tensor* rhs = (Tensor*) rhs_ptr;
// The operation is done on the CPU
deviceToHostCopy(lhs);
deviceToHostCopy(rhs);
if(lhs->data_type != CUDNN_DATA_FLOAT){
ERROR("Currently only Floating point is supported ");
}
profileEvent("tensorGemmCPU");
INFO("rhs->dims.num_dims = %d \n", rhs->dims.num_dims);
INFO("lhs->dims.num_dims = %d \n", lhs->dims.num_dims);
// FIXIT: Need to be more aware of the implications of alpha and beta
//float alpha = 1.0f;
// float beta = 0.0f;
// 'm' holds the batch dimension - assuming NCHW format Tensors
int m = lhs->dims.dim_sizes[0];
// The rhs must be a 2D tensor
int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
int k = 1;
// Flattening the dimensions after the batch dimension
// NOTE: Allowing any number of dimensions > 2 for lhs
for (int j = 1 ; j < lhs->dims.num_dims; j++){
k = k * lhs->dims.dim_sizes[j]; // input neurons
}
int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
// Dimension-note: Check if k is same across the two tensors
INFO("m = %d, n = %d, k = %d \n", m, n, k);
if(rhs_k != k){
ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
}
// NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines
Tensor* output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1);
// Changing output tensor placement from host to device
changeTensorPlacement(output, HOST);
float* lhs_arr = (float*) lhs->host_data;
float* rhs_arr = (float*) rhs->host_data;
float* output_arr = (float*) output->host_data;
for(int i = 0; i < m; i++){
for(int j = 0; j < n; j++){
float sum = 0.0;
for(int l = 0; l < k; l++){
float mul = lhs_arr[i*k+l] * rhs_arr[l*n+j];
sum = sum + mul;
}
output_arr[i*n+j] = sum;
}
}
profileEvent("tensorGemmCPU_end", true);
return output;
}
// Reference: https://gist.github.com/peterwittek/6303527
void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ //, void* result_tensor){
/* Reference Implementation based on: https://gist.github.com/peterwittek/6303527 */
void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){
INFO("*** TensorGemmGPU \n");
profileEvent("Mul");
......@@ -780,16 +716,7 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ //, void* result_tensor){
DEBUG("Creating new TENSOR * \n");
output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1);
/* else{
DEBUG("Reusing TENSOR *\n");
// FIXIT: Add Assertion to check for null pointer and dimension matching
output = (Tensor*) result_tensor;
// FIXIT: output value is trashing - Is this deallocated?
INFO("output->num_elems = %lu \n", output->data_type);
}
*/
DEBUG("Changing placement *\n");
// Changing output tensor placement from host to device
changeTensorPlacement(output, DEVICE);
......@@ -846,107 +773,6 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ //, void* result_tensor){
void* tensorGemm(void* lhs_ptr, void* rhs_ptr){
INFO("*** TensorGemm \n");
profileEvent("tensorGemm");
Tensor* lhs = (Tensor*) lhs_ptr;
Tensor* rhs = (Tensor*) rhs_ptr;
INFO("rhs->dims.num_dims = %d \n", rhs->dims.num_dims);
INFO("lhs->dims.num_dims = %d \n", lhs->dims.num_dims);
// FIXIT: Need to be more aware of the implications of alpha and beta
float alpha = 1.0f, beta = 0.0f;
// 'm' holds the batch dimension - assuming NCHW format Tensors
int m = lhs->dims.dim_sizes[0];
// The rhs last dimension must contain the neurons
int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons
int k = 1;
// Flattening the dimensions after the batch dimension
// NOTE: Allowing any number of dimensions > 2 for lhs
for (int j = 1 ; j < lhs->dims.num_dims; j++){
k = k * lhs->dims.dim_sizes[j]; // input neurons
}
int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2];
// Dimension-note: Check if k is same across the two tensors
INFO("m = %d, n = %d, k = %d \n", m, n, k);
if(rhs_k != k){
ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k);
}
// NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines
Tensor* output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1);
// Changing output tensor placement from host to device
changeTensorPlacement(output, DEVICE);
hostToDeviceCopy(lhs);
hostToDeviceCopy(rhs);
// NOTE: cuBlas uses column-major format
// NOTE: The leading dimension is the FIRST Dimension
// NOTE: The output is N * M in column-major format, M*N in row-major - what cuDNN expects
checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N,
n, m, k,
&alpha,
(float*) rhs->gpu_data, k,
(float*) lhs->gpu_data, k,
&beta,
(float*) output->gpu_data, n));
profileEvent("tensorGemm_end", true);
return output;
}
// FIXIT: Add dimension check assertions throughout the code
void* tensorGemmBias(void* input_ptr, void* bias_ptr){
INFO("*** TensorGemmBias \n");
profileEvent("tensorGemmBias");
Tensor* input = (Tensor*) input_ptr;
Tensor* bias = (Tensor*) bias_ptr;
// NOTE: beta is set to 1 to append to input
// C = A * B + Beta * C
float alpha = 1.0f, beta = 1.0f;
// 'm' holds the batch dimension - assuming NCHW format Tensors
int m = input->dims.dim_sizes[0];
// The bias must be a 2D tensor
int n = bias->dims.dim_sizes[bias->dims.num_dims - 1]; // output neurons
INFO("m = %d, n = %d \n", m, n);
hostToDeviceCopy(input);
hostToDeviceCopy(bias);
struct Tensor* onevec = (Tensor*) create2DTensor(CUDNN_DATA_FLOAT, m, 1);
fillOnes(onevec);
hostToDeviceCopy(onevec);
// NOTE: cuBlas uses column-major format
// NOTE: The leading dimension is just the FIRST Dimension
checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N,
n, m, 1,
&alpha,
(float*) bias->gpu_data, n,
(float*) onevec->gpu_data, 1,
&beta,
(float*) input->gpu_data, n));
profileEvent("tensorGemmBias_end", true);
return input;
}
void* tensorRelu(void* input_ptr){
INFO("*** TensorRelu \n");
......@@ -1019,17 +845,6 @@ void* tensorSoftmax(void* input_ptr){
__global__ void clipValues(float* A, float min, float max, int n){
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < n){
A[id] = fmaxf(min, A[id]);
A[id] = fminf(max, A[id]);
}
}
void* tensorRelu2(void* input_ptr, float min, float max){
......@@ -1184,105 +999,3 @@ void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
/************* GPU Layer API *************/
void* ConvLayer_GPU(void* input,
void* filter,
void* bias,
int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w,
int pool_id, int pool_size,
int activation_id, // Relu, Tanh, ClipRelu
float out_min, float out_max){ // NOTE: min_val, max_val apply to 'ClippedRelu'
void* conv_out = tensorConvolution(input, filter,
conv_pad_h, conv_pad_w,
conv_stride_h, conv_stride_w,
1, 0);
void* conv_add;
if(bias != NULL){
conv_add = tensorAdd(conv_out, bias);
}
else{
conv_add = conv_out;
}
void* activation_out;
switch(activation_id){
case -1:
activation_out = conv_add;
INFO("NO Activation Function \n");
break;
case 0:
activation_out = tensorTanh(conv_add);
break;
case 1:
activation_out = tensorRelu(conv_add);
break;
case 2:
activation_out = tensorRelu2(conv_add, out_min, out_max);
break;
default:
ERROR("Activation id %d NOT supported \n", activation_out);
break;
}
void* pool_out = activation_out;
// NOTE: Skip pooling on negative pool sizes
if(pool_size > 0){
//FIXME: Currently only using MaxPooling
pool_out = tensorPooling(activation_out, 0, pool_size, pool_size, 0, 0, pool_size, pool_size);
}
else{
pool_out = activation_out;
}
return pool_out;
}
void* FCLayer_GPU(void* input,
void* weights,
void* bias,
int activation_id,
float out_min, float out_max){ // NOTE: min_val, max_val apply to 'ClippedRelu'
void* gemm_out = tensorGemmGPU(input, weights);
void* gemmbias_out;
if(bias != NULL){
gemmbias_out = tensorAdd(gemm_out, bias);
}
else{
gemmbias_out = gemm_out;
}
void* activation_out;
switch(activation_id){
case -1:
activation_out = gemmbias_out;
INFO("No Activation Function \n");
break;
case 0:
activation_out = tensorTanh(gemmbias_out);
break;
case 1:
activation_out = tensorRelu(gemmbias_out);
break;
case 2:
activation_out = tensorRelu2(gemmbias_out, out_min, out_max);
break;
default:
ERROR("Activation id %d NOT supported \n", activation_out);
break;
}
return activation_out;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment