Adding BatchNorm to Tensor Runtime

73c2b68a · Hashim Sharif · 1f4e4528 · 73c2b68a · 73c2b68a · 73c2b68a
Commit 73c2b68a authored 5 years ago by Hashim Sharif
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc
@@ -238,6 +238,11 @@ void testTensorConv3(){
 }


+
+
+
+
+
 void testLRN(){

  void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20);
@@ -317,10 +322,10 @@ void testTensorError(){
 void testTensorConv(){

  // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same
-  void* x3 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 1, 4, 4);
+  void* x3 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
  // NOTE: Filter descriptors do NOT have batch size
  // NOTE: First two dims are output channels (configurable), input channels (MUST match input channels)
-  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, 1, 2, 2);
+  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2);
  fillTensorWithOnes(x3);
  fillTensorWithOnes(filter);

@@ -331,11 +336,31 @@ void testTensorConv(){
 				  1, 1, conv_mode, compute_precision);
  printTensorValues(conv1);

-  // NOTE: For cudnnTensorAdd, the only dimension that MUST match is channels  
-  void* bias3 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
-  fillTensorWithOnes(bias3);
-  tensorAdd(conv1, bias3);
+}
+
+
+void testTensorGroupedConv(){
+
+  // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same
+  void* x3 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4);
+  // NOTE: Filter descriptors do NOT have batch size
+  // NOTE: First two dims are output channels (configurable), input channels (MUST match input channels)
+  void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2);
+  fillTensorWithOnes(x3);
+  fillTensorWithOnes(filter);
+
+  int conv_mode = 1; // NOTE: uses CROSS_CORRELATION
+  int conv_groups = 2;
+  
+  void* conv1 = tensorConvolution(x3, filter, 0, 0,
+				  1, 1, conv_mode, conv_groups);
  printTensorValues(conv1);
+
+  // NOTE: For cudnnTensorAdd, the only dimension that MUST match is channels  
+  //void* bias3 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  // fillTensorWithOnes(bias3);
+  //tensorAdd(conv1, bias3);
+  //printTensorValues(conv1);
 }


@@ -347,6 +372,28 @@ void testTensorPool(){
 }


+void testTensorBatchNorm(){
+
+  void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2);
+  fillTensorWithVal(x, 3);
+
+  void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  fillTensorWithVal(gamma, 1);
+
+  void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  fillTensorWithVal(beta, 0);
+
+  void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  fillTensorWithVal(mean, 1);
+
+  void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1);
+  fillTensorWithVal(variance, 1);
+
+  
+  void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 0.01);  
+  printTensorValues(output);  
+}
+

 void testTensorRelu(){

@@ -438,8 +485,13 @@ int main(){
  //testTensorConv();
  //testTensorError();

-  testQuantization();
+  //testQuantization();
+
+  //testTensorConv();
+  //testTensorGroupedConv();

+  testTensorBatchNorm();
+  
  //testTensorGemm();
  //testTensorGemmGPU();
  //testTensorGemmBias();  

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
@@ -100,6 +100,11 @@ extern "C"{
  // NOTE: In-place operation
  void* tensorSoftmax(void* input);

+  // NOTE: In-place operation
+  void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
+			void* mean_ptr, void* variance_ptr, double epsilon);
+
+  
  /* Error injection API - used for accuracy tuning */
  void* tensorAddError(void* x_ptr, int error_scale);
  

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc
@@ -34,6 +34,7 @@ void dummyFunction(){
  void* tensorTanhPtr = (void*) &tensorTanh;
  void* tensorHalfTanhPtr = (void*) &tensorHalfTanh;
  void* tensorSoftmaxPtr = (void*) &tensorSoftmax;
+  void* tensorBatchNormPtr = (void*) &tensorBatchNorm;
  void* tensorAddErrorPtr = (void*) &tensorAddError;
  void* ConvLayer = (void*) &ConvLayer_PROMISE;
  void* FCLayer = (void*) &FCLayer_PROMISE;    

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -1090,6 +1090,58 @@ void* tensorTanh(void* input_ptr){



+
+void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr,
+		      void* mean_ptr, void* variance_ptr, double epsilon){
+
+  INFO("*** TensorBatchNorm \n");
+  profileEvent("tensorBatchNorm");
+
+  Tensor* input = (Tensor*) input_ptr;
+  Tensor* gamma = (Tensor*) gamma_ptr;
+  Tensor* beta = (Tensor*) beta_ptr;
+  Tensor* mean = (Tensor*) mean_ptr;
+  Tensor* variance = (Tensor*) variance_ptr;
+  
+  float alpha_val = 1.0f, beta_val = 0.0f;
+  hostToDeviceCopy(input);
+  hostToDeviceCopy(gamma);
+  hostToDeviceCopy(beta);
+  hostToDeviceCopy(mean);
+  hostToDeviceCopy(variance);
+
+
+  checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL,
+						     &alpha_val, &beta_val,
+						     input->tensor_desc, input->gpu_data,
+						     input->tensor_desc, input->gpu_data,
+						     gamma->tensor_desc, gamma->gpu_data,
+						     beta->gpu_data, mean->gpu_data, variance->gpu_data,
+						     epsilon));
+
+  profileEvent("tensorBatchNorm_end", true);
+
+
+  #ifdef ERROR_INJECTION_ENABLED
+  
+  if(op_counter >= total_ops){
+    ERROR("No accuracy flag found \n");
+  }
+  
+  int op_acc = op_accuracies[op_counter];
+  void* error_norms = tensorAddError(input, op_acc);
+  add_norms(error_norms, "tensorBatchNorm", op_acc);
+  add_relu_overheads(input, op_acc);
+  op_counter++;  
+  #endif
+  
+
+  return input;
+}
+
+
+
+
 /************* GPU Layer API  *************/

 void* ConvLayer_GPU(void* input,