From 85e4b3b92594878456a448a5e88581f7fd8aa169 Mon Sep 17 00:00:00 2001
From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu>
Date: Thu, 28 Feb 2019 16:47:04 -0600
Subject: [PATCH] Batchifying VGG execution

---
 .../dnn_sources/include/utils.h               | 71 ++++++++++++++++++-
 .../dnn_sources/src/vgg16_cifar10.cc          | 17 +++--
 .../tensor_runtime/include/global_data.h      |  8 ++-
 .../tensor_runtime/include/profiling.h        |  5 +-
 .../tensor_runtime/include/tensor_runtime.h   |  2 +
 .../tensor_runtime/src/tensor_runtime.cu      | 36 +++++++---
 6 files changed, 119 insertions(+), 20 deletions(-)

diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
index ec28fccb78..8004c4a423 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
@@ -254,6 +254,45 @@ struct Tensor* readTrainedWeights(const char* file_name, int data_type,
 }
 
 
+
+
+struct Tensor* readInputBatch(const char* file_name, int data_type,
+			      int start, int end,
+			      int dim2_size, int dim3_size, int dim4_size){
+
+  int dim1_size = end - start;
+  // FIXIT: Don't assume floating point types
+  int type_size = 4; // NOTE: Assuming floating point tensors
+  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
+  
+  FILE* file = fopen(file_name, "rb");
+  if(file == NULL){
+    printf("Data file %s is not found. Aborting... \n", file_name);
+    abort();
+  }
+    
+  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
+  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
+
+  printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
+
+  fclose(file);
+  
+  
+  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
+					                   dim3_size, dim4_size);
+  
+  initTensorData(weights, tensor_data, size_in_bytes);
+  free(tensor_data);
+
+  return weights;
+}
+
+
+
 uint8_t* readLabels(const char* labels_file, int num_labels){
 
   uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
@@ -265,9 +304,33 @@ uint8_t* readLabels(const char* labels_file, int num_labels){
 
   size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
 
-  for(unsigned int i = 0 ; i < 20; i++){
+  fclose(file);
+  
+  printf("--labels bytes_read = %lu \n", bytes_read);
+  return labels;
+}
+
+
+uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
+
+  int num_labels = end - start;
+  int file_header_size = sizeof(uint8_t) * start;
+  
+  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
+  FILE* file = fopen(labels_file, "rb");
+  if(file == NULL){
+    printf("Data file %s is not found. Aborting...\n", labels_file);
+    abort();
+  }
+  
+  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
+    
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
+
+  /*for(unsigned int i = 0 ; i < 20; i++){
     printf("labels[%d] = %u \n", i, labels[i]);
   }
+  */
 
   fclose(file);
   
@@ -331,7 +394,9 @@ float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsign
 
   printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
   
-  for(int i = 0; i < batch_dim; i++){
+  //for(int i = 0; i < batch_dim; i++){
+  for(int i = 0; i < num_labels; i++){
+  
     int chosen = 0;
     for (int id = 1; id < num_classes; ++id){
       if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
@@ -342,6 +407,8 @@ float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsign
       
     if(chosen != labels[i])
       num_errors++;
+
+    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
   }
 
   float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc
index f03b103938..d12060a648 100644
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc
@@ -15,9 +15,7 @@ int main(){
 
   std::string dir_prefix = std::string("../model_params/vgg16_cifar10_2/"); 
   std::string input_path =  dir_prefix + std::string("input.bin"); 
-  void* input = readTrainedWeights(input_path.c_str(), 0,1000,3,32,32); 
   std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  uint8_t* labels = readLabels(labels_path.c_str(),10000); 
   std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
   void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
   std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
@@ -82,8 +80,17 @@ int main(){
 
   startMemTracking();
 
-  for(int i = 0; i < 20; i++){
+  int test_input_size = 10000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+
+  for(int i = 0; i < batch_count; i++){
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
     
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); 
+ 
     void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
     void* var_1 = tensorAdd(var_0, conv2d_1_b); 
     void* var_2 = tensorRelu(var_1); 
@@ -135,7 +142,9 @@ int main(){
     void* var_59 = tensorAdd(var_58, dense_2_b); 
     void* var_60 = tensorSoftmax(var_59); 
 
-    computeAccuracy2(labels,10000,var_60); 
+    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+    computeAccuracy2(labels,batch_size,var_60); 
 
     freeBatchMemory();
   }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
index a31c7f4081..c31a982857 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
@@ -15,7 +15,8 @@
 #include <cudnn.h>
 #include <cublas_api.h>
 #include "tensor.h"
-
+#include <string>
+#include <unordered_map>
 
 #define ERROR_INJECTION_ENABLED 0
 #define PROMISE_MODE 1
@@ -44,4 +45,9 @@ std::vector<void*> host_ptr;
 std::vector<void*> obj_ptr;
 
 
+// Profiling Data
+std::unordered_map<std::string, int> func_counters;
+std::string profile_data = "";
+
+
 #endif
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h
index ba40ac53b5..7075d51cd6 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h
@@ -16,12 +16,11 @@
 #include <unordered_map>
 #include <cuda_runtime.h>
 
+#include "global_data.h"
 
 
 /***** Profiling routines ***/
 
-std::unordered_map<std::string, int> func_counters;
-std::string profile_data = "";
 
 std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
 // previous_time maintains time for the latest timed operation
@@ -46,7 +45,7 @@ extern "C"{
   }
 
 
-  void profileEvent(char* event_name, bool compare_previous = false){
+  void profileEvent(const char* event_name, bool compare_previous = false){
 
     checkCudaErrors(cudaDeviceSynchronize());
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
index ba49976406..bac93a6f01 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
@@ -143,7 +143,9 @@ void dumpAccuracyNorms();
 void readOpenTunerFlags(const char* file_name);
 void clearOpCounter();
 void clearTensorMap();
+void startMemTracking();
 void freeOutputTensors();
+void freeBatchMemory();
 void* quantizeTensorPromise(void* input_ptr, float min, float max);
 void* addPromiseError(void* x_ptr, int error_scale);
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
index 5b9177f34a..8d3fe1d48c 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -114,6 +114,14 @@ void clearTensorMap(){
 }
 
 
+void startMemTracking(){
+
+  tensors_ptr.clear();
+  host_ptr.clear();
+  obj_ptr.clear();
+}
+
+
 void freeOutputTensors(){
 
   for(int i = 0; i < tensors_ptr.size(); i++){
@@ -141,6 +149,15 @@ void clearOpCounter(){
 
 
 
+void freeBatchMemory(){
+  // Free allocated memory for the current mini-batch
+  freeOutputTensors();
+  // Reinitialize couter for OpenTuner flags - next mini-batch of execution
+  op_counter = 0;
+  // Clearing profiling data map
+  func_counters.clear();
+}
+
 
 
 
@@ -372,8 +389,8 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
 			int vertical_stride, int horizontal_stride,
 			int conv_mode, int compute_precision){
 
+  
   llvm_hpvm_initTensorRt(0);
-
   
   INFO("*** TensorConvolution \n");
   profileEvent("tensorConv");
@@ -404,7 +421,10 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
   // TODO: Support other cases;  
   hostToDeviceCopy(input);
   hostToDeviceCopy(filter);
-      
+
+
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
+  
   checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
   // FIXIT: Think if upscaling values need to be configurable?
   // IMP-FIXIT:  CUDNN Cross correlation is only used in the Lenet context
@@ -442,7 +462,7 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
   changeTensorPlacement(output, DEVICE); 
   // NOTE: Necessary to insert the above call for every output tensor
     
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
 	output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
 	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
 
@@ -520,11 +540,7 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
 
 
 
-
-
-
-// FIXIT: Currently this only computes MAX pooling
-// FIXIT: Add support for Average Pooling
+// NOTE: Supports Max and Avg Pooling
 void* tensorPooling(void* input_ptr,
 		    int poolFunction,
 		    int window_height, int window_width,
@@ -567,14 +583,14 @@ void* tensorPooling(void* input_ptr,
 					h, w));
 
 
-
   cudnnPoolingMode_t pool_mode;
   if(poolFunction == 0)
     pool_mode = CUDNN_POOLING_MAX;
   else if(poolFunction == 1)
-    pool_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+    pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
 
   
+  
   // FIXIT: Make the pool function (max, min, avg) configurable
   checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc,
 					 //CUDNN_POOLING_MAX,
-- 
GitLab