Batchifying VGG execution

85e4b3b9 · Hashim Sharif · 71acbe40 · 85e4b3b9 · 85e4b3b9 · 85e4b3b9
Commit 85e4b3b9 authored 6 years ago by Hashim Sharif
--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h
@@ -254,6 +254,45 @@ struct Tensor* readTrainedWeights(const char* file_name, int data_type,
 }


+
+
+struct Tensor* readInputBatch(const char* file_name, int data_type,
+			      int start, int end,
+			      int dim2_size, int dim3_size, int dim4_size){
+
+  int dim1_size = end - start;
+  // FIXIT: Don't assume floating point types
+  int type_size = 4; // NOTE: Assuming floating point tensors
+  long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
+  long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
+  float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
+  int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
+  
+  FILE* file = fopen(file_name, "rb");
+  if(file == NULL){
+    printf("Data file %s is not found. Aborting... \n", file_name);
+    abort();
+  }
+    
+  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
+  size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
+
+  printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
+
+  fclose(file);
+  
+  
+  struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
+					                   dim3_size, dim4_size);
+  
+  initTensorData(weights, tensor_data, size_in_bytes);
+  free(tensor_data);
+
+  return weights;
+}
+
+
+
 uint8_t* readLabels(const char* labels_file, int num_labels){

  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
@@ -265,9 +304,33 @@ uint8_t* readLabels(const char* labels_file, int num_labels){

  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);

-  for(unsigned int i = 0 ; i < 20; i++){
+  fclose(file);
+  
+  printf("--labels bytes_read = %lu \n", bytes_read);
+  return labels;
+}
+
+
+uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
+
+  int num_labels = end - start;
+  int file_header_size = sizeof(uint8_t) * start;
+  
+  uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
+  FILE* file = fopen(labels_file, "rb");
+  if(file == NULL){
+    printf("Data file %s is not found. Aborting...\n", labels_file);
+    abort();
+  }
+  
+  fseek(file, file_header_size, SEEK_SET); // Skipping the file header
+    
+  size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
+
+  /*for(unsigned int i = 0 ; i < 20; i++){
    printf("labels[%d] = %u \n", i, labels[i]);
  }
+  */

  fclose(file);
  
@@ -331,7 +394,9 @@ float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsign

  printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
  
-  for(int i = 0; i < batch_dim; i++){
+  //for(int i = 0; i < batch_dim; i++){
+  for(int i = 0; i < num_labels; i++){
+  
    int chosen = 0;
    for (int id = 1; id < num_classes; ++id){
      if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
@@ -342,6 +407,8 @@ float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsign
      
    if(chosen != labels[i])
      num_errors++;
+
+    //printf("chosen = %d, label = %d \n", chosen, labels[i]);
  }

  float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;

--- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc
+++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc
@@ -15,9 +15,7 @@ int main(){

  std::string dir_prefix = std::string("../model_params/vgg16_cifar10_2/"); 
  std::string input_path =  dir_prefix + std::string("input.bin"); 
-  void* input = readTrainedWeights(input_path.c_str(), 0,1000,3,32,32); 
  std::string labels_path =  dir_prefix + std::string("labels.bin"); 
-  uint8_t* labels = readLabels(labels_path.c_str(),10000); 
  std::string conv2d_1_w_path =  dir_prefix + std::string("conv2d_1_w.bin"); 
  void* conv2d_1_w =  readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); 
  std::string conv2d_1_b_path =  dir_prefix + std::string("conv2d_1_b.bin"); 
@@ -82,8 +80,17 @@ int main(){

  startMemTracking();

-  for(int i = 0; i < 20; i++){
+  int test_input_size = 10000;
+  int batch_size = 1000;
+  int batch_count = test_input_size / batch_size;
+
+  for(int i = 0; i < batch_count; i++){
+
+    int start = i * batch_size;
+    int end = (i + 1) * batch_size;
    
+    void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); 
+ 
    void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); 
    void* var_1 = tensorAdd(var_0, conv2d_1_b); 
    void* var_2 = tensorRelu(var_1); 
@@ -135,7 +142,9 @@ int main(){
    void* var_59 = tensorAdd(var_58, dense_2_b); 
    void* var_60 = tensorSoftmax(var_59); 

-    computeAccuracy2(labels,10000,var_60); 
+    uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); 
+
+    computeAccuracy2(labels,batch_size,var_60); 

    freeBatchMemory();
  }

--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h
@@ -15,7 +15,8 @@
 #include <cudnn.h>
 #include <cublas_api.h>
 #include "tensor.h"
-
+#include <string>
+#include <unordered_map>

 #define ERROR_INJECTION_ENABLED 0
 #define PROMISE_MODE 1
@@ -44,4 +45,9 @@ std::vector<void*> host_ptr;
 std::vector<void*> obj_ptr;


+// Profiling Data
+std::unordered_map<std::string, int> func_counters;
+std::string profile_data = "";
+
+
 #endif
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h
@@ -16,12 +16,11 @@
 #include <unordered_map>
 #include <cuda_runtime.h>

+#include "global_data.h"


 /***** Profiling routines ***/

-std::unordered_map<std::string, int> func_counters;
-std::string profile_data = "";

 std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
 // previous_time maintains time for the latest timed operation
@@ -46,7 +45,7 @@ extern "C"{
  }


-  void profileEvent(char* event_name, bool compare_previous = false){
+  void profileEvent(const char* event_name, bool compare_previous = false){

    checkCudaErrors(cudaDeviceSynchronize());


--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h
@@ -143,7 +143,9 @@ void dumpAccuracyNorms();
 void readOpenTunerFlags(const char* file_name);
 void clearOpCounter();
 void clearTensorMap();
+void startMemTracking();
 void freeOutputTensors();
+void freeBatchMemory();
 void* quantizeTensorPromise(void* input_ptr, float min, float max);
 void* addPromiseError(void* x_ptr, int error_scale);


--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu
@@ -114,6 +114,14 @@ void clearTensorMap(){
 }


+void startMemTracking(){
+
+  tensors_ptr.clear();
+  host_ptr.clear();
+  obj_ptr.clear();
+}
+
+
 void freeOutputTensors(){

  for(int i = 0; i < tensors_ptr.size(); i++){
@@ -141,6 +149,15 @@ void clearOpCounter(){



+void freeBatchMemory(){
+  // Free allocated memory for the current mini-batch
+  freeOutputTensors();
+  // Reinitialize couter for OpenTuner flags - next mini-batch of execution
+  op_counter = 0;
+  // Clearing profiling data map
+  func_counters.clear();
+}
+



@@ -372,8 +389,8 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
 			int vertical_stride, int horizontal_stride,
 			int conv_mode, int compute_precision){

+  
  llvm_hpvm_initTensorRt(0);
-
  
  INFO("*** TensorConvolution \n");
  profileEvent("tensorConv");
@@ -404,7 +421,10 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
  // TODO: Support other cases;  
  hostToDeviceCopy(input);
  hostToDeviceCopy(filter);
-      
+
+
+  INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
+  
  checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
  // FIXIT: Think if upscaling values need to be configurable?
  // IMP-FIXIT:  CUDNN Cross correlation is only used in the Lenet context
@@ -442,7 +462,7 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
  changeTensorPlacement(output, DEVICE); 
  // NOTE: Necessary to insert the above call for every output tensor
    
-  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
+  DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
 	output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
 	output->dims.dim_sizes[2], output->dims.dim_sizes[3]);

@@ -520,11 +540,7 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,



-
-
-
-// FIXIT: Currently this only computes MAX pooling
-// FIXIT: Add support for Average Pooling
+// NOTE: Supports Max and Avg Pooling
 void* tensorPooling(void* input_ptr,
 		    int poolFunction,
 		    int window_height, int window_width,
@@ -567,14 +583,14 @@ void* tensorPooling(void* input_ptr,
 					h, w));


-
  cudnnPoolingMode_t pool_mode;
  if(poolFunction == 0)
    pool_mode = CUDNN_POOLING_MAX;
  else if(poolFunction == 1)
-    pool_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+    pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;

  
+  
  // FIXIT: Make the pool function (max, min, avg) configurable
  checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc,
 					 //CUDNN_POOLING_MAX,