From 85e4b3b92594878456a448a5e88581f7fd8aa169 Mon Sep 17 00:00:00 2001 From: Hashim Sharif <hsharif3@tyler.cs.illinois.edu> Date: Thu, 28 Feb 2019 16:47:04 -0600 Subject: [PATCH] Batchifying VGG execution --- .../dnn_sources/include/utils.h | 71 ++++++++++++++++++- .../dnn_sources/src/vgg16_cifar10.cc | 17 +++-- .../tensor_runtime/include/global_data.h | 8 ++- .../tensor_runtime/include/profiling.h | 5 +- .../tensor_runtime/include/tensor_runtime.h | 2 + .../tensor_runtime/src/tensor_runtime.cu | 36 +++++++--- 6 files changed, 119 insertions(+), 20 deletions(-) diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h index ec28fccb78..8004c4a423 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h @@ -254,6 +254,45 @@ struct Tensor* readTrainedWeights(const char* file_name, int data_type, } + + +struct Tensor* readInputBatch(const char* file_name, int data_type, + int start, int end, + int dim2_size, int dim3_size, int dim4_size){ + + int dim1_size = end - start; + // FIXIT: Don't assume floating point types + int type_size = 4; // NOTE: Assuming floating point tensors + long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float* tensor_data = (float*) malloc(sizeof(float) * num_elems); + int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size; + + FILE* file = fopen(file_name, "rb"); + if(file == NULL){ + printf("Data file %s is not found. Aborting... \n", file_name); + abort(); + } + + fseek(file, file_header_size, SEEK_SET); // Skipping the file header + size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); + + printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read); + + fclose(file); + + + struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, + dim3_size, dim4_size); + + initTensorData(weights, tensor_data, size_in_bytes); + free(tensor_data); + + return weights; +} + + + uint8_t* readLabels(const char* labels_file, int num_labels){ uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); @@ -265,9 +304,33 @@ uint8_t* readLabels(const char* labels_file, int num_labels){ size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); - for(unsigned int i = 0 ; i < 20; i++){ + fclose(file); + + printf("--labels bytes_read = %lu \n", bytes_read); + return labels; +} + + +uint8_t* readLabelsBatch(const char* labels_file, int start, int end){ + + int num_labels = end - start; + int file_header_size = sizeof(uint8_t) * start; + + uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); + FILE* file = fopen(labels_file, "rb"); + if(file == NULL){ + printf("Data file %s is not found. Aborting...\n", labels_file); + abort(); + } + + fseek(file, file_header_size, SEEK_SET); // Skipping the file header + + size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); + + /*for(unsigned int i = 0 ; i < 20; i++){ printf("labels[%d] = %u \n", i, labels[i]); } + */ fclose(file); @@ -331,7 +394,9 @@ float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsign printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); - for(int i = 0; i < batch_dim; i++){ + //for(int i = 0; i < batch_dim; i++){ + for(int i = 0; i < num_labels; i++){ + int chosen = 0; for (int id = 1; id < num_classes; ++id){ if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; @@ -342,6 +407,8 @@ float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsign if(chosen != labels[i]) num_errors++; + + //printf("chosen = %d, label = %d \n", chosen, labels[i]); } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc index f03b103938..d12060a648 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/vgg16_cifar10.cc @@ -15,9 +15,7 @@ int main(){ std::string dir_prefix = std::string("../model_params/vgg16_cifar10_2/"); std::string input_path = dir_prefix + std::string("input.bin"); - void* input = readTrainedWeights(input_path.c_str(), 0,1000,3,32,32); std::string labels_path = dir_prefix + std::string("labels.bin"); - uint8_t* labels = readLabels(labels_path.c_str(),10000); std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); @@ -82,8 +80,17 @@ int main(){ startMemTracking(); - for(int i = 0; i < 20; i++){ + int test_input_size = 10000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + + for(int i = 0; i < batch_count; i++){ + + int start = i * batch_size; + int end = (i + 1) * batch_size; + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); void* var_1 = tensorAdd(var_0, conv2d_1_b); void* var_2 = tensorRelu(var_1); @@ -135,7 +142,9 @@ int main(){ void* var_59 = tensorAdd(var_58, dense_2_b); void* var_60 = tensorSoftmax(var_59); - computeAccuracy2(labels,10000,var_60); + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + computeAccuracy2(labels,batch_size,var_60); freeBatchMemory(); } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h index a31c7f4081..c31a982857 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/global_data.h @@ -15,7 +15,8 @@ #include <cudnn.h> #include <cublas_api.h> #include "tensor.h" - +#include <string> +#include <unordered_map> #define ERROR_INJECTION_ENABLED 0 #define PROMISE_MODE 1 @@ -44,4 +45,9 @@ std::vector<void*> host_ptr; std::vector<void*> obj_ptr; +// Profiling Data +std::unordered_map<std::string, int> func_counters; +std::string profile_data = ""; + + #endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h index ba40ac53b5..7075d51cd6 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/profiling.h @@ -16,12 +16,11 @@ #include <unordered_map> #include <cuda_runtime.h> +#include "global_data.h" /***** Profiling routines ***/ -std::unordered_map<std::string, int> func_counters; -std::string profile_data = ""; std::chrono::time_point<std::chrono::high_resolution_clock> start_time; // previous_time maintains time for the latest timed operation @@ -46,7 +45,7 @@ extern "C"{ } - void profileEvent(char* event_name, bool compare_previous = false){ + void profileEvent(const char* event_name, bool compare_previous = false){ checkCudaErrors(cudaDeviceSynchronize()); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h index ba49976406..bac93a6f01 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h @@ -143,7 +143,9 @@ void dumpAccuracyNorms(); void readOpenTunerFlags(const char* file_name); void clearOpCounter(); void clearTensorMap(); +void startMemTracking(); void freeOutputTensors(); +void freeBatchMemory(); void* quantizeTensorPromise(void* input_ptr, float min, float max); void* addPromiseError(void* x_ptr, int error_scale); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index 5b9177f34a..8d3fe1d48c 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -114,6 +114,14 @@ void clearTensorMap(){ } +void startMemTracking(){ + + tensors_ptr.clear(); + host_ptr.clear(); + obj_ptr.clear(); +} + + void freeOutputTensors(){ for(int i = 0; i < tensors_ptr.size(); i++){ @@ -141,6 +149,15 @@ void clearOpCounter(){ +void freeBatchMemory(){ + // Free allocated memory for the current mini-batch + freeOutputTensors(); + // Reinitialize couter for OpenTuner flags - next mini-batch of execution + op_counter = 0; + // Clearing profiling data map + func_counters.clear(); +} + @@ -372,8 +389,8 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr, int vertical_stride, int horizontal_stride, int conv_mode, int compute_precision){ + llvm_hpvm_initTensorRt(0); - INFO("*** TensorConvolution \n"); profileEvent("tensorConv"); @@ -404,7 +421,10 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr, // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); - + + + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); + checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); // FIXIT: Think if upscaling values need to be configurable? // IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context @@ -442,7 +462,7 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr, changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], output->dims.dim_sizes[2], output->dims.dim_sizes[3]); @@ -520,11 +540,7 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr, - - - -// FIXIT: Currently this only computes MAX pooling -// FIXIT: Add support for Average Pooling +// NOTE: Supports Max and Avg Pooling void* tensorPooling(void* input_ptr, int poolFunction, int window_height, int window_width, @@ -567,14 +583,14 @@ void* tensorPooling(void* input_ptr, h, w)); - cudnnPoolingMode_t pool_mode; if(poolFunction == 0) pool_mode = CUDNN_POOLING_MAX; else if(poolFunction == 1) - pool_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + // FIXIT: Make the pool function (max, min, avg) configurable checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc, //CUDNN_POOLING_MAX, -- GitLab