Skip to content
Snippets Groups Projects
Commit 85e4b3b9 authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Batchifying VGG execution

parent 71acbe40
No related branches found
No related tags found
No related merge requests found
......@@ -254,6 +254,45 @@ struct Tensor* readTrainedWeights(const char* file_name, int data_type,
}
struct Tensor* readInputBatch(const char* file_name, int data_type,
int start, int end,
int dim2_size, int dim3_size, int dim4_size){
int dim1_size = end - start;
// FIXIT: Don't assume floating point types
int type_size = 4; // NOTE: Assuming floating point tensors
long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size;
long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size;
float* tensor_data = (float*) malloc(sizeof(float) * num_elems);
int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size;
FILE* file = fopen(file_name, "rb");
if(file == NULL){
printf("Data file %s is not found. Aborting... \n", file_name);
abort();
}
fseek(file, file_header_size, SEEK_SET); // Skipping the file header
size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file);
printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read);
fclose(file);
struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size,
dim3_size, dim4_size);
initTensorData(weights, tensor_data, size_in_bytes);
free(tensor_data);
return weights;
}
uint8_t* readLabels(const char* labels_file, int num_labels){
uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
......@@ -265,9 +304,33 @@ uint8_t* readLabels(const char* labels_file, int num_labels){
size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
for(unsigned int i = 0 ; i < 20; i++){
fclose(file);
printf("--labels bytes_read = %lu \n", bytes_read);
return labels;
}
uint8_t* readLabelsBatch(const char* labels_file, int start, int end){
int num_labels = end - start;
int file_header_size = sizeof(uint8_t) * start;
uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels);
FILE* file = fopen(labels_file, "rb");
if(file == NULL){
printf("Data file %s is not found. Aborting...\n", labels_file);
abort();
}
fseek(file, file_header_size, SEEK_SET); // Skipping the file header
size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file);
/*for(unsigned int i = 0 ; i < 20; i++){
printf("labels[%d] = %u \n", i, labels[i]);
}
*/
fclose(file);
......@@ -331,7 +394,9 @@ float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsign
printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels);
for(int i = 0; i < batch_dim; i++){
//for(int i = 0; i < batch_dim; i++){
for(int i = 0; i < num_labels; i++){
int chosen = 0;
for (int id = 1; id < num_classes; ++id){
if (data[i * channels + chosen] < data[i * channels + id]) chosen = id;
......@@ -342,6 +407,8 @@ float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsign
if(chosen != labels[i])
num_errors++;
//printf("chosen = %d, label = %d \n", chosen, labels[i]);
}
float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0;
......
......@@ -15,9 +15,7 @@ int main(){
std::string dir_prefix = std::string("../model_params/vgg16_cifar10_2/");
std::string input_path = dir_prefix + std::string("input.bin");
void* input = readTrainedWeights(input_path.c_str(), 0,1000,3,32,32);
std::string labels_path = dir_prefix + std::string("labels.bin");
uint8_t* labels = readLabels(labels_path.c_str(),10000);
std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin");
void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3);
std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin");
......@@ -82,8 +80,17 @@ int main(){
startMemTracking();
for(int i = 0; i < 20; i++){
int test_input_size = 10000;
int batch_size = 1000;
int batch_count = test_input_size / batch_size;
for(int i = 0; i < batch_count; i++){
int start = i * batch_size;
int end = (i + 1) * batch_size;
void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32);
void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0);
void* var_1 = tensorAdd(var_0, conv2d_1_b);
void* var_2 = tensorRelu(var_1);
......@@ -135,7 +142,9 @@ int main(){
void* var_59 = tensorAdd(var_58, dense_2_b);
void* var_60 = tensorSoftmax(var_59);
computeAccuracy2(labels,10000,var_60);
uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end);
computeAccuracy2(labels,batch_size,var_60);
freeBatchMemory();
}
......
......@@ -15,7 +15,8 @@
#include <cudnn.h>
#include <cublas_api.h>
#include "tensor.h"
#include <string>
#include <unordered_map>
#define ERROR_INJECTION_ENABLED 0
#define PROMISE_MODE 1
......@@ -44,4 +45,9 @@ std::vector<void*> host_ptr;
std::vector<void*> obj_ptr;
// Profiling Data
std::unordered_map<std::string, int> func_counters;
std::string profile_data = "";
#endif
......@@ -16,12 +16,11 @@
#include <unordered_map>
#include <cuda_runtime.h>
#include "global_data.h"
/***** Profiling routines ***/
std::unordered_map<std::string, int> func_counters;
std::string profile_data = "";
std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
// previous_time maintains time for the latest timed operation
......@@ -46,7 +45,7 @@ extern "C"{
}
void profileEvent(char* event_name, bool compare_previous = false){
void profileEvent(const char* event_name, bool compare_previous = false){
checkCudaErrors(cudaDeviceSynchronize());
......
......@@ -143,7 +143,9 @@ void dumpAccuracyNorms();
void readOpenTunerFlags(const char* file_name);
void clearOpCounter();
void clearTensorMap();
void startMemTracking();
void freeOutputTensors();
void freeBatchMemory();
void* quantizeTensorPromise(void* input_ptr, float min, float max);
void* addPromiseError(void* x_ptr, int error_scale);
......
......@@ -114,6 +114,14 @@ void clearTensorMap(){
}
void startMemTracking(){
tensors_ptr.clear();
host_ptr.clear();
obj_ptr.clear();
}
void freeOutputTensors(){
for(int i = 0; i < tensors_ptr.size(); i++){
......@@ -141,6 +149,15 @@ void clearOpCounter(){
void freeBatchMemory(){
// Free allocated memory for the current mini-batch
freeOutputTensors();
// Reinitialize couter for OpenTuner flags - next mini-batch of execution
op_counter = 0;
// Clearing profiling data map
func_counters.clear();
}
......@@ -372,8 +389,8 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
int vertical_stride, int horizontal_stride,
int conv_mode, int compute_precision){
llvm_hpvm_initTensorRt(0);
INFO("*** TensorConvolution \n");
profileEvent("tensorConv");
......@@ -404,7 +421,10 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
// TODO: Support other cases;
hostToDeviceCopy(input);
hostToDeviceCopy(filter);
INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
// FIXIT: Think if upscaling values need to be configurable?
// IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context
......@@ -442,7 +462,7 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
changeTensorPlacement(output, DEVICE);
// NOTE: Necessary to insert the above call for every output tensor
DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n",
DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1],
output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
......@@ -520,11 +540,7 @@ void* tensorConvolution(void* input_ptr, void* filter_ptr,
// FIXIT: Currently this only computes MAX pooling
// FIXIT: Add support for Average Pooling
// NOTE: Supports Max and Avg Pooling
void* tensorPooling(void* input_ptr,
int poolFunction,
int window_height, int window_width,
......@@ -567,14 +583,14 @@ void* tensorPooling(void* input_ptr,
h, w));
cudnnPoolingMode_t pool_mode;
if(poolFunction == 0)
pool_mode = CUDNN_POOLING_MAX;
else if(poolFunction == 1)
pool_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
// FIXIT: Make the pool function (max, min, avg) configurable
checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc,
//CUDNN_POOLING_MAX,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment