diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h index 4eaf88e6d613c51a5a75ef8ce73b55a3410f1dbd..8a97fbf3d31917391c5269d185fd7f72116076bd 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/op_overheads.h @@ -3,22 +3,18 @@ #ifndef OP_OVERHEADS_HEADER #define OP_OVERHEADS_HEADER - #include <sstream> #include "../../tensor_runtime/include/tensor.h" #include "types.h" - float scale_down_factor = 10000.0; float error_factor = 0.1; std::string result_str = ""; - // TODO: Every routine needs testing - // private function -static float getScaledComps(double total_comps, int error_scale){ +static float getScaledComps(double total_comps, int error_scale) { total_comps = total_comps / scale_down_factor; float comp_scale = 1.0 + (error_factor * error_scale); @@ -27,122 +23,107 @@ static float getScaledComps(double total_comps, int error_scale){ return total_comps; } - -static void addNormToResult(float comps){ +static void addNormToResult(float comps) { std::ostringstream ss; ss << std::fixed << comps; - - result_str.append( std::string(ss.str()) ); + + result_str.append(std::string(ss.str())); result_str.append("\t"); } - - -static void addCompsToResult(float comps){ +static void addCompsToResult(float comps) { std::ostringstream ss; ss << std::fixed << comps; - - result_str.append( std::string(ss.str()) ); + + result_str.append(std::string(ss.str())); result_str.append("\n"); } +void add_conv_overheads(void *input_ptr, void *filter_ptr, int strideA, + int strideB, int error_scale) { -void add_conv_overheads(void* input_ptr, void* filter_ptr, - int strideA, int strideB, int error_scale){ - - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; - + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; } +void add_gemm_overheads(void *lhs_ptr, void *rhs_ptr, int error_scale) { -void add_gemm_overheads(void* lhs_ptr, void* rhs_ptr, int error_scale){ + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; - Tensor* lhs = (Tensor*) lhs_ptr; - Tensor* rhs = (Tensor*) rhs_ptr; - int m = lhs->dims.dim_sizes[0]; // The rhs last dimension must contain the neurons - int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons int k = 1; - + // Flattening the dimensions after the batch dimension - for (int j = 1 ; j < lhs->dims.num_dims; j++){ + for (int j = 1; j < lhs->dims.num_dims; j++) { k = k * lhs->dims.dim_sizes[j]; // input neurons } - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; // Dimension-note: Check if k is same across the two tensors printf("m = %d, n = %d, k = %d \n", m, n, k); - - if(rhs_k != k){ + + if (rhs_k != k) { printf("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k); abort(); } - + double total_comps = m * n * rhs_k * 1.0; float scaled_comps = getScaledComps(total_comps, error_scale); - + printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n", - error_scale, total_comps, scaled_comps); + error_scale, total_comps, scaled_comps); addCompsToResult(scaled_comps); - } +void add_bias_overheads(void *input_ptr, int error_scale) { -void add_bias_overheads(void* input_ptr, int error_scale){ + Tensor *input = (Tensor *)input_ptr; - Tensor* input = (Tensor*) input_ptr; - double total_comps = input->num_elems; float scaled_comps = getScaledComps(total_comps, error_scale); printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n", - error_scale, total_comps, scaled_comps); + error_scale, total_comps, scaled_comps); addCompsToResult(scaled_comps); - } +void add_relu_overheads(void *input_ptr, int error_scale) { + + Tensor *input = (Tensor *)input_ptr; -void add_relu_overheads(void* input_ptr, int error_scale){ - - Tensor* input = (Tensor*) input_ptr; - double total_comps = input->num_elems; float scaled_comps = getScaledComps(total_comps, error_scale); printf("error_scale = %d, total_comps = %f, scaled_comps = %f \n", - error_scale, total_comps, scaled_comps); + error_scale, total_comps, scaled_comps); addCompsToResult(scaled_comps); - -} - -float add_pool_overheads(void* input_ptr, int kernel_size, - int stride_size, int error_scale){ - } +float add_pool_overheads(void *input_ptr, int kernel_size, int stride_size, + int error_scale) {} -void add_norms(void* norms_ptr){ +void add_norms(void *norms_ptr) { - Norm_t* norms = (Norm_t*) norms_ptr; + Norm_t *norms = (Norm_t *)norms_ptr; addNormToResult(norms->l1_norm); addNormToResult(norms->l2_norm); addNormToResult(norms->inf_norm); - } -void dump_result(char* file_name){ +void dump_result(char *file_name) { - FILE* fp = fopen(file_name, "w+"); + FILE *fp = fopen(file_name, "w+"); fwrite(result_str.c_str(), 1, result_str.length(), fp); - fclose(fp); + fclose(fp); } #endif diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h index 3e4f64610da64fb04b6270035da8557e940eb7e2..cafd37f703f4424b778f9d44afdb4b16f2ed1e80 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/types.h @@ -32,7 +32,7 @@ enum Tensor_type_t{ // NOTE: Currently only NCHW is supported due to limited cuDNN support enum Tensor_format_t{ nchw, - nhwc + nhwc }; */ diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h index 5d1e0e66ad1a3402981682ed97e664ddcc173787..178454153bfc475e9bbee99738af0acd679e61ae 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils.h @@ -4,9 +4,9 @@ #define UTILS_HEADER #include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> #include <sstream> #include <vector> #include <bits/stdc++.h> @@ -15,17 +15,14 @@ #include <cmath> #include <string.h> - std::vector<float> run_accuracies; std::string model_params_path = "../../../build/model_params/"; +void printTensorInfo(void *tensor_ptr) { -void printTensorInfo(void* tensor_ptr){ - - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; + struct Tensor *tensor = (struct Tensor *)tensor_ptr; - if(tensor->gpu_data != NULL){ + if (tensor->gpu_data != NULL) { printf("Successful cudaMalloc \n"); } @@ -35,388 +32,363 @@ void printTensorInfo(void* tensor_ptr){ printf("num_elems = %lu \n", tensor->num_elems); } - // FIXIT: Move this to debug.h and include in all files -void dumpWeightsToFile(const char* file_name, void* weights_ptr){ +void dumpWeightsToFile(const char *file_name, void *weights_ptr) { - struct Tensor* weights = (Tensor*) weights_ptr; + struct Tensor *weights = (Tensor *)weights_ptr; // Move data back to host hpvm_request_tensor(weights, 0); - - FILE* fp = fopen(file_name, "wb"); - if(fp == NULL){ - printf("File %s could not be created. Check if directory exists \n", file_name); + + FILE *fp = fopen(file_name, "wb"); + if (fp == NULL) { + printf("File %s could not be created. Check if directory exists \n", + file_name); abort(); } - //printf("size_in_bytes = %lu \n", weights->size_in_bytes); - size_t bytes_written = fwrite(weights->host_data, 1, weights->size_in_bytes, fp); - //printf("bytes_written = %lu \n", bytes_written); + // printf("size_in_bytes = %lu \n", weights->size_in_bytes); + size_t bytes_written = + fwrite(weights->host_data, 1, weights->size_in_bytes, fp); + // printf("bytes_written = %lu \n", bytes_written); fclose(fp); } +void fillTensorWithOnes(void *tensor_ptr) { + struct Tensor *tensor = (struct Tensor *)tensor_ptr; -void fillTensorWithOnes(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - + // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - data_arr[i] = 1.0; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; + for (unsigned int i = 0; i < tensor->num_elems; i++) { + data_arr[i] = 1.0; } } } +void fillWithOnesAndTwos(void *tensor_ptr) { -void fillWithOnesAndTwos(void* tensor_ptr){ + struct Tensor *tensor = (struct Tensor *)tensor_ptr; - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - + // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ + for (unsigned int i = 0; i < tensor->num_elems; i++) { if (i % 2 == 0) data_arr[i] = 1.0; else - data_arr[i] = 2.0; + data_arr[i] = 2.0; } /*for(unsigned int i = 0; i < tensor->num_elems/2; i++){ - data_arr[i] = 1.0; + data_arr[i] = 1.0; } for(unsigned int i = tensor->num_elems/2; i < tensor->num_elems; i++){ - data_arr[i] = 2.0; + data_arr[i] = 2.0; }*/ - } } +void fillTensorWithVal(void *tensor_ptr, float target_value) { -void fillTensorWithVal(void* tensor_ptr, float target_value){ + struct Tensor *tensor = (struct Tensor *)tensor_ptr; - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - + // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - data_arr[i] = target_value; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; + for (unsigned int i = 0; i < tensor->num_elems; i++) { + data_arr[i] = target_value; } } } +void fillTensorWithNegOnes(void *tensor_ptr) { -void fillTensorWithNegOnes(void* tensor_ptr){ + struct Tensor *tensor = (struct Tensor *)tensor_ptr; - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - + // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - data_arr[i] = -1.0; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; + for (unsigned int i = 0; i < tensor->num_elems; i++) { + data_arr[i] = -1.0; } } } +void fillTensorVals(void *tensor_ptr) { -void fillTensorVals(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; + struct Tensor *tensor = (struct Tensor *)tensor_ptr; // initialization is specific to the floating point type - if(tensor->data_type == CUDNN_DATA_FLOAT){ - float* data_arr = (float*) tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - data_arr[i] = i + 1; + if (tensor->data_type == CUDNN_DATA_FLOAT) { + float *data_arr = (float *)tensor->host_data; + for (unsigned int i = 0; i < tensor->num_elems; i++) { + data_arr[i] = i + 1; } } } +void printTensorValues(void *tensor_ptr) { -void printTensorValues(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; + struct Tensor *tensor = (struct Tensor *)tensor_ptr; hpvm_request_tensor(tensor, 0); - + // printing is specific to the floating point type - if(tensor->data_type != CUDNN_DATA_FLOAT){ - //printf("\n WARNING: The tensor is non-float type tensor \n\n"); - } + if (tensor->data_type != CUDNN_DATA_FLOAT) { + // printf("\n WARNING: The tensor is non-float type tensor \n\n"); + } - float* data_arr = (float*) tensor->host_data; + float *data_arr = (float *)tensor->host_data; - for(unsigned int i = 0; i < tensor->num_elems; i++){ - printf("%f,", data_arr[i]); + for (unsigned int i = 0; i < tensor->num_elems; i++) { + printf("%f,", data_arr[i]); } - printf("\n"); } +void printTensorDims(void *tensor_ptr) { -void printTensorDims(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; + struct Tensor *tensor = (struct Tensor *)tensor_ptr; printf("Num_elems = %lu \n", tensor->num_elems); - for (int i = 0; i < tensor->dims.num_dims; i++){ + for (int i = 0; i < tensor->dims.num_dims; i++) { printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]); } } +void compareTensors(void *tensor1_ptr, void *tensor2_ptr) { - -void compareTensors(void* tensor1_ptr, void* tensor2_ptr){ - - struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr; - struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr; + struct Tensor *tensor1 = (struct Tensor *)tensor1_ptr; + struct Tensor *tensor2 = (struct Tensor *)tensor2_ptr; hpvm_request_tensor(tensor1, 0); hpvm_request_tensor(tensor2, 0); - float* tensor_data1 = (float*) tensor1->host_data; - float* tensor_data2 = (float*) tensor2->host_data; - - for(unsigned int i = 0; i < tensor1->num_elems; i++){ - if(tensor_data1[i] != tensor_data2[i]){ + float *tensor_data1 = (float *)tensor1->host_data; + float *tensor_data2 = (float *)tensor2->host_data; + + for (unsigned int i = 0; i < tensor1->num_elems; i++) { + if (tensor_data1[i] != tensor_data2[i]) { printf("Tensor data mismatch at index %d \n", i); abort(); } } } +void compareValues(void *tensor_ptr, float *data, size_t num_elems) { + struct Tensor *tensor = (struct Tensor *)tensor_ptr; -void compareValues(void* tensor_ptr, float* data, size_t num_elems){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - hpvm_request_tensor(tensor, 0); - - float* tensor_data = (float*) tensor->host_data; - for(unsigned int i = 0; i < num_elems; i++){ - if(tensor_data[i] != data[i]){ + + float *tensor_data = (float *)tensor->host_data; + for (unsigned int i = 0; i < num_elems; i++) { + if (tensor_data[i] != data[i]) { printf("Tensor data mismatch"); abort(); } } } - -void* readInputTensor(const char* file_name, int data_type, int dim1_size, int dim2_size, - int dim3_size, int dim4_size){ +void *readInputTensor(const char *file_name, int data_type, int dim1_size, + int dim2_size, int dim3_size, int dim4_size) { int type_size = 4; // NOTE: Assuming floating point tensors int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - uint8_t* file_data = (uint8_t*) malloc(sizeof(char) * num_elems); - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); + uint8_t *file_data = (uint8_t *)malloc(sizeof(char) * num_elems); + float *tensor_data = (float *)malloc(sizeof(float) * num_elems); int file_header_size = 16; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ + + FILE *file = fopen(file_name, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting... \n", file_name); abort(); } - fseek(file, file_header_size, SEEK_CUR); // Skipping the file header size_t bytes_read = fread(file_data, 1, sizeof(uint8_t) * num_elems, file); fclose(file); - - for (size_t i = 0; i < num_elems; ++i){ - tensor_data[i] = (float) file_data[i] / 255.0f; + + for (size_t i = 0; i < num_elems; ++i) { + tensor_data[i] = (float)file_data[i] / 255.0f; } // NOTE: Using NCHW format - struct Tensor* input = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - + struct Tensor *input = (struct Tensor *)create4DTensor( + data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); + initTensorData(input, tensor_data, size_in_bytes); // compareValues(input, tensor_data, num_elems); - - return input; -} + return input; +} //*** FIXIT: Move this to CPU-only -struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type, - int dim1_size, int dim2_size, - int dim3_size, int dim4_size){ +struct Tensor *readTrainedWeightsCPU(const char *file_name, int data_type, + int dim1_size, int dim2_size, + int dim3_size, int dim4_size) { // FIXIT: Don't assume floating point types int type_size = 4; // NOTE: Assuming floating point tensors long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); + long int size_in_bytes = + type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float *tensor_data = (float *)malloc(sizeof(float) * num_elems); int file_header_size = 0; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ + + FILE *file = fopen(file_name, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting... \n", file_name); abort(); } - + fseek(file, file_header_size, SEEK_CUR); // Skipping the file header size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); - //printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read); + // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, + // bytes_read); fclose(file); - - - struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - + + struct Tensor *weights = (struct Tensor *)create4DTensor( + data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); + initTensorData(weights, tensor_data, size_in_bytes); - //compareValues(weights, tensor_data, num_elems); + // compareValues(weights, tensor_data, num_elems); free(tensor_data); return weights; } - -struct Tensor* readTrainedWeights(const char* file_name, int data_type, - long int dim1_size, long int dim2_size, - long int dim3_size, long int dim4_size){ +struct Tensor *readTrainedWeights(const char *file_name, int data_type, + long int dim1_size, long int dim2_size, + long int dim3_size, long int dim4_size) { // FIXIT: Don't assume floating point types int type_size = 4; // NOTE: Assuming floating point tensors long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); - //printf("size_in_bytes = %lu \n", size_in_bytes); - + long int size_in_bytes = + type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float *tensor_data = (float *)malloc(sizeof(float) * num_elems); + // printf("size_in_bytes = %lu \n", size_in_bytes); + int file_header_size = 0; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ + + FILE *file = fopen(file_name, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting... \n", file_name); abort(); } - + fseek(file, file_header_size, SEEK_CUR); // Skipping the file header size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); - // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read); + // printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, + // bytes_read); fclose(file); - - - struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - + + struct Tensor *weights = (struct Tensor *)create4DTensor( + data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); + initTensorData(weights, tensor_data, size_in_bytes); - //compareValues(weights, tensor_data, num_elems); + // compareValues(weights, tensor_data, num_elems); free(tensor_data); return weights; } - - - -struct Tensor* readInputBatch(const char* file_name, int data_type, - long int start, long int end, - long int dim2_size, long int dim3_size, long int dim4_size){ +struct Tensor *readInputBatch(const char *file_name, int data_type, + long int start, long int end, long int dim2_size, + long int dim3_size, long int dim4_size) { long int dim1_size = end - start; // FIXIT: Don't assume floating point types long int type_size = 4; // NOTE: Assuming floating point tensors long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); - long int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ + long int size_in_bytes = + type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float *tensor_data = (float *)malloc(sizeof(float) * num_elems); + long int file_header_size = + type_size * start * dim2_size * dim3_size * dim4_size; + + FILE *file = fopen(file_name, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting... \n", file_name); abort(); } - + fseek(file, file_header_size, SEEK_SET); // Skipping the file header size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); - fclose(file); - - - struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - + + struct Tensor *weights = (struct Tensor *)create4DTensor( + data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); + initTensorData(weights, tensor_data, size_in_bytes); free(tensor_data); return weights; } +void *copyInputBatch(const char *file_name, int start, int end, + long int dim2_size, long int dim3_size, long int dim4_size, + void *inputTensor_ptr) { + struct Tensor *inputTensor = (struct Tensor *)inputTensor_ptr; -void* copyInputBatch(const char* file_name, - int start, int end, - long int dim2_size, long int dim3_size, long int dim4_size, - void* inputTensor_ptr){ - - struct Tensor* inputTensor = (struct Tensor*) inputTensor_ptr; - long int dim1_size = end - start; // FIXIT: Don't assume floating point types int type_size = 4; // NOTE: Assuming floating point tensors long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); + long int size_in_bytes = + type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float *tensor_data = (float *)malloc(sizeof(float) * num_elems); int file_header_size = type_size * start * dim2_size * dim3_size * dim4_size; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ + + FILE *file = fopen(file_name, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting... \n", file_name); abort(); } - + fseek(file, file_header_size, SEEK_SET); // Skipping the file header size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); fclose(file); - - + initTensorData(inputTensor, tensor_data, size_in_bytes); free(tensor_data); printf("******NOTE: tensor Dims = %d \n", inputTensor->dims.num_dims); - if(inputTensor->host_data == NULL || inputTensor->gpu_data == NULL) + if (inputTensor->host_data == NULL || inputTensor->gpu_data == NULL) printf("ERROR: NULL data pointers \n"); - - // Chaning Tensor Placement to HOST + // Chaning Tensor Placement to HOST changeTensorPlacement(inputTensor, HOST); - return inputTensor; } +uint8_t *readLabels(const char *labels_file, int num_labels) { - -uint8_t* readLabels(const char* labels_file, int num_labels){ - - uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ + uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels); + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting...\n", labels_file); abort(); } @@ -424,17 +396,15 @@ uint8_t* readLabels(const char* labels_file, int num_labels){ size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); fclose(file); - + return labels; } +uint32_t *readLabels3(const char *labels_file, int num_labels) { - -uint32_t* readLabels3(const char* labels_file, int num_labels){ - - uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ + uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels); + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting...\n", labels_file); abort(); } @@ -442,264 +412,248 @@ uint32_t* readLabels3(const char* labels_file, int num_labels){ size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file); fclose(file); - + return labels; } - -uint8_t* readLabelsBatch(const char* labels_file, int start, int end){ +uint8_t *readLabelsBatch(const char *labels_file, int start, int end) { int num_labels = end - start; int file_header_size = sizeof(uint8_t) * start; - - uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ + + uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels); + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting...\n", labels_file); abort(); } - + fseek(file, file_header_size, SEEK_SET); // Skipping the file header - - size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); + size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); fclose(file); - + // printf("--labels bytes_read = %lu \n", bytes_read); return labels; } - -uint32_t* readLabelsBatch3(const char* labels_file, int start, int end){ +uint32_t *readLabelsBatch3(const char *labels_file, int start, int end) { int num_labels = end - start; int file_header_size = sizeof(uint32_t) * start; - - uint32_t* labels = (uint32_t*) malloc(sizeof(uint32_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ + + uint32_t *labels = (uint32_t *)malloc(sizeof(uint32_t) * num_labels); + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting...\n", labels_file); abort(); } - + fseek(file, file_header_size, SEEK_SET); // Skipping the file header - - size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file); + size_t bytes_read = fread(labels, 1, sizeof(uint32_t) * num_labels, file); fclose(file); - + return labels; } +void computeAccuracy(const char *labels_file, int num_labels, + void *result_ptr) { + struct Tensor *result = (struct Tensor *)result_ptr; -void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){ - - struct Tensor* result = (struct Tensor*) result_ptr; - - uint8_t* labels = readLabels(labels_file, num_labels); + uint8_t *labels = readLabels(labels_file, num_labels); size_t batch_dim = result->dims.dim_sizes[0]; size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; int num_errors = 0; - - for(int i = 0; i < batch_dim; i++){ + + for (int i = 0; i < batch_dim; i++) { int chosen = 0; - for (int id = 1; id < 10; ++id){ - if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; + for (int id = 1; id < 10; ++id) { + if (data[i * channels + chosen] < data[i * channels + id]) + chosen = id; } - - //printf("chosen = %d, label = %d \n", chosen, labels[i]); - if(chosen != labels[i]) + + // printf("chosen = %d, label = %d \n", chosen, labels[i]); + if (chosen != labels[i]) num_errors++; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << accuracy; std::string print_str = ss.str(); - + fwrite(print_str.c_str(), 1, print_str.length(), fp); fclose(fp); } - } +// NOTE: batch_size and num_classes are Unused arguments +float computeAccuracy2(uint8_t *labels, int batch_size, void *result_ptr, + size_t num_classes = 10) { + struct Tensor *result = (struct Tensor *)result_ptr; - -// NOTE: batch_size and num_classes are Unused arguments -float computeAccuracy2(uint8_t* labels, int batch_size, - void* result_ptr, size_t num_classes = 10){ - - struct Tensor* result = (struct Tensor*) result_ptr; - size_t batch_dim = result->dims.dim_sizes[0]; num_classes = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; int num_errors = 0; printf("batch_dim = %lu, channels = %lu \n", batch_dim, num_classes); - - for(unsigned int i = 0; i < batch_dim; i++){ - + + for (unsigned int i = 0; i < batch_dim; i++) { + int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; + for (int id = 1; id < num_classes; ++id) { + if (data[i * num_classes + chosen] < data[i * num_classes + id]) + chosen = id; } - - if(chosen != labels[i]) - num_errors++; + if (chosen != labels[i]) + num_errors++; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << accuracy; std::string print_str = ss.str(); - + fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); - return accuracy; + return accuracy; } +float computeAccuracy3(uint32_t *labels, void *result_ptr) { + struct Tensor *result = (struct Tensor *)result_ptr; -float computeAccuracy3(uint32_t* labels, void* result_ptr){ - - struct Tensor* result = (struct Tensor*) result_ptr; - size_t batch_dim = result->dims.dim_sizes[0]; size_t num_classes = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; int num_errors = 0; printf("batch_dim = %lu, num_classes = %lu \n", batch_dim, num_classes); - - for(int i = 0; i < batch_dim; i++){ - + + for (int i = 0; i < batch_dim; i++) { + int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; + for (int id = 1; id < num_classes; ++id) { + if (data[i * num_classes + chosen] < data[i * num_classes + id]) + chosen = id; } - - if(chosen != labels[i]) + + if (chosen != labels[i]) num_errors++; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << accuracy; std::string print_str = ss.str(); - + fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); - return accuracy; + return accuracy; } - - -struct ClassProb{ +struct ClassProb { float prob; int index; }; - -bool descendFloatComp(ClassProb obj1, ClassProb obj2){ +bool descendFloatComp(ClassProb obj1, ClassProb obj2) { return obj1.prob > obj2.prob; } +float computeTop5Accuracy(uint8_t *labels, int num_labels, void *result_ptr, + unsigned num_classes = 10) { + + struct Tensor *result = (struct Tensor *)result_ptr; -float computeTop5Accuracy(uint8_t* labels, int num_labels, - void* result_ptr, unsigned num_classes = 10){ - - struct Tensor* result = (struct Tensor*) result_ptr; - size_t batch_dim = result->dims.dim_sizes[0]; size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; int num_errors = 0; printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); - - for(int i = 0; i < num_labels; i++){ + + for (int i = 0; i < num_labels; i++) { std::vector<ClassProb> elem_probs; - for (int id = 0; id < num_classes; ++id){ + for (int id = 0; id < num_classes; ++id) { ClassProb cProb; cProb.prob = data[i * channels + id]; cProb.index = id; - elem_probs.push_back(cProb); + elem_probs.push_back(cProb); } - std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp); + std: + sort(elem_probs.begin(), elem_probs.end(), descendFloatComp); // Check if any of top-5 predictions matches bool matched = false; - for(int j = 0; j < 5; j++){ + for (int j = 0; j < 5; j++) { ClassProb cProb = elem_probs[j]; - if(cProb.index == labels[i]) + if (cProb.index == labels[i]) matched = true; } - if(!matched) - num_errors +=1; + if (!matched) + num_errors += 1; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << accuracy; std::string print_str = ss.str(); - + fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); - return accuracy; + return accuracy; } - - - -void dumpFinalAccuracy(float accuracy){ +void dumpFinalAccuracy(float accuracy) { printf("\n\n **** Final Accuracy = %f \n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << accuracy; std::string print_str = ss.str(); - + fwrite(print_str.c_str(), 1, print_str.length(), fp); } @@ -708,44 +662,37 @@ void dumpFinalAccuracy(float accuracy){ run_accuracies.push_back(accuracy); } +void dumpAvgPSNR(float avg_psnr) { - -void dumpAvgPSNR(float avg_psnr){ - - FILE* fp = fopen("avg_psnr", "w+"); - if(fp != NULL){ + FILE *fp = fopen("avg_psnr", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << avg_psnr; - std::string print_str = ss.str(); + std::string print_str = ss.str(); fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); } +void dumpPSNRStd(float psnr_std) { -void dumpPSNRStd(float psnr_std){ - - FILE* fp = fopen("psnr_std.txt", "w+"); - if(fp != NULL){ + FILE *fp = fopen("psnr_std.txt", "w+"); + if (fp != NULL) { std::ostringstream ss; ss << std::fixed << psnr_std; - std::string print_str = ss.str(); + std::string print_str = ss.str(); fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); } +void dumpExecutionAccuracies() { - - - -void dumpExecutionAccuracies(){ - - FILE* fp = fopen("run_accuracies.txt", "w+"); - if(fp != NULL){ - for (int i = 0; i < run_accuracies.size(); i++){ + FILE *fp = fopen("run_accuracies.txt", "w+"); + if (fp != NULL) { + for (int i = 0; i < run_accuracies.size(); i++) { float accuracy = run_accuracies[i]; std::ostringstream ss; ss << std::fixed << accuracy; @@ -753,63 +700,60 @@ void dumpExecutionAccuracies(){ fwrite(print_str.c_str(), 1, print_str.length(), fp); fwrite("\n", 1, 1, fp); } - } fclose(fp); } - -float readPSNRFromFile(const char* file_name){ +float readPSNRFromFile(const char *file_name) { float psnr; - FILE* pFile = fopen(file_name, "r"); - if(pFile == NULL){ + FILE *pFile = fopen(file_name, "r"); + if (pFile == NULL) { printf("ERROR: psnr.txt not found! \n"); abort(); } - + fscanf(pFile, "%f", &psnr); printf("**** PSNR read = %f \n\n", psnr); - return psnr; + return psnr; } +float computePSNRViolation(void *gold_ptr, void *approx_ptr, + float PSNR_threshold) { -float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){ - - PSNR_threshold = readPSNRFromFile("psnr.txt"); std::vector<float> psnr_list; - - struct Tensor* gold_tensor = (struct Tensor*) gold_ptr; - struct Tensor* approx_tensor = (struct Tensor*) approx_ptr; - size_t* dim_sizes = gold_tensor->dims.dim_sizes; + struct Tensor *gold_tensor = (struct Tensor *)gold_ptr; + struct Tensor *approx_tensor = (struct Tensor *)approx_ptr; + + size_t *dim_sizes = gold_tensor->dims.dim_sizes; size_t batch_dim = dim_sizes[0]; size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3]; - + printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size); - - float* gold_data = (float*) gold_tensor->host_data; - float* approx_data = (float*) approx_tensor->host_data; - FILE* fp = fopen("img_psnr.txt", "w+"); + float *gold_data = (float *)gold_tensor->host_data; + float *approx_data = (float *)approx_tensor->host_data; + + FILE *fp = fopen("img_psnr.txt", "w+"); float sum_psnr = 0.0; - int num_errors = 0; - for(size_t i = 0; i < batch_dim; i++){ + int num_errors = 0; + for (size_t i = 0; i < batch_dim; i++) { float mse_sum = 0.0; - float max_val = -999999; + float max_val = -999999; size_t offset = i * image_size; - - for(size_t j = 0; j < image_size; j++){ + + for (size_t j = 0; j < image_size; j++) { float diff = gold_data[offset + j] - approx_data[offset + j]; float diff_square = diff * diff; mse_sum += diff_square; - if(max_val < gold_data[offset + j]){ - max_val = gold_data[offset + j]; - } + if (max_val < gold_data[offset + j]) { + max_val = gold_data[offset + j]; + } } mse_sum = mse_sum / image_size; @@ -817,7 +761,7 @@ float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshol sum_psnr += psnr; if (psnr < PSNR_threshold) - num_errors += 1; + num_errors += 1; printf("PSNR value = %f \n", psnr); psnr_list.push_back(psnr); @@ -835,126 +779,104 @@ float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshol float avg_psnr = sum_psnr / batch_dim; printf("*** avg_psnr = %f \n\n", avg_psnr); dumpAvgPSNR(avg_psnr); - + float success_rate = 100.0 - violation_rate; dumpFinalAccuracy(success_rate); fclose(fp); - float var = 0.0; - for(size_t i = 0; i < batch_dim; i++){ - var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); + for (size_t i = 0; i < batch_dim; i++) { + var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); } var /= batch_dim; float std = sqrt(var); dumpPSNRStd(std); - - return violation_rate; -} + return violation_rate; +} -void dumpOutput(void* output_ptr, const char* file_name){ +void dumpOutput(void *output_ptr, const char *file_name) { - struct Tensor* out_tensor = (struct Tensor*) output_ptr; + struct Tensor *out_tensor = (struct Tensor *)output_ptr; size_t size_in_bytes = out_tensor->size_in_bytes; - printf ("** Output size = %lu \n", size_in_bytes); - - float* host_data = (float*) out_tensor->host_data; - FILE* fd = fopen(file_name, "w+"); + printf("** Output size = %lu \n", size_in_bytes); + + float *host_data = (float *)out_tensor->host_data; + FILE *fd = fopen(file_name, "w+"); fwrite(host_data, 1, size_in_bytes, fd); fclose(fd); } +void copyClassConfsAndLabels(void *result_ptr, float *classConfs, + int *predictedLabels, int start, int end) { + struct Tensor *result = (struct Tensor *)result_ptr; - - -void copyClassConfsAndLabels(void* result_ptr, - float* classConfs, - int* predictedLabels, - int start, int end){ - - - struct Tensor* result = (struct Tensor*) result_ptr; - size_t num_classes = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; + + int it_count = end - start; + for (int i = 0; i < it_count; i++) { - - int it_count = end - start; - for(int i = 0; i < it_count; i++){ - int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * num_classes + chosen] < data[i * num_classes + id]) chosen = id; + for (int id = 1; id < num_classes; ++id) { + if (data[i * num_classes + chosen] < data[i * num_classes + id]) + chosen = id; } predictedLabels[start + i] = chosen; classConfs[start + i] = data[i * num_classes + chosen]; } - - } +void dumpClassConfsAndLabels(float *classConfs, int *predictedLabels, + uint32_t *goldLabels, int test_input_size) { -void dumpClassConfsAndLabels(float* classConfs, - int* predictedLabels, - uint32_t* goldLabels, - int test_input_size){ + FILE *labels_fp = fopen("predicted_confs_labels.txt", "w+"); - FILE* labels_fp = fopen("predicted_confs_labels.txt", "w+"); - - for (int i = 0; i < test_input_size; i++){ + for (int i = 0; i < test_input_size; i++) { int label = predictedLabels[i]; - int gold_label = (int) goldLabels[i]; + int gold_label = (int)goldLabels[i]; float conf = classConfs[i]; - + std::ostringstream ss; ss << std::fixed << conf; - std::string print_str = ss.str(); + std::string print_str = ss.str(); fwrite(print_str.c_str(), 1, print_str.length(), labels_fp); fwrite(" ", 1, 1, labels_fp); - std::ostringstream label_ss; label_ss << label; - std::string label_str = label_ss.str(); + std::string label_str = label_ss.str(); fwrite(label_str.c_str(), 1, label_str.length(), labels_fp); fwrite(" ", 1, 1, labels_fp); - std::ostringstream gold_ss; gold_ss << gold_label; - std::string gold_str = gold_ss.str(); + std::string gold_str = gold_ss.str(); fwrite(gold_str.c_str(), 1, gold_str.length(), labels_fp); fwrite("\n", 1, 1, labels_fp); - - } fclose(labels_fp); } - - - - /**** Routines for Handling Piped Execution ***/ -void stallOnOpenTunerSignal(){ +void stallOnOpenTunerSignal() { - const char* myfifo = "/tmp/opentuner_fifo"; + const char *myfifo = "/tmp/opentuner_fifo"; int fd = open(myfifo, O_RDONLY); - if (fd == -1){ + if (fd == -1) { printf("OpenTuner pipe could not be opened \n"); abort(); } - + int ret_val = fcntl(fd, F_GETFD); - if(ret_val == -1){ + if (ret_val == -1) { printf("Invalid descriptor \n"); abort(); } @@ -963,32 +885,26 @@ void stallOnOpenTunerSignal(){ read(fd, str, 100); readOpenTunerFlags("promise_flags"); - - if(strcmp(str, "stop_run") == 0){ + if (strcmp(str, "stop_run") == 0) { abort(); } close(fd); } +void signalPipeToOpenTuner() { - -void signalPipeToOpenTuner(){ - - const char* myfifo = "/tmp/opentuner_fifo"; + const char *myfifo = "/tmp/opentuner_fifo"; int fd_out = open(myfifo, O_WRONLY); int ret_val = fcntl(fd_out, F_GETFD); - if(ret_val == -1){ + if (ret_val == -1) { printf("Invalid descriptor \n"); abort(); } - - const char* str = "completed***!\n\0"; + + const char *str = "completed***!\n\0"; write(fd_out, str, 80); close(fd_out); } - - - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h index 45ef7211a4c04f15d1763fde729b4ca550851008..ef4d1afda71dba0a851af796be524099e3ae524e 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h @@ -3,7 +3,6 @@ #ifndef UTILS_HEADER #define UTILS_HEADER - #include <sstream> #include <vector> #include <bits/stdc++.h> @@ -13,15 +12,13 @@ #include <cmath> #include <stdint.h> - std::vector<float> run_accuracies; +void printTensorInfo(void *tensor_ptr) { -void printTensorInfo(void* tensor_ptr){ + struct Tensor *tensor = (struct Tensor *)tensor_ptr; - struct Tensor* tensor = (struct Tensor*) tensor_ptr; - - if(tensor->gpu_data != NULL){ + if (tensor->gpu_data != NULL) { printf("Successful cudaMalloc \n"); } @@ -31,59 +28,54 @@ void printTensorInfo(void* tensor_ptr){ printf("num_elems = %lu \n", tensor->num_elems); } +void printTensorDims(void *tensor_ptr) { - -void printTensorDims(void* tensor_ptr){ - - struct Tensor* tensor = (struct Tensor*) tensor_ptr; + struct Tensor *tensor = (struct Tensor *)tensor_ptr; printf("Num_elems = %lu \n", tensor->num_elems); - for (int i = 0; i < tensor->dims.num_dims; i++){ + for (int i = 0; i < tensor->dims.num_dims; i++) { printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]); } } +void compareTensors(void *tensor1_ptr, void *tensor2_ptr) { + struct Tensor *tensor1 = (struct Tensor *)tensor1_ptr; + struct Tensor *tensor2 = (struct Tensor *)tensor2_ptr; -void compareTensors(void* tensor1_ptr, void* tensor2_ptr){ - - struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr; - struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr; + // hpvm_request_tensor(tensor1, 0); + // hpvm_request_tensor(tensor2, 0); - //hpvm_request_tensor(tensor1, 0); - //hpvm_request_tensor(tensor2, 0); + float *tensor_data1 = (float *)tensor1->host_data; + float *tensor_data2 = (float *)tensor2->host_data; - float* tensor_data1 = (float*) tensor1->host_data; - float* tensor_data2 = (float*) tensor2->host_data; - - for(unsigned int i = 0; i < tensor1->num_elems; i++){ - if(tensor_data1[i] != tensor_data2[i]){ + for (unsigned int i = 0; i < tensor1->num_elems; i++) { + if (tensor_data1[i] != tensor_data2[i]) { printf("Tensor data mismatch at index %d \n", i); abort(); } } } - - //*** FIXIT: Move this to CPU-only -struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type, - int dim1_size, int dim2_size, - int dim3_size, int dim4_size){ +struct Tensor *readTrainedWeightsCPU(const char *file_name, int data_type, + int dim1_size, int dim2_size, + int dim3_size, int dim4_size) { // FIXIT: Don't assume floating point types int type_size = 4; // NOTE: Assuming floating point tensors long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; - float* tensor_data = (float*) malloc(sizeof(float) * num_elems); + long int size_in_bytes = + type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float *tensor_data = (float *)malloc(sizeof(float) * num_elems); int file_header_size = 0; - - FILE* file = fopen(file_name, "rb"); - if(file == NULL){ + + FILE *file = fopen(file_name, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting... \n", file_name); abort(); } - + fseek(file, file_header_size, SEEK_CUR); // Skipping the file header size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); @@ -91,32 +83,29 @@ struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type, fclose(file); - - struct Tensor* weights = (struct Tensor*) create4DTensor(data_type, nchw, dim1_size, dim2_size, - dim3_size, dim4_size); - + struct Tensor *weights = (struct Tensor *)create4DTensor( + data_type, nchw, dim1_size, dim2_size, dim3_size, dim4_size); + initTensorData(weights, tensor_data, size_in_bytes); - //compareValues(weights, tensor_data, num_elems); + // compareValues(weights, tensor_data, num_elems); free(tensor_data); return weights; } +struct Tensor *readTrainedWeights(const char *file_name, int data_type, + int dim1_size, int dim2_size, int dim3_size, + int dim4_size) { -struct Tensor* readTrainedWeights(const char* file_name, int data_type, - int dim1_size, int dim2_size, - int dim3_size, int dim4_size){ - - return readTrainedWeightsCPU(file_name, data_type, dim1_size, dim2_size, dim3_size, dim4_size); + return readTrainedWeightsCPU(file_name, data_type, dim1_size, dim2_size, + dim3_size, dim4_size); } +uint8_t *readLabels(const char *labels_file, int num_labels) { - -uint8_t* readLabels(const char* labels_file, int num_labels){ - - uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ + uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels); + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting...\n", labels_file); abort(); } @@ -124,176 +113,168 @@ uint8_t* readLabels(const char* labels_file, int num_labels){ size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); fclose(file); - + return labels; } - -uint8_t* readLabelsBatch(const char* labels_file, int start, int end){ +uint8_t *readLabelsBatch(const char *labels_file, int start, int end) { int num_labels = end - start; int file_header_size = sizeof(uint8_t) * start; - - uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); - FILE* file = fopen(labels_file, "rb"); - if(file == NULL){ + + uint8_t *labels = (uint8_t *)malloc(sizeof(uint8_t) * num_labels); + FILE *file = fopen(labels_file, "rb"); + if (file == NULL) { printf("Data file %s is not found. Aborting...\n", labels_file); abort(); } - + fseek(file, file_header_size, SEEK_SET); // Skipping the file header - - size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); + size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); fclose(file); - + return labels; } +void computeAccuracy(const char *labels_file, int num_labels, + void *result_ptr) { + struct Tensor *result = (struct Tensor *)result_ptr; -void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){ - - struct Tensor* result = (struct Tensor*) result_ptr; - - uint8_t* labels = readLabels(labels_file, num_labels); + uint8_t *labels = readLabels(labels_file, num_labels); size_t batch_dim = result->dims.dim_sizes[0]; size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; int num_errors = 0; - - for(int i = 0; i < batch_dim; i++){ + + for (int i = 0; i < batch_dim; i++) { int chosen = 0; - for (int id = 1; id < 10; ++id){ - if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; + for (int id = 1; id < 10; ++id) { + if (data[i * channels + chosen] < data[i * channels + id]) + chosen = id; } - - if(chosen != labels[i]) + + if (chosen != labels[i]) num_errors++; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { fprintf(fp, "%f", accuracy); fclose(fp); } - } +float computeAccuracy2(uint8_t *labels, int num_labels, void *result_ptr, + unsigned num_classes = 10) { + unsigned num_zeros = 0; -float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){ + struct Tensor *result = (struct Tensor *)result_ptr; - unsigned num_zeros = 0; - - struct Tensor* result = (struct Tensor*) result_ptr; - size_t batch_dim = result->dims.dim_sizes[0]; size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; int num_errors = 0; printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); - - for(int i = 0; i < num_labels; i++){ + + for (int i = 0; i < num_labels; i++) { int chosen = 0; - for (int id = 1; id < num_classes; ++id){ - if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; + for (int id = 1; id < num_classes; ++id) { + if (data[i * channels + chosen] < data[i * channels + id]) + chosen = id; } - - if(labels[i] == 0) + + if (labels[i] == 0) num_zeros++; - - if(chosen != labels[i]) + + if (chosen != labels[i]) num_errors++; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { fprintf(fp, "%f", accuracy); } fclose(fp); - return accuracy; + return accuracy; } - -struct ClassProb{ +struct ClassProb { float prob; int index; }; - -bool descendFloatComp(ClassProb obj1, ClassProb obj2){ +bool descendFloatComp(ClassProb obj1, ClassProb obj2) { return obj1.prob > obj2.prob; } +float computeTop5Accuracy(uint8_t *labels, int num_labels, void *result_ptr, + unsigned num_classes = 10) { + + struct Tensor *result = (struct Tensor *)result_ptr; -float computeTop5Accuracy(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){ - - struct Tensor* result = (struct Tensor*) result_ptr; - size_t batch_dim = result->dims.dim_sizes[0]; size_t channels = result->dims.dim_sizes[1]; - float* data = (float*) result->host_data; + float *data = (float *)result->host_data; int num_errors = 0; printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); - - for(int i = 0; i < num_labels; i++){ + + for (int i = 0; i < num_labels; i++) { std::vector<ClassProb> elem_probs; - for (int id = 0; id < num_classes; ++id){ + for (int id = 0; id < num_classes; ++id) { ClassProb cProb; cProb.prob = data[i * channels + id]; cProb.index = id; - elem_probs.push_back(cProb); + elem_probs.push_back(cProb); } - std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp); + std: + sort(elem_probs.begin(), elem_probs.end(), descendFloatComp); // Check if any of top-5 predictions matches bool matched = false; - for(int j = 0; j < 5; j++){ + for (int j = 0; j < 5; j++) { ClassProb cProb = elem_probs[j]; - if(cProb.index == labels[i]) + if (cProb.index == labels[i]) matched = true; } - if(!matched) - num_errors +=1; + if (!matched) + num_errors += 1; } float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { fprintf(fp, "%f", accuracy); } fclose(fp); - return accuracy; + return accuracy; } - - - -void dumpFinalAccuracy(float accuracy){ +void dumpFinalAccuracy(float accuracy) { printf("\n\n **** Final Accuracy = %f \n", accuracy); - - FILE* fp = fopen("final_accuracy", "w+"); - if(fp != NULL){ + + FILE *fp = fopen("final_accuracy", "w+"); + if (fp != NULL) { fprintf(fp, "%f", accuracy); } @@ -302,15 +283,13 @@ void dumpFinalAccuracy(float accuracy){ run_accuracies.push_back(accuracy); } - - /*void dumpAvgPSNR(float avg_psnr){ FILE* fp = fopen("avg_psnr", "w+"); if(fp != NULL){ std::ostringstream ss; ss << std::fixed << avg_psnr; - std::string print_str = ss.str(); + std::string print_str = ss.str(); fwrite(print_str.c_str(), 1, print_str.length(), fp); } @@ -324,21 +303,18 @@ void dumpFinalAccuracy(float accuracy){ if(fp != NULL){ std::ostringstream ss; ss << std::fixed << psnr_std; - std::string print_str = ss.str(); + std::string print_str = ss.str(); fwrite(print_str.c_str(), 1, print_str.length(), fp); } fclose(fp); }*/ - - - /* void dumpExecutionAccuracies(){ FILE* fp = fopen("run_accuracies.txt", "w+"); - if(fp != NULL){ + if(fp != NULL){ for (int i = 0; i < run_accuracies.size(); i++){ float accuracy = run_accuracies[i]; std::ostringstream ss; @@ -354,56 +330,56 @@ void dumpExecutionAccuracies(){ } */ -float readPSNRFromFile(const char* file_name){ +float readPSNRFromFile(const char *file_name) { float psnr; - FILE* pFile = fopen(file_name, "r"); - if(pFile == NULL){ + FILE *pFile = fopen(file_name, "r"); + if (pFile == NULL) { printf("ERROR: psnr.txt not found! \n"); abort(); } - + fscanf(pFile, "%f", &psnr); printf("**** PSNR read = %f \n\n", psnr); - return psnr; + return psnr; } +/*float computePSNRViolation(void* gold_ptr, void* approx_ptr, float +PSNR_threshold){ -/*float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){ - PSNR_threshold = readPSNRFromFile("psnr.txt"); std::vector<float> psnr_list; - + struct Tensor* gold_tensor = (struct Tensor*) gold_ptr; struct Tensor* approx_tensor = (struct Tensor*) approx_ptr; size_t* dim_sizes = gold_tensor->dims.dim_sizes; size_t batch_dim = dim_sizes[0]; size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3]; - + printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size); - + float* gold_data = (float*) gold_tensor->host_data; float* approx_data = (float*) approx_tensor->host_data; FILE* fp = fopen("img_psnr.txt", "w+"); float sum_psnr = 0.0; - int num_errors = 0; + int num_errors = 0; for(size_t i = 0; i < batch_dim; i++){ float mse_sum = 0.0; - float max_val = -999999; + float max_val = -999999; size_t offset = i * image_size; - + for(size_t j = 0; j < image_size; j++){ float diff = gold_data[offset + j] - approx_data[offset + j]; float diff_square = diff * diff; mse_sum += diff_square; if(max_val < gold_data[offset + j]){ - max_val = gold_data[offset + j]; - } + max_val = gold_data[offset + j]; + } } mse_sum = mse_sum / image_size; @@ -411,7 +387,7 @@ float readPSNRFromFile(const char* file_name){ sum_psnr += psnr; if (psnr < PSNR_threshold) - num_errors += 1; + num_errors += 1; printf("PSNR value = %f \n", psnr); psnr_list.push_back(psnr); @@ -429,7 +405,7 @@ float readPSNRFromFile(const char* file_name){ float avg_psnr = sum_psnr / batch_dim; printf("*** avg_psnr = %f \n\n", avg_psnr); dumpAvgPSNR(avg_psnr); - + float success_rate = 100.0 - violation_rate; dumpFinalAccuracy(success_rate); @@ -438,30 +414,27 @@ float readPSNRFromFile(const char* file_name){ float var = 0.0; for(size_t i = 0; i < batch_dim; i++){ - var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); + var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); } var /= batch_dim; float std = sqrt(var); //dumpPSNRStd(std); - - return violation_rate; -}*/ + return violation_rate; +}*/ -void dumpOutput(void* output_ptr, const char* file_name){ +void dumpOutput(void *output_ptr, const char *file_name) { - struct Tensor* out_tensor = (struct Tensor*) output_ptr; + struct Tensor *out_tensor = (struct Tensor *)output_ptr; size_t size_in_bytes = out_tensor->size_in_bytes; - printf ("** Output size = %lu \n", size_in_bytes); - - float* host_data = (float*) out_tensor->host_data; - FILE* fd = fopen(file_name, "w+"); + printf("** Output size = %lu \n", size_in_bytes); + + float *host_data = (float *)out_tensor->host_data; + FILE *fd = fopen(file_name, "w+"); fwrite(host_data, 1, size_in_bytes, fd); fclose(fd); } - - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc index d93110945b1d1a70ec29c7788d9133dc16551ee5..846500ad355a5bc61b6a6385afa5a7ee36ea22c0 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet2_cifar10_half.cc @@ -11,50 +11,62 @@ #include "../../../tensor_runtime/include/tensor_runtime.h" #include "../../include/utils.h" - - /* NOTE: Reference Architecture to use for profiling */ -void testCifarNet(){ +void testCifarNet() { printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); - - std::string dir_prefix = model_params_path + std::string("/alexnet2_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,32,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,128,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - - int conv_mode = 1; // NOTE: using CROSS_CORRELATION - int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + std::string dir_prefix = + model_params_path + std::string("/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); + + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 32, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 128, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = + 0; // NOTE: using Float as compute precision. FIXIT: use enum startMemTracking(); @@ -65,61 +77,61 @@ void testCifarNet(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* conv1out = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv1out, conv2d_1_b); - void* conv1_tanh = tensorHalfTanh(conv1out); - + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *conv1out = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv1out, conv2d_1_b); + void *conv1_tanh = tensorHalfTanh(conv1out); + // 2nd Layer - void* conv2out = tensorHalfConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv2out, conv2d_2_b); - void* conv2_tanh = tensorHalfTanh(conv2out); - void* pool2out = tensorHalfPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv2out = tensorHalfConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv2out, conv2d_2_b); + void *conv2_tanh = tensorHalfTanh(conv2out); + void *pool2out = tensorHalfPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + // 3rd Layer - void* conv3out = tensorHalfConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv3out, conv2d_3_b); - void* conv3_tanh = tensorHalfTanh(conv3out); + void *conv3out = tensorHalfConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv3out, conv2d_3_b); + void *conv3_tanh = tensorHalfTanh(conv3out); // 4th Layer - void* conv4out = tensorHalfConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv4out, conv2d_4_b); - void* conv4_tanh = tensorHalfTanh(conv4out); - void* pool4out = tensorHalfPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv4out = tensorHalfConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv4out, conv2d_4_b); + void *conv4_tanh = tensorHalfTanh(conv4out); + void *pool4out = tensorHalfPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + // 5th Layer - void* conv5out = tensorHalfConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorHalfAdd(conv5out, conv2d_5_b); - void* conv5_tanh = tensorHalfTanh(conv5out); + void *conv5out = tensorHalfConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorHalfAdd(conv5out, conv2d_5_b); + void *conv5_tanh = tensorHalfTanh(conv5out); // 6th Layer - void* conv6out = tensorHalfConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1, - conv_mode, conv_precision); + void *conv6out = tensorHalfConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1, + conv_mode, conv_precision); tensorHalfAdd(conv6out, conv2d_6_b); - void* conv6_tanh = tensorHalfTanh(conv6out); - void* pool6out = tensorHalfPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv6_tanh = tensorHalfTanh(conv6out); + void *pool6out = tensorHalfPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + // final FC Layer - void* gemm1out = tensorHalfGemmGPU(pool6out, dense_1_w); - void* gemm1biasout = tensorHalfAdd(gemm1out, dense_1_b); - void* result = tensorSoftmax(gemm1biasout); + void *gemm1out = tensorHalfGemmGPU(pool6out, dense_1_w); + void *gemm1biasout = tensorHalfAdd(gemm1out, dense_1_b); + void *result = tensorSoftmax(gemm1biasout); - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); - float accuracy = computeAccuracy2(labels, batch_size, result); + float accuracy = computeAccuracy2(labels, batch_size, result); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -127,11 +139,9 @@ void testCifarNet(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - } - -int main(int argc, char* argv[]){ +int main(int argc, char *argv[]) { llvm_hpvm_initTensorRt(0); @@ -141,4 +151,3 @@ int main(int argc, char* argv[]){ return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc index b7695bbd7a24712e335f0cf8bbd25290f3261dea..2bde9d1eea174d6773a52cf2d007b524108c55dd 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/alexnet_cifar10_half.cc @@ -1,49 +1,58 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../../tensor_runtime/include/tensor_runtime.h" -#include "../../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - - std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); - - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 4096, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -54,40 +63,40 @@ int main(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); - void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); - void* var_2 = tensorHalfTanh(var_1); - void* var_3 = tensorHalfPooling(var_2,0,2,2,0,0,2,2); - void* var_5 = tensorHalfConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); - void* var_6 = tensorHalfAdd(var_5, conv2d_2_b); - void* var_7 = tensorHalfTanh(var_6); - void* var_8 = tensorHalfPooling(var_7,0,2,2,0,0,2,2); - void* var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorHalfAdd(var_10, conv2d_3_b); - void* var_12 = tensorHalfTanh(var_11); - void* var_13 = tensorHalfConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_14 = tensorHalfAdd(var_13, conv2d_4_b); - void* var_15 = tensorHalfTanh(var_14); - void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); - void* var_18 = tensorHalfTanh(var_17); - void* var_19 = tensorHalfPooling(var_18,0,2,2,0,0,2,2); - void* var_22 = tensorHalfGemmGPU(var_19, dense_1_w); - void* var_23 = tensorHalfAdd(var_22, dense_1_b); - void* var_24 = tensorSoftmax(var_23); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_24); + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); + void *var_1 = tensorHalfAdd(var_0, conv2d_1_b); + void *var_2 = tensorHalfTanh(var_1); + void *var_3 = tensorHalfPooling(var_2, 0, 2, 2, 0, 0, 2, 2); + void *var_5 = tensorHalfConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); + void *var_6 = tensorHalfAdd(var_5, conv2d_2_b); + void *var_7 = tensorHalfTanh(var_6); + void *var_8 = tensorHalfPooling(var_7, 0, 2, 2, 0, 0, 2, 2); + void *var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_11 = tensorHalfAdd(var_10, conv2d_3_b); + void *var_12 = tensorHalfTanh(var_11); + void *var_13 = tensorHalfConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_14 = tensorHalfAdd(var_13, conv2d_4_b); + void *var_15 = tensorHalfTanh(var_14); + void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorHalfAdd(var_16, conv2d_5_b); + void *var_18 = tensorHalfTanh(var_17); + void *var_19 = tensorHalfPooling(var_18, 0, 2, 2, 0, 0, 2, 2); + void *var_22 = tensorHalfGemmGPU(var_19, dense_1_w); + void *var_23 = tensorHalfAdd(var_22, dense_1_b); + void *var_24 = tensorSoftmax(var_23); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_24); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -96,9 +105,7 @@ int main(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); + llvm_hpvm_cleanupTensorRt(); - llvm_hpvm_cleanupTensorRt(); - - return 0; - + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc index 29f392c630a36a6044c5f804e5d3a7b252591831..037a3d7a3eda161bd341c55b8501d2ae608517d5 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/lenet_mnist_half.cc @@ -8,108 +8,102 @@ #include <sys/stat.h> #include <string.h> - #include "tensor_runtime.h" #include "utils.h" - /* NOTE: Reference Architecture to use for profiling */ -void testLenetTanh(){ +void testLenetTanh() { int total_runs = 1; printf("********* Lenet-2 Architecture ********** \n"); // FIXIT: Extend this to batch of images - currently 5 images int test_batch_size = 5000; - std::string dir_prefix = model_params_path + std::string("/lenet_mnist/"); + std::string dir_prefix = model_params_path + std::string("/lenet_mnist/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - // Loading Input Batch - void* input = readInputBatch(input_path.c_str(),0, 0,test_batch_size,1,28,28); - uint8_t* labels = readLabelsBatch(labels_path.c_str(), 0,test_batch_size); - - void* conv1_filter = readTrainedWeights("../model_params/lenet_mnist/conv1.bin", - float_type, 32, 1, 5, 5); - void* conv1_bias = readTrainedWeights("../model_params/lenet_mnist/conv1_bias.bin", - float_type, 1, 32, 1, 1); - void* conv2_filter = readTrainedWeights("../model_params/lenet_mnist/conv2.bin", - float_type, 64, 32, 5, 5); - void* conv2_bias = readTrainedWeights("../model_params/lenet_mnist/conv2_bias.bin", - float_type, 1, 64, 1, 1); - void* fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin", - float_type, 1, 1, 7*7*64, 1024); - void* fc1_bias = readTrainedWeights("../model_params/lenet_mnist/fc1_bias.bin", - float_type, 1, 1024, 1, 1); - void* fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin", - float_type, 1, 1, 1024, 10); - void* fc2_bias = readTrainedWeights("../model_params/lenet_mnist/fc2_bias.bin", - float_type, 1, 10, 1, 1); - - - + void *input = + readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), 0, test_batch_size); + + void *conv1_filter = readTrainedWeights( + "../model_params/lenet_mnist/conv1.bin", float_type, 32, 1, 5, 5); + void *conv1_bias = readTrainedWeights( + "../model_params/lenet_mnist/conv1_bias.bin", float_type, 1, 32, 1, 1); + void *conv2_filter = readTrainedWeights( + "../model_params/lenet_mnist/conv2.bin", float_type, 64, 32, 5, 5); + void *conv2_bias = readTrainedWeights( + "../model_params/lenet_mnist/conv2_bias.bin", float_type, 1, 64, 1, 1); + void *fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin", + float_type, 1, 1, 7 * 7 * 64, 1024); + void *fc1_bias = readTrainedWeights( + "../model_params/lenet_mnist/fc1_bias.bin", float_type, 1, 1024, 1, 1); + void *fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin", + float_type, 1, 1, 1024, 10); + void *fc2_bias = readTrainedWeights( + "../model_params/lenet_mnist/fc2_bias.bin", float_type, 1, 10, 1, 1); + clearTensorMap(); - - for(int i = 0; i < total_runs; i++){ + + for (int i = 0; i < total_runs; i++) { readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters - // Start power and performnce profiling + // Start power and performnce profiling startProfiling(); - + int conv_mode = 1; // NOTE: using CROSS_CORRELATION - int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + int conv_precision = + 0; // NOTE: using Float as compute precision. FIXIT: use enum // NOTE: 'SAME' convolution - void* conv1out = tensorHalfConvolution(input, conv1_filter, 2, 2, 1, 1, - conv_mode, conv_precision); + void *conv1out = tensorHalfConvolution(input, conv1_filter, 2, 2, 1, 1, + conv_mode, conv_precision); - // NOTE: For tensorAdd, the only dimension that MUST match is channels + // NOTE: For tensorAdd, the only dimension that MUST match is channels tensorHalfAdd(conv1out, conv1_bias); // NOTE: In place operation - void* pool1out = tensorHalfPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); + void *pool1out = tensorHalfPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); - void* conv1_tanh = tensorHalfTanh(pool1out); + void *conv1_tanh = tensorHalfTanh(pool1out); - // NOTE: input channels have to match between tensor op inputs and outputs - void* conv2out = tensorHalfConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, - conv_mode, conv_precision); + // NOTE: input channels have to match between tensor op inputs and outputs + void *conv2out = tensorHalfConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, + conv_mode, conv_precision); tensorHalfAdd(conv2out, conv2_bias); // NOTE: In place operation - void* pool2out = tensorHalfPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + void *pool2out = tensorHalfPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + + void *conv2_tanh = tensorHalfTanh(pool2out); + + void *gemm1out = tensorHalfGemm(conv2_tanh, fc1_weights); - void* conv2_tanh = tensorHalfTanh(pool2out); + void *gemm1biasout = tensorHalfAdd(gemm1out, fc1_bias); - void* gemm1out = tensorHalfGemm(conv2_tanh, fc1_weights); + void *tanh1out = tensorHalfTanh(gemm1biasout); - void* gemm1biasout = tensorHalfAdd(gemm1out, fc1_bias); + void *gemm2out = tensorHalfGemm(tanh1out, fc2_weights); - void* tanh1out = tensorHalfTanh(gemm1biasout); - - void* gemm2out = tensorHalfGemm(tanh1out, fc2_weights); - - void* gemm2_biasout = tensorHalfAdd(gemm2out, fc2_bias); + void *gemm2_biasout = tensorHalfAdd(gemm2out, fc2_bias); - void* tanh2out = tensorHalfTanh(gemm2_biasout); - - void* result = tensorSoftmax(tanh2out); + void *tanh2out = tensorHalfTanh(gemm2_biasout); + + void *result = tensorSoftmax(tanh2out); // End profiling and dump output to profile.txt stopProfiling(); - + computeAccuracy2(labels, test_batch_size, result); - + dumpAccuracyNorms(); freeOutputTensors(); } - - - } - -int main(int argc, char* argv[]){ +int main(int argc, char *argv[]) { llvm_hpvm_initTensorRt(0); testLenetTanh(); @@ -118,4 +112,3 @@ int main(int argc, char* argv[]){ return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc index d662dc1584c7810d8d3631d5ac16c427c3ff8b02..8940aeb3f1df087aac1e359ccae3f9b4391d2798 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/mobilenet_half.cc @@ -1,411 +1,731 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> #include "../../../tensor_runtime/include/tensor_runtime.h" #include "../../include/utils.h" -int main(){ +int main() { - llvm_hpvm_initTensorRt(0); + llvm_hpvm_initTensorRt(0); + std::string dir_prefix = model_params_path + std::string("/mobilenet/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); + std::string batch_normalization_1_gamma_path = + dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void *batch_normalization_1_gamma = readTrainedWeights( + batch_normalization_1_gamma_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_beta_path = + dir_prefix + std::string("batch_normalization_1_beta.bin"); + void *batch_normalization_1_beta = readTrainedWeights( + batch_normalization_1_beta_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_mean_path = + dir_prefix + std::string("batch_normalization_1_mean.bin"); + void *batch_normalization_1_mean = readTrainedWeights( + batch_normalization_1_mean_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_variance_path = + dir_prefix + std::string("batch_normalization_1_variance.bin"); + void *batch_normalization_1_variance = readTrainedWeights( + batch_normalization_1_variance_path.c_str(), 0, 1, 32, 1, 1); + std::string depthwise_conv2d_1_w_path = + dir_prefix + std::string("depthwise_conv2d_1_w.bin"); + void *depthwise_conv2d_1_w = + readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0, 32, 1, 3, 3); + std::string batch_normalization_2_gamma_path = + dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void *batch_normalization_2_gamma = readTrainedWeights( + batch_normalization_2_gamma_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_beta_path = + dir_prefix + std::string("batch_normalization_2_beta.bin"); + void *batch_normalization_2_beta = readTrainedWeights( + batch_normalization_2_beta_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_mean_path = + dir_prefix + std::string("batch_normalization_2_mean.bin"); + void *batch_normalization_2_mean = readTrainedWeights( + batch_normalization_2_mean_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_variance_path = + dir_prefix + std::string("batch_normalization_2_variance.bin"); + void *batch_normalization_2_variance = readTrainedWeights( + batch_normalization_2_variance_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 32, 1, 1); + std::string batch_normalization_3_gamma_path = + dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void *batch_normalization_3_gamma = readTrainedWeights( + batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_beta_path = + dir_prefix + std::string("batch_normalization_3_beta.bin"); + void *batch_normalization_3_beta = readTrainedWeights( + batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_mean_path = + dir_prefix + std::string("batch_normalization_3_mean.bin"); + void *batch_normalization_3_mean = readTrainedWeights( + batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_variance_path = + dir_prefix + std::string("batch_normalization_3_variance.bin"); + void *batch_normalization_3_variance = readTrainedWeights( + batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string depthwise_conv2d_2_w_path = + dir_prefix + std::string("depthwise_conv2d_2_w.bin"); + void *depthwise_conv2d_2_w = + readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0, 64, 1, 3, 3); + std::string batch_normalization_4_gamma_path = + dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void *batch_normalization_4_gamma = readTrainedWeights( + batch_normalization_4_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_beta_path = + dir_prefix + std::string("batch_normalization_4_beta.bin"); + void *batch_normalization_4_beta = readTrainedWeights( + batch_normalization_4_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_mean_path = + dir_prefix + std::string("batch_normalization_4_mean.bin"); + void *batch_normalization_4_mean = readTrainedWeights( + batch_normalization_4_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_variance_path = + dir_prefix + std::string("batch_normalization_4_variance.bin"); + void *batch_normalization_4_variance = readTrainedWeights( + batch_normalization_4_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 1, 1); + std::string batch_normalization_5_gamma_path = + dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void *batch_normalization_5_gamma = readTrainedWeights( + batch_normalization_5_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_beta_path = + dir_prefix + std::string("batch_normalization_5_beta.bin"); + void *batch_normalization_5_beta = readTrainedWeights( + batch_normalization_5_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_mean_path = + dir_prefix + std::string("batch_normalization_5_mean.bin"); + void *batch_normalization_5_mean = readTrainedWeights( + batch_normalization_5_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_variance_path = + dir_prefix + std::string("batch_normalization_5_variance.bin"); + void *batch_normalization_5_variance = readTrainedWeights( + batch_normalization_5_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string depthwise_conv2d_3_w_path = + dir_prefix + std::string("depthwise_conv2d_3_w.bin"); + void *depthwise_conv2d_3_w = + readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0, 128, 1, 3, 3); + std::string batch_normalization_6_gamma_path = + dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void *batch_normalization_6_gamma = readTrainedWeights( + batch_normalization_6_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_beta_path = + dir_prefix + std::string("batch_normalization_6_beta.bin"); + void *batch_normalization_6_beta = readTrainedWeights( + batch_normalization_6_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_mean_path = + dir_prefix + std::string("batch_normalization_6_mean.bin"); + void *batch_normalization_6_mean = readTrainedWeights( + batch_normalization_6_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_variance_path = + dir_prefix + std::string("batch_normalization_6_variance.bin"); + void *batch_normalization_6_variance = readTrainedWeights( + batch_normalization_6_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 1, 1); + std::string batch_normalization_7_gamma_path = + dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void *batch_normalization_7_gamma = readTrainedWeights( + batch_normalization_7_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_beta_path = + dir_prefix + std::string("batch_normalization_7_beta.bin"); + void *batch_normalization_7_beta = readTrainedWeights( + batch_normalization_7_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_mean_path = + dir_prefix + std::string("batch_normalization_7_mean.bin"); + void *batch_normalization_7_mean = readTrainedWeights( + batch_normalization_7_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_variance_path = + dir_prefix + std::string("batch_normalization_7_variance.bin"); + void *batch_normalization_7_variance = readTrainedWeights( + batch_normalization_7_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string depthwise_conv2d_4_w_path = + dir_prefix + std::string("depthwise_conv2d_4_w.bin"); + void *depthwise_conv2d_4_w = + readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0, 128, 1, 3, 3); + std::string batch_normalization_8_gamma_path = + dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void *batch_normalization_8_gamma = readTrainedWeights( + batch_normalization_8_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_beta_path = + dir_prefix + std::string("batch_normalization_8_beta.bin"); + void *batch_normalization_8_beta = readTrainedWeights( + batch_normalization_8_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_mean_path = + dir_prefix + std::string("batch_normalization_8_mean.bin"); + void *batch_normalization_8_mean = readTrainedWeights( + batch_normalization_8_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_variance_path = + dir_prefix + std::string("batch_normalization_8_variance.bin"); + void *batch_normalization_8_variance = readTrainedWeights( + batch_normalization_8_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 1, 1); + std::string batch_normalization_9_gamma_path = + dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void *batch_normalization_9_gamma = readTrainedWeights( + batch_normalization_9_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_beta_path = + dir_prefix + std::string("batch_normalization_9_beta.bin"); + void *batch_normalization_9_beta = readTrainedWeights( + batch_normalization_9_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_mean_path = + dir_prefix + std::string("batch_normalization_9_mean.bin"); + void *batch_normalization_9_mean = readTrainedWeights( + batch_normalization_9_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_variance_path = + dir_prefix + std::string("batch_normalization_9_variance.bin"); + void *batch_normalization_9_variance = readTrainedWeights( + batch_normalization_9_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string depthwise_conv2d_5_w_path = + dir_prefix + std::string("depthwise_conv2d_5_w.bin"); + void *depthwise_conv2d_5_w = + readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0, 256, 1, 3, 3); + std::string batch_normalization_10_gamma_path = + dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void *batch_normalization_10_gamma = readTrainedWeights( + batch_normalization_10_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_beta_path = + dir_prefix + std::string("batch_normalization_10_beta.bin"); + void *batch_normalization_10_beta = readTrainedWeights( + batch_normalization_10_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_mean_path = + dir_prefix + std::string("batch_normalization_10_mean.bin"); + void *batch_normalization_10_mean = readTrainedWeights( + batch_normalization_10_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_variance_path = + dir_prefix + std::string("batch_normalization_10_variance.bin"); + void *batch_normalization_10_variance = readTrainedWeights( + batch_normalization_10_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 1, 1); + std::string batch_normalization_11_gamma_path = + dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void *batch_normalization_11_gamma = readTrainedWeights( + batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_beta_path = + dir_prefix + std::string("batch_normalization_11_beta.bin"); + void *batch_normalization_11_beta = readTrainedWeights( + batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_mean_path = + dir_prefix + std::string("batch_normalization_11_mean.bin"); + void *batch_normalization_11_mean = readTrainedWeights( + batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_variance_path = + dir_prefix + std::string("batch_normalization_11_variance.bin"); + void *batch_normalization_11_variance = readTrainedWeights( + batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string depthwise_conv2d_6_w_path = + dir_prefix + std::string("depthwise_conv2d_6_w.bin"); + void *depthwise_conv2d_6_w = + readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0, 256, 1, 3, 3); + std::string batch_normalization_12_gamma_path = + dir_prefix + std::string("batch_normalization_12_gamma.bin"); + void *batch_normalization_12_gamma = readTrainedWeights( + batch_normalization_12_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_beta_path = + dir_prefix + std::string("batch_normalization_12_beta.bin"); + void *batch_normalization_12_beta = readTrainedWeights( + batch_normalization_12_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_mean_path = + dir_prefix + std::string("batch_normalization_12_mean.bin"); + void *batch_normalization_12_mean = readTrainedWeights( + batch_normalization_12_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_variance_path = + dir_prefix + std::string("batch_normalization_12_variance.bin"); + void *batch_normalization_12_variance = readTrainedWeights( + batch_normalization_12_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 512, 256, 1, 1); + std::string batch_normalization_13_gamma_path = + dir_prefix + std::string("batch_normalization_13_gamma.bin"); + void *batch_normalization_13_gamma = readTrainedWeights( + batch_normalization_13_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_beta_path = + dir_prefix + std::string("batch_normalization_13_beta.bin"); + void *batch_normalization_13_beta = readTrainedWeights( + batch_normalization_13_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_mean_path = + dir_prefix + std::string("batch_normalization_13_mean.bin"); + void *batch_normalization_13_mean = readTrainedWeights( + batch_normalization_13_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_variance_path = + dir_prefix + std::string("batch_normalization_13_variance.bin"); + void *batch_normalization_13_variance = readTrainedWeights( + batch_normalization_13_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_7_w_path = + dir_prefix + std::string("depthwise_conv2d_7_w.bin"); + void *depthwise_conv2d_7_w = + readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_14_gamma_path = + dir_prefix + std::string("batch_normalization_14_gamma.bin"); + void *batch_normalization_14_gamma = readTrainedWeights( + batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_beta_path = + dir_prefix + std::string("batch_normalization_14_beta.bin"); + void *batch_normalization_14_beta = readTrainedWeights( + batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_mean_path = + dir_prefix + std::string("batch_normalization_14_mean.bin"); + void *batch_normalization_14_mean = readTrainedWeights( + batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_variance_path = + dir_prefix + std::string("batch_normalization_14_variance.bin"); + void *batch_normalization_14_variance = readTrainedWeights( + batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_15_gamma_path = + dir_prefix + std::string("batch_normalization_15_gamma.bin"); + void *batch_normalization_15_gamma = readTrainedWeights( + batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_beta_path = + dir_prefix + std::string("batch_normalization_15_beta.bin"); + void *batch_normalization_15_beta = readTrainedWeights( + batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_mean_path = + dir_prefix + std::string("batch_normalization_15_mean.bin"); + void *batch_normalization_15_mean = readTrainedWeights( + batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_variance_path = + dir_prefix + std::string("batch_normalization_15_variance.bin"); + void *batch_normalization_15_variance = readTrainedWeights( + batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_8_w_path = + dir_prefix + std::string("depthwise_conv2d_8_w.bin"); + void *depthwise_conv2d_8_w = + readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_16_gamma_path = + dir_prefix + std::string("batch_normalization_16_gamma.bin"); + void *batch_normalization_16_gamma = readTrainedWeights( + batch_normalization_16_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_beta_path = + dir_prefix + std::string("batch_normalization_16_beta.bin"); + void *batch_normalization_16_beta = readTrainedWeights( + batch_normalization_16_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_mean_path = + dir_prefix + std::string("batch_normalization_16_mean.bin"); + void *batch_normalization_16_mean = readTrainedWeights( + batch_normalization_16_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_variance_path = + dir_prefix + std::string("batch_normalization_16_variance.bin"); + void *batch_normalization_16_variance = readTrainedWeights( + batch_normalization_16_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_17_gamma_path = + dir_prefix + std::string("batch_normalization_17_gamma.bin"); + void *batch_normalization_17_gamma = readTrainedWeights( + batch_normalization_17_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_beta_path = + dir_prefix + std::string("batch_normalization_17_beta.bin"); + void *batch_normalization_17_beta = readTrainedWeights( + batch_normalization_17_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_mean_path = + dir_prefix + std::string("batch_normalization_17_mean.bin"); + void *batch_normalization_17_mean = readTrainedWeights( + batch_normalization_17_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_variance_path = + dir_prefix + std::string("batch_normalization_17_variance.bin"); + void *batch_normalization_17_variance = readTrainedWeights( + batch_normalization_17_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_9_w_path = + dir_prefix + std::string("depthwise_conv2d_9_w.bin"); + void *depthwise_conv2d_9_w = + readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_18_gamma_path = + dir_prefix + std::string("batch_normalization_18_gamma.bin"); + void *batch_normalization_18_gamma = readTrainedWeights( + batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_beta_path = + dir_prefix + std::string("batch_normalization_18_beta.bin"); + void *batch_normalization_18_beta = readTrainedWeights( + batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_mean_path = + dir_prefix + std::string("batch_normalization_18_mean.bin"); + void *batch_normalization_18_mean = readTrainedWeights( + batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_variance_path = + dir_prefix + std::string("batch_normalization_18_variance.bin"); + void *batch_normalization_18_variance = readTrainedWeights( + batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_19_gamma_path = + dir_prefix + std::string("batch_normalization_19_gamma.bin"); + void *batch_normalization_19_gamma = readTrainedWeights( + batch_normalization_19_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_beta_path = + dir_prefix + std::string("batch_normalization_19_beta.bin"); + void *batch_normalization_19_beta = readTrainedWeights( + batch_normalization_19_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_mean_path = + dir_prefix + std::string("batch_normalization_19_mean.bin"); + void *batch_normalization_19_mean = readTrainedWeights( + batch_normalization_19_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_variance_path = + dir_prefix + std::string("batch_normalization_19_variance.bin"); + void *batch_normalization_19_variance = readTrainedWeights( + batch_normalization_19_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_10_w_path = + dir_prefix + std::string("depthwise_conv2d_10_w.bin"); + void *depthwise_conv2d_10_w = + readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_20_gamma_path = + dir_prefix + std::string("batch_normalization_20_gamma.bin"); + void *batch_normalization_20_gamma = readTrainedWeights( + batch_normalization_20_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_beta_path = + dir_prefix + std::string("batch_normalization_20_beta.bin"); + void *batch_normalization_20_beta = readTrainedWeights( + batch_normalization_20_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_mean_path = + dir_prefix + std::string("batch_normalization_20_mean.bin"); + void *batch_normalization_20_mean = readTrainedWeights( + batch_normalization_20_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_variance_path = + dir_prefix + std::string("batch_normalization_20_variance.bin"); + void *batch_normalization_20_variance = readTrainedWeights( + batch_normalization_20_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_21_gamma_path = + dir_prefix + std::string("batch_normalization_21_gamma.bin"); + void *batch_normalization_21_gamma = readTrainedWeights( + batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_beta_path = + dir_prefix + std::string("batch_normalization_21_beta.bin"); + void *batch_normalization_21_beta = readTrainedWeights( + batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_mean_path = + dir_prefix + std::string("batch_normalization_21_mean.bin"); + void *batch_normalization_21_mean = readTrainedWeights( + batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_variance_path = + dir_prefix + std::string("batch_normalization_21_variance.bin"); + void *batch_normalization_21_variance = readTrainedWeights( + batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_11_w_path = + dir_prefix + std::string("depthwise_conv2d_11_w.bin"); + void *depthwise_conv2d_11_w = + readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_22_gamma_path = + dir_prefix + std::string("batch_normalization_22_gamma.bin"); + void *batch_normalization_22_gamma = readTrainedWeights( + batch_normalization_22_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_beta_path = + dir_prefix + std::string("batch_normalization_22_beta.bin"); + void *batch_normalization_22_beta = readTrainedWeights( + batch_normalization_22_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_mean_path = + dir_prefix + std::string("batch_normalization_22_mean.bin"); + void *batch_normalization_22_mean = readTrainedWeights( + batch_normalization_22_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_variance_path = + dir_prefix + std::string("batch_normalization_22_variance.bin"); + void *batch_normalization_22_variance = readTrainedWeights( + batch_normalization_22_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_23_gamma_path = + dir_prefix + std::string("batch_normalization_23_gamma.bin"); + void *batch_normalization_23_gamma = readTrainedWeights( + batch_normalization_23_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_beta_path = + dir_prefix + std::string("batch_normalization_23_beta.bin"); + void *batch_normalization_23_beta = readTrainedWeights( + batch_normalization_23_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_mean_path = + dir_prefix + std::string("batch_normalization_23_mean.bin"); + void *batch_normalization_23_mean = readTrainedWeights( + batch_normalization_23_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_variance_path = + dir_prefix + std::string("batch_normalization_23_variance.bin"); + void *batch_normalization_23_variance = readTrainedWeights( + batch_normalization_23_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_12_w_path = + dir_prefix + std::string("depthwise_conv2d_12_w.bin"); + void *depthwise_conv2d_12_w = + readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_24_gamma_path = + dir_prefix + std::string("batch_normalization_24_gamma.bin"); + void *batch_normalization_24_gamma = readTrainedWeights( + batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_beta_path = + dir_prefix + std::string("batch_normalization_24_beta.bin"); + void *batch_normalization_24_beta = readTrainedWeights( + batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_mean_path = + dir_prefix + std::string("batch_normalization_24_mean.bin"); + void *batch_normalization_24_mean = readTrainedWeights( + batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_variance_path = + dir_prefix + std::string("batch_normalization_24_variance.bin"); + void *batch_normalization_24_variance = readTrainedWeights( + batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 1024, 512, 1, 1); + std::string batch_normalization_25_gamma_path = + dir_prefix + std::string("batch_normalization_25_gamma.bin"); + void *batch_normalization_25_gamma = readTrainedWeights( + batch_normalization_25_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_beta_path = + dir_prefix + std::string("batch_normalization_25_beta.bin"); + void *batch_normalization_25_beta = readTrainedWeights( + batch_normalization_25_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_mean_path = + dir_prefix + std::string("batch_normalization_25_mean.bin"); + void *batch_normalization_25_mean = readTrainedWeights( + batch_normalization_25_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_variance_path = + dir_prefix + std::string("batch_normalization_25_variance.bin"); + void *batch_normalization_25_variance = readTrainedWeights( + batch_normalization_25_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string depthwise_conv2d_13_w_path = + dir_prefix + std::string("depthwise_conv2d_13_w.bin"); + void *depthwise_conv2d_13_w = + readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0, 1024, 1, 3, 3); + std::string batch_normalization_26_gamma_path = + dir_prefix + std::string("batch_normalization_26_gamma.bin"); + void *batch_normalization_26_gamma = readTrainedWeights( + batch_normalization_26_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_beta_path = + dir_prefix + std::string("batch_normalization_26_beta.bin"); + void *batch_normalization_26_beta = readTrainedWeights( + batch_normalization_26_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_mean_path = + dir_prefix + std::string("batch_normalization_26_mean.bin"); + void *batch_normalization_26_mean = readTrainedWeights( + batch_normalization_26_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_variance_path = + dir_prefix + std::string("batch_normalization_26_variance.bin"); + void *batch_normalization_26_variance = readTrainedWeights( + batch_normalization_26_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 1024, 1024, 1, 1); + std::string batch_normalization_27_gamma_path = + dir_prefix + std::string("batch_normalization_27_gamma.bin"); + void *batch_normalization_27_gamma = readTrainedWeights( + batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_beta_path = + dir_prefix + std::string("batch_normalization_27_beta.bin"); + void *batch_normalization_27_beta = readTrainedWeights( + batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_mean_path = + dir_prefix + std::string("batch_normalization_27_mean.bin"); + void *batch_normalization_27_mean = readTrainedWeights( + batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_variance_path = + dir_prefix + std::string("batch_normalization_27_variance.bin"); + void *batch_normalization_27_variance = readTrainedWeights( + batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); - std::string dir_prefix = model_params_path + std::string("/mobilenet/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); - std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); - void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); - void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); - void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); - void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); - std::string depthwise_conv2d_1_w_path = dir_prefix + std::string("depthwise_conv2d_1_w.bin"); - void* depthwise_conv2d_1_w = readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); - std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); - void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); - void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); - void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); - void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); - std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); - void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); - void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); - void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); - void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); - std::string depthwise_conv2d_2_w_path = dir_prefix + std::string("depthwise_conv2d_2_w.bin"); - void* depthwise_conv2d_2_w = readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); - std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); - void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); - void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); - void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); - void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); - std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); - void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); - void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); - void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); - void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); - std::string depthwise_conv2d_3_w_path = dir_prefix + std::string("depthwise_conv2d_3_w.bin"); - void* depthwise_conv2d_3_w = readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); - std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); - void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); - void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); - void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); - void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); - std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); - void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); - void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); - void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); - void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); - std::string depthwise_conv2d_4_w_path = dir_prefix + std::string("depthwise_conv2d_4_w.bin"); - void* depthwise_conv2d_4_w = readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); - std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); - void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); - void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); - void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); - void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); - std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); - void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); - void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); - void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); - void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); - std::string depthwise_conv2d_5_w_path = dir_prefix + std::string("depthwise_conv2d_5_w.bin"); - void* depthwise_conv2d_5_w = readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); - std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); - void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); - void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); - void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); - void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); - std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); - void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); - void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); - void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); - void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); - std::string depthwise_conv2d_6_w_path = dir_prefix + std::string("depthwise_conv2d_6_w.bin"); - void* depthwise_conv2d_6_w = readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); - std::string batch_normalization_12_gamma_path = dir_prefix + std::string("batch_normalization_12_gamma.bin"); - void* batch_normalization_12_gamma = readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_beta_path = dir_prefix + std::string("batch_normalization_12_beta.bin"); - void* batch_normalization_12_beta = readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_mean_path = dir_prefix + std::string("batch_normalization_12_mean.bin"); - void* batch_normalization_12_mean = readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_variance_path = dir_prefix + std::string("batch_normalization_12_variance.bin"); - void* batch_normalization_12_variance = readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); - std::string batch_normalization_13_gamma_path = dir_prefix + std::string("batch_normalization_13_gamma.bin"); - void* batch_normalization_13_gamma = readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_beta_path = dir_prefix + std::string("batch_normalization_13_beta.bin"); - void* batch_normalization_13_beta = readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_mean_path = dir_prefix + std::string("batch_normalization_13_mean.bin"); - void* batch_normalization_13_mean = readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_variance_path = dir_prefix + std::string("batch_normalization_13_variance.bin"); - void* batch_normalization_13_variance = readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_7_w_path = dir_prefix + std::string("depthwise_conv2d_7_w.bin"); - void* depthwise_conv2d_7_w = readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_14_gamma_path = dir_prefix + std::string("batch_normalization_14_gamma.bin"); - void* batch_normalization_14_gamma = readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_beta_path = dir_prefix + std::string("batch_normalization_14_beta.bin"); - void* batch_normalization_14_beta = readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_mean_path = dir_prefix + std::string("batch_normalization_14_mean.bin"); - void* batch_normalization_14_mean = readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_variance_path = dir_prefix + std::string("batch_normalization_14_variance.bin"); - void* batch_normalization_14_variance = readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_15_gamma_path = dir_prefix + std::string("batch_normalization_15_gamma.bin"); - void* batch_normalization_15_gamma = readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_beta_path = dir_prefix + std::string("batch_normalization_15_beta.bin"); - void* batch_normalization_15_beta = readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_mean_path = dir_prefix + std::string("batch_normalization_15_mean.bin"); - void* batch_normalization_15_mean = readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_variance_path = dir_prefix + std::string("batch_normalization_15_variance.bin"); - void* batch_normalization_15_variance = readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_8_w_path = dir_prefix + std::string("depthwise_conv2d_8_w.bin"); - void* depthwise_conv2d_8_w = readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_16_gamma_path = dir_prefix + std::string("batch_normalization_16_gamma.bin"); - void* batch_normalization_16_gamma = readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_beta_path = dir_prefix + std::string("batch_normalization_16_beta.bin"); - void* batch_normalization_16_beta = readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_mean_path = dir_prefix + std::string("batch_normalization_16_mean.bin"); - void* batch_normalization_16_mean = readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_variance_path = dir_prefix + std::string("batch_normalization_16_variance.bin"); - void* batch_normalization_16_variance = readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_17_gamma_path = dir_prefix + std::string("batch_normalization_17_gamma.bin"); - void* batch_normalization_17_gamma = readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_beta_path = dir_prefix + std::string("batch_normalization_17_beta.bin"); - void* batch_normalization_17_beta = readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_mean_path = dir_prefix + std::string("batch_normalization_17_mean.bin"); - void* batch_normalization_17_mean = readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_variance_path = dir_prefix + std::string("batch_normalization_17_variance.bin"); - void* batch_normalization_17_variance = readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_9_w_path = dir_prefix + std::string("depthwise_conv2d_9_w.bin"); - void* depthwise_conv2d_9_w = readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_18_gamma_path = dir_prefix + std::string("batch_normalization_18_gamma.bin"); - void* batch_normalization_18_gamma = readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_beta_path = dir_prefix + std::string("batch_normalization_18_beta.bin"); - void* batch_normalization_18_beta = readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_mean_path = dir_prefix + std::string("batch_normalization_18_mean.bin"); - void* batch_normalization_18_mean = readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_variance_path = dir_prefix + std::string("batch_normalization_18_variance.bin"); - void* batch_normalization_18_variance = readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_19_gamma_path = dir_prefix + std::string("batch_normalization_19_gamma.bin"); - void* batch_normalization_19_gamma = readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_beta_path = dir_prefix + std::string("batch_normalization_19_beta.bin"); - void* batch_normalization_19_beta = readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_mean_path = dir_prefix + std::string("batch_normalization_19_mean.bin"); - void* batch_normalization_19_mean = readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_variance_path = dir_prefix + std::string("batch_normalization_19_variance.bin"); - void* batch_normalization_19_variance = readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_10_w_path = dir_prefix + std::string("depthwise_conv2d_10_w.bin"); - void* depthwise_conv2d_10_w = readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_20_gamma_path = dir_prefix + std::string("batch_normalization_20_gamma.bin"); - void* batch_normalization_20_gamma = readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_beta_path = dir_prefix + std::string("batch_normalization_20_beta.bin"); - void* batch_normalization_20_beta = readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_mean_path = dir_prefix + std::string("batch_normalization_20_mean.bin"); - void* batch_normalization_20_mean = readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_variance_path = dir_prefix + std::string("batch_normalization_20_variance.bin"); - void* batch_normalization_20_variance = readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_21_gamma_path = dir_prefix + std::string("batch_normalization_21_gamma.bin"); - void* batch_normalization_21_gamma = readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_beta_path = dir_prefix + std::string("batch_normalization_21_beta.bin"); - void* batch_normalization_21_beta = readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_mean_path = dir_prefix + std::string("batch_normalization_21_mean.bin"); - void* batch_normalization_21_mean = readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_variance_path = dir_prefix + std::string("batch_normalization_21_variance.bin"); - void* batch_normalization_21_variance = readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_11_w_path = dir_prefix + std::string("depthwise_conv2d_11_w.bin"); - void* depthwise_conv2d_11_w = readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_22_gamma_path = dir_prefix + std::string("batch_normalization_22_gamma.bin"); - void* batch_normalization_22_gamma = readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_beta_path = dir_prefix + std::string("batch_normalization_22_beta.bin"); - void* batch_normalization_22_beta = readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_mean_path = dir_prefix + std::string("batch_normalization_22_mean.bin"); - void* batch_normalization_22_mean = readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_variance_path = dir_prefix + std::string("batch_normalization_22_variance.bin"); - void* batch_normalization_22_variance = readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_23_gamma_path = dir_prefix + std::string("batch_normalization_23_gamma.bin"); - void* batch_normalization_23_gamma = readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_beta_path = dir_prefix + std::string("batch_normalization_23_beta.bin"); - void* batch_normalization_23_beta = readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_mean_path = dir_prefix + std::string("batch_normalization_23_mean.bin"); - void* batch_normalization_23_mean = readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_variance_path = dir_prefix + std::string("batch_normalization_23_variance.bin"); - void* batch_normalization_23_variance = readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_12_w_path = dir_prefix + std::string("depthwise_conv2d_12_w.bin"); - void* depthwise_conv2d_12_w = readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_24_gamma_path = dir_prefix + std::string("batch_normalization_24_gamma.bin"); - void* batch_normalization_24_gamma = readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_beta_path = dir_prefix + std::string("batch_normalization_24_beta.bin"); - void* batch_normalization_24_beta = readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_mean_path = dir_prefix + std::string("batch_normalization_24_mean.bin"); - void* batch_normalization_24_mean = readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_variance_path = dir_prefix + std::string("batch_normalization_24_variance.bin"); - void* batch_normalization_24_variance = readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); - std::string batch_normalization_25_gamma_path = dir_prefix + std::string("batch_normalization_25_gamma.bin"); - void* batch_normalization_25_gamma = readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_beta_path = dir_prefix + std::string("batch_normalization_25_beta.bin"); - void* batch_normalization_25_beta = readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_mean_path = dir_prefix + std::string("batch_normalization_25_mean.bin"); - void* batch_normalization_25_mean = readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_variance_path = dir_prefix + std::string("batch_normalization_25_variance.bin"); - void* batch_normalization_25_variance = readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); - std::string depthwise_conv2d_13_w_path = dir_prefix + std::string("depthwise_conv2d_13_w.bin"); - void* depthwise_conv2d_13_w = readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); - std::string batch_normalization_26_gamma_path = dir_prefix + std::string("batch_normalization_26_gamma.bin"); - void* batch_normalization_26_gamma = readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_beta_path = dir_prefix + std::string("batch_normalization_26_beta.bin"); - void* batch_normalization_26_beta = readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_mean_path = dir_prefix + std::string("batch_normalization_26_mean.bin"); - void* batch_normalization_26_mean = readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_variance_path = dir_prefix + std::string("batch_normalization_26_variance.bin"); - void* batch_normalization_26_variance = readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); - std::string batch_normalization_27_gamma_path = dir_prefix + std::string("batch_normalization_27_gamma.bin"); - void* batch_normalization_27_gamma = readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_beta_path = dir_prefix + std::string("batch_normalization_27_beta.bin"); - void* batch_normalization_27_beta = readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_mean_path = dir_prefix + std::string("batch_normalization_27_mean.bin"); - void* batch_normalization_27_mean = readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_variance_path = dir_prefix + std::string("batch_normalization_27_variance.bin"); - void* batch_normalization_27_variance = readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + startMemTracking(); + int test_input_size = 2000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; - startMemTracking(); + for (int i = 0; i < batch_count; i++) { - int test_input_size = 2000; - int batch_size = 1000; - int batch_count = test_input_size / batch_size; + int start = i * batch_size; + int end = (i + 1) * batch_size; - float final_accuracy = 0.0; + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); - for(int i = 0; i < batch_count; i++){ + void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); + void *var_1 = tensorHalfBatchNorm( + var_0, batch_normalization_1_gamma, batch_normalization_1_beta, + batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void *var_2 = tensorHalfRelu(var_1); + void *var_4 = + tensorHalfConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); + void *var_5 = tensorHalfBatchNorm( + var_4, batch_normalization_2_gamma, batch_normalization_2_beta, + batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void *var_6 = tensorHalfRelu(var_5); + void *var_7 = tensorHalfConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); + void *var_8 = tensorHalfBatchNorm( + var_7, batch_normalization_3_gamma, batch_normalization_3_beta, + batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void *var_9 = tensorHalfRelu(var_8); + void *var_11 = + tensorHalfConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); + void *var_12 = tensorHalfBatchNorm( + var_11, batch_normalization_4_gamma, batch_normalization_4_beta, + batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void *var_13 = tensorHalfRelu(var_12); + void *var_14 = tensorHalfConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); + void *var_15 = tensorHalfBatchNorm( + var_14, batch_normalization_5_gamma, batch_normalization_5_beta, + batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void *var_16 = tensorHalfRelu(var_15); + void *var_18 = + tensorHalfConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); + void *var_19 = tensorHalfBatchNorm( + var_18, batch_normalization_6_gamma, batch_normalization_6_beta, + batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void *var_20 = tensorHalfRelu(var_19); + void *var_21 = tensorHalfConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); + void *var_22 = tensorHalfBatchNorm( + var_21, batch_normalization_7_gamma, batch_normalization_7_beta, + batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void *var_23 = tensorHalfRelu(var_22); + void *var_26 = + tensorHalfConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); + void *var_27 = tensorHalfBatchNorm( + var_26, batch_normalization_8_gamma, batch_normalization_8_beta, + batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void *var_28 = tensorHalfRelu(var_27); + void *var_29 = tensorHalfConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); + void *var_30 = tensorHalfBatchNorm( + var_29, batch_normalization_9_gamma, batch_normalization_9_beta, + batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void *var_31 = tensorHalfRelu(var_30); + void *var_33 = + tensorHalfConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); + void *var_34 = tensorHalfBatchNorm( + var_33, batch_normalization_10_gamma, batch_normalization_10_beta, + batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void *var_35 = tensorHalfRelu(var_34); + void *var_36 = tensorHalfConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); + void *var_37 = tensorHalfBatchNorm( + var_36, batch_normalization_11_gamma, batch_normalization_11_beta, + batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void *var_38 = tensorHalfRelu(var_37); + void *var_41 = + tensorHalfConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); + void *var_42 = tensorHalfBatchNorm( + var_41, batch_normalization_12_gamma, batch_normalization_12_beta, + batch_normalization_12_mean, batch_normalization_12_variance, 0.001); + void *var_43 = tensorHalfRelu(var_42); + void *var_44 = tensorHalfConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); + void *var_45 = tensorHalfBatchNorm( + var_44, batch_normalization_13_gamma, batch_normalization_13_beta, + batch_normalization_13_mean, batch_normalization_13_variance, 0.001); + void *var_46 = tensorHalfRelu(var_45); + void *var_48 = + tensorHalfConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); + void *var_49 = tensorHalfBatchNorm( + var_48, batch_normalization_14_gamma, batch_normalization_14_beta, + batch_normalization_14_mean, batch_normalization_14_variance, 0.001); + void *var_50 = tensorHalfRelu(var_49); + void *var_51 = tensorHalfConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); + void *var_52 = tensorHalfBatchNorm( + var_51, batch_normalization_15_gamma, batch_normalization_15_beta, + batch_normalization_15_mean, batch_normalization_15_variance, 0.001); + void *var_53 = tensorHalfRelu(var_52); + void *var_55 = + tensorHalfConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); + void *var_56 = tensorHalfBatchNorm( + var_55, batch_normalization_16_gamma, batch_normalization_16_beta, + batch_normalization_16_mean, batch_normalization_16_variance, 0.001); + void *var_57 = tensorHalfRelu(var_56); + void *var_58 = tensorHalfConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); + void *var_59 = tensorHalfBatchNorm( + var_58, batch_normalization_17_gamma, batch_normalization_17_beta, + batch_normalization_17_mean, batch_normalization_17_variance, 0.001); + void *var_60 = tensorHalfRelu(var_59); + void *var_63 = + tensorHalfConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); + void *var_64 = tensorHalfBatchNorm( + var_63, batch_normalization_18_gamma, batch_normalization_18_beta, + batch_normalization_18_mean, batch_normalization_18_variance, 0.001); + void *var_65 = tensorHalfRelu(var_64); + void *var_66 = tensorHalfConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); + void *var_67 = tensorHalfBatchNorm( + var_66, batch_normalization_19_gamma, batch_normalization_19_beta, + batch_normalization_19_mean, batch_normalization_19_variance, 0.001); + void *var_68 = tensorHalfRelu(var_67); + void *var_70 = tensorHalfConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, + 1, 1, 512); + void *var_71 = tensorHalfBatchNorm( + var_70, batch_normalization_20_gamma, batch_normalization_20_beta, + batch_normalization_20_mean, batch_normalization_20_variance, 0.001); + void *var_72 = tensorHalfRelu(var_71); + void *var_73 = tensorHalfConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); + void *var_74 = tensorHalfBatchNorm( + var_73, batch_normalization_21_gamma, batch_normalization_21_beta, + batch_normalization_21_mean, batch_normalization_21_variance, 0.001); + void *var_75 = tensorHalfRelu(var_74); + void *var_77 = tensorHalfConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, + 1, 1, 512); + void *var_78 = tensorHalfBatchNorm( + var_77, batch_normalization_22_gamma, batch_normalization_22_beta, + batch_normalization_22_mean, batch_normalization_22_variance, 0.001); + void *var_79 = tensorHalfRelu(var_78); + void *var_80 = tensorHalfConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); + void *var_81 = tensorHalfBatchNorm( + var_80, batch_normalization_23_gamma, batch_normalization_23_beta, + batch_normalization_23_mean, batch_normalization_23_variance, 0.001); + void *var_82 = tensorHalfRelu(var_81); + void *var_85 = tensorHalfConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, + 2, 1, 512); + void *var_86 = tensorHalfBatchNorm( + var_85, batch_normalization_24_gamma, batch_normalization_24_beta, + batch_normalization_24_mean, batch_normalization_24_variance, 0.001); + void *var_87 = tensorHalfRelu(var_86); + void *var_88 = tensorHalfConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); + void *var_89 = tensorHalfBatchNorm( + var_88, batch_normalization_25_gamma, batch_normalization_25_beta, + batch_normalization_25_mean, batch_normalization_25_variance, 0.001); + void *var_90 = tensorHalfRelu(var_89); + void *var_92 = tensorHalfConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, + 1, 1, 1024); + void *var_93 = tensorHalfBatchNorm( + var_92, batch_normalization_26_gamma, batch_normalization_26_beta, + batch_normalization_26_mean, batch_normalization_26_variance, 0.001); + void *var_94 = tensorHalfRelu(var_93); + void *var_95 = tensorHalfConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); + void *var_96 = tensorHalfBatchNorm( + var_95, batch_normalization_27_gamma, batch_normalization_27_beta, + batch_normalization_27_mean, batch_normalization_27_variance, 0.001); + void *var_97 = tensorHalfRelu(var_96); + void *var_99 = tensorHalfPooling(var_97, 1, 2, 2, 0, 0, 2, 2); + void *var_101 = tensorHalfGemmGPU(var_99, dense_1_w); + void *var_102 = tensorHalfAdd(var_101, dense_1_b); + void *var_103 = tensorSoftmax(var_102); - int start = i * batch_size; - int end = (i + 1) * batch_size; + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); - void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); + float accuracy = computeAccuracy2(labels, batch_size, var_103); + final_accuracy += accuracy; + freeBatchMemory(); + } + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); - void* var_1 = tensorHalfBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); - void* var_2 = tensorHalfRelu(var_1); - void* var_4 = tensorHalfConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); - void* var_5 = tensorHalfBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); - void* var_6 = tensorHalfRelu(var_5); - void* var_7 = tensorHalfConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); - void* var_8 = tensorHalfBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); - void* var_9 = tensorHalfRelu(var_8); - void* var_11 = tensorHalfConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); - void* var_12 = tensorHalfBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); - void* var_13 = tensorHalfRelu(var_12); - void* var_14 = tensorHalfConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); - void* var_15 = tensorHalfBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); - void* var_16 = tensorHalfRelu(var_15); - void* var_18 = tensorHalfConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); - void* var_19 = tensorHalfBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); - void* var_20 = tensorHalfRelu(var_19); - void* var_21 = tensorHalfConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); - void* var_22 = tensorHalfBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); - void* var_23 = tensorHalfRelu(var_22); - void* var_26 = tensorHalfConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); - void* var_27 = tensorHalfBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); - void* var_28 = tensorHalfRelu(var_27); - void* var_29 = tensorHalfConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); - void* var_30 = tensorHalfBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); - void* var_31 = tensorHalfRelu(var_30); - void* var_33 = tensorHalfConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); - void* var_34 = tensorHalfBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); - void* var_35 = tensorHalfRelu(var_34); - void* var_36 = tensorHalfConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); - void* var_37 = tensorHalfBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); - void* var_38 = tensorHalfRelu(var_37); - void* var_41 = tensorHalfConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); - void* var_42 = tensorHalfBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); - void* var_43 = tensorHalfRelu(var_42); - void* var_44 = tensorHalfConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); - void* var_45 = tensorHalfBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); - void* var_46 = tensorHalfRelu(var_45); - void* var_48 = tensorHalfConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); - void* var_49 = tensorHalfBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); - void* var_50 = tensorHalfRelu(var_49); - void* var_51 = tensorHalfConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); - void* var_52 = tensorHalfBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); - void* var_53 = tensorHalfRelu(var_52); - void* var_55 = tensorHalfConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); - void* var_56 = tensorHalfBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); - void* var_57 = tensorHalfRelu(var_56); - void* var_58 = tensorHalfConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); - void* var_59 = tensorHalfBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); - void* var_60 = tensorHalfRelu(var_59); - void* var_63 = tensorHalfConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); - void* var_64 = tensorHalfBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); - void* var_65 = tensorHalfRelu(var_64); - void* var_66 = tensorHalfConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); - void* var_67 = tensorHalfBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); - void* var_68 = tensorHalfRelu(var_67); - void* var_70 = tensorHalfConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); - void* var_71 = tensorHalfBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); - void* var_72 = tensorHalfRelu(var_71); - void* var_73 = tensorHalfConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); - void* var_74 = tensorHalfBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); - void* var_75 = tensorHalfRelu(var_74); - void* var_77 = tensorHalfConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); - void* var_78 = tensorHalfBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); - void* var_79 = tensorHalfRelu(var_78); - void* var_80 = tensorHalfConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); - void* var_81 = tensorHalfBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); - void* var_82 = tensorHalfRelu(var_81); - void* var_85 = tensorHalfConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); - void* var_86 = tensorHalfBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); - void* var_87 = tensorHalfRelu(var_86); - void* var_88 = tensorHalfConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); - void* var_89 = tensorHalfBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); - void* var_90 = tensorHalfRelu(var_89); - void* var_92 = tensorHalfConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); - void* var_93 = tensorHalfBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); - void* var_94 = tensorHalfRelu(var_93); - void* var_95 = tensorHalfConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); - void* var_96 = tensorHalfBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); - void* var_97 = tensorHalfRelu(var_96); - void* var_99 = tensorHalfPooling(var_97,1,2,2,0,0,2,2); - void* var_101 = tensorHalfGemmGPU(var_99, dense_1_w); - void* var_102 = tensorHalfAdd(var_101, dense_1_b); - void* var_103 = tensorSoftmax(var_102); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy2(labels, batch_size, var_103); - final_accuracy += accuracy; - freeBatchMemory(); - } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc index 741c4a443cc9a56c443ec5858aaed5a7d5705268..d674591027a9451d51d7a8a4243e7180e8abd24d 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/resnet18_cifar10_half.cc @@ -1,112 +1,155 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../../tensor_runtime/include/tensor_runtime.h" -#include "../../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/resnet18_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); - std::string labels_path = dir_prefix + std::string("labels.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); - void* conv2d_14_b = readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); - void* conv2d_15_w = readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); - std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); - void* conv2d_15_b = readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); - void* conv2d_17_w = readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); - std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); - void* conv2d_17_b = readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); - void* conv2d_16_w = readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); - void* conv2d_16_b = readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); - void* conv2d_18_w = readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); - void* conv2d_18_b = readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); - void* conv2d_19_w = readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); - void* conv2d_19_b = readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); - void* conv2d_20_w = readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); - void* conv2d_20_b = readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); - void* conv2d_21_w = readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); - void* conv2d_21_b = readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = + model_params_path + std::string("/resnet18_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + // void* input = readTrainedWeights(input_path.c_str(), 0, + // batch_size,3,32,32); + std::string labels_path = dir_prefix + std::string("labels.bin"); + // uint8_t* labels = readLabels(labels_path.c_str(), batch_size); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 32, 16, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 32, 16, 1, 1); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); + void *conv2d_14_b = + readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); + void *conv2d_15_w = + readTrainedWeights(conv2d_15_w_path.c_str(), 0, 64, 32, 3, 3); + std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); + void *conv2d_15_b = + readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); + void *conv2d_17_w = + readTrainedWeights(conv2d_17_w_path.c_str(), 0, 64, 32, 1, 1); + std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); + void *conv2d_17_b = + readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); + void *conv2d_16_w = + readTrainedWeights(conv2d_16_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); + void *conv2d_16_b = + readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); + void *conv2d_18_w = + readTrainedWeights(conv2d_18_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); + void *conv2d_18_b = + readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); + void *conv2d_19_w = + readTrainedWeights(conv2d_19_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); + void *conv2d_19_b = + readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); + void *conv2d_20_w = + readTrainedWeights(conv2d_20_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); + void *conv2d_20_b = + readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); + void *conv2d_21_w = + readTrainedWeights(conv2d_21_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); + void *conv2d_21_b = + readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 64, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 64, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -117,94 +160,94 @@ int main(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_2 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_3 = tensorHalfAdd(var_2, conv2d_1_b); - void* var_4 = tensorHalfRelu(var_3); - void* var_6 = tensorHalfConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_7 = tensorHalfAdd(var_6, conv2d_2_b); - void* var_8 = tensorHalfRelu(var_7); - void* var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorHalfAdd(var_10, conv2d_3_b); - void* var_12 = tensorHalfAdd(var_4, var_11); - void* var_13 = tensorHalfRelu(var_12); - void* var_15 = tensorHalfConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_16 = tensorHalfAdd(var_15, conv2d_4_b); - void* var_17 = tensorHalfRelu(var_16); - void* var_19 = tensorHalfConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_20 = tensorHalfAdd(var_19, conv2d_5_b); - void* var_21 = tensorHalfAdd(var_13, var_20); - void* var_22 = tensorHalfRelu(var_21); - void* var_24 = tensorHalfConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorHalfAdd(var_24, conv2d_6_b); - void* var_26 = tensorHalfRelu(var_25); - void* var_28 = tensorHalfConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorHalfAdd(var_28, conv2d_7_b); - void* var_30 = tensorHalfAdd(var_22, var_29); - void* var_31 = tensorHalfRelu(var_30); - void* var_33 = tensorHalfConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); - void* var_34 = tensorHalfAdd(var_33, conv2d_8_b); - void* var_35 = tensorHalfRelu(var_34); - void* var_37 = tensorHalfConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_38 = tensorHalfAdd(var_37, conv2d_9_b); - void* var_40 = tensorHalfConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); - void* var_41 = tensorHalfAdd(var_40, conv2d_10_b); - void* var_42 = tensorHalfAdd(var_41, var_38); - void* var_43 = tensorHalfRelu(var_42); - void* var_45 = tensorHalfConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_46 = tensorHalfAdd(var_45, conv2d_11_b); - void* var_47 = tensorHalfRelu(var_46); - void* var_49 = tensorHalfConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_50 = tensorHalfAdd(var_49, conv2d_12_b); - void* var_51 = tensorHalfAdd(var_43, var_50); - void* var_52 = tensorHalfRelu(var_51); - void* var_54 = tensorHalfConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_55 = tensorHalfAdd(var_54, conv2d_13_b); - void* var_56 = tensorHalfRelu(var_55); - void* var_58 = tensorHalfConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); - void* var_59 = tensorHalfAdd(var_58, conv2d_14_b); - void* var_60 = tensorHalfAdd(var_52, var_59); - void* var_61 = tensorHalfRelu(var_60); - void* var_63 = tensorHalfConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); - void* var_64 = tensorHalfAdd(var_63, conv2d_15_b); - void* var_65 = tensorHalfRelu(var_64); - void* var_67 = tensorHalfConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); - void* var_68 = tensorHalfAdd(var_67, conv2d_16_b); - void* var_70 = tensorHalfConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); - void* var_71 = tensorHalfAdd(var_70, conv2d_17_b); - void* var_72 = tensorHalfAdd(var_71, var_68); - void* var_73 = tensorHalfRelu(var_72); - void* var_75 = tensorHalfConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); - void* var_76 = tensorHalfAdd(var_75, conv2d_18_b); - void* var_77 = tensorHalfRelu(var_76); - void* var_79 = tensorHalfConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); - void* var_80 = tensorHalfAdd(var_79, conv2d_19_b); - void* var_81 = tensorHalfAdd(var_73, var_80); - void* var_82 = tensorHalfRelu(var_81); - void* var_84 = tensorHalfConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); - void* var_85 = tensorHalfAdd(var_84, conv2d_20_b); - void* var_86 = tensorHalfRelu(var_85); - void* var_88 = tensorHalfConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); - void* var_89 = tensorHalfAdd(var_88, conv2d_21_b); - void* var_90 = tensorHalfAdd(var_82, var_89); - void* var_91 = tensorHalfRelu(var_90); - void* var_92 = tensorHalfPooling(var_91,1,8,8,0,0,8,8); - void* var_94 = tensorHalfGemmGPU(var_92, dense_1_w); - void* var_95 = tensorHalfAdd(var_94, dense_1_b); - void* var_96 = tensorSoftmax(var_95); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_96); + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_2 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_3 = tensorHalfAdd(var_2, conv2d_1_b); + void *var_4 = tensorHalfRelu(var_3); + void *var_6 = tensorHalfConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_7 = tensorHalfAdd(var_6, conv2d_2_b); + void *var_8 = tensorHalfRelu(var_7); + void *var_10 = tensorHalfConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_11 = tensorHalfAdd(var_10, conv2d_3_b); + void *var_12 = tensorHalfAdd(var_4, var_11); + void *var_13 = tensorHalfRelu(var_12); + void *var_15 = tensorHalfConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_16 = tensorHalfAdd(var_15, conv2d_4_b); + void *var_17 = tensorHalfRelu(var_16); + void *var_19 = tensorHalfConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_20 = tensorHalfAdd(var_19, conv2d_5_b); + void *var_21 = tensorHalfAdd(var_13, var_20); + void *var_22 = tensorHalfRelu(var_21); + void *var_24 = tensorHalfConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorHalfAdd(var_24, conv2d_6_b); + void *var_26 = tensorHalfRelu(var_25); + void *var_28 = tensorHalfConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorHalfAdd(var_28, conv2d_7_b); + void *var_30 = tensorHalfAdd(var_22, var_29); + void *var_31 = tensorHalfRelu(var_30); + void *var_33 = tensorHalfConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); + void *var_34 = tensorHalfAdd(var_33, conv2d_8_b); + void *var_35 = tensorHalfRelu(var_34); + void *var_37 = tensorHalfConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_38 = tensorHalfAdd(var_37, conv2d_9_b); + void *var_40 = tensorHalfConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); + void *var_41 = tensorHalfAdd(var_40, conv2d_10_b); + void *var_42 = tensorHalfAdd(var_41, var_38); + void *var_43 = tensorHalfRelu(var_42); + void *var_45 = tensorHalfConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_46 = tensorHalfAdd(var_45, conv2d_11_b); + void *var_47 = tensorHalfRelu(var_46); + void *var_49 = tensorHalfConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_50 = tensorHalfAdd(var_49, conv2d_12_b); + void *var_51 = tensorHalfAdd(var_43, var_50); + void *var_52 = tensorHalfRelu(var_51); + void *var_54 = tensorHalfConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_55 = tensorHalfAdd(var_54, conv2d_13_b); + void *var_56 = tensorHalfRelu(var_55); + void *var_58 = tensorHalfConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); + void *var_59 = tensorHalfAdd(var_58, conv2d_14_b); + void *var_60 = tensorHalfAdd(var_52, var_59); + void *var_61 = tensorHalfRelu(var_60); + void *var_63 = tensorHalfConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); + void *var_64 = tensorHalfAdd(var_63, conv2d_15_b); + void *var_65 = tensorHalfRelu(var_64); + void *var_67 = tensorHalfConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); + void *var_68 = tensorHalfAdd(var_67, conv2d_16_b); + void *var_70 = tensorHalfConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); + void *var_71 = tensorHalfAdd(var_70, conv2d_17_b); + void *var_72 = tensorHalfAdd(var_71, var_68); + void *var_73 = tensorHalfRelu(var_72); + void *var_75 = tensorHalfConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); + void *var_76 = tensorHalfAdd(var_75, conv2d_18_b); + void *var_77 = tensorHalfRelu(var_76); + void *var_79 = tensorHalfConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); + void *var_80 = tensorHalfAdd(var_79, conv2d_19_b); + void *var_81 = tensorHalfAdd(var_73, var_80); + void *var_82 = tensorHalfRelu(var_81); + void *var_84 = tensorHalfConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); + void *var_85 = tensorHalfAdd(var_84, conv2d_20_b); + void *var_86 = tensorHalfRelu(var_85); + void *var_88 = tensorHalfConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); + void *var_89 = tensorHalfAdd(var_88, conv2d_21_b); + void *var_90 = tensorHalfAdd(var_82, var_89); + void *var_91 = tensorHalfRelu(var_90); + void *var_92 = tensorHalfPooling(var_91, 1, 8, 8, 0, 0, 8, 8); + void *var_94 = tensorHalfGemmGPU(var_92, dense_1_w); + void *var_95 = tensorHalfAdd(var_94, dense_1_b); + void *var_96 = tensorSoftmax(var_95); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_96); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -213,9 +256,7 @@ int main(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc index 9ac1deea68c693f8baf2df2d9f2b626b3597ad7f..fff901a330c334ec31b7da0709ae9acd5b39f634 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar100_half.cc @@ -1,160 +1,186 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> #include "../../../tensor_runtime/include/tensor_runtime.h" #include "../../include/utils.h" -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); - - - startMemTracking(); - - int test_input_size = 2000; - int batch_size = 1000; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; - - for(int i = 0; i < batch_count; i++){ - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); - void* var_2 = tensorHalfRelu(var_1); - void* var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_5 = tensorHalfAdd(var_4, conv2d_2_b); - void* var_6 = tensorHalfRelu(var_5); - void* var_7 = tensorHalfPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_9 = tensorHalfAdd(var_8, conv2d_3_b); - void* var_10 = tensorHalfRelu(var_9); - void* var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_13 = tensorHalfAdd(var_12, conv2d_4_b); - void* var_14 = tensorHalfRelu(var_13); - void* var_15 = tensorHalfPooling(var_14,0,2,2,0,0,2,2); - void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); - void* var_18 = tensorHalfRelu(var_17); - void* var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_21 = tensorHalfAdd(var_20, conv2d_6_b); - void* var_22 = tensorHalfRelu(var_21); - void* var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorHalfAdd(var_24, conv2d_7_b); - void* var_26 = tensorHalfRelu(var_25); - void* var_27 = tensorHalfPooling(var_26,0,2,2,0,0,2,2); - void* var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorHalfAdd(var_28, conv2d_8_b); - void* var_30 = tensorHalfRelu(var_29); - void* var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_33 = tensorHalfAdd(var_32, conv2d_9_b); - void* var_34 = tensorHalfRelu(var_33); - void* var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); - void* var_37 = tensorHalfAdd(var_36, conv2d_10_b); - void* var_38 = tensorHalfRelu(var_37); - void* var_39 = tensorHalfPooling(var_38,0,2,2,0,0,2,2); - void* var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_41 = tensorHalfAdd(var_40, conv2d_11_b); - void* var_42 = tensorHalfRelu(var_41); - void* var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_45 = tensorHalfAdd(var_44, conv2d_12_b); - void* var_46 = tensorHalfRelu(var_45); - void* var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_49 = tensorHalfAdd(var_48, conv2d_13_b); - void* var_50 = tensorHalfRelu(var_49); - void* var_51 = tensorHalfPooling(var_50,0,2,2,0,0,2,2); - void* var_54 = tensorHalfGemmGPU(var_51, dense_1_w); - void* var_55 = tensorHalfAdd(var_54, dense_1_b); - void* var_56 = tensorHalfRelu(var_55); - void* var_58 = tensorHalfGemmGPU(var_56, dense_2_w); - void* var_59 = tensorHalfAdd(var_58, dense_2_b); - void* var_60 = tensorSoftmax(var_59); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); - final_accuracy += accuracy; - freeBatchMemory(); - - } - - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - - return 0; +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1); + + startMemTracking(); + + int test_input_size = 2000; + int batch_size = 1000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_1 = tensorHalfAdd(var_0, conv2d_1_b); + void *var_2 = tensorHalfRelu(var_1); + void *var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_5 = tensorHalfAdd(var_4, conv2d_2_b); + void *var_6 = tensorHalfRelu(var_5); + void *var_7 = tensorHalfPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_9 = tensorHalfAdd(var_8, conv2d_3_b); + void *var_10 = tensorHalfRelu(var_9); + void *var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_13 = tensorHalfAdd(var_12, conv2d_4_b); + void *var_14 = tensorHalfRelu(var_13); + void *var_15 = tensorHalfPooling(var_14, 0, 2, 2, 0, 0, 2, 2); + void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorHalfAdd(var_16, conv2d_5_b); + void *var_18 = tensorHalfRelu(var_17); + void *var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_21 = tensorHalfAdd(var_20, conv2d_6_b); + void *var_22 = tensorHalfRelu(var_21); + void *var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorHalfAdd(var_24, conv2d_7_b); + void *var_26 = tensorHalfRelu(var_25); + void *var_27 = tensorHalfPooling(var_26, 0, 2, 2, 0, 0, 2, 2); + void *var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorHalfAdd(var_28, conv2d_8_b); + void *var_30 = tensorHalfRelu(var_29); + void *var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_33 = tensorHalfAdd(var_32, conv2d_9_b); + void *var_34 = tensorHalfRelu(var_33); + void *var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void *var_37 = tensorHalfAdd(var_36, conv2d_10_b); + void *var_38 = tensorHalfRelu(var_37); + void *var_39 = tensorHalfPooling(var_38, 0, 2, 2, 0, 0, 2, 2); + void *var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_41 = tensorHalfAdd(var_40, conv2d_11_b); + void *var_42 = tensorHalfRelu(var_41); + void *var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_45 = tensorHalfAdd(var_44, conv2d_12_b); + void *var_46 = tensorHalfRelu(var_45); + void *var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_49 = tensorHalfAdd(var_48, conv2d_13_b); + void *var_50 = tensorHalfRelu(var_49); + void *var_51 = tensorHalfPooling(var_50, 0, 2, 2, 0, 0, 2, 2); + void *var_54 = tensorHalfGemmGPU(var_51, dense_1_w); + void *var_55 = tensorHalfAdd(var_54, dense_1_b); + void *var_56 = tensorHalfRelu(var_55); + void *var_58 = tensorHalfGemmGPU(var_56, dense_2_w); + void *var_59 = tensorHalfAdd(var_58, dense_2_b); + void *var_60 = tensorSoftmax(var_59); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); + final_accuracy += accuracy; + freeBatchMemory(); + } + + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + + llvm_hpvm_cleanupTensorRt(); + + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc index f92bac10e27162fe0bc59c07aa4f9ede542ccd6e..3d6f0f3566914598279e2b0e070a7af287c388e5 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp16/vgg16_cifar10_half.cc @@ -1,82 +1,109 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> #include <string.h> #include "../../../tensor_runtime/include/tensor_runtime.h" -#include "../../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); - +#include "../../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -85,83 +112,82 @@ int main(){ int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; - // Start power and performance profiling + // Start power and performance profiling startProfiling(); - for(int i = 0; i < batch_count; i++){ + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_1 = tensorHalfAdd(var_0, conv2d_1_b); - void* var_2 = tensorHalfRelu(var_1); - void* var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_5 = tensorHalfAdd(var_4, conv2d_2_b); - void* var_6 = tensorHalfRelu(var_5); - void* var_7 = tensorHalfPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_9 = tensorHalfAdd(var_8, conv2d_3_b); - void* var_10 = tensorHalfRelu(var_9); - void* var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_13 = tensorHalfAdd(var_12, conv2d_4_b); - void* var_14 = tensorHalfRelu(var_13); - void* var_15 = tensorHalfPooling(var_14,0,2,2,0,0,2,2); - void* var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorHalfAdd(var_16, conv2d_5_b); - void* var_18 = tensorHalfRelu(var_17); - void* var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_21 = tensorHalfAdd(var_20, conv2d_6_b); - void* var_22 = tensorHalfRelu(var_21); - void* var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorHalfAdd(var_24, conv2d_7_b); - void* var_26 = tensorHalfRelu(var_25); - void* var_27 = tensorHalfPooling(var_26,0,2,2,0,0,2,2); - void* var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorHalfAdd(var_28, conv2d_8_b); - void* var_30 = tensorHalfRelu(var_29); - void* var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_33 = tensorHalfAdd(var_32, conv2d_9_b); - void* var_34 = tensorHalfRelu(var_33); - void* var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); - void* var_37 = tensorHalfAdd(var_36, conv2d_10_b); - void* var_38 = tensorHalfRelu(var_37); - void* var_39 = tensorHalfPooling(var_38,0,2,2,0,0,2,2); - void* var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_41 = tensorHalfAdd(var_40, conv2d_11_b); - void* var_42 = tensorHalfRelu(var_41); - void* var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_45 = tensorHalfAdd(var_44, conv2d_12_b); - void* var_46 = tensorHalfRelu(var_45); - void* var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_49 = tensorHalfAdd(var_48, conv2d_13_b); - void* var_50 = tensorHalfRelu(var_49); - void* var_51 = tensorHalfPooling(var_50,0,2,2,0,0,2,2); - void* var_54 = tensorHalfGemmGPU(var_51, dense_1_w); - void* var_55 = tensorHalfAdd(var_54, dense_1_b); - void* var_56 = tensorHalfRelu(var_55); - void* var_58 = tensorHalfGemmGPU(var_56, dense_2_w); - void* var_59 = tensorHalfAdd(var_58, dense_2_b); - void* var_60 = tensorSoftmax(var_59); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_60); + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorHalfConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_1 = tensorHalfAdd(var_0, conv2d_1_b); + void *var_2 = tensorHalfRelu(var_1); + void *var_4 = tensorHalfConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_5 = tensorHalfAdd(var_4, conv2d_2_b); + void *var_6 = tensorHalfRelu(var_5); + void *var_7 = tensorHalfPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorHalfConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_9 = tensorHalfAdd(var_8, conv2d_3_b); + void *var_10 = tensorHalfRelu(var_9); + void *var_12 = tensorHalfConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_13 = tensorHalfAdd(var_12, conv2d_4_b); + void *var_14 = tensorHalfRelu(var_13); + void *var_15 = tensorHalfPooling(var_14, 0, 2, 2, 0, 0, 2, 2); + void *var_16 = tensorHalfConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorHalfAdd(var_16, conv2d_5_b); + void *var_18 = tensorHalfRelu(var_17); + void *var_20 = tensorHalfConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_21 = tensorHalfAdd(var_20, conv2d_6_b); + void *var_22 = tensorHalfRelu(var_21); + void *var_24 = tensorHalfConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorHalfAdd(var_24, conv2d_7_b); + void *var_26 = tensorHalfRelu(var_25); + void *var_27 = tensorHalfPooling(var_26, 0, 2, 2, 0, 0, 2, 2); + void *var_28 = tensorHalfConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorHalfAdd(var_28, conv2d_8_b); + void *var_30 = tensorHalfRelu(var_29); + void *var_32 = tensorHalfConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_33 = tensorHalfAdd(var_32, conv2d_9_b); + void *var_34 = tensorHalfRelu(var_33); + void *var_36 = tensorHalfConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void *var_37 = tensorHalfAdd(var_36, conv2d_10_b); + void *var_38 = tensorHalfRelu(var_37); + void *var_39 = tensorHalfPooling(var_38, 0, 2, 2, 0, 0, 2, 2); + void *var_40 = tensorHalfConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_41 = tensorHalfAdd(var_40, conv2d_11_b); + void *var_42 = tensorHalfRelu(var_41); + void *var_44 = tensorHalfConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_45 = tensorHalfAdd(var_44, conv2d_12_b); + void *var_46 = tensorHalfRelu(var_45); + void *var_48 = tensorHalfConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_49 = tensorHalfAdd(var_48, conv2d_13_b); + void *var_50 = tensorHalfRelu(var_49); + void *var_51 = tensorHalfPooling(var_50, 0, 2, 2, 0, 0, 2, 2); + void *var_54 = tensorHalfGemmGPU(var_51, dense_1_w); + void *var_55 = tensorHalfAdd(var_54, dense_1_b); + void *var_56 = tensorHalfRelu(var_55); + void *var_58 = tensorHalfGemmGPU(var_56, dense_2_w); + void *var_59 = tensorHalfAdd(var_58, dense_2_b); + void *var_60 = tensorSoftmax(var_59); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60); final_accuracy += accuracy; - + freeBatchMemory(); } - // Start power and performance profiling + // Start power and performance profiling stopProfiling(); final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc index 50d9747f990d486c4543607d16d4a4ccb88b0517..20484a3a0bd67eeba88e44aeeffcae563512c349 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet2_cifar10.cc @@ -11,52 +11,62 @@ #include "../../tensor_runtime/include/tensor_runtime.h" #include "../include/utils.h" - - /* NOTE: Reference Architecture to use for profiling */ -void testCifarNet(){ +void testCifarNet() { printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); - - - std::string dir_prefix = model_params_path + std::string("/alexnet2_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,32,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,128,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - - - int conv_mode = 1; // NOTE: using CROSS_CORRELATION - int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + std::string dir_prefix = + model_params_path + std::string("/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); + + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 32, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 128, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = + 0; // NOTE: using Float as compute precision. FIXIT: use enum startMemTracking(); @@ -67,62 +77,61 @@ void testCifarNet(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* conv1out = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv1out, conv2d_1_b); - void* conv1_tanh = tensorTanh(conv1out); - + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *conv1out = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, conv_mode, + conv_precision); + tensorAdd(conv1out, conv2d_1_b); + void *conv1_tanh = tensorTanh(conv1out); + // 2nd Layer - void* conv2out = tensorConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv2out, conv2d_2_b); - void* conv2_tanh = tensorTanh(conv2out); - void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv2out = tensorConvolution(conv1_tanh, conv2d_2_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv2out, conv2d_2_b); + void *conv2_tanh = tensorTanh(conv2out); + void *pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + // 3rd Layer - void* conv3out = tensorConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv3out, conv2d_3_b); - void* conv3_tanh = tensorTanh(conv3out); + void *conv3out = tensorConvolution(pool2out, conv2d_3_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv3out, conv2d_3_b); + void *conv3_tanh = tensorTanh(conv3out); // 4th Layer - void* conv4out = tensorConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv4out, conv2d_4_b); - void* conv4_tanh = tensorTanh(conv4out); - void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv4out = tensorConvolution(conv3_tanh, conv2d_4_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv4out, conv2d_4_b); + void *conv4_tanh = tensorTanh(conv4out); + void *pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + // 5th Layer - void* conv5out = tensorConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv5out, conv2d_5_b); - void* conv5_tanh = tensorTanh(conv5out); + void *conv5out = tensorConvolution(pool4out, conv2d_5_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv5out, conv2d_5_b); + void *conv5_tanh = tensorTanh(conv5out); // 6th Layer - void* conv6out = tensorConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1, - conv_mode, conv_precision); - tensorAdd(conv6out, conv2d_6_b); - void* conv6_tanh = tensorTanh(conv6out); - void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); - + void *conv6out = tensorConvolution(conv5_tanh, conv2d_6_w, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv6out, conv2d_6_b); + void *conv6_tanh = tensorTanh(conv6out); + void *pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + // final FC Layer - void* gemm1out = tensorGemmGPU(pool6out, dense_1_w); - void* gemm1biasout = tensorAdd(gemm1out, dense_1_b); - void* result = tensorSoftmax(gemm1biasout); + void *gemm1out = tensorGemmGPU(pool6out, dense_1_w); + void *gemm1biasout = tensorAdd(gemm1out, dense_1_b); + void *result = tensorSoftmax(gemm1biasout); - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); - float accuracy = computeAccuracy2(labels, batch_size, result); + float accuracy = computeAccuracy2(labels, batch_size, result); final_accuracy += accuracy; - freeBatchMemory(); } @@ -130,11 +139,9 @@ void testCifarNet(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - } - -int main(int argc, char* argv[]){ +int main(int argc, char *argv[]) { llvm_hpvm_initTensorRt(0); @@ -144,4 +151,3 @@ int main(int argc, char* argv[]){ return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc index 1a76f1ae8ba6059124117b82cd72e8ccd6cdeba6..70d582d11cb47b8c51a52a72b1f9ca003cb0a305 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_cifar10.cc @@ -1,50 +1,59 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); - - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - - - +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/alexnet_cifar10/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 4096, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); + startMemTracking(); int test_input_size = 5000; @@ -54,40 +63,40 @@ int main(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); - void* var_1 = tensorAdd(var_0, conv2d_1_b); - void* var_2 = tensorTanh(var_1); - void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); - void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); - void* var_6 = tensorAdd(var_5, conv2d_2_b); - void* var_7 = tensorTanh(var_6); - void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); - void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorAdd(var_10, conv2d_3_b); - void* var_12 = tensorTanh(var_11); - void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_14 = tensorAdd(var_13, conv2d_4_b); - void* var_15 = tensorTanh(var_14); - void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorAdd(var_16, conv2d_5_b); - void* var_18 = tensorTanh(var_17); - void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); - void* var_22 = tensorGemmGPU(var_19, dense_1_w); - void* var_23 = tensorAdd(var_22, dense_1_b); - void* var_24 = tensorSoftmax(var_23); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_24); + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); + void *var_1 = tensorAdd(var_0, conv2d_1_b); + void *var_2 = tensorTanh(var_1); + void *var_3 = tensorPooling(var_2, 0, 2, 2, 0, 0, 2, 2); + void *var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); + void *var_6 = tensorAdd(var_5, conv2d_2_b); + void *var_7 = tensorTanh(var_6); + void *var_8 = tensorPooling(var_7, 0, 2, 2, 0, 0, 2, 2); + void *var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_11 = tensorAdd(var_10, conv2d_3_b); + void *var_12 = tensorTanh(var_11); + void *var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_14 = tensorAdd(var_13, conv2d_4_b); + void *var_15 = tensorTanh(var_14); + void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorAdd(var_16, conv2d_5_b); + void *var_18 = tensorTanh(var_17); + void *var_19 = tensorPooling(var_18, 0, 2, 2, 0, 0, 2, 2); + void *var_22 = tensorGemmGPU(var_19, dense_1_w); + void *var_23 = tensorAdd(var_22, dense_1_b); + void *var_24 = tensorSoftmax(var_23); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_24); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -96,9 +105,7 @@ int main(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); + llvm_hpvm_cleanupTensorRt(); - llvm_hpvm_cleanupTensorRt(); - - return 0; - + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc index aa518d77a1993ce5f0f47b4a29276aae6e6de0e5..9d7e8fe2a27e1ef0eff9ab78c225659b6f2ab67f 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/alexnet_imagenet.cc @@ -1,116 +1,126 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "tensor_runtime.h" -#include "utils.h" - - -int main(){ - - llvm_hpvm_initTensorRt(0); - - - std::string dir_prefix = std::string("/home/nvidia/sd_card/alexnet_imagenet_tune/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,9216,4096); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); - std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); - void* dense_3_w = readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); - std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); - void* dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); - - - - startMemTracking(); - - int test_input_size = 1000; - int batch_size = 100; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; - - for(int i = 0; i < batch_count; i++){ - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); - - void* var_2 = tensorConvolution(input, conv2d_1_w, 2, 2, 4, 4, 1, 1); - void* var_3 = tensorAdd(var_2, conv2d_1_b); - void* var_4 = tensorRelu(var_3); - void* var_5 = tensorPooling(var_4,0,3,3,0,0,2,2); - void* var_7 = tensorConvolution(var_5, conv2d_2_w, 2, 2, 1, 1, 1, 1); - void* var_8 = tensorAdd(var_7, conv2d_2_b); - void* var_9 = tensorRelu(var_8); - void* var_10 = tensorPooling(var_9,0,3,3,0,0,2,2); - void* var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); - void* var_12 = tensorAdd(var_11, conv2d_3_b); - void* var_13 = tensorRelu(var_12); - void* var_14 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 1); - void* var_15 = tensorAdd(var_14, conv2d_4_b); - void* var_16 = tensorRelu(var_15); - void* var_17 = tensorConvolution(var_16, conv2d_5_w, 1, 1, 1, 1, 1, 1); - void* var_18 = tensorAdd(var_17, conv2d_5_b); - void* var_19 = tensorRelu(var_18); - void* var_20 = tensorPooling(var_19,0,3,3,0,0,2,2); - void* var_23 = tensorGemmGPU(var_20, dense_1_w); - void* var_24 = tensorAdd(var_23, dense_1_b); - void* var_25 = tensorRelu(var_24); - void* var_27 = tensorGemmGPU(var_25, dense_2_w); - void* var_28 = tensorAdd(var_27, dense_2_b); - void* var_29 = tensorRelu(var_28); - void* var_30 = tensorGemmGPU(var_29, dense_3_w); - void* var_31 = tensorAdd(var_30, dense_3_b); - void* var_32 = tensorSoftmax(var_31); - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy3(labels, var_32); - final_accuracy += accuracy; - freeBatchMemory(); - +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "tensor_runtime.h" +#include "utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = + std::string("/home/nvidia/sd_card/alexnet_imagenet_tune/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 11, 11); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 192, 64, 5, 5); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 192, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 384, 192, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 384, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 384, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 9216, 4096); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = + readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 4096, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 4096, 4096); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = + readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 4096, 1, 1); + std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); + void *dense_3_w = + readTrainedWeights(dense_3_w_path.c_str(), 0, 1, 1, 4096, 1000); + std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); + void *dense_3_b = + readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1); + + startMemTracking(); + + int test_input_size = 1000; + int batch_size = 100; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); + + void *var_2 = tensorConvolution(input, conv2d_1_w, 2, 2, 4, 4, 1, 1); + void *var_3 = tensorAdd(var_2, conv2d_1_b); + void *var_4 = tensorRelu(var_3); + void *var_5 = tensorPooling(var_4, 0, 3, 3, 0, 0, 2, 2); + void *var_7 = tensorConvolution(var_5, conv2d_2_w, 2, 2, 1, 1, 1, 1); + void *var_8 = tensorAdd(var_7, conv2d_2_b); + void *var_9 = tensorRelu(var_8); + void *var_10 = tensorPooling(var_9, 0, 3, 3, 0, 0, 2, 2); + void *var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); + void *var_12 = tensorAdd(var_11, conv2d_3_b); + void *var_13 = tensorRelu(var_12); + void *var_14 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 1); + void *var_15 = tensorAdd(var_14, conv2d_4_b); + void *var_16 = tensorRelu(var_15); + void *var_17 = tensorConvolution(var_16, conv2d_5_w, 1, 1, 1, 1, 1, 1); + void *var_18 = tensorAdd(var_17, conv2d_5_b); + void *var_19 = tensorRelu(var_18); + void *var_20 = tensorPooling(var_19, 0, 3, 3, 0, 0, 2, 2); + void *var_23 = tensorGemmGPU(var_20, dense_1_w); + void *var_24 = tensorAdd(var_23, dense_1_b); + void *var_25 = tensorRelu(var_24); + void *var_27 = tensorGemmGPU(var_25, dense_2_w); + void *var_28 = tensorAdd(var_27, dense_2_b); + void *var_29 = tensorRelu(var_28); + void *var_30 = tensorGemmGPU(var_29, dense_3_w); + void *var_31 = tensorAdd(var_30, dense_3_b); + void *var_32 = tensorSoftmax(var_31); + + uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy3(labels, var_32); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); + llvm_hpvm_cleanupTensorRt(); - llvm_hpvm_cleanupTensorRt(); - - - return 0; - + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc index 7508f3119eeb469a164fad9741000308e3e8c031..c32efad92feb55b3900d5d316110759504f0692c 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/lenet_mnist.cc @@ -8,117 +8,109 @@ #include <sys/stat.h> #include <string.h> - #include "tensor_runtime.h" #include "utils.h" int total_runs = 1; - /* NOTE: Reference Architecture to use for profiling */ -void testLenetTanh(){ +void testLenetTanh() { printf("********* Lenet-2 Architecture ********** \n"); // FIXIT: Extend this to batch of images - currently 5 images int test_batch_size = 5000; - std::string dir_prefix = model_params_path + std::string("/lenet_mnist/"); + std::string dir_prefix = model_params_path + std::string("/lenet_mnist/"); + + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string labels32_path = dir_prefix + std::string("labels32.bin"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string labels32_path = dir_prefix + std::string("labels32.bin"); - // Loading Input Batch - void* input = readInputBatch(input_path.c_str(),0, 0,test_batch_size,1,28,28); - uint8_t* labels = readLabelsBatch(labels_path.c_str(), 0,test_batch_size); - - - void* conv1_filter = readTrainedWeights("../model_params/lenet_mnist/conv1.bin", - float_type, 32, 1, 5, 5); - void* conv1_bias = readTrainedWeights("../model_params/lenet_mnist/conv1_bias.bin", - float_type, 1, 32, 1, 1); - void* conv2_filter = readTrainedWeights("../model_params/lenet_mnist/conv2.bin", - float_type, 64, 32, 5, 5); - void* conv2_bias = readTrainedWeights("../model_params/lenet_mnist/conv2_bias.bin", - float_type, 1, 64, 1, 1); - void* fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin", - float_type, 1, 1, 7*7*64, 1024); - void* fc1_bias = readTrainedWeights("../model_params/lenet_mnist/fc1_bias.bin", - float_type, 1, 1024, 1, 1); - void* fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin", - float_type, 1, 1, 1024, 10); - void* fc2_bias = readTrainedWeights("../model_params/lenet_mnist/fc2_bias.bin", - float_type, 1, 10, 1, 1); - - - + void *input = + readInputBatch(input_path.c_str(), 0, 0, test_batch_size, 1, 28, 28); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), 0, test_batch_size); + + void *conv1_filter = readTrainedWeights( + "../model_params/lenet_mnist/conv1.bin", float_type, 32, 1, 5, 5); + void *conv1_bias = readTrainedWeights( + "../model_params/lenet_mnist/conv1_bias.bin", float_type, 1, 32, 1, 1); + void *conv2_filter = readTrainedWeights( + "../model_params/lenet_mnist/conv2.bin", float_type, 64, 32, 5, 5); + void *conv2_bias = readTrainedWeights( + "../model_params/lenet_mnist/conv2_bias.bin", float_type, 1, 64, 1, 1); + void *fc1_weights = readTrainedWeights("../model_params/lenet_mnist/fc1.bin", + float_type, 1, 1, 7 * 7 * 64, 1024); + void *fc1_bias = readTrainedWeights( + "../model_params/lenet_mnist/fc1_bias.bin", float_type, 1, 1024, 1, 1); + void *fc2_weights = readTrainedWeights("../model_params/lenet_mnist/fc2.bin", + float_type, 1, 1, 1024, 10); + void *fc2_bias = readTrainedWeights( + "../model_params/lenet_mnist/fc2_bias.bin", float_type, 1, 10, 1, 1); + clearTensorMap(); - - for(int i = 0; i < total_runs; i++){ + + for (int i = 0; i < total_runs; i++) { readOpenTunerFlags("opentuner_flags"); // Resets the OpenTuner counters - // Start power and performnce profiling + // Start power and performnce profiling startProfiling(); - + int conv_mode = 1; // NOTE: using CROSS_CORRELATION - int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + int conv_precision = + 0; // NOTE: using Float as compute precision. FIXIT: use enum // NOTE: 'SAME' convolution - void* conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1, - conv_mode, conv_precision); + void *conv1out = tensorConvolution(input, conv1_filter, 2, 2, 1, 1, + conv_mode, conv_precision); - // NOTE: For tensorAdd, the only dimension that MUST match is channels + // NOTE: For tensorAdd, the only dimension that MUST match is channels tensorAdd(conv1out, conv1_bias); // NOTE: In place operation - void* pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); + void *pool1out = tensorPooling(conv1out, 0, 2, 2, 0, 0, 2, 2); - void* conv1_tanh = tensorTanh(pool1out); + void *conv1_tanh = tensorTanh(pool1out); - // NOTE: input channels have to match between tensor op inputs and outputs - void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, - conv_mode, conv_precision); + // NOTE: input channels have to match between tensor op inputs and outputs + void *conv2out = tensorConvolution(conv1_tanh, conv2_filter, 2, 2, 1, 1, + conv_mode, conv_precision); tensorAdd(conv2out, conv2_bias); // NOTE: In place operation - void* pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + void *pool2out = tensorPooling(conv2out, 0, 2, 2, 0, 0, 2, 2); + + void *conv2_tanh = tensorTanh(pool2out); - void* conv2_tanh = tensorTanh(pool2out); + void *gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights); - void* gemm1out = tensorGemmGPU(conv2_tanh, fc1_weights); + void *gemm1biasout = tensorAdd(gemm1out, fc1_bias); - void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + void *tanh1out = tensorTanh(gemm1biasout); - void* tanh1out = tensorTanh(gemm1biasout); - - void* gemm2out = tensorGemmGPU(tanh1out, fc2_weights); - - void* gemm2_biasout = tensorAdd(gemm2out, fc2_bias); + void *gemm2out = tensorGemmGPU(tanh1out, fc2_weights); - void* tanh2out = tensorTanh(gemm2_biasout); - - void* result = tensorSoftmax(tanh2out); + void *gemm2_biasout = tensorAdd(gemm2out, fc2_bias); + + void *tanh2out = tensorTanh(gemm2_biasout); + + void *result = tensorSoftmax(tanh2out); // End profiling and dump output to profile.txt stopProfiling(); - + float accuracy = computeAccuracy2(labels, test_batch_size, result); - dumpFinalAccuracy(accuracy); + dumpFinalAccuracy(accuracy); - - //FIXME: remove the comment below to use piped autotuner - //dumpAccuracyNorms(); - freeOutputTensors(); + // FIXME: remove the comment below to use piped autotuner + // dumpAccuracyNorms(); + freeOutputTensors(); } dumpExecutionAccuracies(); - - } +int main(int argc, char *argv[]) { - -int main(int argc, char* argv[]){ - - if (argc > 1){ + if (argc > 1) { total_runs = atoi(argv[1]); } @@ -130,4 +122,3 @@ int main(int argc, char* argv[]){ return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc index 7c311a568647caa107112bed4982fb57254dc7b3..0820d4467a123644a1a1660adbefc5101c2c6206 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/mobilenet.cc @@ -1,414 +1,732 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> #include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" +#include "../include/utils.h" -int main(){ +int main() { - llvm_hpvm_initTensorRt(0); + llvm_hpvm_initTensorRt(0); + std::string dir_prefix = model_params_path + std::string("/mobilenet/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 32, 3, 3, 3); + std::string batch_normalization_1_gamma_path = + dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void *batch_normalization_1_gamma = readTrainedWeights( + batch_normalization_1_gamma_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_beta_path = + dir_prefix + std::string("batch_normalization_1_beta.bin"); + void *batch_normalization_1_beta = readTrainedWeights( + batch_normalization_1_beta_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_mean_path = + dir_prefix + std::string("batch_normalization_1_mean.bin"); + void *batch_normalization_1_mean = readTrainedWeights( + batch_normalization_1_mean_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_1_variance_path = + dir_prefix + std::string("batch_normalization_1_variance.bin"); + void *batch_normalization_1_variance = readTrainedWeights( + batch_normalization_1_variance_path.c_str(), 0, 1, 32, 1, 1); + std::string depthwise_conv2d_1_w_path = + dir_prefix + std::string("depthwise_conv2d_1_w.bin"); + void *depthwise_conv2d_1_w = + readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0, 32, 1, 3, 3); + std::string batch_normalization_2_gamma_path = + dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void *batch_normalization_2_gamma = readTrainedWeights( + batch_normalization_2_gamma_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_beta_path = + dir_prefix + std::string("batch_normalization_2_beta.bin"); + void *batch_normalization_2_beta = readTrainedWeights( + batch_normalization_2_beta_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_mean_path = + dir_prefix + std::string("batch_normalization_2_mean.bin"); + void *batch_normalization_2_mean = readTrainedWeights( + batch_normalization_2_mean_path.c_str(), 0, 1, 32, 1, 1); + std::string batch_normalization_2_variance_path = + dir_prefix + std::string("batch_normalization_2_variance.bin"); + void *batch_normalization_2_variance = readTrainedWeights( + batch_normalization_2_variance_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 32, 1, 1); + std::string batch_normalization_3_gamma_path = + dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void *batch_normalization_3_gamma = readTrainedWeights( + batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_beta_path = + dir_prefix + std::string("batch_normalization_3_beta.bin"); + void *batch_normalization_3_beta = readTrainedWeights( + batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_mean_path = + dir_prefix + std::string("batch_normalization_3_mean.bin"); + void *batch_normalization_3_mean = readTrainedWeights( + batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_variance_path = + dir_prefix + std::string("batch_normalization_3_variance.bin"); + void *batch_normalization_3_variance = readTrainedWeights( + batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string depthwise_conv2d_2_w_path = + dir_prefix + std::string("depthwise_conv2d_2_w.bin"); + void *depthwise_conv2d_2_w = + readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0, 64, 1, 3, 3); + std::string batch_normalization_4_gamma_path = + dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void *batch_normalization_4_gamma = readTrainedWeights( + batch_normalization_4_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_beta_path = + dir_prefix + std::string("batch_normalization_4_beta.bin"); + void *batch_normalization_4_beta = readTrainedWeights( + batch_normalization_4_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_mean_path = + dir_prefix + std::string("batch_normalization_4_mean.bin"); + void *batch_normalization_4_mean = readTrainedWeights( + batch_normalization_4_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_4_variance_path = + dir_prefix + std::string("batch_normalization_4_variance.bin"); + void *batch_normalization_4_variance = readTrainedWeights( + batch_normalization_4_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 1, 1); + std::string batch_normalization_5_gamma_path = + dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void *batch_normalization_5_gamma = readTrainedWeights( + batch_normalization_5_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_beta_path = + dir_prefix + std::string("batch_normalization_5_beta.bin"); + void *batch_normalization_5_beta = readTrainedWeights( + batch_normalization_5_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_mean_path = + dir_prefix + std::string("batch_normalization_5_mean.bin"); + void *batch_normalization_5_mean = readTrainedWeights( + batch_normalization_5_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_5_variance_path = + dir_prefix + std::string("batch_normalization_5_variance.bin"); + void *batch_normalization_5_variance = readTrainedWeights( + batch_normalization_5_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string depthwise_conv2d_3_w_path = + dir_prefix + std::string("depthwise_conv2d_3_w.bin"); + void *depthwise_conv2d_3_w = + readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0, 128, 1, 3, 3); + std::string batch_normalization_6_gamma_path = + dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void *batch_normalization_6_gamma = readTrainedWeights( + batch_normalization_6_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_beta_path = + dir_prefix + std::string("batch_normalization_6_beta.bin"); + void *batch_normalization_6_beta = readTrainedWeights( + batch_normalization_6_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_mean_path = + dir_prefix + std::string("batch_normalization_6_mean.bin"); + void *batch_normalization_6_mean = readTrainedWeights( + batch_normalization_6_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_6_variance_path = + dir_prefix + std::string("batch_normalization_6_variance.bin"); + void *batch_normalization_6_variance = readTrainedWeights( + batch_normalization_6_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 1, 1); + std::string batch_normalization_7_gamma_path = + dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void *batch_normalization_7_gamma = readTrainedWeights( + batch_normalization_7_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_beta_path = + dir_prefix + std::string("batch_normalization_7_beta.bin"); + void *batch_normalization_7_beta = readTrainedWeights( + batch_normalization_7_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_mean_path = + dir_prefix + std::string("batch_normalization_7_mean.bin"); + void *batch_normalization_7_mean = readTrainedWeights( + batch_normalization_7_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_7_variance_path = + dir_prefix + std::string("batch_normalization_7_variance.bin"); + void *batch_normalization_7_variance = readTrainedWeights( + batch_normalization_7_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string depthwise_conv2d_4_w_path = + dir_prefix + std::string("depthwise_conv2d_4_w.bin"); + void *depthwise_conv2d_4_w = + readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0, 128, 1, 3, 3); + std::string batch_normalization_8_gamma_path = + dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void *batch_normalization_8_gamma = readTrainedWeights( + batch_normalization_8_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_beta_path = + dir_prefix + std::string("batch_normalization_8_beta.bin"); + void *batch_normalization_8_beta = readTrainedWeights( + batch_normalization_8_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_mean_path = + dir_prefix + std::string("batch_normalization_8_mean.bin"); + void *batch_normalization_8_mean = readTrainedWeights( + batch_normalization_8_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_8_variance_path = + dir_prefix + std::string("batch_normalization_8_variance.bin"); + void *batch_normalization_8_variance = readTrainedWeights( + batch_normalization_8_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 1, 1); + std::string batch_normalization_9_gamma_path = + dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void *batch_normalization_9_gamma = readTrainedWeights( + batch_normalization_9_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_beta_path = + dir_prefix + std::string("batch_normalization_9_beta.bin"); + void *batch_normalization_9_beta = readTrainedWeights( + batch_normalization_9_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_mean_path = + dir_prefix + std::string("batch_normalization_9_mean.bin"); + void *batch_normalization_9_mean = readTrainedWeights( + batch_normalization_9_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_9_variance_path = + dir_prefix + std::string("batch_normalization_9_variance.bin"); + void *batch_normalization_9_variance = readTrainedWeights( + batch_normalization_9_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string depthwise_conv2d_5_w_path = + dir_prefix + std::string("depthwise_conv2d_5_w.bin"); + void *depthwise_conv2d_5_w = + readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0, 256, 1, 3, 3); + std::string batch_normalization_10_gamma_path = + dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void *batch_normalization_10_gamma = readTrainedWeights( + batch_normalization_10_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_beta_path = + dir_prefix + std::string("batch_normalization_10_beta.bin"); + void *batch_normalization_10_beta = readTrainedWeights( + batch_normalization_10_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_mean_path = + dir_prefix + std::string("batch_normalization_10_mean.bin"); + void *batch_normalization_10_mean = readTrainedWeights( + batch_normalization_10_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_10_variance_path = + dir_prefix + std::string("batch_normalization_10_variance.bin"); + void *batch_normalization_10_variance = readTrainedWeights( + batch_normalization_10_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 1, 1); + std::string batch_normalization_11_gamma_path = + dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void *batch_normalization_11_gamma = readTrainedWeights( + batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_beta_path = + dir_prefix + std::string("batch_normalization_11_beta.bin"); + void *batch_normalization_11_beta = readTrainedWeights( + batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_mean_path = + dir_prefix + std::string("batch_normalization_11_mean.bin"); + void *batch_normalization_11_mean = readTrainedWeights( + batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_variance_path = + dir_prefix + std::string("batch_normalization_11_variance.bin"); + void *batch_normalization_11_variance = readTrainedWeights( + batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string depthwise_conv2d_6_w_path = + dir_prefix + std::string("depthwise_conv2d_6_w.bin"); + void *depthwise_conv2d_6_w = + readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0, 256, 1, 3, 3); + std::string batch_normalization_12_gamma_path = + dir_prefix + std::string("batch_normalization_12_gamma.bin"); + void *batch_normalization_12_gamma = readTrainedWeights( + batch_normalization_12_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_beta_path = + dir_prefix + std::string("batch_normalization_12_beta.bin"); + void *batch_normalization_12_beta = readTrainedWeights( + batch_normalization_12_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_mean_path = + dir_prefix + std::string("batch_normalization_12_mean.bin"); + void *batch_normalization_12_mean = readTrainedWeights( + batch_normalization_12_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_12_variance_path = + dir_prefix + std::string("batch_normalization_12_variance.bin"); + void *batch_normalization_12_variance = readTrainedWeights( + batch_normalization_12_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 512, 256, 1, 1); + std::string batch_normalization_13_gamma_path = + dir_prefix + std::string("batch_normalization_13_gamma.bin"); + void *batch_normalization_13_gamma = readTrainedWeights( + batch_normalization_13_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_beta_path = + dir_prefix + std::string("batch_normalization_13_beta.bin"); + void *batch_normalization_13_beta = readTrainedWeights( + batch_normalization_13_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_mean_path = + dir_prefix + std::string("batch_normalization_13_mean.bin"); + void *batch_normalization_13_mean = readTrainedWeights( + batch_normalization_13_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_13_variance_path = + dir_prefix + std::string("batch_normalization_13_variance.bin"); + void *batch_normalization_13_variance = readTrainedWeights( + batch_normalization_13_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_7_w_path = + dir_prefix + std::string("depthwise_conv2d_7_w.bin"); + void *depthwise_conv2d_7_w = + readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_14_gamma_path = + dir_prefix + std::string("batch_normalization_14_gamma.bin"); + void *batch_normalization_14_gamma = readTrainedWeights( + batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_beta_path = + dir_prefix + std::string("batch_normalization_14_beta.bin"); + void *batch_normalization_14_beta = readTrainedWeights( + batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_mean_path = + dir_prefix + std::string("batch_normalization_14_mean.bin"); + void *batch_normalization_14_mean = readTrainedWeights( + batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_variance_path = + dir_prefix + std::string("batch_normalization_14_variance.bin"); + void *batch_normalization_14_variance = readTrainedWeights( + batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_15_gamma_path = + dir_prefix + std::string("batch_normalization_15_gamma.bin"); + void *batch_normalization_15_gamma = readTrainedWeights( + batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_beta_path = + dir_prefix + std::string("batch_normalization_15_beta.bin"); + void *batch_normalization_15_beta = readTrainedWeights( + batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_mean_path = + dir_prefix + std::string("batch_normalization_15_mean.bin"); + void *batch_normalization_15_mean = readTrainedWeights( + batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_variance_path = + dir_prefix + std::string("batch_normalization_15_variance.bin"); + void *batch_normalization_15_variance = readTrainedWeights( + batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_8_w_path = + dir_prefix + std::string("depthwise_conv2d_8_w.bin"); + void *depthwise_conv2d_8_w = + readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_16_gamma_path = + dir_prefix + std::string("batch_normalization_16_gamma.bin"); + void *batch_normalization_16_gamma = readTrainedWeights( + batch_normalization_16_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_beta_path = + dir_prefix + std::string("batch_normalization_16_beta.bin"); + void *batch_normalization_16_beta = readTrainedWeights( + batch_normalization_16_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_mean_path = + dir_prefix + std::string("batch_normalization_16_mean.bin"); + void *batch_normalization_16_mean = readTrainedWeights( + batch_normalization_16_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_16_variance_path = + dir_prefix + std::string("batch_normalization_16_variance.bin"); + void *batch_normalization_16_variance = readTrainedWeights( + batch_normalization_16_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_17_gamma_path = + dir_prefix + std::string("batch_normalization_17_gamma.bin"); + void *batch_normalization_17_gamma = readTrainedWeights( + batch_normalization_17_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_beta_path = + dir_prefix + std::string("batch_normalization_17_beta.bin"); + void *batch_normalization_17_beta = readTrainedWeights( + batch_normalization_17_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_mean_path = + dir_prefix + std::string("batch_normalization_17_mean.bin"); + void *batch_normalization_17_mean = readTrainedWeights( + batch_normalization_17_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_17_variance_path = + dir_prefix + std::string("batch_normalization_17_variance.bin"); + void *batch_normalization_17_variance = readTrainedWeights( + batch_normalization_17_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_9_w_path = + dir_prefix + std::string("depthwise_conv2d_9_w.bin"); + void *depthwise_conv2d_9_w = + readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_18_gamma_path = + dir_prefix + std::string("batch_normalization_18_gamma.bin"); + void *batch_normalization_18_gamma = readTrainedWeights( + batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_beta_path = + dir_prefix + std::string("batch_normalization_18_beta.bin"); + void *batch_normalization_18_beta = readTrainedWeights( + batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_mean_path = + dir_prefix + std::string("batch_normalization_18_mean.bin"); + void *batch_normalization_18_mean = readTrainedWeights( + batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_variance_path = + dir_prefix + std::string("batch_normalization_18_variance.bin"); + void *batch_normalization_18_variance = readTrainedWeights( + batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_19_gamma_path = + dir_prefix + std::string("batch_normalization_19_gamma.bin"); + void *batch_normalization_19_gamma = readTrainedWeights( + batch_normalization_19_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_beta_path = + dir_prefix + std::string("batch_normalization_19_beta.bin"); + void *batch_normalization_19_beta = readTrainedWeights( + batch_normalization_19_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_mean_path = + dir_prefix + std::string("batch_normalization_19_mean.bin"); + void *batch_normalization_19_mean = readTrainedWeights( + batch_normalization_19_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_19_variance_path = + dir_prefix + std::string("batch_normalization_19_variance.bin"); + void *batch_normalization_19_variance = readTrainedWeights( + batch_normalization_19_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_10_w_path = + dir_prefix + std::string("depthwise_conv2d_10_w.bin"); + void *depthwise_conv2d_10_w = + readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_20_gamma_path = + dir_prefix + std::string("batch_normalization_20_gamma.bin"); + void *batch_normalization_20_gamma = readTrainedWeights( + batch_normalization_20_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_beta_path = + dir_prefix + std::string("batch_normalization_20_beta.bin"); + void *batch_normalization_20_beta = readTrainedWeights( + batch_normalization_20_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_mean_path = + dir_prefix + std::string("batch_normalization_20_mean.bin"); + void *batch_normalization_20_mean = readTrainedWeights( + batch_normalization_20_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_20_variance_path = + dir_prefix + std::string("batch_normalization_20_variance.bin"); + void *batch_normalization_20_variance = readTrainedWeights( + batch_normalization_20_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_21_gamma_path = + dir_prefix + std::string("batch_normalization_21_gamma.bin"); + void *batch_normalization_21_gamma = readTrainedWeights( + batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_beta_path = + dir_prefix + std::string("batch_normalization_21_beta.bin"); + void *batch_normalization_21_beta = readTrainedWeights( + batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_mean_path = + dir_prefix + std::string("batch_normalization_21_mean.bin"); + void *batch_normalization_21_mean = readTrainedWeights( + batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_variance_path = + dir_prefix + std::string("batch_normalization_21_variance.bin"); + void *batch_normalization_21_variance = readTrainedWeights( + batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_11_w_path = + dir_prefix + std::string("depthwise_conv2d_11_w.bin"); + void *depthwise_conv2d_11_w = + readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_22_gamma_path = + dir_prefix + std::string("batch_normalization_22_gamma.bin"); + void *batch_normalization_22_gamma = readTrainedWeights( + batch_normalization_22_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_beta_path = + dir_prefix + std::string("batch_normalization_22_beta.bin"); + void *batch_normalization_22_beta = readTrainedWeights( + batch_normalization_22_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_mean_path = + dir_prefix + std::string("batch_normalization_22_mean.bin"); + void *batch_normalization_22_mean = readTrainedWeights( + batch_normalization_22_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_22_variance_path = + dir_prefix + std::string("batch_normalization_22_variance.bin"); + void *batch_normalization_22_variance = readTrainedWeights( + batch_normalization_22_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 1, 1); + std::string batch_normalization_23_gamma_path = + dir_prefix + std::string("batch_normalization_23_gamma.bin"); + void *batch_normalization_23_gamma = readTrainedWeights( + batch_normalization_23_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_beta_path = + dir_prefix + std::string("batch_normalization_23_beta.bin"); + void *batch_normalization_23_beta = readTrainedWeights( + batch_normalization_23_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_mean_path = + dir_prefix + std::string("batch_normalization_23_mean.bin"); + void *batch_normalization_23_mean = readTrainedWeights( + batch_normalization_23_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_23_variance_path = + dir_prefix + std::string("batch_normalization_23_variance.bin"); + void *batch_normalization_23_variance = readTrainedWeights( + batch_normalization_23_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string depthwise_conv2d_12_w_path = + dir_prefix + std::string("depthwise_conv2d_12_w.bin"); + void *depthwise_conv2d_12_w = + readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0, 512, 1, 3, 3); + std::string batch_normalization_24_gamma_path = + dir_prefix + std::string("batch_normalization_24_gamma.bin"); + void *batch_normalization_24_gamma = readTrainedWeights( + batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_beta_path = + dir_prefix + std::string("batch_normalization_24_beta.bin"); + void *batch_normalization_24_beta = readTrainedWeights( + batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_mean_path = + dir_prefix + std::string("batch_normalization_24_mean.bin"); + void *batch_normalization_24_mean = readTrainedWeights( + batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_variance_path = + dir_prefix + std::string("batch_normalization_24_variance.bin"); + void *batch_normalization_24_variance = readTrainedWeights( + batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 1024, 512, 1, 1); + std::string batch_normalization_25_gamma_path = + dir_prefix + std::string("batch_normalization_25_gamma.bin"); + void *batch_normalization_25_gamma = readTrainedWeights( + batch_normalization_25_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_beta_path = + dir_prefix + std::string("batch_normalization_25_beta.bin"); + void *batch_normalization_25_beta = readTrainedWeights( + batch_normalization_25_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_mean_path = + dir_prefix + std::string("batch_normalization_25_mean.bin"); + void *batch_normalization_25_mean = readTrainedWeights( + batch_normalization_25_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_25_variance_path = + dir_prefix + std::string("batch_normalization_25_variance.bin"); + void *batch_normalization_25_variance = readTrainedWeights( + batch_normalization_25_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string depthwise_conv2d_13_w_path = + dir_prefix + std::string("depthwise_conv2d_13_w.bin"); + void *depthwise_conv2d_13_w = + readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0, 1024, 1, 3, 3); + std::string batch_normalization_26_gamma_path = + dir_prefix + std::string("batch_normalization_26_gamma.bin"); + void *batch_normalization_26_gamma = readTrainedWeights( + batch_normalization_26_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_beta_path = + dir_prefix + std::string("batch_normalization_26_beta.bin"); + void *batch_normalization_26_beta = readTrainedWeights( + batch_normalization_26_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_mean_path = + dir_prefix + std::string("batch_normalization_26_mean.bin"); + void *batch_normalization_26_mean = readTrainedWeights( + batch_normalization_26_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_26_variance_path = + dir_prefix + std::string("batch_normalization_26_variance.bin"); + void *batch_normalization_26_variance = readTrainedWeights( + batch_normalization_26_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 1024, 1024, 1, 1); + std::string batch_normalization_27_gamma_path = + dir_prefix + std::string("batch_normalization_27_gamma.bin"); + void *batch_normalization_27_gamma = readTrainedWeights( + batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_beta_path = + dir_prefix + std::string("batch_normalization_27_beta.bin"); + void *batch_normalization_27_beta = readTrainedWeights( + batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_mean_path = + dir_prefix + std::string("batch_normalization_27_mean.bin"); + void *batch_normalization_27_mean = readTrainedWeights( + batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_variance_path = + dir_prefix + std::string("batch_normalization_27_variance.bin"); + void *batch_normalization_27_variance = readTrainedWeights( + batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 1024, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); - std::string dir_prefix = model_params_path + std::string("/mobilenet/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,32,3,3,3); - std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); - void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); - void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); - void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); - void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,32,1,1); - std::string depthwise_conv2d_1_w_path = dir_prefix + std::string("depthwise_conv2d_1_w.bin"); - void* depthwise_conv2d_1_w = readTrainedWeights(depthwise_conv2d_1_w_path.c_str(), 0,32,1,3,3); - std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); - void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); - void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); - void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,32,1,1); - std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); - void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,32,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,32,1,1); - std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); - void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); - void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); - void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); - void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); - std::string depthwise_conv2d_2_w_path = dir_prefix + std::string("depthwise_conv2d_2_w.bin"); - void* depthwise_conv2d_2_w = readTrainedWeights(depthwise_conv2d_2_w_path.c_str(), 0,64,1,3,3); - std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); - void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); - void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); - void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); - void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,1,1); - std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); - void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); - void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); - void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); - void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,128,1,1); - std::string depthwise_conv2d_3_w_path = dir_prefix + std::string("depthwise_conv2d_3_w.bin"); - void* depthwise_conv2d_3_w = readTrainedWeights(depthwise_conv2d_3_w_path.c_str(), 0,128,1,3,3); - std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); - void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); - void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); - void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); - void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,1,1); - std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); - void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); - void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); - void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); - void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,128,1,1); - std::string depthwise_conv2d_4_w_path = dir_prefix + std::string("depthwise_conv2d_4_w.bin"); - void* depthwise_conv2d_4_w = readTrainedWeights(depthwise_conv2d_4_w_path.c_str(), 0,128,1,3,3); - std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); - void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); - void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); - void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); - void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,1,1); - std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); - void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); - void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); - void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); - void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,256,1,1); - std::string depthwise_conv2d_5_w_path = dir_prefix + std::string("depthwise_conv2d_5_w.bin"); - void* depthwise_conv2d_5_w = readTrainedWeights(depthwise_conv2d_5_w_path.c_str(), 0,256,1,3,3); - std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); - void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); - void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); - void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); - void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,1,1); - std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); - void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); - void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); - void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); - void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); - std::string depthwise_conv2d_6_w_path = dir_prefix + std::string("depthwise_conv2d_6_w.bin"); - void* depthwise_conv2d_6_w = readTrainedWeights(depthwise_conv2d_6_w_path.c_str(), 0,256,1,3,3); - std::string batch_normalization_12_gamma_path = dir_prefix + std::string("batch_normalization_12_gamma.bin"); - void* batch_normalization_12_gamma = readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_beta_path = dir_prefix + std::string("batch_normalization_12_beta.bin"); - void* batch_normalization_12_beta = readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_mean_path = dir_prefix + std::string("batch_normalization_12_mean.bin"); - void* batch_normalization_12_mean = readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_12_variance_path = dir_prefix + std::string("batch_normalization_12_variance.bin"); - void* batch_normalization_12_variance = readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,512,256,1,1); - std::string batch_normalization_13_gamma_path = dir_prefix + std::string("batch_normalization_13_gamma.bin"); - void* batch_normalization_13_gamma = readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_beta_path = dir_prefix + std::string("batch_normalization_13_beta.bin"); - void* batch_normalization_13_beta = readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_mean_path = dir_prefix + std::string("batch_normalization_13_mean.bin"); - void* batch_normalization_13_mean = readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_13_variance_path = dir_prefix + std::string("batch_normalization_13_variance.bin"); - void* batch_normalization_13_variance = readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_7_w_path = dir_prefix + std::string("depthwise_conv2d_7_w.bin"); - void* depthwise_conv2d_7_w = readTrainedWeights(depthwise_conv2d_7_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_14_gamma_path = dir_prefix + std::string("batch_normalization_14_gamma.bin"); - void* batch_normalization_14_gamma = readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_beta_path = dir_prefix + std::string("batch_normalization_14_beta.bin"); - void* batch_normalization_14_beta = readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_mean_path = dir_prefix + std::string("batch_normalization_14_mean.bin"); - void* batch_normalization_14_mean = readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_variance_path = dir_prefix + std::string("batch_normalization_14_variance.bin"); - void* batch_normalization_14_variance = readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_15_gamma_path = dir_prefix + std::string("batch_normalization_15_gamma.bin"); - void* batch_normalization_15_gamma = readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_beta_path = dir_prefix + std::string("batch_normalization_15_beta.bin"); - void* batch_normalization_15_beta = readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_mean_path = dir_prefix + std::string("batch_normalization_15_mean.bin"); - void* batch_normalization_15_mean = readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_variance_path = dir_prefix + std::string("batch_normalization_15_variance.bin"); - void* batch_normalization_15_variance = readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_8_w_path = dir_prefix + std::string("depthwise_conv2d_8_w.bin"); - void* depthwise_conv2d_8_w = readTrainedWeights(depthwise_conv2d_8_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_16_gamma_path = dir_prefix + std::string("batch_normalization_16_gamma.bin"); - void* batch_normalization_16_gamma = readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_beta_path = dir_prefix + std::string("batch_normalization_16_beta.bin"); - void* batch_normalization_16_beta = readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_mean_path = dir_prefix + std::string("batch_normalization_16_mean.bin"); - void* batch_normalization_16_mean = readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_16_variance_path = dir_prefix + std::string("batch_normalization_16_variance.bin"); - void* batch_normalization_16_variance = readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_17_gamma_path = dir_prefix + std::string("batch_normalization_17_gamma.bin"); - void* batch_normalization_17_gamma = readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_beta_path = dir_prefix + std::string("batch_normalization_17_beta.bin"); - void* batch_normalization_17_beta = readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_mean_path = dir_prefix + std::string("batch_normalization_17_mean.bin"); - void* batch_normalization_17_mean = readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_17_variance_path = dir_prefix + std::string("batch_normalization_17_variance.bin"); - void* batch_normalization_17_variance = readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_9_w_path = dir_prefix + std::string("depthwise_conv2d_9_w.bin"); - void* depthwise_conv2d_9_w = readTrainedWeights(depthwise_conv2d_9_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_18_gamma_path = dir_prefix + std::string("batch_normalization_18_gamma.bin"); - void* batch_normalization_18_gamma = readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_beta_path = dir_prefix + std::string("batch_normalization_18_beta.bin"); - void* batch_normalization_18_beta = readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_mean_path = dir_prefix + std::string("batch_normalization_18_mean.bin"); - void* batch_normalization_18_mean = readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_variance_path = dir_prefix + std::string("batch_normalization_18_variance.bin"); - void* batch_normalization_18_variance = readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_19_gamma_path = dir_prefix + std::string("batch_normalization_19_gamma.bin"); - void* batch_normalization_19_gamma = readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_beta_path = dir_prefix + std::string("batch_normalization_19_beta.bin"); - void* batch_normalization_19_beta = readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_mean_path = dir_prefix + std::string("batch_normalization_19_mean.bin"); - void* batch_normalization_19_mean = readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_19_variance_path = dir_prefix + std::string("batch_normalization_19_variance.bin"); - void* batch_normalization_19_variance = readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_10_w_path = dir_prefix + std::string("depthwise_conv2d_10_w.bin"); - void* depthwise_conv2d_10_w = readTrainedWeights(depthwise_conv2d_10_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_20_gamma_path = dir_prefix + std::string("batch_normalization_20_gamma.bin"); - void* batch_normalization_20_gamma = readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_beta_path = dir_prefix + std::string("batch_normalization_20_beta.bin"); - void* batch_normalization_20_beta = readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_mean_path = dir_prefix + std::string("batch_normalization_20_mean.bin"); - void* batch_normalization_20_mean = readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_20_variance_path = dir_prefix + std::string("batch_normalization_20_variance.bin"); - void* batch_normalization_20_variance = readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_21_gamma_path = dir_prefix + std::string("batch_normalization_21_gamma.bin"); - void* batch_normalization_21_gamma = readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_beta_path = dir_prefix + std::string("batch_normalization_21_beta.bin"); - void* batch_normalization_21_beta = readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_mean_path = dir_prefix + std::string("batch_normalization_21_mean.bin"); - void* batch_normalization_21_mean = readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_variance_path = dir_prefix + std::string("batch_normalization_21_variance.bin"); - void* batch_normalization_21_variance = readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_11_w_path = dir_prefix + std::string("depthwise_conv2d_11_w.bin"); - void* depthwise_conv2d_11_w = readTrainedWeights(depthwise_conv2d_11_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_22_gamma_path = dir_prefix + std::string("batch_normalization_22_gamma.bin"); - void* batch_normalization_22_gamma = readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_beta_path = dir_prefix + std::string("batch_normalization_22_beta.bin"); - void* batch_normalization_22_beta = readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_mean_path = dir_prefix + std::string("batch_normalization_22_mean.bin"); - void* batch_normalization_22_mean = readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_22_variance_path = dir_prefix + std::string("batch_normalization_22_variance.bin"); - void* batch_normalization_22_variance = readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,1,1); - std::string batch_normalization_23_gamma_path = dir_prefix + std::string("batch_normalization_23_gamma.bin"); - void* batch_normalization_23_gamma = readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_beta_path = dir_prefix + std::string("batch_normalization_23_beta.bin"); - void* batch_normalization_23_beta = readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_mean_path = dir_prefix + std::string("batch_normalization_23_mean.bin"); - void* batch_normalization_23_mean = readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_23_variance_path = dir_prefix + std::string("batch_normalization_23_variance.bin"); - void* batch_normalization_23_variance = readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,512,1,1); - std::string depthwise_conv2d_12_w_path = dir_prefix + std::string("depthwise_conv2d_12_w.bin"); - void* depthwise_conv2d_12_w = readTrainedWeights(depthwise_conv2d_12_w_path.c_str(), 0,512,1,3,3); - std::string batch_normalization_24_gamma_path = dir_prefix + std::string("batch_normalization_24_gamma.bin"); - void* batch_normalization_24_gamma = readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_beta_path = dir_prefix + std::string("batch_normalization_24_beta.bin"); - void* batch_normalization_24_beta = readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_mean_path = dir_prefix + std::string("batch_normalization_24_mean.bin"); - void* batch_normalization_24_mean = readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_variance_path = dir_prefix + std::string("batch_normalization_24_variance.bin"); - void* batch_normalization_24_variance = readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,1024,512,1,1); - std::string batch_normalization_25_gamma_path = dir_prefix + std::string("batch_normalization_25_gamma.bin"); - void* batch_normalization_25_gamma = readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_beta_path = dir_prefix + std::string("batch_normalization_25_beta.bin"); - void* batch_normalization_25_beta = readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_mean_path = dir_prefix + std::string("batch_normalization_25_mean.bin"); - void* batch_normalization_25_mean = readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_25_variance_path = dir_prefix + std::string("batch_normalization_25_variance.bin"); - void* batch_normalization_25_variance = readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,1024,1,1); - std::string depthwise_conv2d_13_w_path = dir_prefix + std::string("depthwise_conv2d_13_w.bin"); - void* depthwise_conv2d_13_w = readTrainedWeights(depthwise_conv2d_13_w_path.c_str(), 0,1024,1,3,3); - std::string batch_normalization_26_gamma_path = dir_prefix + std::string("batch_normalization_26_gamma.bin"); - void* batch_normalization_26_gamma = readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_beta_path = dir_prefix + std::string("batch_normalization_26_beta.bin"); - void* batch_normalization_26_beta = readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_mean_path = dir_prefix + std::string("batch_normalization_26_mean.bin"); - void* batch_normalization_26_mean = readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_26_variance_path = dir_prefix + std::string("batch_normalization_26_variance.bin"); - void* batch_normalization_26_variance = readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,1024,1024,1,1); - std::string batch_normalization_27_gamma_path = dir_prefix + std::string("batch_normalization_27_gamma.bin"); - void* batch_normalization_27_gamma = readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_beta_path = dir_prefix + std::string("batch_normalization_27_beta.bin"); - void* batch_normalization_27_beta = readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_mean_path = dir_prefix + std::string("batch_normalization_27_mean.bin"); - void* batch_normalization_27_mean = readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_variance_path = dir_prefix + std::string("batch_normalization_27_variance.bin"); - void* batch_normalization_27_variance = readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,1024,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); + startMemTracking(); + int test_input_size = 2000; + int batch_size = 2000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + for (int i = 0; i < batch_count; i++) { - startMemTracking(); + int start = i * batch_size; + int end = (i + 1) * batch_size; - int test_input_size = 2000; - int batch_size = 2000; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); - for(int i = 0; i < batch_count; i++){ + void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); + void *var_1 = tensorBatchNorm( + var_0, batch_normalization_1_gamma, batch_normalization_1_beta, + batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void *var_2 = tensorRelu(var_1); + void *var_4 = + tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); + void *var_5 = tensorBatchNorm( + var_4, batch_normalization_2_gamma, batch_normalization_2_beta, + batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void *var_6 = tensorRelu(var_5); + void *var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); + void *var_8 = tensorBatchNorm( + var_7, batch_normalization_3_gamma, batch_normalization_3_beta, + batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void *var_9 = tensorRelu(var_8); + void *var_11 = + tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); + void *var_12 = tensorBatchNorm( + var_11, batch_normalization_4_gamma, batch_normalization_4_beta, + batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void *var_13 = tensorRelu(var_12); + void *var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); + void *var_15 = tensorBatchNorm( + var_14, batch_normalization_5_gamma, batch_normalization_5_beta, + batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void *var_16 = tensorRelu(var_15); + void *var_18 = + tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); + void *var_19 = tensorBatchNorm( + var_18, batch_normalization_6_gamma, batch_normalization_6_beta, + batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void *var_20 = tensorRelu(var_19); + void *var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); + void *var_22 = tensorBatchNorm( + var_21, batch_normalization_7_gamma, batch_normalization_7_beta, + batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void *var_23 = tensorRelu(var_22); + void *var_26 = + tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); + void *var_27 = tensorBatchNorm( + var_26, batch_normalization_8_gamma, batch_normalization_8_beta, + batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void *var_28 = tensorRelu(var_27); + void *var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); + void *var_30 = tensorBatchNorm( + var_29, batch_normalization_9_gamma, batch_normalization_9_beta, + batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void *var_31 = tensorRelu(var_30); + void *var_33 = + tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); + void *var_34 = tensorBatchNorm( + var_33, batch_normalization_10_gamma, batch_normalization_10_beta, + batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void *var_35 = tensorRelu(var_34); + void *var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); + void *var_37 = tensorBatchNorm( + var_36, batch_normalization_11_gamma, batch_normalization_11_beta, + batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void *var_38 = tensorRelu(var_37); + void *var_41 = + tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); + void *var_42 = tensorBatchNorm( + var_41, batch_normalization_12_gamma, batch_normalization_12_beta, + batch_normalization_12_mean, batch_normalization_12_variance, 0.001); + void *var_43 = tensorRelu(var_42); + void *var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); + void *var_45 = tensorBatchNorm( + var_44, batch_normalization_13_gamma, batch_normalization_13_beta, + batch_normalization_13_mean, batch_normalization_13_variance, 0.001); + void *var_46 = tensorRelu(var_45); + void *var_48 = + tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); + void *var_49 = tensorBatchNorm( + var_48, batch_normalization_14_gamma, batch_normalization_14_beta, + batch_normalization_14_mean, batch_normalization_14_variance, 0.001); + void *var_50 = tensorRelu(var_49); + void *var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); + void *var_52 = tensorBatchNorm( + var_51, batch_normalization_15_gamma, batch_normalization_15_beta, + batch_normalization_15_mean, batch_normalization_15_variance, 0.001); + void *var_53 = tensorRelu(var_52); + void *var_55 = + tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); + void *var_56 = tensorBatchNorm( + var_55, batch_normalization_16_gamma, batch_normalization_16_beta, + batch_normalization_16_mean, batch_normalization_16_variance, 0.001); + void *var_57 = tensorRelu(var_56); + void *var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); + void *var_59 = tensorBatchNorm( + var_58, batch_normalization_17_gamma, batch_normalization_17_beta, + batch_normalization_17_mean, batch_normalization_17_variance, 0.001); + void *var_60 = tensorRelu(var_59); + void *var_63 = + tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); + void *var_64 = tensorBatchNorm( + var_63, batch_normalization_18_gamma, batch_normalization_18_beta, + batch_normalization_18_mean, batch_normalization_18_variance, 0.001); + void *var_65 = tensorRelu(var_64); + void *var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); + void *var_67 = tensorBatchNorm( + var_66, batch_normalization_19_gamma, batch_normalization_19_beta, + batch_normalization_19_mean, batch_normalization_19_variance, 0.001); + void *var_68 = tensorRelu(var_67); + void *var_70 = + tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); + void *var_71 = tensorBatchNorm( + var_70, batch_normalization_20_gamma, batch_normalization_20_beta, + batch_normalization_20_mean, batch_normalization_20_variance, 0.001); + void *var_72 = tensorRelu(var_71); + void *var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); + void *var_74 = tensorBatchNorm( + var_73, batch_normalization_21_gamma, batch_normalization_21_beta, + batch_normalization_21_mean, batch_normalization_21_variance, 0.001); + void *var_75 = tensorRelu(var_74); + void *var_77 = + tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); + void *var_78 = tensorBatchNorm( + var_77, batch_normalization_22_gamma, batch_normalization_22_beta, + batch_normalization_22_mean, batch_normalization_22_variance, 0.001); + void *var_79 = tensorRelu(var_78); + void *var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); + void *var_81 = tensorBatchNorm( + var_80, batch_normalization_23_gamma, batch_normalization_23_beta, + batch_normalization_23_mean, batch_normalization_23_variance, 0.001); + void *var_82 = tensorRelu(var_81); + void *var_85 = + tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); + void *var_86 = tensorBatchNorm( + var_85, batch_normalization_24_gamma, batch_normalization_24_beta, + batch_normalization_24_mean, batch_normalization_24_variance, 0.001); + void *var_87 = tensorRelu(var_86); + void *var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); + void *var_89 = tensorBatchNorm( + var_88, batch_normalization_25_gamma, batch_normalization_25_beta, + batch_normalization_25_mean, batch_normalization_25_variance, 0.001); + void *var_90 = tensorRelu(var_89); + void *var_92 = + tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); + void *var_93 = tensorBatchNorm( + var_92, batch_normalization_26_gamma, batch_normalization_26_beta, + batch_normalization_26_mean, batch_normalization_26_variance, 0.001); + void *var_94 = tensorRelu(var_93); + void *var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); + void *var_96 = tensorBatchNorm( + var_95, batch_normalization_27_gamma, batch_normalization_27_beta, + batch_normalization_27_mean, batch_normalization_27_variance, 0.001); + void *var_97 = tensorRelu(var_96); + void *var_99 = tensorPooling(var_97, 1, 2, 2, 0, 0, 2, 2); + void *var_101 = tensorGemmGPU(var_99, dense_1_w); + void *var_102 = tensorAdd(var_101, dense_1_b); + void *var_103 = tensorSoftmax(var_102); - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); - void* var_1 = tensorBatchNorm(var_0, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); - void* var_2 = tensorRelu(var_1); - void* var_4 = tensorConvCutlass(var_2, depthwise_conv2d_1_w, 1, 1, 1, 1, 1, 32); - void* var_5 = tensorBatchNorm(var_4, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); - void* var_6 = tensorRelu(var_5); - void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); - void* var_8 = tensorBatchNorm(var_7, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); - void* var_9 = tensorRelu(var_8); - void* var_11 = tensorConvCutlass(var_9, depthwise_conv2d_2_w, 1, 1, 2, 2, 1, 64); - void* var_12 = tensorBatchNorm(var_11, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); - void* var_13 = tensorRelu(var_12); - void* var_14 = tensorConvolution(var_13, conv2d_3_w, 0, 0, 1, 1, 1, 1); - void* var_15 = tensorBatchNorm(var_14, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); - void* var_16 = tensorRelu(var_15); - void* var_18 = tensorConvCutlass(var_16, depthwise_conv2d_3_w, 1, 1, 1, 1, 1, 128); - void* var_19 = tensorBatchNorm(var_18, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); - void* var_20 = tensorRelu(var_19); - void* var_21 = tensorConvolution(var_20, conv2d_4_w, 0, 0, 1, 1, 1, 1); - void* var_22 = tensorBatchNorm(var_21, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); - void* var_23 = tensorRelu(var_22); - void* var_26 = tensorConvCutlass(var_23, depthwise_conv2d_4_w, 1, 1, 2, 2, 1, 128); - void* var_27 = tensorBatchNorm(var_26, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); - void* var_28 = tensorRelu(var_27); - void* var_29 = tensorConvolution(var_28, conv2d_5_w, 0, 0, 1, 1, 1, 1); - void* var_30 = tensorBatchNorm(var_29, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); - void* var_31 = tensorRelu(var_30); - void* var_33 = tensorConvCutlass(var_31, depthwise_conv2d_5_w, 1, 1, 1, 1, 1, 256); - void* var_34 = tensorBatchNorm(var_33, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); - void* var_35 = tensorRelu(var_34); - void* var_36 = tensorConvolution(var_35, conv2d_6_w, 0, 0, 1, 1, 1, 1); - void* var_37 = tensorBatchNorm(var_36, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); - void* var_38 = tensorRelu(var_37); - void* var_41 = tensorConvCutlass(var_38, depthwise_conv2d_6_w, 1, 1, 2, 2, 1, 256); - void* var_42 = tensorBatchNorm(var_41, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); - void* var_43 = tensorRelu(var_42); - void* var_44 = tensorConvolution(var_43, conv2d_7_w, 0, 0, 1, 1, 1, 1); - void* var_45 = tensorBatchNorm(var_44, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); - void* var_46 = tensorRelu(var_45); - void* var_48 = tensorConvCutlass(var_46, depthwise_conv2d_7_w, 1, 1, 1, 1, 1, 512); - void* var_49 = tensorBatchNorm(var_48, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); - void* var_50 = tensorRelu(var_49); - void* var_51 = tensorConvolution(var_50, conv2d_8_w, 0, 0, 1, 1, 1, 1); - void* var_52 = tensorBatchNorm(var_51, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); - void* var_53 = tensorRelu(var_52); - void* var_55 = tensorConvCutlass(var_53, depthwise_conv2d_8_w, 1, 1, 1, 1, 1, 512); - void* var_56 = tensorBatchNorm(var_55, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); - void* var_57 = tensorRelu(var_56); - void* var_58 = tensorConvolution(var_57, conv2d_9_w, 0, 0, 1, 1, 1, 1); - void* var_59 = tensorBatchNorm(var_58, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); - void* var_60 = tensorRelu(var_59); - void* var_63 = tensorConvCutlass(var_60, depthwise_conv2d_9_w, 1, 1, 1, 1, 1, 512); - void* var_64 = tensorBatchNorm(var_63, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); - void* var_65 = tensorRelu(var_64); - void* var_66 = tensorConvolution(var_65, conv2d_10_w, 0, 0, 1, 1, 1, 1); - void* var_67 = tensorBatchNorm(var_66, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); - void* var_68 = tensorRelu(var_67); - void* var_70 = tensorConvCutlass(var_68, depthwise_conv2d_10_w, 1, 1, 1, 1, 1, 512); - void* var_71 = tensorBatchNorm(var_70, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); - void* var_72 = tensorRelu(var_71); - void* var_73 = tensorConvolution(var_72, conv2d_11_w, 0, 0, 1, 1, 1, 1); - void* var_74 = tensorBatchNorm(var_73, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); - void* var_75 = tensorRelu(var_74); - void* var_77 = tensorConvCutlass(var_75, depthwise_conv2d_11_w, 1, 1, 1, 1, 1, 512); - void* var_78 = tensorBatchNorm(var_77, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); - void* var_79 = tensorRelu(var_78); - void* var_80 = tensorConvolution(var_79, conv2d_12_w, 0, 0, 1, 1, 1, 1); - void* var_81 = tensorBatchNorm(var_80, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); - void* var_82 = tensorRelu(var_81); - void* var_85 = tensorConvCutlass(var_82, depthwise_conv2d_12_w, 1, 1, 2, 2, 1, 512); - void* var_86 = tensorBatchNorm(var_85, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); - void* var_87 = tensorRelu(var_86); - void* var_88 = tensorConvolution(var_87, conv2d_13_w, 0, 0, 1, 1, 1, 1); - void* var_89 = tensorBatchNorm(var_88, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); - void* var_90 = tensorRelu(var_89); - void* var_92 = tensorConvCutlass(var_90, depthwise_conv2d_13_w, 1, 1, 1, 1, 1, 1024); - void* var_93 = tensorBatchNorm(var_92, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); - void* var_94 = tensorRelu(var_93); - void* var_95 = tensorConvolution(var_94, conv2d_14_w, 0, 0, 1, 1, 1, 1); - void* var_96 = tensorBatchNorm(var_95, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); - void* var_97 = tensorRelu(var_96); - void* var_99 = tensorPooling(var_97,1,2,2,0,0,2,2); - void* var_101 = tensorGemmGPU(var_99, dense_1_w); - void* var_102 = tensorAdd(var_101, dense_1_b); - void* var_103 = tensorSoftmax(var_102); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy2(labels, batch_size, var_103); - final_accuracy += accuracy; - freeBatchMemory(); + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + float accuracy = computeAccuracy2(labels, batch_size, var_103); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); - - - llvm_hpvm_cleanupTensorRt(); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc index 87b8cd4156ed8d7f882ff7642420c995cd7c3a0f..dc462f3943546fa6d924ed92ab16ba517320bf17 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet18_cifar10.cc @@ -1,112 +1,155 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(1); - - std::string dir_prefix = model_params_path + std::string("/resnet18_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - //void* input = readTrainedWeights(input_path.c_str(), 0, batch_size,3,32,32); - std::string labels_path = dir_prefix + std::string("labels.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(), batch_size); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,16,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,16,16,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,16,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,32,16,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,32,16,1,1); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,32,32,3,3); - std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); - void* conv2d_14_b = readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,32,1,1); - std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); - void* conv2d_15_w = readTrainedWeights(conv2d_15_w_path.c_str(), 0,64,32,3,3); - std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); - void* conv2d_15_b = readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); - void* conv2d_17_w = readTrainedWeights(conv2d_17_w_path.c_str(), 0,64,32,1,1); - std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); - void* conv2d_17_b = readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); - void* conv2d_16_w = readTrainedWeights(conv2d_16_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); - void* conv2d_16_b = readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); - void* conv2d_18_w = readTrainedWeights(conv2d_18_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); - void* conv2d_18_b = readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); - void* conv2d_19_w = readTrainedWeights(conv2d_19_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); - void* conv2d_19_b = readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); - void* conv2d_20_w = readTrainedWeights(conv2d_20_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); - void* conv2d_20_b = readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); - void* conv2d_21_w = readTrainedWeights(conv2d_21_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); - void* conv2d_21_b = readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,64,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,64,10); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" +int main() { + + llvm_hpvm_initTensorRt(1); + + std::string dir_prefix = + model_params_path + std::string("/resnet18_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + // void* input = readTrainedWeights(input_path.c_str(), 0, + // batch_size,3,32,32); + std::string labels_path = dir_prefix + std::string("labels.bin"); + // uint8_t* labels = readLabels(labels_path.c_str(), batch_size); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 16, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 16, 16, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 16, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 32, 16, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 32, 16, 1, 1); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 32, 32, 3, 3); + std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); + void *conv2d_14_b = + readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 32, 1, 1); + std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); + void *conv2d_15_w = + readTrainedWeights(conv2d_15_w_path.c_str(), 0, 64, 32, 3, 3); + std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); + void *conv2d_15_b = + readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); + void *conv2d_17_w = + readTrainedWeights(conv2d_17_w_path.c_str(), 0, 64, 32, 1, 1); + std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); + void *conv2d_17_b = + readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); + void *conv2d_16_w = + readTrainedWeights(conv2d_16_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); + void *conv2d_16_b = + readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); + void *conv2d_18_w = + readTrainedWeights(conv2d_18_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); + void *conv2d_18_b = + readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); + void *conv2d_19_w = + readTrainedWeights(conv2d_19_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); + void *conv2d_19_b = + readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); + void *conv2d_20_w = + readTrainedWeights(conv2d_20_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); + void *conv2d_20_b = + readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); + void *conv2d_21_w = + readTrainedWeights(conv2d_21_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); + void *conv2d_21_b = + readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 64, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 64, 10); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -117,94 +160,94 @@ int main(){ // NOTE: Starting time profiling startProfiling(); - - for(int i = 0; i < batch_count; i++){ + + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_3 = tensorAdd(var_2, conv2d_1_b); - void* var_4 = tensorRelu(var_3); - void* var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_7 = tensorAdd(var_6, conv2d_2_b); - void* var_8 = tensorRelu(var_7); - void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorAdd(var_10, conv2d_3_b); - void* var_12 = tensorAdd(var_4, var_11); - void* var_13 = tensorRelu(var_12); - void* var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_16 = tensorAdd(var_15, conv2d_4_b); - void* var_17 = tensorRelu(var_16); - void* var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_20 = tensorAdd(var_19, conv2d_5_b); - void* var_21 = tensorAdd(var_13, var_20); - void* var_22 = tensorRelu(var_21); - void* var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorAdd(var_24, conv2d_6_b); - void* var_26 = tensorRelu(var_25); - void* var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorAdd(var_28, conv2d_7_b); - void* var_30 = tensorAdd(var_22, var_29); - void* var_31 = tensorRelu(var_30); - void* var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); - void* var_34 = tensorAdd(var_33, conv2d_8_b); - void* var_35 = tensorRelu(var_34); - void* var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_38 = tensorAdd(var_37, conv2d_9_b); - void* var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); - void* var_41 = tensorAdd(var_40, conv2d_10_b); - void* var_42 = tensorAdd(var_41, var_38); - void* var_43 = tensorRelu(var_42); - void* var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_46 = tensorAdd(var_45, conv2d_11_b); - void* var_47 = tensorRelu(var_46); - void* var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_50 = tensorAdd(var_49, conv2d_12_b); - void* var_51 = tensorAdd(var_43, var_50); - void* var_52 = tensorRelu(var_51); - void* var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_55 = tensorAdd(var_54, conv2d_13_b); - void* var_56 = tensorRelu(var_55); - void* var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); - void* var_59 = tensorAdd(var_58, conv2d_14_b); - void* var_60 = tensorAdd(var_52, var_59); - void* var_61 = tensorRelu(var_60); - void* var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); - void* var_64 = tensorAdd(var_63, conv2d_15_b); - void* var_65 = tensorRelu(var_64); - void* var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); - void* var_68 = tensorAdd(var_67, conv2d_16_b); - void* var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); - void* var_71 = tensorAdd(var_70, conv2d_17_b); - void* var_72 = tensorAdd(var_71, var_68); - void* var_73 = tensorRelu(var_72); - void* var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); - void* var_76 = tensorAdd(var_75, conv2d_18_b); - void* var_77 = tensorRelu(var_76); - void* var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); - void* var_80 = tensorAdd(var_79, conv2d_19_b); - void* var_81 = tensorAdd(var_73, var_80); - void* var_82 = tensorRelu(var_81); - void* var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); - void* var_85 = tensorAdd(var_84, conv2d_20_b); - void* var_86 = tensorRelu(var_85); - void* var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); - void* var_89 = tensorAdd(var_88, conv2d_21_b); - void* var_90 = tensorAdd(var_82, var_89); - void* var_91 = tensorRelu(var_90); - void* var_92 = tensorPooling(var_91,1,8,8,0,0,8,8); - void* var_94 = tensorGemmGPU(var_92, dense_1_w); - void* var_95 = tensorAdd(var_94, dense_1_b); - void* var_96 = tensorSoftmax(var_95); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_96); + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_2 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_3 = tensorAdd(var_2, conv2d_1_b); + void *var_4 = tensorRelu(var_3); + void *var_6 = tensorConvolution(var_4, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_7 = tensorAdd(var_6, conv2d_2_b); + void *var_8 = tensorRelu(var_7); + void *var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_11 = tensorAdd(var_10, conv2d_3_b); + void *var_12 = tensorAdd(var_4, var_11); + void *var_13 = tensorRelu(var_12); + void *var_15 = tensorConvolution(var_13, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_16 = tensorAdd(var_15, conv2d_4_b); + void *var_17 = tensorRelu(var_16); + void *var_19 = tensorConvolution(var_17, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_20 = tensorAdd(var_19, conv2d_5_b); + void *var_21 = tensorAdd(var_13, var_20); + void *var_22 = tensorRelu(var_21); + void *var_24 = tensorConvolution(var_22, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorAdd(var_24, conv2d_6_b); + void *var_26 = tensorRelu(var_25); + void *var_28 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorAdd(var_28, conv2d_7_b); + void *var_30 = tensorAdd(var_22, var_29); + void *var_31 = tensorRelu(var_30); + void *var_33 = tensorConvolution(var_31, conv2d_8_w, 1, 1, 2, 2, 1, 0); + void *var_34 = tensorAdd(var_33, conv2d_8_b); + void *var_35 = tensorRelu(var_34); + void *var_37 = tensorConvolution(var_35, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_38 = tensorAdd(var_37, conv2d_9_b); + void *var_40 = tensorConvolution(var_31, conv2d_10_w, 0, 0, 2, 2, 1, 0); + void *var_41 = tensorAdd(var_40, conv2d_10_b); + void *var_42 = tensorAdd(var_41, var_38); + void *var_43 = tensorRelu(var_42); + void *var_45 = tensorConvolution(var_43, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_46 = tensorAdd(var_45, conv2d_11_b); + void *var_47 = tensorRelu(var_46); + void *var_49 = tensorConvolution(var_47, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_50 = tensorAdd(var_49, conv2d_12_b); + void *var_51 = tensorAdd(var_43, var_50); + void *var_52 = tensorRelu(var_51); + void *var_54 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_55 = tensorAdd(var_54, conv2d_13_b); + void *var_56 = tensorRelu(var_55); + void *var_58 = tensorConvolution(var_56, conv2d_14_w, 1, 1, 1, 1, 1, 0); + void *var_59 = tensorAdd(var_58, conv2d_14_b); + void *var_60 = tensorAdd(var_52, var_59); + void *var_61 = tensorRelu(var_60); + void *var_63 = tensorConvolution(var_61, conv2d_15_w, 1, 1, 2, 2, 1, 0); + void *var_64 = tensorAdd(var_63, conv2d_15_b); + void *var_65 = tensorRelu(var_64); + void *var_67 = tensorConvolution(var_65, conv2d_16_w, 1, 1, 1, 1, 1, 0); + void *var_68 = tensorAdd(var_67, conv2d_16_b); + void *var_70 = tensorConvolution(var_61, conv2d_17_w, 0, 0, 2, 2, 1, 0); + void *var_71 = tensorAdd(var_70, conv2d_17_b); + void *var_72 = tensorAdd(var_71, var_68); + void *var_73 = tensorRelu(var_72); + void *var_75 = tensorConvolution(var_73, conv2d_18_w, 1, 1, 1, 1, 1, 0); + void *var_76 = tensorAdd(var_75, conv2d_18_b); + void *var_77 = tensorRelu(var_76); + void *var_79 = tensorConvolution(var_77, conv2d_19_w, 1, 1, 1, 1, 1, 0); + void *var_80 = tensorAdd(var_79, conv2d_19_b); + void *var_81 = tensorAdd(var_73, var_80); + void *var_82 = tensorRelu(var_81); + void *var_84 = tensorConvolution(var_82, conv2d_20_w, 1, 1, 1, 1, 1, 0); + void *var_85 = tensorAdd(var_84, conv2d_20_b); + void *var_86 = tensorRelu(var_85); + void *var_88 = tensorConvolution(var_86, conv2d_21_w, 1, 1, 1, 1, 1, 0); + void *var_89 = tensorAdd(var_88, conv2d_21_b); + void *var_90 = tensorAdd(var_82, var_89); + void *var_91 = tensorRelu(var_90); + void *var_92 = tensorPooling(var_91, 1, 8, 8, 0, 0, 8, 8); + void *var_94 = tensorGemmGPU(var_92, dense_1_w); + void *var_95 = tensorAdd(var_94, dense_1_b); + void *var_96 = tensorSoftmax(var_95); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_96); final_accuracy += accuracy; - + freeBatchMemory(); } @@ -213,9 +256,7 @@ int main(){ final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc index 0914b3f70c353ee7e56c39ccf52f21914618301e..1329c0b9b880021f65d7307f37977cb76afad5fa 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/resnet50_imagenet.cc @@ -1,924 +1,1557 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" +int main() { -int main(){ + llvm_hpvm_initTensorRt(0); - llvm_hpvm_initTensorRt(0); + std::string dir_prefix = + model_params_path + std::string("/shared/hsharif3/resnet50_imagenet/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 7, 7); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_1_gamma_path = + dir_prefix + std::string("batch_normalization_1_gamma.bin"); + void *batch_normalization_1_gamma = readTrainedWeights( + batch_normalization_1_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_1_beta_path = + dir_prefix + std::string("batch_normalization_1_beta.bin"); + void *batch_normalization_1_beta = readTrainedWeights( + batch_normalization_1_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_1_mean_path = + dir_prefix + std::string("batch_normalization_1_mean.bin"); + void *batch_normalization_1_mean = readTrainedWeights( + batch_normalization_1_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_1_variance_path = + dir_prefix + std::string("batch_normalization_1_variance.bin"); + void *batch_normalization_1_variance = readTrainedWeights( + batch_normalization_1_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 1, 1); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_2_gamma_path = + dir_prefix + std::string("batch_normalization_2_gamma.bin"); + void *batch_normalization_2_gamma = readTrainedWeights( + batch_normalization_2_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_2_beta_path = + dir_prefix + std::string("batch_normalization_2_beta.bin"); + void *batch_normalization_2_beta = readTrainedWeights( + batch_normalization_2_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_2_mean_path = + dir_prefix + std::string("batch_normalization_2_mean.bin"); + void *batch_normalization_2_mean = readTrainedWeights( + batch_normalization_2_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_2_variance_path = + dir_prefix + std::string("batch_normalization_2_variance.bin"); + void *batch_normalization_2_variance = readTrainedWeights( + batch_normalization_2_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_gamma_path = + dir_prefix + std::string("batch_normalization_3_gamma.bin"); + void *batch_normalization_3_gamma = readTrainedWeights( + batch_normalization_3_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_beta_path = + dir_prefix + std::string("batch_normalization_3_beta.bin"); + void *batch_normalization_3_beta = readTrainedWeights( + batch_normalization_3_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_mean_path = + dir_prefix + std::string("batch_normalization_3_mean.bin"); + void *batch_normalization_3_mean = readTrainedWeights( + batch_normalization_3_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_3_variance_path = + dir_prefix + std::string("batch_normalization_3_variance.bin"); + void *batch_normalization_3_variance = readTrainedWeights( + batch_normalization_3_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 256, 64, 1, 1); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 64, 1, 1); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_4_gamma_path = + dir_prefix + std::string("batch_normalization_4_gamma.bin"); + void *batch_normalization_4_gamma = readTrainedWeights( + batch_normalization_4_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_4_beta_path = + dir_prefix + std::string("batch_normalization_4_beta.bin"); + void *batch_normalization_4_beta = readTrainedWeights( + batch_normalization_4_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_4_mean_path = + dir_prefix + std::string("batch_normalization_4_mean.bin"); + void *batch_normalization_4_mean = readTrainedWeights( + batch_normalization_4_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_4_variance_path = + dir_prefix + std::string("batch_normalization_4_variance.bin"); + void *batch_normalization_4_variance = readTrainedWeights( + batch_normalization_4_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_5_gamma_path = + dir_prefix + std::string("batch_normalization_5_gamma.bin"); + void *batch_normalization_5_gamma = readTrainedWeights( + batch_normalization_5_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_5_beta_path = + dir_prefix + std::string("batch_normalization_5_beta.bin"); + void *batch_normalization_5_beta = readTrainedWeights( + batch_normalization_5_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_5_mean_path = + dir_prefix + std::string("batch_normalization_5_mean.bin"); + void *batch_normalization_5_mean = readTrainedWeights( + batch_normalization_5_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_5_variance_path = + dir_prefix + std::string("batch_normalization_5_variance.bin"); + void *batch_normalization_5_variance = readTrainedWeights( + batch_normalization_5_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 64, 256, 1, 1); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_6_gamma_path = + dir_prefix + std::string("batch_normalization_6_gamma.bin"); + void *batch_normalization_6_gamma = readTrainedWeights( + batch_normalization_6_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_6_beta_path = + dir_prefix + std::string("batch_normalization_6_beta.bin"); + void *batch_normalization_6_beta = readTrainedWeights( + batch_normalization_6_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_6_mean_path = + dir_prefix + std::string("batch_normalization_6_mean.bin"); + void *batch_normalization_6_mean = readTrainedWeights( + batch_normalization_6_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_6_variance_path = + dir_prefix + std::string("batch_normalization_6_variance.bin"); + void *batch_normalization_6_variance = readTrainedWeights( + batch_normalization_6_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_7_gamma_path = + dir_prefix + std::string("batch_normalization_7_gamma.bin"); + void *batch_normalization_7_gamma = readTrainedWeights( + batch_normalization_7_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_7_beta_path = + dir_prefix + std::string("batch_normalization_7_beta.bin"); + void *batch_normalization_7_beta = readTrainedWeights( + batch_normalization_7_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_7_mean_path = + dir_prefix + std::string("batch_normalization_7_mean.bin"); + void *batch_normalization_7_mean = readTrainedWeights( + batch_normalization_7_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_7_variance_path = + dir_prefix + std::string("batch_normalization_7_variance.bin"); + void *batch_normalization_7_variance = readTrainedWeights( + batch_normalization_7_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 256, 64, 1, 1); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_8_gamma_path = + dir_prefix + std::string("batch_normalization_8_gamma.bin"); + void *batch_normalization_8_gamma = readTrainedWeights( + batch_normalization_8_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_8_beta_path = + dir_prefix + std::string("batch_normalization_8_beta.bin"); + void *batch_normalization_8_beta = readTrainedWeights( + batch_normalization_8_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_8_mean_path = + dir_prefix + std::string("batch_normalization_8_mean.bin"); + void *batch_normalization_8_mean = readTrainedWeights( + batch_normalization_8_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_8_variance_path = + dir_prefix + std::string("batch_normalization_8_variance.bin"); + void *batch_normalization_8_variance = readTrainedWeights( + batch_normalization_8_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 64, 256, 1, 1); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_9_gamma_path = + dir_prefix + std::string("batch_normalization_9_gamma.bin"); + void *batch_normalization_9_gamma = readTrainedWeights( + batch_normalization_9_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_9_beta_path = + dir_prefix + std::string("batch_normalization_9_beta.bin"); + void *batch_normalization_9_beta = readTrainedWeights( + batch_normalization_9_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_9_mean_path = + dir_prefix + std::string("batch_normalization_9_mean.bin"); + void *batch_normalization_9_mean = readTrainedWeights( + batch_normalization_9_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_9_variance_path = + dir_prefix + std::string("batch_normalization_9_variance.bin"); + void *batch_normalization_9_variance = readTrainedWeights( + batch_normalization_9_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_10_gamma_path = + dir_prefix + std::string("batch_normalization_10_gamma.bin"); + void *batch_normalization_10_gamma = readTrainedWeights( + batch_normalization_10_gamma_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_10_beta_path = + dir_prefix + std::string("batch_normalization_10_beta.bin"); + void *batch_normalization_10_beta = readTrainedWeights( + batch_normalization_10_beta_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_10_mean_path = + dir_prefix + std::string("batch_normalization_10_mean.bin"); + void *batch_normalization_10_mean = readTrainedWeights( + batch_normalization_10_mean_path.c_str(), 0, 1, 64, 1, 1); + std::string batch_normalization_10_variance_path = + dir_prefix + std::string("batch_normalization_10_variance.bin"); + void *batch_normalization_10_variance = readTrainedWeights( + batch_normalization_10_variance_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 256, 64, 1, 1); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_gamma_path = + dir_prefix + std::string("batch_normalization_11_gamma.bin"); + void *batch_normalization_11_gamma = readTrainedWeights( + batch_normalization_11_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_beta_path = + dir_prefix + std::string("batch_normalization_11_beta.bin"); + void *batch_normalization_11_beta = readTrainedWeights( + batch_normalization_11_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_mean_path = + dir_prefix + std::string("batch_normalization_11_mean.bin"); + void *batch_normalization_11_mean = readTrainedWeights( + batch_normalization_11_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_11_variance_path = + dir_prefix + std::string("batch_normalization_11_variance.bin"); + void *batch_normalization_11_variance = readTrainedWeights( + batch_normalization_11_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 128, 256, 1, 1); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_12_gamma_path = + dir_prefix + std::string("batch_normalization_12_gamma.bin"); + void *batch_normalization_12_gamma = readTrainedWeights( + batch_normalization_12_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_12_beta_path = + dir_prefix + std::string("batch_normalization_12_beta.bin"); + void *batch_normalization_12_beta = readTrainedWeights( + batch_normalization_12_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_12_mean_path = + dir_prefix + std::string("batch_normalization_12_mean.bin"); + void *batch_normalization_12_mean = readTrainedWeights( + batch_normalization_12_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_12_variance_path = + dir_prefix + std::string("batch_normalization_12_variance.bin"); + void *batch_normalization_12_variance = readTrainedWeights( + batch_normalization_12_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_13_gamma_path = + dir_prefix + std::string("batch_normalization_13_gamma.bin"); + void *batch_normalization_13_gamma = readTrainedWeights( + batch_normalization_13_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_13_beta_path = + dir_prefix + std::string("batch_normalization_13_beta.bin"); + void *batch_normalization_13_beta = readTrainedWeights( + batch_normalization_13_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_13_mean_path = + dir_prefix + std::string("batch_normalization_13_mean.bin"); + void *batch_normalization_13_mean = readTrainedWeights( + batch_normalization_13_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_13_variance_path = + dir_prefix + std::string("batch_normalization_13_variance.bin"); + void *batch_normalization_13_variance = readTrainedWeights( + batch_normalization_13_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); + void *conv2d_14_w = + readTrainedWeights(conv2d_14_w_path.c_str(), 0, 512, 128, 1, 1); + std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); + void *conv2d_14_b = + readTrainedWeights(conv2d_14_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); + void *conv2d_15_w = + readTrainedWeights(conv2d_15_w_path.c_str(), 0, 512, 256, 1, 1); + std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); + void *conv2d_15_b = + readTrainedWeights(conv2d_15_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_gamma_path = + dir_prefix + std::string("batch_normalization_14_gamma.bin"); + void *batch_normalization_14_gamma = readTrainedWeights( + batch_normalization_14_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_beta_path = + dir_prefix + std::string("batch_normalization_14_beta.bin"); + void *batch_normalization_14_beta = readTrainedWeights( + batch_normalization_14_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_mean_path = + dir_prefix + std::string("batch_normalization_14_mean.bin"); + void *batch_normalization_14_mean = readTrainedWeights( + batch_normalization_14_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_14_variance_path = + dir_prefix + std::string("batch_normalization_14_variance.bin"); + void *batch_normalization_14_variance = readTrainedWeights( + batch_normalization_14_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_gamma_path = + dir_prefix + std::string("batch_normalization_15_gamma.bin"); + void *batch_normalization_15_gamma = readTrainedWeights( + batch_normalization_15_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_beta_path = + dir_prefix + std::string("batch_normalization_15_beta.bin"); + void *batch_normalization_15_beta = readTrainedWeights( + batch_normalization_15_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_mean_path = + dir_prefix + std::string("batch_normalization_15_mean.bin"); + void *batch_normalization_15_mean = readTrainedWeights( + batch_normalization_15_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_15_variance_path = + dir_prefix + std::string("batch_normalization_15_variance.bin"); + void *batch_normalization_15_variance = readTrainedWeights( + batch_normalization_15_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); + void *conv2d_16_w = + readTrainedWeights(conv2d_16_w_path.c_str(), 0, 128, 512, 1, 1); + std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); + void *conv2d_16_b = + readTrainedWeights(conv2d_16_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_16_gamma_path = + dir_prefix + std::string("batch_normalization_16_gamma.bin"); + void *batch_normalization_16_gamma = readTrainedWeights( + batch_normalization_16_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_16_beta_path = + dir_prefix + std::string("batch_normalization_16_beta.bin"); + void *batch_normalization_16_beta = readTrainedWeights( + batch_normalization_16_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_16_mean_path = + dir_prefix + std::string("batch_normalization_16_mean.bin"); + void *batch_normalization_16_mean = readTrainedWeights( + batch_normalization_16_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_16_variance_path = + dir_prefix + std::string("batch_normalization_16_variance.bin"); + void *batch_normalization_16_variance = readTrainedWeights( + batch_normalization_16_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); + void *conv2d_17_w = + readTrainedWeights(conv2d_17_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); + void *conv2d_17_b = + readTrainedWeights(conv2d_17_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_17_gamma_path = + dir_prefix + std::string("batch_normalization_17_gamma.bin"); + void *batch_normalization_17_gamma = readTrainedWeights( + batch_normalization_17_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_17_beta_path = + dir_prefix + std::string("batch_normalization_17_beta.bin"); + void *batch_normalization_17_beta = readTrainedWeights( + batch_normalization_17_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_17_mean_path = + dir_prefix + std::string("batch_normalization_17_mean.bin"); + void *batch_normalization_17_mean = readTrainedWeights( + batch_normalization_17_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_17_variance_path = + dir_prefix + std::string("batch_normalization_17_variance.bin"); + void *batch_normalization_17_variance = readTrainedWeights( + batch_normalization_17_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); + void *conv2d_18_w = + readTrainedWeights(conv2d_18_w_path.c_str(), 0, 512, 128, 1, 1); + std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); + void *conv2d_18_b = + readTrainedWeights(conv2d_18_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_gamma_path = + dir_prefix + std::string("batch_normalization_18_gamma.bin"); + void *batch_normalization_18_gamma = readTrainedWeights( + batch_normalization_18_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_beta_path = + dir_prefix + std::string("batch_normalization_18_beta.bin"); + void *batch_normalization_18_beta = readTrainedWeights( + batch_normalization_18_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_mean_path = + dir_prefix + std::string("batch_normalization_18_mean.bin"); + void *batch_normalization_18_mean = readTrainedWeights( + batch_normalization_18_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_18_variance_path = + dir_prefix + std::string("batch_normalization_18_variance.bin"); + void *batch_normalization_18_variance = readTrainedWeights( + batch_normalization_18_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); + void *conv2d_19_w = + readTrainedWeights(conv2d_19_w_path.c_str(), 0, 128, 512, 1, 1); + std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); + void *conv2d_19_b = + readTrainedWeights(conv2d_19_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_19_gamma_path = + dir_prefix + std::string("batch_normalization_19_gamma.bin"); + void *batch_normalization_19_gamma = readTrainedWeights( + batch_normalization_19_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_19_beta_path = + dir_prefix + std::string("batch_normalization_19_beta.bin"); + void *batch_normalization_19_beta = readTrainedWeights( + batch_normalization_19_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_19_mean_path = + dir_prefix + std::string("batch_normalization_19_mean.bin"); + void *batch_normalization_19_mean = readTrainedWeights( + batch_normalization_19_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_19_variance_path = + dir_prefix + std::string("batch_normalization_19_variance.bin"); + void *batch_normalization_19_variance = readTrainedWeights( + batch_normalization_19_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); + void *conv2d_20_w = + readTrainedWeights(conv2d_20_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); + void *conv2d_20_b = + readTrainedWeights(conv2d_20_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_20_gamma_path = + dir_prefix + std::string("batch_normalization_20_gamma.bin"); + void *batch_normalization_20_gamma = readTrainedWeights( + batch_normalization_20_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_20_beta_path = + dir_prefix + std::string("batch_normalization_20_beta.bin"); + void *batch_normalization_20_beta = readTrainedWeights( + batch_normalization_20_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_20_mean_path = + dir_prefix + std::string("batch_normalization_20_mean.bin"); + void *batch_normalization_20_mean = readTrainedWeights( + batch_normalization_20_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_20_variance_path = + dir_prefix + std::string("batch_normalization_20_variance.bin"); + void *batch_normalization_20_variance = readTrainedWeights( + batch_normalization_20_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); + void *conv2d_21_w = + readTrainedWeights(conv2d_21_w_path.c_str(), 0, 512, 128, 1, 1); + std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); + void *conv2d_21_b = + readTrainedWeights(conv2d_21_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_gamma_path = + dir_prefix + std::string("batch_normalization_21_gamma.bin"); + void *batch_normalization_21_gamma = readTrainedWeights( + batch_normalization_21_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_beta_path = + dir_prefix + std::string("batch_normalization_21_beta.bin"); + void *batch_normalization_21_beta = readTrainedWeights( + batch_normalization_21_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_mean_path = + dir_prefix + std::string("batch_normalization_21_mean.bin"); + void *batch_normalization_21_mean = readTrainedWeights( + batch_normalization_21_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_21_variance_path = + dir_prefix + std::string("batch_normalization_21_variance.bin"); + void *batch_normalization_21_variance = readTrainedWeights( + batch_normalization_21_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_22_w_path = dir_prefix + std::string("conv2d_22_w.bin"); + void *conv2d_22_w = + readTrainedWeights(conv2d_22_w_path.c_str(), 0, 128, 512, 1, 1); + std::string conv2d_22_b_path = dir_prefix + std::string("conv2d_22_b.bin"); + void *conv2d_22_b = + readTrainedWeights(conv2d_22_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_22_gamma_path = + dir_prefix + std::string("batch_normalization_22_gamma.bin"); + void *batch_normalization_22_gamma = readTrainedWeights( + batch_normalization_22_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_22_beta_path = + dir_prefix + std::string("batch_normalization_22_beta.bin"); + void *batch_normalization_22_beta = readTrainedWeights( + batch_normalization_22_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_22_mean_path = + dir_prefix + std::string("batch_normalization_22_mean.bin"); + void *batch_normalization_22_mean = readTrainedWeights( + batch_normalization_22_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_22_variance_path = + dir_prefix + std::string("batch_normalization_22_variance.bin"); + void *batch_normalization_22_variance = readTrainedWeights( + batch_normalization_22_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_23_w_path = dir_prefix + std::string("conv2d_23_w.bin"); + void *conv2d_23_w = + readTrainedWeights(conv2d_23_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_23_b_path = dir_prefix + std::string("conv2d_23_b.bin"); + void *conv2d_23_b = + readTrainedWeights(conv2d_23_b_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_23_gamma_path = + dir_prefix + std::string("batch_normalization_23_gamma.bin"); + void *batch_normalization_23_gamma = readTrainedWeights( + batch_normalization_23_gamma_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_23_beta_path = + dir_prefix + std::string("batch_normalization_23_beta.bin"); + void *batch_normalization_23_beta = readTrainedWeights( + batch_normalization_23_beta_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_23_mean_path = + dir_prefix + std::string("batch_normalization_23_mean.bin"); + void *batch_normalization_23_mean = readTrainedWeights( + batch_normalization_23_mean_path.c_str(), 0, 1, 128, 1, 1); + std::string batch_normalization_23_variance_path = + dir_prefix + std::string("batch_normalization_23_variance.bin"); + void *batch_normalization_23_variance = readTrainedWeights( + batch_normalization_23_variance_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_24_w_path = dir_prefix + std::string("conv2d_24_w.bin"); + void *conv2d_24_w = + readTrainedWeights(conv2d_24_w_path.c_str(), 0, 512, 128, 1, 1); + std::string conv2d_24_b_path = dir_prefix + std::string("conv2d_24_b.bin"); + void *conv2d_24_b = + readTrainedWeights(conv2d_24_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_gamma_path = + dir_prefix + std::string("batch_normalization_24_gamma.bin"); + void *batch_normalization_24_gamma = readTrainedWeights( + batch_normalization_24_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_beta_path = + dir_prefix + std::string("batch_normalization_24_beta.bin"); + void *batch_normalization_24_beta = readTrainedWeights( + batch_normalization_24_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_mean_path = + dir_prefix + std::string("batch_normalization_24_mean.bin"); + void *batch_normalization_24_mean = readTrainedWeights( + batch_normalization_24_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_24_variance_path = + dir_prefix + std::string("batch_normalization_24_variance.bin"); + void *batch_normalization_24_variance = readTrainedWeights( + batch_normalization_24_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_25_w_path = dir_prefix + std::string("conv2d_25_w.bin"); + void *conv2d_25_w = + readTrainedWeights(conv2d_25_w_path.c_str(), 0, 256, 512, 1, 1); + std::string conv2d_25_b_path = dir_prefix + std::string("conv2d_25_b.bin"); + void *conv2d_25_b = + readTrainedWeights(conv2d_25_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_25_gamma_path = + dir_prefix + std::string("batch_normalization_25_gamma.bin"); + void *batch_normalization_25_gamma = readTrainedWeights( + batch_normalization_25_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_25_beta_path = + dir_prefix + std::string("batch_normalization_25_beta.bin"); + void *batch_normalization_25_beta = readTrainedWeights( + batch_normalization_25_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_25_mean_path = + dir_prefix + std::string("batch_normalization_25_mean.bin"); + void *batch_normalization_25_mean = readTrainedWeights( + batch_normalization_25_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_25_variance_path = + dir_prefix + std::string("batch_normalization_25_variance.bin"); + void *batch_normalization_25_variance = readTrainedWeights( + batch_normalization_25_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_26_w_path = dir_prefix + std::string("conv2d_26_w.bin"); + void *conv2d_26_w = + readTrainedWeights(conv2d_26_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_26_b_path = dir_prefix + std::string("conv2d_26_b.bin"); + void *conv2d_26_b = + readTrainedWeights(conv2d_26_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_26_gamma_path = + dir_prefix + std::string("batch_normalization_26_gamma.bin"); + void *batch_normalization_26_gamma = readTrainedWeights( + batch_normalization_26_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_26_beta_path = + dir_prefix + std::string("batch_normalization_26_beta.bin"); + void *batch_normalization_26_beta = readTrainedWeights( + batch_normalization_26_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_26_mean_path = + dir_prefix + std::string("batch_normalization_26_mean.bin"); + void *batch_normalization_26_mean = readTrainedWeights( + batch_normalization_26_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_26_variance_path = + dir_prefix + std::string("batch_normalization_26_variance.bin"); + void *batch_normalization_26_variance = readTrainedWeights( + batch_normalization_26_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_27_w_path = dir_prefix + std::string("conv2d_27_w.bin"); + void *conv2d_27_w = + readTrainedWeights(conv2d_27_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_27_b_path = dir_prefix + std::string("conv2d_27_b.bin"); + void *conv2d_27_b = + readTrainedWeights(conv2d_27_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_28_w_path = dir_prefix + std::string("conv2d_28_w.bin"); + void *conv2d_28_w = + readTrainedWeights(conv2d_28_w_path.c_str(), 0, 1024, 512, 1, 1); + std::string conv2d_28_b_path = dir_prefix + std::string("conv2d_28_b.bin"); + void *conv2d_28_b = + readTrainedWeights(conv2d_28_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_gamma_path = + dir_prefix + std::string("batch_normalization_27_gamma.bin"); + void *batch_normalization_27_gamma = readTrainedWeights( + batch_normalization_27_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_beta_path = + dir_prefix + std::string("batch_normalization_27_beta.bin"); + void *batch_normalization_27_beta = readTrainedWeights( + batch_normalization_27_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_mean_path = + dir_prefix + std::string("batch_normalization_27_mean.bin"); + void *batch_normalization_27_mean = readTrainedWeights( + batch_normalization_27_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_27_variance_path = + dir_prefix + std::string("batch_normalization_27_variance.bin"); + void *batch_normalization_27_variance = readTrainedWeights( + batch_normalization_27_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_28_gamma_path = + dir_prefix + std::string("batch_normalization_28_gamma.bin"); + void *batch_normalization_28_gamma = readTrainedWeights( + batch_normalization_28_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_28_beta_path = + dir_prefix + std::string("batch_normalization_28_beta.bin"); + void *batch_normalization_28_beta = readTrainedWeights( + batch_normalization_28_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_28_mean_path = + dir_prefix + std::string("batch_normalization_28_mean.bin"); + void *batch_normalization_28_mean = readTrainedWeights( + batch_normalization_28_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_28_variance_path = + dir_prefix + std::string("batch_normalization_28_variance.bin"); + void *batch_normalization_28_variance = readTrainedWeights( + batch_normalization_28_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_29_w_path = dir_prefix + std::string("conv2d_29_w.bin"); + void *conv2d_29_w = + readTrainedWeights(conv2d_29_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_29_b_path = dir_prefix + std::string("conv2d_29_b.bin"); + void *conv2d_29_b = + readTrainedWeights(conv2d_29_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_29_gamma_path = + dir_prefix + std::string("batch_normalization_29_gamma.bin"); + void *batch_normalization_29_gamma = readTrainedWeights( + batch_normalization_29_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_29_beta_path = + dir_prefix + std::string("batch_normalization_29_beta.bin"); + void *batch_normalization_29_beta = readTrainedWeights( + batch_normalization_29_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_29_mean_path = + dir_prefix + std::string("batch_normalization_29_mean.bin"); + void *batch_normalization_29_mean = readTrainedWeights( + batch_normalization_29_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_29_variance_path = + dir_prefix + std::string("batch_normalization_29_variance.bin"); + void *batch_normalization_29_variance = readTrainedWeights( + batch_normalization_29_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_30_w_path = dir_prefix + std::string("conv2d_30_w.bin"); + void *conv2d_30_w = + readTrainedWeights(conv2d_30_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_30_b_path = dir_prefix + std::string("conv2d_30_b.bin"); + void *conv2d_30_b = + readTrainedWeights(conv2d_30_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_30_gamma_path = + dir_prefix + std::string("batch_normalization_30_gamma.bin"); + void *batch_normalization_30_gamma = readTrainedWeights( + batch_normalization_30_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_30_beta_path = + dir_prefix + std::string("batch_normalization_30_beta.bin"); + void *batch_normalization_30_beta = readTrainedWeights( + batch_normalization_30_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_30_mean_path = + dir_prefix + std::string("batch_normalization_30_mean.bin"); + void *batch_normalization_30_mean = readTrainedWeights( + batch_normalization_30_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_30_variance_path = + dir_prefix + std::string("batch_normalization_30_variance.bin"); + void *batch_normalization_30_variance = readTrainedWeights( + batch_normalization_30_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_31_w_path = dir_prefix + std::string("conv2d_31_w.bin"); + void *conv2d_31_w = + readTrainedWeights(conv2d_31_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_31_b_path = dir_prefix + std::string("conv2d_31_b.bin"); + void *conv2d_31_b = + readTrainedWeights(conv2d_31_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_31_gamma_path = + dir_prefix + std::string("batch_normalization_31_gamma.bin"); + void *batch_normalization_31_gamma = readTrainedWeights( + batch_normalization_31_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_31_beta_path = + dir_prefix + std::string("batch_normalization_31_beta.bin"); + void *batch_normalization_31_beta = readTrainedWeights( + batch_normalization_31_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_31_mean_path = + dir_prefix + std::string("batch_normalization_31_mean.bin"); + void *batch_normalization_31_mean = readTrainedWeights( + batch_normalization_31_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_31_variance_path = + dir_prefix + std::string("batch_normalization_31_variance.bin"); + void *batch_normalization_31_variance = readTrainedWeights( + batch_normalization_31_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_32_w_path = dir_prefix + std::string("conv2d_32_w.bin"); + void *conv2d_32_w = + readTrainedWeights(conv2d_32_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_32_b_path = dir_prefix + std::string("conv2d_32_b.bin"); + void *conv2d_32_b = + readTrainedWeights(conv2d_32_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_32_gamma_path = + dir_prefix + std::string("batch_normalization_32_gamma.bin"); + void *batch_normalization_32_gamma = readTrainedWeights( + batch_normalization_32_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_32_beta_path = + dir_prefix + std::string("batch_normalization_32_beta.bin"); + void *batch_normalization_32_beta = readTrainedWeights( + batch_normalization_32_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_32_mean_path = + dir_prefix + std::string("batch_normalization_32_mean.bin"); + void *batch_normalization_32_mean = readTrainedWeights( + batch_normalization_32_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_32_variance_path = + dir_prefix + std::string("batch_normalization_32_variance.bin"); + void *batch_normalization_32_variance = readTrainedWeights( + batch_normalization_32_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_33_w_path = dir_prefix + std::string("conv2d_33_w.bin"); + void *conv2d_33_w = + readTrainedWeights(conv2d_33_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_33_b_path = dir_prefix + std::string("conv2d_33_b.bin"); + void *conv2d_33_b = + readTrainedWeights(conv2d_33_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_33_gamma_path = + dir_prefix + std::string("batch_normalization_33_gamma.bin"); + void *batch_normalization_33_gamma = readTrainedWeights( + batch_normalization_33_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_33_beta_path = + dir_prefix + std::string("batch_normalization_33_beta.bin"); + void *batch_normalization_33_beta = readTrainedWeights( + batch_normalization_33_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_33_mean_path = + dir_prefix + std::string("batch_normalization_33_mean.bin"); + void *batch_normalization_33_mean = readTrainedWeights( + batch_normalization_33_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_33_variance_path = + dir_prefix + std::string("batch_normalization_33_variance.bin"); + void *batch_normalization_33_variance = readTrainedWeights( + batch_normalization_33_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_34_w_path = dir_prefix + std::string("conv2d_34_w.bin"); + void *conv2d_34_w = + readTrainedWeights(conv2d_34_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_34_b_path = dir_prefix + std::string("conv2d_34_b.bin"); + void *conv2d_34_b = + readTrainedWeights(conv2d_34_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_34_gamma_path = + dir_prefix + std::string("batch_normalization_34_gamma.bin"); + void *batch_normalization_34_gamma = readTrainedWeights( + batch_normalization_34_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_34_beta_path = + dir_prefix + std::string("batch_normalization_34_beta.bin"); + void *batch_normalization_34_beta = readTrainedWeights( + batch_normalization_34_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_34_mean_path = + dir_prefix + std::string("batch_normalization_34_mean.bin"); + void *batch_normalization_34_mean = readTrainedWeights( + batch_normalization_34_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_34_variance_path = + dir_prefix + std::string("batch_normalization_34_variance.bin"); + void *batch_normalization_34_variance = readTrainedWeights( + batch_normalization_34_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_35_w_path = dir_prefix + std::string("conv2d_35_w.bin"); + void *conv2d_35_w = + readTrainedWeights(conv2d_35_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_35_b_path = dir_prefix + std::string("conv2d_35_b.bin"); + void *conv2d_35_b = + readTrainedWeights(conv2d_35_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_35_gamma_path = + dir_prefix + std::string("batch_normalization_35_gamma.bin"); + void *batch_normalization_35_gamma = readTrainedWeights( + batch_normalization_35_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_35_beta_path = + dir_prefix + std::string("batch_normalization_35_beta.bin"); + void *batch_normalization_35_beta = readTrainedWeights( + batch_normalization_35_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_35_mean_path = + dir_prefix + std::string("batch_normalization_35_mean.bin"); + void *batch_normalization_35_mean = readTrainedWeights( + batch_normalization_35_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_35_variance_path = + dir_prefix + std::string("batch_normalization_35_variance.bin"); + void *batch_normalization_35_variance = readTrainedWeights( + batch_normalization_35_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_36_w_path = dir_prefix + std::string("conv2d_36_w.bin"); + void *conv2d_36_w = + readTrainedWeights(conv2d_36_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_36_b_path = dir_prefix + std::string("conv2d_36_b.bin"); + void *conv2d_36_b = + readTrainedWeights(conv2d_36_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_36_gamma_path = + dir_prefix + std::string("batch_normalization_36_gamma.bin"); + void *batch_normalization_36_gamma = readTrainedWeights( + batch_normalization_36_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_36_beta_path = + dir_prefix + std::string("batch_normalization_36_beta.bin"); + void *batch_normalization_36_beta = readTrainedWeights( + batch_normalization_36_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_36_mean_path = + dir_prefix + std::string("batch_normalization_36_mean.bin"); + void *batch_normalization_36_mean = readTrainedWeights( + batch_normalization_36_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_36_variance_path = + dir_prefix + std::string("batch_normalization_36_variance.bin"); + void *batch_normalization_36_variance = readTrainedWeights( + batch_normalization_36_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_37_w_path = dir_prefix + std::string("conv2d_37_w.bin"); + void *conv2d_37_w = + readTrainedWeights(conv2d_37_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_37_b_path = dir_prefix + std::string("conv2d_37_b.bin"); + void *conv2d_37_b = + readTrainedWeights(conv2d_37_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_37_gamma_path = + dir_prefix + std::string("batch_normalization_37_gamma.bin"); + void *batch_normalization_37_gamma = readTrainedWeights( + batch_normalization_37_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_37_beta_path = + dir_prefix + std::string("batch_normalization_37_beta.bin"); + void *batch_normalization_37_beta = readTrainedWeights( + batch_normalization_37_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_37_mean_path = + dir_prefix + std::string("batch_normalization_37_mean.bin"); + void *batch_normalization_37_mean = readTrainedWeights( + batch_normalization_37_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_37_variance_path = + dir_prefix + std::string("batch_normalization_37_variance.bin"); + void *batch_normalization_37_variance = readTrainedWeights( + batch_normalization_37_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_38_w_path = dir_prefix + std::string("conv2d_38_w.bin"); + void *conv2d_38_w = + readTrainedWeights(conv2d_38_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_38_b_path = dir_prefix + std::string("conv2d_38_b.bin"); + void *conv2d_38_b = + readTrainedWeights(conv2d_38_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_38_gamma_path = + dir_prefix + std::string("batch_normalization_38_gamma.bin"); + void *batch_normalization_38_gamma = readTrainedWeights( + batch_normalization_38_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_38_beta_path = + dir_prefix + std::string("batch_normalization_38_beta.bin"); + void *batch_normalization_38_beta = readTrainedWeights( + batch_normalization_38_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_38_mean_path = + dir_prefix + std::string("batch_normalization_38_mean.bin"); + void *batch_normalization_38_mean = readTrainedWeights( + batch_normalization_38_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_38_variance_path = + dir_prefix + std::string("batch_normalization_38_variance.bin"); + void *batch_normalization_38_variance = readTrainedWeights( + batch_normalization_38_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_39_w_path = dir_prefix + std::string("conv2d_39_w.bin"); + void *conv2d_39_w = + readTrainedWeights(conv2d_39_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_39_b_path = dir_prefix + std::string("conv2d_39_b.bin"); + void *conv2d_39_b = + readTrainedWeights(conv2d_39_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_39_gamma_path = + dir_prefix + std::string("batch_normalization_39_gamma.bin"); + void *batch_normalization_39_gamma = readTrainedWeights( + batch_normalization_39_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_39_beta_path = + dir_prefix + std::string("batch_normalization_39_beta.bin"); + void *batch_normalization_39_beta = readTrainedWeights( + batch_normalization_39_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_39_mean_path = + dir_prefix + std::string("batch_normalization_39_mean.bin"); + void *batch_normalization_39_mean = readTrainedWeights( + batch_normalization_39_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_39_variance_path = + dir_prefix + std::string("batch_normalization_39_variance.bin"); + void *batch_normalization_39_variance = readTrainedWeights( + batch_normalization_39_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_40_w_path = dir_prefix + std::string("conv2d_40_w.bin"); + void *conv2d_40_w = + readTrainedWeights(conv2d_40_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_40_b_path = dir_prefix + std::string("conv2d_40_b.bin"); + void *conv2d_40_b = + readTrainedWeights(conv2d_40_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_40_gamma_path = + dir_prefix + std::string("batch_normalization_40_gamma.bin"); + void *batch_normalization_40_gamma = readTrainedWeights( + batch_normalization_40_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_40_beta_path = + dir_prefix + std::string("batch_normalization_40_beta.bin"); + void *batch_normalization_40_beta = readTrainedWeights( + batch_normalization_40_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_40_mean_path = + dir_prefix + std::string("batch_normalization_40_mean.bin"); + void *batch_normalization_40_mean = readTrainedWeights( + batch_normalization_40_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_40_variance_path = + dir_prefix + std::string("batch_normalization_40_variance.bin"); + void *batch_normalization_40_variance = readTrainedWeights( + batch_normalization_40_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_41_w_path = dir_prefix + std::string("conv2d_41_w.bin"); + void *conv2d_41_w = + readTrainedWeights(conv2d_41_w_path.c_str(), 0, 256, 1024, 1, 1); + std::string conv2d_41_b_path = dir_prefix + std::string("conv2d_41_b.bin"); + void *conv2d_41_b = + readTrainedWeights(conv2d_41_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_41_gamma_path = + dir_prefix + std::string("batch_normalization_41_gamma.bin"); + void *batch_normalization_41_gamma = readTrainedWeights( + batch_normalization_41_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_41_beta_path = + dir_prefix + std::string("batch_normalization_41_beta.bin"); + void *batch_normalization_41_beta = readTrainedWeights( + batch_normalization_41_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_41_mean_path = + dir_prefix + std::string("batch_normalization_41_mean.bin"); + void *batch_normalization_41_mean = readTrainedWeights( + batch_normalization_41_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_41_variance_path = + dir_prefix + std::string("batch_normalization_41_variance.bin"); + void *batch_normalization_41_variance = readTrainedWeights( + batch_normalization_41_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_42_w_path = dir_prefix + std::string("conv2d_42_w.bin"); + void *conv2d_42_w = + readTrainedWeights(conv2d_42_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_42_b_path = dir_prefix + std::string("conv2d_42_b.bin"); + void *conv2d_42_b = + readTrainedWeights(conv2d_42_b_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_42_gamma_path = + dir_prefix + std::string("batch_normalization_42_gamma.bin"); + void *batch_normalization_42_gamma = readTrainedWeights( + batch_normalization_42_gamma_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_42_beta_path = + dir_prefix + std::string("batch_normalization_42_beta.bin"); + void *batch_normalization_42_beta = readTrainedWeights( + batch_normalization_42_beta_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_42_mean_path = + dir_prefix + std::string("batch_normalization_42_mean.bin"); + void *batch_normalization_42_mean = readTrainedWeights( + batch_normalization_42_mean_path.c_str(), 0, 1, 256, 1, 1); + std::string batch_normalization_42_variance_path = + dir_prefix + std::string("batch_normalization_42_variance.bin"); + void *batch_normalization_42_variance = readTrainedWeights( + batch_normalization_42_variance_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_43_w_path = dir_prefix + std::string("conv2d_43_w.bin"); + void *conv2d_43_w = + readTrainedWeights(conv2d_43_w_path.c_str(), 0, 1024, 256, 1, 1); + std::string conv2d_43_b_path = dir_prefix + std::string("conv2d_43_b.bin"); + void *conv2d_43_b = + readTrainedWeights(conv2d_43_b_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_43_gamma_path = + dir_prefix + std::string("batch_normalization_43_gamma.bin"); + void *batch_normalization_43_gamma = readTrainedWeights( + batch_normalization_43_gamma_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_43_beta_path = + dir_prefix + std::string("batch_normalization_43_beta.bin"); + void *batch_normalization_43_beta = readTrainedWeights( + batch_normalization_43_beta_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_43_mean_path = + dir_prefix + std::string("batch_normalization_43_mean.bin"); + void *batch_normalization_43_mean = readTrainedWeights( + batch_normalization_43_mean_path.c_str(), 0, 1, 1024, 1, 1); + std::string batch_normalization_43_variance_path = + dir_prefix + std::string("batch_normalization_43_variance.bin"); + void *batch_normalization_43_variance = readTrainedWeights( + batch_normalization_43_variance_path.c_str(), 0, 1, 1024, 1, 1); + std::string conv2d_44_w_path = dir_prefix + std::string("conv2d_44_w.bin"); + void *conv2d_44_w = + readTrainedWeights(conv2d_44_w_path.c_str(), 0, 512, 1024, 1, 1); + std::string conv2d_44_b_path = dir_prefix + std::string("conv2d_44_b.bin"); + void *conv2d_44_b = + readTrainedWeights(conv2d_44_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_44_gamma_path = + dir_prefix + std::string("batch_normalization_44_gamma.bin"); + void *batch_normalization_44_gamma = readTrainedWeights( + batch_normalization_44_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_44_beta_path = + dir_prefix + std::string("batch_normalization_44_beta.bin"); + void *batch_normalization_44_beta = readTrainedWeights( + batch_normalization_44_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_44_mean_path = + dir_prefix + std::string("batch_normalization_44_mean.bin"); + void *batch_normalization_44_mean = readTrainedWeights( + batch_normalization_44_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_44_variance_path = + dir_prefix + std::string("batch_normalization_44_variance.bin"); + void *batch_normalization_44_variance = readTrainedWeights( + batch_normalization_44_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_45_w_path = dir_prefix + std::string("conv2d_45_w.bin"); + void *conv2d_45_w = + readTrainedWeights(conv2d_45_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_45_b_path = dir_prefix + std::string("conv2d_45_b.bin"); + void *conv2d_45_b = + readTrainedWeights(conv2d_45_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_45_gamma_path = + dir_prefix + std::string("batch_normalization_45_gamma.bin"); + void *batch_normalization_45_gamma = readTrainedWeights( + batch_normalization_45_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_45_beta_path = + dir_prefix + std::string("batch_normalization_45_beta.bin"); + void *batch_normalization_45_beta = readTrainedWeights( + batch_normalization_45_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_45_mean_path = + dir_prefix + std::string("batch_normalization_45_mean.bin"); + void *batch_normalization_45_mean = readTrainedWeights( + batch_normalization_45_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_45_variance_path = + dir_prefix + std::string("batch_normalization_45_variance.bin"); + void *batch_normalization_45_variance = readTrainedWeights( + batch_normalization_45_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_46_w_path = dir_prefix + std::string("conv2d_46_w.bin"); + void *conv2d_46_w = + readTrainedWeights(conv2d_46_w_path.c_str(), 0, 2048, 512, 1, 1); + std::string conv2d_46_b_path = dir_prefix + std::string("conv2d_46_b.bin"); + void *conv2d_46_b = + readTrainedWeights(conv2d_46_b_path.c_str(), 0, 1, 2048, 1, 1); + std::string conv2d_47_w_path = dir_prefix + std::string("conv2d_47_w.bin"); + void *conv2d_47_w = + readTrainedWeights(conv2d_47_w_path.c_str(), 0, 2048, 1024, 1, 1); + std::string conv2d_47_b_path = dir_prefix + std::string("conv2d_47_b.bin"); + void *conv2d_47_b = + readTrainedWeights(conv2d_47_b_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_46_gamma_path = + dir_prefix + std::string("batch_normalization_46_gamma.bin"); + void *batch_normalization_46_gamma = readTrainedWeights( + batch_normalization_46_gamma_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_46_beta_path = + dir_prefix + std::string("batch_normalization_46_beta.bin"); + void *batch_normalization_46_beta = readTrainedWeights( + batch_normalization_46_beta_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_46_mean_path = + dir_prefix + std::string("batch_normalization_46_mean.bin"); + void *batch_normalization_46_mean = readTrainedWeights( + batch_normalization_46_mean_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_46_variance_path = + dir_prefix + std::string("batch_normalization_46_variance.bin"); + void *batch_normalization_46_variance = readTrainedWeights( + batch_normalization_46_variance_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_47_gamma_path = + dir_prefix + std::string("batch_normalization_47_gamma.bin"); + void *batch_normalization_47_gamma = readTrainedWeights( + batch_normalization_47_gamma_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_47_beta_path = + dir_prefix + std::string("batch_normalization_47_beta.bin"); + void *batch_normalization_47_beta = readTrainedWeights( + batch_normalization_47_beta_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_47_mean_path = + dir_prefix + std::string("batch_normalization_47_mean.bin"); + void *batch_normalization_47_mean = readTrainedWeights( + batch_normalization_47_mean_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_47_variance_path = + dir_prefix + std::string("batch_normalization_47_variance.bin"); + void *batch_normalization_47_variance = readTrainedWeights( + batch_normalization_47_variance_path.c_str(), 0, 1, 2048, 1, 1); + std::string conv2d_48_w_path = dir_prefix + std::string("conv2d_48_w.bin"); + void *conv2d_48_w = + readTrainedWeights(conv2d_48_w_path.c_str(), 0, 512, 2048, 1, 1); + std::string conv2d_48_b_path = dir_prefix + std::string("conv2d_48_b.bin"); + void *conv2d_48_b = + readTrainedWeights(conv2d_48_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_48_gamma_path = + dir_prefix + std::string("batch_normalization_48_gamma.bin"); + void *batch_normalization_48_gamma = readTrainedWeights( + batch_normalization_48_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_48_beta_path = + dir_prefix + std::string("batch_normalization_48_beta.bin"); + void *batch_normalization_48_beta = readTrainedWeights( + batch_normalization_48_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_48_mean_path = + dir_prefix + std::string("batch_normalization_48_mean.bin"); + void *batch_normalization_48_mean = readTrainedWeights( + batch_normalization_48_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_48_variance_path = + dir_prefix + std::string("batch_normalization_48_variance.bin"); + void *batch_normalization_48_variance = readTrainedWeights( + batch_normalization_48_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_49_w_path = dir_prefix + std::string("conv2d_49_w.bin"); + void *conv2d_49_w = + readTrainedWeights(conv2d_49_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_49_b_path = dir_prefix + std::string("conv2d_49_b.bin"); + void *conv2d_49_b = + readTrainedWeights(conv2d_49_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_49_gamma_path = + dir_prefix + std::string("batch_normalization_49_gamma.bin"); + void *batch_normalization_49_gamma = readTrainedWeights( + batch_normalization_49_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_49_beta_path = + dir_prefix + std::string("batch_normalization_49_beta.bin"); + void *batch_normalization_49_beta = readTrainedWeights( + batch_normalization_49_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_49_mean_path = + dir_prefix + std::string("batch_normalization_49_mean.bin"); + void *batch_normalization_49_mean = readTrainedWeights( + batch_normalization_49_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_49_variance_path = + dir_prefix + std::string("batch_normalization_49_variance.bin"); + void *batch_normalization_49_variance = readTrainedWeights( + batch_normalization_49_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_50_w_path = dir_prefix + std::string("conv2d_50_w.bin"); + void *conv2d_50_w = + readTrainedWeights(conv2d_50_w_path.c_str(), 0, 2048, 512, 1, 1); + std::string conv2d_50_b_path = dir_prefix + std::string("conv2d_50_b.bin"); + void *conv2d_50_b = + readTrainedWeights(conv2d_50_b_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_50_gamma_path = + dir_prefix + std::string("batch_normalization_50_gamma.bin"); + void *batch_normalization_50_gamma = readTrainedWeights( + batch_normalization_50_gamma_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_50_beta_path = + dir_prefix + std::string("batch_normalization_50_beta.bin"); + void *batch_normalization_50_beta = readTrainedWeights( + batch_normalization_50_beta_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_50_mean_path = + dir_prefix + std::string("batch_normalization_50_mean.bin"); + void *batch_normalization_50_mean = readTrainedWeights( + batch_normalization_50_mean_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_50_variance_path = + dir_prefix + std::string("batch_normalization_50_variance.bin"); + void *batch_normalization_50_variance = readTrainedWeights( + batch_normalization_50_variance_path.c_str(), 0, 1, 2048, 1, 1); + std::string conv2d_51_w_path = dir_prefix + std::string("conv2d_51_w.bin"); + void *conv2d_51_w = + readTrainedWeights(conv2d_51_w_path.c_str(), 0, 512, 2048, 1, 1); + std::string conv2d_51_b_path = dir_prefix + std::string("conv2d_51_b.bin"); + void *conv2d_51_b = + readTrainedWeights(conv2d_51_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_51_gamma_path = + dir_prefix + std::string("batch_normalization_51_gamma.bin"); + void *batch_normalization_51_gamma = readTrainedWeights( + batch_normalization_51_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_51_beta_path = + dir_prefix + std::string("batch_normalization_51_beta.bin"); + void *batch_normalization_51_beta = readTrainedWeights( + batch_normalization_51_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_51_mean_path = + dir_prefix + std::string("batch_normalization_51_mean.bin"); + void *batch_normalization_51_mean = readTrainedWeights( + batch_normalization_51_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_51_variance_path = + dir_prefix + std::string("batch_normalization_51_variance.bin"); + void *batch_normalization_51_variance = readTrainedWeights( + batch_normalization_51_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_52_w_path = dir_prefix + std::string("conv2d_52_w.bin"); + void *conv2d_52_w = + readTrainedWeights(conv2d_52_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_52_b_path = dir_prefix + std::string("conv2d_52_b.bin"); + void *conv2d_52_b = + readTrainedWeights(conv2d_52_b_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_52_gamma_path = + dir_prefix + std::string("batch_normalization_52_gamma.bin"); + void *batch_normalization_52_gamma = readTrainedWeights( + batch_normalization_52_gamma_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_52_beta_path = + dir_prefix + std::string("batch_normalization_52_beta.bin"); + void *batch_normalization_52_beta = readTrainedWeights( + batch_normalization_52_beta_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_52_mean_path = + dir_prefix + std::string("batch_normalization_52_mean.bin"); + void *batch_normalization_52_mean = readTrainedWeights( + batch_normalization_52_mean_path.c_str(), 0, 1, 512, 1, 1); + std::string batch_normalization_52_variance_path = + dir_prefix + std::string("batch_normalization_52_variance.bin"); + void *batch_normalization_52_variance = readTrainedWeights( + batch_normalization_52_variance_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_53_w_path = dir_prefix + std::string("conv2d_53_w.bin"); + void *conv2d_53_w = + readTrainedWeights(conv2d_53_w_path.c_str(), 0, 2048, 512, 1, 1); + std::string conv2d_53_b_path = dir_prefix + std::string("conv2d_53_b.bin"); + void *conv2d_53_b = + readTrainedWeights(conv2d_53_b_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_53_gamma_path = + dir_prefix + std::string("batch_normalization_53_gamma.bin"); + void *batch_normalization_53_gamma = readTrainedWeights( + batch_normalization_53_gamma_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_53_beta_path = + dir_prefix + std::string("batch_normalization_53_beta.bin"); + void *batch_normalization_53_beta = readTrainedWeights( + batch_normalization_53_beta_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_53_mean_path = + dir_prefix + std::string("batch_normalization_53_mean.bin"); + void *batch_normalization_53_mean = readTrainedWeights( + batch_normalization_53_mean_path.c_str(), 0, 1, 2048, 1, 1); + std::string batch_normalization_53_variance_path = + dir_prefix + std::string("batch_normalization_53_variance.bin"); + void *batch_normalization_53_variance = readTrainedWeights( + batch_normalization_53_variance_path.c_str(), 0, 1, 2048, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 2048, 1000); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = + readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 1000, 1, 1); + startMemTracking(); - std::string dir_prefix = model_params_path + std::string("/shared/hsharif3/resnet50_imagenet/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,7,7); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_1_gamma_path = dir_prefix + std::string("batch_normalization_1_gamma.bin"); - void* batch_normalization_1_gamma = readTrainedWeights(batch_normalization_1_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_1_beta_path = dir_prefix + std::string("batch_normalization_1_beta.bin"); - void* batch_normalization_1_beta = readTrainedWeights(batch_normalization_1_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_1_mean_path = dir_prefix + std::string("batch_normalization_1_mean.bin"); - void* batch_normalization_1_mean = readTrainedWeights(batch_normalization_1_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_1_variance_path = dir_prefix + std::string("batch_normalization_1_variance.bin"); - void* batch_normalization_1_variance = readTrainedWeights(batch_normalization_1_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,1,1); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_2_gamma_path = dir_prefix + std::string("batch_normalization_2_gamma.bin"); - void* batch_normalization_2_gamma = readTrainedWeights(batch_normalization_2_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_2_beta_path = dir_prefix + std::string("batch_normalization_2_beta.bin"); - void* batch_normalization_2_beta = readTrainedWeights(batch_normalization_2_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_2_mean_path = dir_prefix + std::string("batch_normalization_2_mean.bin"); - void* batch_normalization_2_mean = readTrainedWeights(batch_normalization_2_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_2_variance_path = dir_prefix + std::string("batch_normalization_2_variance.bin"); - void* batch_normalization_2_variance = readTrainedWeights(batch_normalization_2_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_gamma_path = dir_prefix + std::string("batch_normalization_3_gamma.bin"); - void* batch_normalization_3_gamma = readTrainedWeights(batch_normalization_3_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_beta_path = dir_prefix + std::string("batch_normalization_3_beta.bin"); - void* batch_normalization_3_beta = readTrainedWeights(batch_normalization_3_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_mean_path = dir_prefix + std::string("batch_normalization_3_mean.bin"); - void* batch_normalization_3_mean = readTrainedWeights(batch_normalization_3_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_3_variance_path = dir_prefix + std::string("batch_normalization_3_variance.bin"); - void* batch_normalization_3_variance = readTrainedWeights(batch_normalization_3_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,64,1,1); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,64,1,1); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_4_gamma_path = dir_prefix + std::string("batch_normalization_4_gamma.bin"); - void* batch_normalization_4_gamma = readTrainedWeights(batch_normalization_4_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_4_beta_path = dir_prefix + std::string("batch_normalization_4_beta.bin"); - void* batch_normalization_4_beta = readTrainedWeights(batch_normalization_4_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_4_mean_path = dir_prefix + std::string("batch_normalization_4_mean.bin"); - void* batch_normalization_4_mean = readTrainedWeights(batch_normalization_4_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_4_variance_path = dir_prefix + std::string("batch_normalization_4_variance.bin"); - void* batch_normalization_4_variance = readTrainedWeights(batch_normalization_4_variance_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_5_gamma_path = dir_prefix + std::string("batch_normalization_5_gamma.bin"); - void* batch_normalization_5_gamma = readTrainedWeights(batch_normalization_5_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_5_beta_path = dir_prefix + std::string("batch_normalization_5_beta.bin"); - void* batch_normalization_5_beta = readTrainedWeights(batch_normalization_5_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_5_mean_path = dir_prefix + std::string("batch_normalization_5_mean.bin"); - void* batch_normalization_5_mean = readTrainedWeights(batch_normalization_5_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_5_variance_path = dir_prefix + std::string("batch_normalization_5_variance.bin"); - void* batch_normalization_5_variance = readTrainedWeights(batch_normalization_5_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,64,256,1,1); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_6_gamma_path = dir_prefix + std::string("batch_normalization_6_gamma.bin"); - void* batch_normalization_6_gamma = readTrainedWeights(batch_normalization_6_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_6_beta_path = dir_prefix + std::string("batch_normalization_6_beta.bin"); - void* batch_normalization_6_beta = readTrainedWeights(batch_normalization_6_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_6_mean_path = dir_prefix + std::string("batch_normalization_6_mean.bin"); - void* batch_normalization_6_mean = readTrainedWeights(batch_normalization_6_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_6_variance_path = dir_prefix + std::string("batch_normalization_6_variance.bin"); - void* batch_normalization_6_variance = readTrainedWeights(batch_normalization_6_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_7_gamma_path = dir_prefix + std::string("batch_normalization_7_gamma.bin"); - void* batch_normalization_7_gamma = readTrainedWeights(batch_normalization_7_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_7_beta_path = dir_prefix + std::string("batch_normalization_7_beta.bin"); - void* batch_normalization_7_beta = readTrainedWeights(batch_normalization_7_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_7_mean_path = dir_prefix + std::string("batch_normalization_7_mean.bin"); - void* batch_normalization_7_mean = readTrainedWeights(batch_normalization_7_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_7_variance_path = dir_prefix + std::string("batch_normalization_7_variance.bin"); - void* batch_normalization_7_variance = readTrainedWeights(batch_normalization_7_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,256,64,1,1); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_8_gamma_path = dir_prefix + std::string("batch_normalization_8_gamma.bin"); - void* batch_normalization_8_gamma = readTrainedWeights(batch_normalization_8_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_8_beta_path = dir_prefix + std::string("batch_normalization_8_beta.bin"); - void* batch_normalization_8_beta = readTrainedWeights(batch_normalization_8_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_8_mean_path = dir_prefix + std::string("batch_normalization_8_mean.bin"); - void* batch_normalization_8_mean = readTrainedWeights(batch_normalization_8_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_8_variance_path = dir_prefix + std::string("batch_normalization_8_variance.bin"); - void* batch_normalization_8_variance = readTrainedWeights(batch_normalization_8_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,64,256,1,1); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_9_gamma_path = dir_prefix + std::string("batch_normalization_9_gamma.bin"); - void* batch_normalization_9_gamma = readTrainedWeights(batch_normalization_9_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_9_beta_path = dir_prefix + std::string("batch_normalization_9_beta.bin"); - void* batch_normalization_9_beta = readTrainedWeights(batch_normalization_9_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_9_mean_path = dir_prefix + std::string("batch_normalization_9_mean.bin"); - void* batch_normalization_9_mean = readTrainedWeights(batch_normalization_9_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_9_variance_path = dir_prefix + std::string("batch_normalization_9_variance.bin"); - void* batch_normalization_9_variance = readTrainedWeights(batch_normalization_9_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_10_gamma_path = dir_prefix + std::string("batch_normalization_10_gamma.bin"); - void* batch_normalization_10_gamma = readTrainedWeights(batch_normalization_10_gamma_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_10_beta_path = dir_prefix + std::string("batch_normalization_10_beta.bin"); - void* batch_normalization_10_beta = readTrainedWeights(batch_normalization_10_beta_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_10_mean_path = dir_prefix + std::string("batch_normalization_10_mean.bin"); - void* batch_normalization_10_mean = readTrainedWeights(batch_normalization_10_mean_path.c_str(), 0,1,64,1,1); - std::string batch_normalization_10_variance_path = dir_prefix + std::string("batch_normalization_10_variance.bin"); - void* batch_normalization_10_variance = readTrainedWeights(batch_normalization_10_variance_path.c_str(), 0,1,64,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,256,64,1,1); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_gamma_path = dir_prefix + std::string("batch_normalization_11_gamma.bin"); - void* batch_normalization_11_gamma = readTrainedWeights(batch_normalization_11_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_beta_path = dir_prefix + std::string("batch_normalization_11_beta.bin"); - void* batch_normalization_11_beta = readTrainedWeights(batch_normalization_11_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_mean_path = dir_prefix + std::string("batch_normalization_11_mean.bin"); - void* batch_normalization_11_mean = readTrainedWeights(batch_normalization_11_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_11_variance_path = dir_prefix + std::string("batch_normalization_11_variance.bin"); - void* batch_normalization_11_variance = readTrainedWeights(batch_normalization_11_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,128,256,1,1); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_12_gamma_path = dir_prefix + std::string("batch_normalization_12_gamma.bin"); - void* batch_normalization_12_gamma = readTrainedWeights(batch_normalization_12_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_12_beta_path = dir_prefix + std::string("batch_normalization_12_beta.bin"); - void* batch_normalization_12_beta = readTrainedWeights(batch_normalization_12_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_12_mean_path = dir_prefix + std::string("batch_normalization_12_mean.bin"); - void* batch_normalization_12_mean = readTrainedWeights(batch_normalization_12_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_12_variance_path = dir_prefix + std::string("batch_normalization_12_variance.bin"); - void* batch_normalization_12_variance = readTrainedWeights(batch_normalization_12_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_13_gamma_path = dir_prefix + std::string("batch_normalization_13_gamma.bin"); - void* batch_normalization_13_gamma = readTrainedWeights(batch_normalization_13_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_13_beta_path = dir_prefix + std::string("batch_normalization_13_beta.bin"); - void* batch_normalization_13_beta = readTrainedWeights(batch_normalization_13_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_13_mean_path = dir_prefix + std::string("batch_normalization_13_mean.bin"); - void* batch_normalization_13_mean = readTrainedWeights(batch_normalization_13_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_13_variance_path = dir_prefix + std::string("batch_normalization_13_variance.bin"); - void* batch_normalization_13_variance = readTrainedWeights(batch_normalization_13_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_14_w_path = dir_prefix + std::string("conv2d_14_w.bin"); - void* conv2d_14_w = readTrainedWeights(conv2d_14_w_path.c_str(), 0,512,128,1,1); - std::string conv2d_14_b_path = dir_prefix + std::string("conv2d_14_b.bin"); - void* conv2d_14_b = readTrainedWeights(conv2d_14_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_15_w_path = dir_prefix + std::string("conv2d_15_w.bin"); - void* conv2d_15_w = readTrainedWeights(conv2d_15_w_path.c_str(), 0,512,256,1,1); - std::string conv2d_15_b_path = dir_prefix + std::string("conv2d_15_b.bin"); - void* conv2d_15_b = readTrainedWeights(conv2d_15_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_gamma_path = dir_prefix + std::string("batch_normalization_14_gamma.bin"); - void* batch_normalization_14_gamma = readTrainedWeights(batch_normalization_14_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_beta_path = dir_prefix + std::string("batch_normalization_14_beta.bin"); - void* batch_normalization_14_beta = readTrainedWeights(batch_normalization_14_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_mean_path = dir_prefix + std::string("batch_normalization_14_mean.bin"); - void* batch_normalization_14_mean = readTrainedWeights(batch_normalization_14_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_14_variance_path = dir_prefix + std::string("batch_normalization_14_variance.bin"); - void* batch_normalization_14_variance = readTrainedWeights(batch_normalization_14_variance_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_gamma_path = dir_prefix + std::string("batch_normalization_15_gamma.bin"); - void* batch_normalization_15_gamma = readTrainedWeights(batch_normalization_15_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_beta_path = dir_prefix + std::string("batch_normalization_15_beta.bin"); - void* batch_normalization_15_beta = readTrainedWeights(batch_normalization_15_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_mean_path = dir_prefix + std::string("batch_normalization_15_mean.bin"); - void* batch_normalization_15_mean = readTrainedWeights(batch_normalization_15_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_15_variance_path = dir_prefix + std::string("batch_normalization_15_variance.bin"); - void* batch_normalization_15_variance = readTrainedWeights(batch_normalization_15_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_16_w_path = dir_prefix + std::string("conv2d_16_w.bin"); - void* conv2d_16_w = readTrainedWeights(conv2d_16_w_path.c_str(), 0,128,512,1,1); - std::string conv2d_16_b_path = dir_prefix + std::string("conv2d_16_b.bin"); - void* conv2d_16_b = readTrainedWeights(conv2d_16_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_16_gamma_path = dir_prefix + std::string("batch_normalization_16_gamma.bin"); - void* batch_normalization_16_gamma = readTrainedWeights(batch_normalization_16_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_16_beta_path = dir_prefix + std::string("batch_normalization_16_beta.bin"); - void* batch_normalization_16_beta = readTrainedWeights(batch_normalization_16_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_16_mean_path = dir_prefix + std::string("batch_normalization_16_mean.bin"); - void* batch_normalization_16_mean = readTrainedWeights(batch_normalization_16_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_16_variance_path = dir_prefix + std::string("batch_normalization_16_variance.bin"); - void* batch_normalization_16_variance = readTrainedWeights(batch_normalization_16_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_17_w_path = dir_prefix + std::string("conv2d_17_w.bin"); - void* conv2d_17_w = readTrainedWeights(conv2d_17_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_17_b_path = dir_prefix + std::string("conv2d_17_b.bin"); - void* conv2d_17_b = readTrainedWeights(conv2d_17_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_17_gamma_path = dir_prefix + std::string("batch_normalization_17_gamma.bin"); - void* batch_normalization_17_gamma = readTrainedWeights(batch_normalization_17_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_17_beta_path = dir_prefix + std::string("batch_normalization_17_beta.bin"); - void* batch_normalization_17_beta = readTrainedWeights(batch_normalization_17_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_17_mean_path = dir_prefix + std::string("batch_normalization_17_mean.bin"); - void* batch_normalization_17_mean = readTrainedWeights(batch_normalization_17_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_17_variance_path = dir_prefix + std::string("batch_normalization_17_variance.bin"); - void* batch_normalization_17_variance = readTrainedWeights(batch_normalization_17_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_18_w_path = dir_prefix + std::string("conv2d_18_w.bin"); - void* conv2d_18_w = readTrainedWeights(conv2d_18_w_path.c_str(), 0,512,128,1,1); - std::string conv2d_18_b_path = dir_prefix + std::string("conv2d_18_b.bin"); - void* conv2d_18_b = readTrainedWeights(conv2d_18_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_gamma_path = dir_prefix + std::string("batch_normalization_18_gamma.bin"); - void* batch_normalization_18_gamma = readTrainedWeights(batch_normalization_18_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_beta_path = dir_prefix + std::string("batch_normalization_18_beta.bin"); - void* batch_normalization_18_beta = readTrainedWeights(batch_normalization_18_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_mean_path = dir_prefix + std::string("batch_normalization_18_mean.bin"); - void* batch_normalization_18_mean = readTrainedWeights(batch_normalization_18_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_18_variance_path = dir_prefix + std::string("batch_normalization_18_variance.bin"); - void* batch_normalization_18_variance = readTrainedWeights(batch_normalization_18_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_19_w_path = dir_prefix + std::string("conv2d_19_w.bin"); - void* conv2d_19_w = readTrainedWeights(conv2d_19_w_path.c_str(), 0,128,512,1,1); - std::string conv2d_19_b_path = dir_prefix + std::string("conv2d_19_b.bin"); - void* conv2d_19_b = readTrainedWeights(conv2d_19_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_19_gamma_path = dir_prefix + std::string("batch_normalization_19_gamma.bin"); - void* batch_normalization_19_gamma = readTrainedWeights(batch_normalization_19_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_19_beta_path = dir_prefix + std::string("batch_normalization_19_beta.bin"); - void* batch_normalization_19_beta = readTrainedWeights(batch_normalization_19_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_19_mean_path = dir_prefix + std::string("batch_normalization_19_mean.bin"); - void* batch_normalization_19_mean = readTrainedWeights(batch_normalization_19_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_19_variance_path = dir_prefix + std::string("batch_normalization_19_variance.bin"); - void* batch_normalization_19_variance = readTrainedWeights(batch_normalization_19_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_20_w_path = dir_prefix + std::string("conv2d_20_w.bin"); - void* conv2d_20_w = readTrainedWeights(conv2d_20_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_20_b_path = dir_prefix + std::string("conv2d_20_b.bin"); - void* conv2d_20_b = readTrainedWeights(conv2d_20_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_20_gamma_path = dir_prefix + std::string("batch_normalization_20_gamma.bin"); - void* batch_normalization_20_gamma = readTrainedWeights(batch_normalization_20_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_20_beta_path = dir_prefix + std::string("batch_normalization_20_beta.bin"); - void* batch_normalization_20_beta = readTrainedWeights(batch_normalization_20_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_20_mean_path = dir_prefix + std::string("batch_normalization_20_mean.bin"); - void* batch_normalization_20_mean = readTrainedWeights(batch_normalization_20_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_20_variance_path = dir_prefix + std::string("batch_normalization_20_variance.bin"); - void* batch_normalization_20_variance = readTrainedWeights(batch_normalization_20_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_21_w_path = dir_prefix + std::string("conv2d_21_w.bin"); - void* conv2d_21_w = readTrainedWeights(conv2d_21_w_path.c_str(), 0,512,128,1,1); - std::string conv2d_21_b_path = dir_prefix + std::string("conv2d_21_b.bin"); - void* conv2d_21_b = readTrainedWeights(conv2d_21_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_gamma_path = dir_prefix + std::string("batch_normalization_21_gamma.bin"); - void* batch_normalization_21_gamma = readTrainedWeights(batch_normalization_21_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_beta_path = dir_prefix + std::string("batch_normalization_21_beta.bin"); - void* batch_normalization_21_beta = readTrainedWeights(batch_normalization_21_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_mean_path = dir_prefix + std::string("batch_normalization_21_mean.bin"); - void* batch_normalization_21_mean = readTrainedWeights(batch_normalization_21_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_21_variance_path = dir_prefix + std::string("batch_normalization_21_variance.bin"); - void* batch_normalization_21_variance = readTrainedWeights(batch_normalization_21_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_22_w_path = dir_prefix + std::string("conv2d_22_w.bin"); - void* conv2d_22_w = readTrainedWeights(conv2d_22_w_path.c_str(), 0,128,512,1,1); - std::string conv2d_22_b_path = dir_prefix + std::string("conv2d_22_b.bin"); - void* conv2d_22_b = readTrainedWeights(conv2d_22_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_22_gamma_path = dir_prefix + std::string("batch_normalization_22_gamma.bin"); - void* batch_normalization_22_gamma = readTrainedWeights(batch_normalization_22_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_22_beta_path = dir_prefix + std::string("batch_normalization_22_beta.bin"); - void* batch_normalization_22_beta = readTrainedWeights(batch_normalization_22_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_22_mean_path = dir_prefix + std::string("batch_normalization_22_mean.bin"); - void* batch_normalization_22_mean = readTrainedWeights(batch_normalization_22_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_22_variance_path = dir_prefix + std::string("batch_normalization_22_variance.bin"); - void* batch_normalization_22_variance = readTrainedWeights(batch_normalization_22_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_23_w_path = dir_prefix + std::string("conv2d_23_w.bin"); - void* conv2d_23_w = readTrainedWeights(conv2d_23_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_23_b_path = dir_prefix + std::string("conv2d_23_b.bin"); - void* conv2d_23_b = readTrainedWeights(conv2d_23_b_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_23_gamma_path = dir_prefix + std::string("batch_normalization_23_gamma.bin"); - void* batch_normalization_23_gamma = readTrainedWeights(batch_normalization_23_gamma_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_23_beta_path = dir_prefix + std::string("batch_normalization_23_beta.bin"); - void* batch_normalization_23_beta = readTrainedWeights(batch_normalization_23_beta_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_23_mean_path = dir_prefix + std::string("batch_normalization_23_mean.bin"); - void* batch_normalization_23_mean = readTrainedWeights(batch_normalization_23_mean_path.c_str(), 0,1,128,1,1); - std::string batch_normalization_23_variance_path = dir_prefix + std::string("batch_normalization_23_variance.bin"); - void* batch_normalization_23_variance = readTrainedWeights(batch_normalization_23_variance_path.c_str(), 0,1,128,1,1); - std::string conv2d_24_w_path = dir_prefix + std::string("conv2d_24_w.bin"); - void* conv2d_24_w = readTrainedWeights(conv2d_24_w_path.c_str(), 0,512,128,1,1); - std::string conv2d_24_b_path = dir_prefix + std::string("conv2d_24_b.bin"); - void* conv2d_24_b = readTrainedWeights(conv2d_24_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_gamma_path = dir_prefix + std::string("batch_normalization_24_gamma.bin"); - void* batch_normalization_24_gamma = readTrainedWeights(batch_normalization_24_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_beta_path = dir_prefix + std::string("batch_normalization_24_beta.bin"); - void* batch_normalization_24_beta = readTrainedWeights(batch_normalization_24_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_mean_path = dir_prefix + std::string("batch_normalization_24_mean.bin"); - void* batch_normalization_24_mean = readTrainedWeights(batch_normalization_24_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_24_variance_path = dir_prefix + std::string("batch_normalization_24_variance.bin"); - void* batch_normalization_24_variance = readTrainedWeights(batch_normalization_24_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_25_w_path = dir_prefix + std::string("conv2d_25_w.bin"); - void* conv2d_25_w = readTrainedWeights(conv2d_25_w_path.c_str(), 0,256,512,1,1); - std::string conv2d_25_b_path = dir_prefix + std::string("conv2d_25_b.bin"); - void* conv2d_25_b = readTrainedWeights(conv2d_25_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_25_gamma_path = dir_prefix + std::string("batch_normalization_25_gamma.bin"); - void* batch_normalization_25_gamma = readTrainedWeights(batch_normalization_25_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_25_beta_path = dir_prefix + std::string("batch_normalization_25_beta.bin"); - void* batch_normalization_25_beta = readTrainedWeights(batch_normalization_25_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_25_mean_path = dir_prefix + std::string("batch_normalization_25_mean.bin"); - void* batch_normalization_25_mean = readTrainedWeights(batch_normalization_25_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_25_variance_path = dir_prefix + std::string("batch_normalization_25_variance.bin"); - void* batch_normalization_25_variance = readTrainedWeights(batch_normalization_25_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_26_w_path = dir_prefix + std::string("conv2d_26_w.bin"); - void* conv2d_26_w = readTrainedWeights(conv2d_26_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_26_b_path = dir_prefix + std::string("conv2d_26_b.bin"); - void* conv2d_26_b = readTrainedWeights(conv2d_26_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_26_gamma_path = dir_prefix + std::string("batch_normalization_26_gamma.bin"); - void* batch_normalization_26_gamma = readTrainedWeights(batch_normalization_26_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_26_beta_path = dir_prefix + std::string("batch_normalization_26_beta.bin"); - void* batch_normalization_26_beta = readTrainedWeights(batch_normalization_26_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_26_mean_path = dir_prefix + std::string("batch_normalization_26_mean.bin"); - void* batch_normalization_26_mean = readTrainedWeights(batch_normalization_26_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_26_variance_path = dir_prefix + std::string("batch_normalization_26_variance.bin"); - void* batch_normalization_26_variance = readTrainedWeights(batch_normalization_26_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_27_w_path = dir_prefix + std::string("conv2d_27_w.bin"); - void* conv2d_27_w = readTrainedWeights(conv2d_27_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_27_b_path = dir_prefix + std::string("conv2d_27_b.bin"); - void* conv2d_27_b = readTrainedWeights(conv2d_27_b_path.c_str(), 0,1,1024,1,1); - std::string conv2d_28_w_path = dir_prefix + std::string("conv2d_28_w.bin"); - void* conv2d_28_w = readTrainedWeights(conv2d_28_w_path.c_str(), 0,1024,512,1,1); - std::string conv2d_28_b_path = dir_prefix + std::string("conv2d_28_b.bin"); - void* conv2d_28_b = readTrainedWeights(conv2d_28_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_gamma_path = dir_prefix + std::string("batch_normalization_27_gamma.bin"); - void* batch_normalization_27_gamma = readTrainedWeights(batch_normalization_27_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_beta_path = dir_prefix + std::string("batch_normalization_27_beta.bin"); - void* batch_normalization_27_beta = readTrainedWeights(batch_normalization_27_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_mean_path = dir_prefix + std::string("batch_normalization_27_mean.bin"); - void* batch_normalization_27_mean = readTrainedWeights(batch_normalization_27_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_27_variance_path = dir_prefix + std::string("batch_normalization_27_variance.bin"); - void* batch_normalization_27_variance = readTrainedWeights(batch_normalization_27_variance_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_28_gamma_path = dir_prefix + std::string("batch_normalization_28_gamma.bin"); - void* batch_normalization_28_gamma = readTrainedWeights(batch_normalization_28_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_28_beta_path = dir_prefix + std::string("batch_normalization_28_beta.bin"); - void* batch_normalization_28_beta = readTrainedWeights(batch_normalization_28_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_28_mean_path = dir_prefix + std::string("batch_normalization_28_mean.bin"); - void* batch_normalization_28_mean = readTrainedWeights(batch_normalization_28_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_28_variance_path = dir_prefix + std::string("batch_normalization_28_variance.bin"); - void* batch_normalization_28_variance = readTrainedWeights(batch_normalization_28_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_29_w_path = dir_prefix + std::string("conv2d_29_w.bin"); - void* conv2d_29_w = readTrainedWeights(conv2d_29_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_29_b_path = dir_prefix + std::string("conv2d_29_b.bin"); - void* conv2d_29_b = readTrainedWeights(conv2d_29_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_29_gamma_path = dir_prefix + std::string("batch_normalization_29_gamma.bin"); - void* batch_normalization_29_gamma = readTrainedWeights(batch_normalization_29_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_29_beta_path = dir_prefix + std::string("batch_normalization_29_beta.bin"); - void* batch_normalization_29_beta = readTrainedWeights(batch_normalization_29_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_29_mean_path = dir_prefix + std::string("batch_normalization_29_mean.bin"); - void* batch_normalization_29_mean = readTrainedWeights(batch_normalization_29_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_29_variance_path = dir_prefix + std::string("batch_normalization_29_variance.bin"); - void* batch_normalization_29_variance = readTrainedWeights(batch_normalization_29_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_30_w_path = dir_prefix + std::string("conv2d_30_w.bin"); - void* conv2d_30_w = readTrainedWeights(conv2d_30_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_30_b_path = dir_prefix + std::string("conv2d_30_b.bin"); - void* conv2d_30_b = readTrainedWeights(conv2d_30_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_30_gamma_path = dir_prefix + std::string("batch_normalization_30_gamma.bin"); - void* batch_normalization_30_gamma = readTrainedWeights(batch_normalization_30_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_30_beta_path = dir_prefix + std::string("batch_normalization_30_beta.bin"); - void* batch_normalization_30_beta = readTrainedWeights(batch_normalization_30_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_30_mean_path = dir_prefix + std::string("batch_normalization_30_mean.bin"); - void* batch_normalization_30_mean = readTrainedWeights(batch_normalization_30_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_30_variance_path = dir_prefix + std::string("batch_normalization_30_variance.bin"); - void* batch_normalization_30_variance = readTrainedWeights(batch_normalization_30_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_31_w_path = dir_prefix + std::string("conv2d_31_w.bin"); - void* conv2d_31_w = readTrainedWeights(conv2d_31_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_31_b_path = dir_prefix + std::string("conv2d_31_b.bin"); - void* conv2d_31_b = readTrainedWeights(conv2d_31_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_31_gamma_path = dir_prefix + std::string("batch_normalization_31_gamma.bin"); - void* batch_normalization_31_gamma = readTrainedWeights(batch_normalization_31_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_31_beta_path = dir_prefix + std::string("batch_normalization_31_beta.bin"); - void* batch_normalization_31_beta = readTrainedWeights(batch_normalization_31_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_31_mean_path = dir_prefix + std::string("batch_normalization_31_mean.bin"); - void* batch_normalization_31_mean = readTrainedWeights(batch_normalization_31_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_31_variance_path = dir_prefix + std::string("batch_normalization_31_variance.bin"); - void* batch_normalization_31_variance = readTrainedWeights(batch_normalization_31_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_32_w_path = dir_prefix + std::string("conv2d_32_w.bin"); - void* conv2d_32_w = readTrainedWeights(conv2d_32_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_32_b_path = dir_prefix + std::string("conv2d_32_b.bin"); - void* conv2d_32_b = readTrainedWeights(conv2d_32_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_32_gamma_path = dir_prefix + std::string("batch_normalization_32_gamma.bin"); - void* batch_normalization_32_gamma = readTrainedWeights(batch_normalization_32_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_32_beta_path = dir_prefix + std::string("batch_normalization_32_beta.bin"); - void* batch_normalization_32_beta = readTrainedWeights(batch_normalization_32_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_32_mean_path = dir_prefix + std::string("batch_normalization_32_mean.bin"); - void* batch_normalization_32_mean = readTrainedWeights(batch_normalization_32_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_32_variance_path = dir_prefix + std::string("batch_normalization_32_variance.bin"); - void* batch_normalization_32_variance = readTrainedWeights(batch_normalization_32_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_33_w_path = dir_prefix + std::string("conv2d_33_w.bin"); - void* conv2d_33_w = readTrainedWeights(conv2d_33_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_33_b_path = dir_prefix + std::string("conv2d_33_b.bin"); - void* conv2d_33_b = readTrainedWeights(conv2d_33_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_33_gamma_path = dir_prefix + std::string("batch_normalization_33_gamma.bin"); - void* batch_normalization_33_gamma = readTrainedWeights(batch_normalization_33_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_33_beta_path = dir_prefix + std::string("batch_normalization_33_beta.bin"); - void* batch_normalization_33_beta = readTrainedWeights(batch_normalization_33_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_33_mean_path = dir_prefix + std::string("batch_normalization_33_mean.bin"); - void* batch_normalization_33_mean = readTrainedWeights(batch_normalization_33_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_33_variance_path = dir_prefix + std::string("batch_normalization_33_variance.bin"); - void* batch_normalization_33_variance = readTrainedWeights(batch_normalization_33_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_34_w_path = dir_prefix + std::string("conv2d_34_w.bin"); - void* conv2d_34_w = readTrainedWeights(conv2d_34_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_34_b_path = dir_prefix + std::string("conv2d_34_b.bin"); - void* conv2d_34_b = readTrainedWeights(conv2d_34_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_34_gamma_path = dir_prefix + std::string("batch_normalization_34_gamma.bin"); - void* batch_normalization_34_gamma = readTrainedWeights(batch_normalization_34_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_34_beta_path = dir_prefix + std::string("batch_normalization_34_beta.bin"); - void* batch_normalization_34_beta = readTrainedWeights(batch_normalization_34_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_34_mean_path = dir_prefix + std::string("batch_normalization_34_mean.bin"); - void* batch_normalization_34_mean = readTrainedWeights(batch_normalization_34_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_34_variance_path = dir_prefix + std::string("batch_normalization_34_variance.bin"); - void* batch_normalization_34_variance = readTrainedWeights(batch_normalization_34_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_35_w_path = dir_prefix + std::string("conv2d_35_w.bin"); - void* conv2d_35_w = readTrainedWeights(conv2d_35_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_35_b_path = dir_prefix + std::string("conv2d_35_b.bin"); - void* conv2d_35_b = readTrainedWeights(conv2d_35_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_35_gamma_path = dir_prefix + std::string("batch_normalization_35_gamma.bin"); - void* batch_normalization_35_gamma = readTrainedWeights(batch_normalization_35_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_35_beta_path = dir_prefix + std::string("batch_normalization_35_beta.bin"); - void* batch_normalization_35_beta = readTrainedWeights(batch_normalization_35_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_35_mean_path = dir_prefix + std::string("batch_normalization_35_mean.bin"); - void* batch_normalization_35_mean = readTrainedWeights(batch_normalization_35_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_35_variance_path = dir_prefix + std::string("batch_normalization_35_variance.bin"); - void* batch_normalization_35_variance = readTrainedWeights(batch_normalization_35_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_36_w_path = dir_prefix + std::string("conv2d_36_w.bin"); - void* conv2d_36_w = readTrainedWeights(conv2d_36_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_36_b_path = dir_prefix + std::string("conv2d_36_b.bin"); - void* conv2d_36_b = readTrainedWeights(conv2d_36_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_36_gamma_path = dir_prefix + std::string("batch_normalization_36_gamma.bin"); - void* batch_normalization_36_gamma = readTrainedWeights(batch_normalization_36_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_36_beta_path = dir_prefix + std::string("batch_normalization_36_beta.bin"); - void* batch_normalization_36_beta = readTrainedWeights(batch_normalization_36_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_36_mean_path = dir_prefix + std::string("batch_normalization_36_mean.bin"); - void* batch_normalization_36_mean = readTrainedWeights(batch_normalization_36_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_36_variance_path = dir_prefix + std::string("batch_normalization_36_variance.bin"); - void* batch_normalization_36_variance = readTrainedWeights(batch_normalization_36_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_37_w_path = dir_prefix + std::string("conv2d_37_w.bin"); - void* conv2d_37_w = readTrainedWeights(conv2d_37_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_37_b_path = dir_prefix + std::string("conv2d_37_b.bin"); - void* conv2d_37_b = readTrainedWeights(conv2d_37_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_37_gamma_path = dir_prefix + std::string("batch_normalization_37_gamma.bin"); - void* batch_normalization_37_gamma = readTrainedWeights(batch_normalization_37_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_37_beta_path = dir_prefix + std::string("batch_normalization_37_beta.bin"); - void* batch_normalization_37_beta = readTrainedWeights(batch_normalization_37_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_37_mean_path = dir_prefix + std::string("batch_normalization_37_mean.bin"); - void* batch_normalization_37_mean = readTrainedWeights(batch_normalization_37_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_37_variance_path = dir_prefix + std::string("batch_normalization_37_variance.bin"); - void* batch_normalization_37_variance = readTrainedWeights(batch_normalization_37_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_38_w_path = dir_prefix + std::string("conv2d_38_w.bin"); - void* conv2d_38_w = readTrainedWeights(conv2d_38_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_38_b_path = dir_prefix + std::string("conv2d_38_b.bin"); - void* conv2d_38_b = readTrainedWeights(conv2d_38_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_38_gamma_path = dir_prefix + std::string("batch_normalization_38_gamma.bin"); - void* batch_normalization_38_gamma = readTrainedWeights(batch_normalization_38_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_38_beta_path = dir_prefix + std::string("batch_normalization_38_beta.bin"); - void* batch_normalization_38_beta = readTrainedWeights(batch_normalization_38_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_38_mean_path = dir_prefix + std::string("batch_normalization_38_mean.bin"); - void* batch_normalization_38_mean = readTrainedWeights(batch_normalization_38_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_38_variance_path = dir_prefix + std::string("batch_normalization_38_variance.bin"); - void* batch_normalization_38_variance = readTrainedWeights(batch_normalization_38_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_39_w_path = dir_prefix + std::string("conv2d_39_w.bin"); - void* conv2d_39_w = readTrainedWeights(conv2d_39_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_39_b_path = dir_prefix + std::string("conv2d_39_b.bin"); - void* conv2d_39_b = readTrainedWeights(conv2d_39_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_39_gamma_path = dir_prefix + std::string("batch_normalization_39_gamma.bin"); - void* batch_normalization_39_gamma = readTrainedWeights(batch_normalization_39_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_39_beta_path = dir_prefix + std::string("batch_normalization_39_beta.bin"); - void* batch_normalization_39_beta = readTrainedWeights(batch_normalization_39_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_39_mean_path = dir_prefix + std::string("batch_normalization_39_mean.bin"); - void* batch_normalization_39_mean = readTrainedWeights(batch_normalization_39_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_39_variance_path = dir_prefix + std::string("batch_normalization_39_variance.bin"); - void* batch_normalization_39_variance = readTrainedWeights(batch_normalization_39_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_40_w_path = dir_prefix + std::string("conv2d_40_w.bin"); - void* conv2d_40_w = readTrainedWeights(conv2d_40_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_40_b_path = dir_prefix + std::string("conv2d_40_b.bin"); - void* conv2d_40_b = readTrainedWeights(conv2d_40_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_40_gamma_path = dir_prefix + std::string("batch_normalization_40_gamma.bin"); - void* batch_normalization_40_gamma = readTrainedWeights(batch_normalization_40_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_40_beta_path = dir_prefix + std::string("batch_normalization_40_beta.bin"); - void* batch_normalization_40_beta = readTrainedWeights(batch_normalization_40_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_40_mean_path = dir_prefix + std::string("batch_normalization_40_mean.bin"); - void* batch_normalization_40_mean = readTrainedWeights(batch_normalization_40_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_40_variance_path = dir_prefix + std::string("batch_normalization_40_variance.bin"); - void* batch_normalization_40_variance = readTrainedWeights(batch_normalization_40_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_41_w_path = dir_prefix + std::string("conv2d_41_w.bin"); - void* conv2d_41_w = readTrainedWeights(conv2d_41_w_path.c_str(), 0,256,1024,1,1); - std::string conv2d_41_b_path = dir_prefix + std::string("conv2d_41_b.bin"); - void* conv2d_41_b = readTrainedWeights(conv2d_41_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_41_gamma_path = dir_prefix + std::string("batch_normalization_41_gamma.bin"); - void* batch_normalization_41_gamma = readTrainedWeights(batch_normalization_41_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_41_beta_path = dir_prefix + std::string("batch_normalization_41_beta.bin"); - void* batch_normalization_41_beta = readTrainedWeights(batch_normalization_41_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_41_mean_path = dir_prefix + std::string("batch_normalization_41_mean.bin"); - void* batch_normalization_41_mean = readTrainedWeights(batch_normalization_41_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_41_variance_path = dir_prefix + std::string("batch_normalization_41_variance.bin"); - void* batch_normalization_41_variance = readTrainedWeights(batch_normalization_41_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_42_w_path = dir_prefix + std::string("conv2d_42_w.bin"); - void* conv2d_42_w = readTrainedWeights(conv2d_42_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_42_b_path = dir_prefix + std::string("conv2d_42_b.bin"); - void* conv2d_42_b = readTrainedWeights(conv2d_42_b_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_42_gamma_path = dir_prefix + std::string("batch_normalization_42_gamma.bin"); - void* batch_normalization_42_gamma = readTrainedWeights(batch_normalization_42_gamma_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_42_beta_path = dir_prefix + std::string("batch_normalization_42_beta.bin"); - void* batch_normalization_42_beta = readTrainedWeights(batch_normalization_42_beta_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_42_mean_path = dir_prefix + std::string("batch_normalization_42_mean.bin"); - void* batch_normalization_42_mean = readTrainedWeights(batch_normalization_42_mean_path.c_str(), 0,1,256,1,1); - std::string batch_normalization_42_variance_path = dir_prefix + std::string("batch_normalization_42_variance.bin"); - void* batch_normalization_42_variance = readTrainedWeights(batch_normalization_42_variance_path.c_str(), 0,1,256,1,1); - std::string conv2d_43_w_path = dir_prefix + std::string("conv2d_43_w.bin"); - void* conv2d_43_w = readTrainedWeights(conv2d_43_w_path.c_str(), 0,1024,256,1,1); - std::string conv2d_43_b_path = dir_prefix + std::string("conv2d_43_b.bin"); - void* conv2d_43_b = readTrainedWeights(conv2d_43_b_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_43_gamma_path = dir_prefix + std::string("batch_normalization_43_gamma.bin"); - void* batch_normalization_43_gamma = readTrainedWeights(batch_normalization_43_gamma_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_43_beta_path = dir_prefix + std::string("batch_normalization_43_beta.bin"); - void* batch_normalization_43_beta = readTrainedWeights(batch_normalization_43_beta_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_43_mean_path = dir_prefix + std::string("batch_normalization_43_mean.bin"); - void* batch_normalization_43_mean = readTrainedWeights(batch_normalization_43_mean_path.c_str(), 0,1,1024,1,1); - std::string batch_normalization_43_variance_path = dir_prefix + std::string("batch_normalization_43_variance.bin"); - void* batch_normalization_43_variance = readTrainedWeights(batch_normalization_43_variance_path.c_str(), 0,1,1024,1,1); - std::string conv2d_44_w_path = dir_prefix + std::string("conv2d_44_w.bin"); - void* conv2d_44_w = readTrainedWeights(conv2d_44_w_path.c_str(), 0,512,1024,1,1); - std::string conv2d_44_b_path = dir_prefix + std::string("conv2d_44_b.bin"); - void* conv2d_44_b = readTrainedWeights(conv2d_44_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_44_gamma_path = dir_prefix + std::string("batch_normalization_44_gamma.bin"); - void* batch_normalization_44_gamma = readTrainedWeights(batch_normalization_44_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_44_beta_path = dir_prefix + std::string("batch_normalization_44_beta.bin"); - void* batch_normalization_44_beta = readTrainedWeights(batch_normalization_44_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_44_mean_path = dir_prefix + std::string("batch_normalization_44_mean.bin"); - void* batch_normalization_44_mean = readTrainedWeights(batch_normalization_44_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_44_variance_path = dir_prefix + std::string("batch_normalization_44_variance.bin"); - void* batch_normalization_44_variance = readTrainedWeights(batch_normalization_44_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_45_w_path = dir_prefix + std::string("conv2d_45_w.bin"); - void* conv2d_45_w = readTrainedWeights(conv2d_45_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_45_b_path = dir_prefix + std::string("conv2d_45_b.bin"); - void* conv2d_45_b = readTrainedWeights(conv2d_45_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_45_gamma_path = dir_prefix + std::string("batch_normalization_45_gamma.bin"); - void* batch_normalization_45_gamma = readTrainedWeights(batch_normalization_45_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_45_beta_path = dir_prefix + std::string("batch_normalization_45_beta.bin"); - void* batch_normalization_45_beta = readTrainedWeights(batch_normalization_45_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_45_mean_path = dir_prefix + std::string("batch_normalization_45_mean.bin"); - void* batch_normalization_45_mean = readTrainedWeights(batch_normalization_45_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_45_variance_path = dir_prefix + std::string("batch_normalization_45_variance.bin"); - void* batch_normalization_45_variance = readTrainedWeights(batch_normalization_45_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_46_w_path = dir_prefix + std::string("conv2d_46_w.bin"); - void* conv2d_46_w = readTrainedWeights(conv2d_46_w_path.c_str(), 0,2048,512,1,1); - std::string conv2d_46_b_path = dir_prefix + std::string("conv2d_46_b.bin"); - void* conv2d_46_b = readTrainedWeights(conv2d_46_b_path.c_str(), 0,1,2048,1,1); - std::string conv2d_47_w_path = dir_prefix + std::string("conv2d_47_w.bin"); - void* conv2d_47_w = readTrainedWeights(conv2d_47_w_path.c_str(), 0,2048,1024,1,1); - std::string conv2d_47_b_path = dir_prefix + std::string("conv2d_47_b.bin"); - void* conv2d_47_b = readTrainedWeights(conv2d_47_b_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_46_gamma_path = dir_prefix + std::string("batch_normalization_46_gamma.bin"); - void* batch_normalization_46_gamma = readTrainedWeights(batch_normalization_46_gamma_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_46_beta_path = dir_prefix + std::string("batch_normalization_46_beta.bin"); - void* batch_normalization_46_beta = readTrainedWeights(batch_normalization_46_beta_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_46_mean_path = dir_prefix + std::string("batch_normalization_46_mean.bin"); - void* batch_normalization_46_mean = readTrainedWeights(batch_normalization_46_mean_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_46_variance_path = dir_prefix + std::string("batch_normalization_46_variance.bin"); - void* batch_normalization_46_variance = readTrainedWeights(batch_normalization_46_variance_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_47_gamma_path = dir_prefix + std::string("batch_normalization_47_gamma.bin"); - void* batch_normalization_47_gamma = readTrainedWeights(batch_normalization_47_gamma_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_47_beta_path = dir_prefix + std::string("batch_normalization_47_beta.bin"); - void* batch_normalization_47_beta = readTrainedWeights(batch_normalization_47_beta_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_47_mean_path = dir_prefix + std::string("batch_normalization_47_mean.bin"); - void* batch_normalization_47_mean = readTrainedWeights(batch_normalization_47_mean_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_47_variance_path = dir_prefix + std::string("batch_normalization_47_variance.bin"); - void* batch_normalization_47_variance = readTrainedWeights(batch_normalization_47_variance_path.c_str(), 0,1,2048,1,1); - std::string conv2d_48_w_path = dir_prefix + std::string("conv2d_48_w.bin"); - void* conv2d_48_w = readTrainedWeights(conv2d_48_w_path.c_str(), 0,512,2048,1,1); - std::string conv2d_48_b_path = dir_prefix + std::string("conv2d_48_b.bin"); - void* conv2d_48_b = readTrainedWeights(conv2d_48_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_48_gamma_path = dir_prefix + std::string("batch_normalization_48_gamma.bin"); - void* batch_normalization_48_gamma = readTrainedWeights(batch_normalization_48_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_48_beta_path = dir_prefix + std::string("batch_normalization_48_beta.bin"); - void* batch_normalization_48_beta = readTrainedWeights(batch_normalization_48_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_48_mean_path = dir_prefix + std::string("batch_normalization_48_mean.bin"); - void* batch_normalization_48_mean = readTrainedWeights(batch_normalization_48_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_48_variance_path = dir_prefix + std::string("batch_normalization_48_variance.bin"); - void* batch_normalization_48_variance = readTrainedWeights(batch_normalization_48_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_49_w_path = dir_prefix + std::string("conv2d_49_w.bin"); - void* conv2d_49_w = readTrainedWeights(conv2d_49_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_49_b_path = dir_prefix + std::string("conv2d_49_b.bin"); - void* conv2d_49_b = readTrainedWeights(conv2d_49_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_49_gamma_path = dir_prefix + std::string("batch_normalization_49_gamma.bin"); - void* batch_normalization_49_gamma = readTrainedWeights(batch_normalization_49_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_49_beta_path = dir_prefix + std::string("batch_normalization_49_beta.bin"); - void* batch_normalization_49_beta = readTrainedWeights(batch_normalization_49_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_49_mean_path = dir_prefix + std::string("batch_normalization_49_mean.bin"); - void* batch_normalization_49_mean = readTrainedWeights(batch_normalization_49_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_49_variance_path = dir_prefix + std::string("batch_normalization_49_variance.bin"); - void* batch_normalization_49_variance = readTrainedWeights(batch_normalization_49_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_50_w_path = dir_prefix + std::string("conv2d_50_w.bin"); - void* conv2d_50_w = readTrainedWeights(conv2d_50_w_path.c_str(), 0,2048,512,1,1); - std::string conv2d_50_b_path = dir_prefix + std::string("conv2d_50_b.bin"); - void* conv2d_50_b = readTrainedWeights(conv2d_50_b_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_50_gamma_path = dir_prefix + std::string("batch_normalization_50_gamma.bin"); - void* batch_normalization_50_gamma = readTrainedWeights(batch_normalization_50_gamma_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_50_beta_path = dir_prefix + std::string("batch_normalization_50_beta.bin"); - void* batch_normalization_50_beta = readTrainedWeights(batch_normalization_50_beta_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_50_mean_path = dir_prefix + std::string("batch_normalization_50_mean.bin"); - void* batch_normalization_50_mean = readTrainedWeights(batch_normalization_50_mean_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_50_variance_path = dir_prefix + std::string("batch_normalization_50_variance.bin"); - void* batch_normalization_50_variance = readTrainedWeights(batch_normalization_50_variance_path.c_str(), 0,1,2048,1,1); - std::string conv2d_51_w_path = dir_prefix + std::string("conv2d_51_w.bin"); - void* conv2d_51_w = readTrainedWeights(conv2d_51_w_path.c_str(), 0,512,2048,1,1); - std::string conv2d_51_b_path = dir_prefix + std::string("conv2d_51_b.bin"); - void* conv2d_51_b = readTrainedWeights(conv2d_51_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_51_gamma_path = dir_prefix + std::string("batch_normalization_51_gamma.bin"); - void* batch_normalization_51_gamma = readTrainedWeights(batch_normalization_51_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_51_beta_path = dir_prefix + std::string("batch_normalization_51_beta.bin"); - void* batch_normalization_51_beta = readTrainedWeights(batch_normalization_51_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_51_mean_path = dir_prefix + std::string("batch_normalization_51_mean.bin"); - void* batch_normalization_51_mean = readTrainedWeights(batch_normalization_51_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_51_variance_path = dir_prefix + std::string("batch_normalization_51_variance.bin"); - void* batch_normalization_51_variance = readTrainedWeights(batch_normalization_51_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_52_w_path = dir_prefix + std::string("conv2d_52_w.bin"); - void* conv2d_52_w = readTrainedWeights(conv2d_52_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_52_b_path = dir_prefix + std::string("conv2d_52_b.bin"); - void* conv2d_52_b = readTrainedWeights(conv2d_52_b_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_52_gamma_path = dir_prefix + std::string("batch_normalization_52_gamma.bin"); - void* batch_normalization_52_gamma = readTrainedWeights(batch_normalization_52_gamma_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_52_beta_path = dir_prefix + std::string("batch_normalization_52_beta.bin"); - void* batch_normalization_52_beta = readTrainedWeights(batch_normalization_52_beta_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_52_mean_path = dir_prefix + std::string("batch_normalization_52_mean.bin"); - void* batch_normalization_52_mean = readTrainedWeights(batch_normalization_52_mean_path.c_str(), 0,1,512,1,1); - std::string batch_normalization_52_variance_path = dir_prefix + std::string("batch_normalization_52_variance.bin"); - void* batch_normalization_52_variance = readTrainedWeights(batch_normalization_52_variance_path.c_str(), 0,1,512,1,1); - std::string conv2d_53_w_path = dir_prefix + std::string("conv2d_53_w.bin"); - void* conv2d_53_w = readTrainedWeights(conv2d_53_w_path.c_str(), 0,2048,512,1,1); - std::string conv2d_53_b_path = dir_prefix + std::string("conv2d_53_b.bin"); - void* conv2d_53_b = readTrainedWeights(conv2d_53_b_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_53_gamma_path = dir_prefix + std::string("batch_normalization_53_gamma.bin"); - void* batch_normalization_53_gamma = readTrainedWeights(batch_normalization_53_gamma_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_53_beta_path = dir_prefix + std::string("batch_normalization_53_beta.bin"); - void* batch_normalization_53_beta = readTrainedWeights(batch_normalization_53_beta_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_53_mean_path = dir_prefix + std::string("batch_normalization_53_mean.bin"); - void* batch_normalization_53_mean = readTrainedWeights(batch_normalization_53_mean_path.c_str(), 0,1,2048,1,1); - std::string batch_normalization_53_variance_path = dir_prefix + std::string("batch_normalization_53_variance.bin"); - void* batch_normalization_53_variance = readTrainedWeights(batch_normalization_53_variance_path.c_str(), 0,1,2048,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,2048,1000); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,1000,1,1); + int test_input_size = 500; + int batch_size = 100; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + for (int i = 0; i < batch_count; i++) { + int start = i * batch_size; + int end = (i + 1) * batch_size; - startMemTracking(); + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); - int test_input_size = 500; - int batch_size = 100; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; + void *var_2 = tensorConvolution(input, conv2d_1_w, 3, 3, 2, 2, 1, 1); + void *var_3 = tensorAdd(var_2, conv2d_1_b); + void *var_4 = tensorRelu(var_3); + void *var_5 = tensorPooling(var_4, 0, 3, 3, 0, 0, 2, 2); + void *var_6 = tensorBatchNorm( + var_5, batch_normalization_1_gamma, batch_normalization_1_beta, + batch_normalization_1_mean, batch_normalization_1_variance, 0.001); + void *var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); + void *var_8 = tensorAdd(var_7, conv2d_2_b); + void *var_9 = tensorBatchNorm( + var_8, batch_normalization_2_gamma, batch_normalization_2_beta, + batch_normalization_2_mean, batch_normalization_2_variance, 0.001); + void *var_10 = tensorRelu(var_9); + void *var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); + void *var_12 = tensorAdd(var_11, conv2d_3_b); + void *var_13 = tensorBatchNorm( + var_12, batch_normalization_3_gamma, batch_normalization_3_beta, + batch_normalization_3_mean, batch_normalization_3_variance, 0.001); + void *var_14 = tensorRelu(var_13); + void *var_15 = tensorConvolution(var_14, conv2d_4_w, 0, 0, 1, 1, 1, 1); + void *var_16 = tensorAdd(var_15, conv2d_4_b); + void *var_17 = tensorBatchNorm( + var_16, batch_normalization_4_gamma, batch_normalization_4_beta, + batch_normalization_4_mean, batch_normalization_4_variance, 0.001); + void *var_18 = tensorConvolution(var_6, conv2d_5_w, 0, 0, 1, 1, 1, 1); + void *var_19 = tensorAdd(var_18, conv2d_5_b); + void *var_20 = tensorBatchNorm( + var_19, batch_normalization_5_gamma, batch_normalization_5_beta, + batch_normalization_5_mean, batch_normalization_5_variance, 0.001); + void *var_21 = tensorAdd(var_17, var_20); + void *var_22 = tensorRelu(var_21); + void *var_23 = tensorConvolution(var_22, conv2d_6_w, 0, 0, 1, 1, 1, 1); + void *var_24 = tensorAdd(var_23, conv2d_6_b); + void *var_25 = tensorBatchNorm( + var_24, batch_normalization_6_gamma, batch_normalization_6_beta, + batch_normalization_6_mean, batch_normalization_6_variance, 0.001); + void *var_26 = tensorRelu(var_25); + void *var_27 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 1); + void *var_28 = tensorAdd(var_27, conv2d_7_b); + void *var_29 = tensorBatchNorm( + var_28, batch_normalization_7_gamma, batch_normalization_7_beta, + batch_normalization_7_mean, batch_normalization_7_variance, 0.001); + void *var_30 = tensorRelu(var_29); + void *var_31 = tensorConvolution(var_30, conv2d_8_w, 0, 0, 1, 1, 1, 1); + void *var_32 = tensorAdd(var_31, conv2d_8_b); + void *var_33 = tensorBatchNorm( + var_32, batch_normalization_8_gamma, batch_normalization_8_beta, + batch_normalization_8_mean, batch_normalization_8_variance, 0.001); + void *var_34 = tensorAdd(var_33, var_22); + void *var_35 = tensorRelu(var_34); + void *var_36 = tensorConvolution(var_35, conv2d_9_w, 0, 0, 1, 1, 1, 1); + void *var_37 = tensorAdd(var_36, conv2d_9_b); + void *var_38 = tensorBatchNorm( + var_37, batch_normalization_9_gamma, batch_normalization_9_beta, + batch_normalization_9_mean, batch_normalization_9_variance, 0.001); + void *var_39 = tensorRelu(var_38); + void *var_40 = tensorConvolution(var_39, conv2d_10_w, 1, 1, 1, 1, 1, 1); + void *var_41 = tensorAdd(var_40, conv2d_10_b); + void *var_42 = tensorBatchNorm( + var_41, batch_normalization_10_gamma, batch_normalization_10_beta, + batch_normalization_10_mean, batch_normalization_10_variance, 0.001); + void *var_43 = tensorRelu(var_42); + void *var_44 = tensorConvolution(var_43, conv2d_11_w, 0, 0, 1, 1, 1, 1); + void *var_45 = tensorAdd(var_44, conv2d_11_b); + void *var_46 = tensorBatchNorm( + var_45, batch_normalization_11_gamma, batch_normalization_11_beta, + batch_normalization_11_mean, batch_normalization_11_variance, 0.001); + void *var_47 = tensorAdd(var_46, var_35); + void *var_48 = tensorRelu(var_47); + void *var_49 = tensorConvolution(var_48, conv2d_12_w, 0, 0, 2, 2, 1, 1); + void *var_50 = tensorAdd(var_49, conv2d_12_b); + void *var_51 = tensorBatchNorm( + var_50, batch_normalization_12_gamma, batch_normalization_12_beta, + batch_normalization_12_mean, batch_normalization_12_variance, 0.001); + void *var_52 = tensorRelu(var_51); + void *var_53 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 1); + void *var_54 = tensorAdd(var_53, conv2d_13_b); + void *var_55 = tensorBatchNorm( + var_54, batch_normalization_13_gamma, batch_normalization_13_beta, + batch_normalization_13_mean, batch_normalization_13_variance, 0.001); + void *var_56 = tensorRelu(var_55); + void *var_57 = tensorConvolution(var_56, conv2d_14_w, 0, 0, 1, 1, 1, 1); + void *var_58 = tensorAdd(var_57, conv2d_14_b); + void *var_59 = tensorBatchNorm( + var_58, batch_normalization_14_gamma, batch_normalization_14_beta, + batch_normalization_14_mean, batch_normalization_14_variance, 0.001); + void *var_60 = tensorConvolution(var_48, conv2d_15_w, 0, 0, 2, 2, 1, 1); + void *var_61 = tensorAdd(var_60, conv2d_15_b); + void *var_62 = tensorBatchNorm( + var_61, batch_normalization_15_gamma, batch_normalization_15_beta, + batch_normalization_15_mean, batch_normalization_15_variance, 0.001); + void *var_63 = tensorAdd(var_59, var_62); + void *var_64 = tensorRelu(var_63); + void *var_65 = tensorConvolution(var_64, conv2d_16_w, 0, 0, 1, 1, 1, 1); + void *var_66 = tensorAdd(var_65, conv2d_16_b); + void *var_67 = tensorBatchNorm( + var_66, batch_normalization_16_gamma, batch_normalization_16_beta, + batch_normalization_16_mean, batch_normalization_16_variance, 0.001); + void *var_68 = tensorRelu(var_67); + void *var_69 = tensorConvolution(var_68, conv2d_17_w, 1, 1, 1, 1, 1, 1); + void *var_70 = tensorAdd(var_69, conv2d_17_b); + void *var_71 = tensorBatchNorm( + var_70, batch_normalization_17_gamma, batch_normalization_17_beta, + batch_normalization_17_mean, batch_normalization_17_variance, 0.001); + void *var_72 = tensorRelu(var_71); + void *var_73 = tensorConvolution(var_72, conv2d_18_w, 0, 0, 1, 1, 1, 1); + void *var_74 = tensorAdd(var_73, conv2d_18_b); + void *var_75 = tensorBatchNorm( + var_74, batch_normalization_18_gamma, batch_normalization_18_beta, + batch_normalization_18_mean, batch_normalization_18_variance, 0.001); + void *var_76 = tensorAdd(var_75, var_64); + void *var_77 = tensorRelu(var_76); + void *var_78 = tensorConvolution(var_77, conv2d_19_w, 0, 0, 1, 1, 1, 1); + void *var_79 = tensorAdd(var_78, conv2d_19_b); + void *var_80 = tensorBatchNorm( + var_79, batch_normalization_19_gamma, batch_normalization_19_beta, + batch_normalization_19_mean, batch_normalization_19_variance, 0.001); + void *var_81 = tensorRelu(var_80); + void *var_82 = tensorConvolution(var_81, conv2d_20_w, 1, 1, 1, 1, 1, 1); + void *var_83 = tensorAdd(var_82, conv2d_20_b); + void *var_84 = tensorBatchNorm( + var_83, batch_normalization_20_gamma, batch_normalization_20_beta, + batch_normalization_20_mean, batch_normalization_20_variance, 0.001); + void *var_85 = tensorRelu(var_84); + void *var_86 = tensorConvolution(var_85, conv2d_21_w, 0, 0, 1, 1, 1, 1); + void *var_87 = tensorAdd(var_86, conv2d_21_b); + void *var_88 = tensorBatchNorm( + var_87, batch_normalization_21_gamma, batch_normalization_21_beta, + batch_normalization_21_mean, batch_normalization_21_variance, 0.001); + void *var_89 = tensorAdd(var_88, var_77); + void *var_90 = tensorRelu(var_89); + void *var_91 = tensorConvolution(var_90, conv2d_22_w, 0, 0, 1, 1, 1, 1); + void *var_92 = tensorAdd(var_91, conv2d_22_b); + void *var_93 = tensorBatchNorm( + var_92, batch_normalization_22_gamma, batch_normalization_22_beta, + batch_normalization_22_mean, batch_normalization_22_variance, 0.001); + void *var_94 = tensorRelu(var_93); + void *var_95 = tensorConvolution(var_94, conv2d_23_w, 1, 1, 1, 1, 1, 1); + void *var_96 = tensorAdd(var_95, conv2d_23_b); + void *var_97 = tensorBatchNorm( + var_96, batch_normalization_23_gamma, batch_normalization_23_beta, + batch_normalization_23_mean, batch_normalization_23_variance, 0.001); + void *var_98 = tensorRelu(var_97); + void *var_99 = tensorConvolution(var_98, conv2d_24_w, 0, 0, 1, 1, 1, 1); + void *var_100 = tensorAdd(var_99, conv2d_24_b); + void *var_101 = tensorBatchNorm( + var_100, batch_normalization_24_gamma, batch_normalization_24_beta, + batch_normalization_24_mean, batch_normalization_24_variance, 0.001); + void *var_102 = tensorAdd(var_101, var_90); + void *var_103 = tensorRelu(var_102); + void *var_104 = tensorConvolution(var_103, conv2d_25_w, 0, 0, 2, 2, 1, 1); + void *var_105 = tensorAdd(var_104, conv2d_25_b); + void *var_106 = tensorBatchNorm( + var_105, batch_normalization_25_gamma, batch_normalization_25_beta, + batch_normalization_25_mean, batch_normalization_25_variance, 0.001); + void *var_107 = tensorRelu(var_106); + void *var_108 = tensorConvolution(var_107, conv2d_26_w, 1, 1, 1, 1, 1, 1); + void *var_109 = tensorAdd(var_108, conv2d_26_b); + void *var_110 = tensorBatchNorm( + var_109, batch_normalization_26_gamma, batch_normalization_26_beta, + batch_normalization_26_mean, batch_normalization_26_variance, 0.001); + void *var_111 = tensorRelu(var_110); + void *var_112 = tensorConvolution(var_111, conv2d_27_w, 0, 0, 1, 1, 1, 1); + void *var_113 = tensorAdd(var_112, conv2d_27_b); + void *var_114 = tensorBatchNorm( + var_113, batch_normalization_27_gamma, batch_normalization_27_beta, + batch_normalization_27_mean, batch_normalization_27_variance, 0.001); + void *var_115 = tensorConvolution(var_103, conv2d_28_w, 0, 0, 2, 2, 1, 1); + void *var_116 = tensorAdd(var_115, conv2d_28_b); + void *var_117 = tensorBatchNorm( + var_116, batch_normalization_28_gamma, batch_normalization_28_beta, + batch_normalization_28_mean, batch_normalization_28_variance, 0.001); + void *var_118 = tensorAdd(var_114, var_117); + void *var_119 = tensorRelu(var_118); + void *var_120 = tensorConvolution(var_119, conv2d_29_w, 0, 0, 1, 1, 1, 1); + void *var_121 = tensorAdd(var_120, conv2d_29_b); + void *var_122 = tensorBatchNorm( + var_121, batch_normalization_29_gamma, batch_normalization_29_beta, + batch_normalization_29_mean, batch_normalization_29_variance, 0.001); + void *var_123 = tensorRelu(var_122); + void *var_124 = tensorConvolution(var_123, conv2d_30_w, 1, 1, 1, 1, 1, 1); + void *var_125 = tensorAdd(var_124, conv2d_30_b); + void *var_126 = tensorBatchNorm( + var_125, batch_normalization_30_gamma, batch_normalization_30_beta, + batch_normalization_30_mean, batch_normalization_30_variance, 0.001); + void *var_127 = tensorRelu(var_126); + void *var_128 = tensorConvolution(var_127, conv2d_31_w, 0, 0, 1, 1, 1, 1); + void *var_129 = tensorAdd(var_128, conv2d_31_b); + void *var_130 = tensorBatchNorm( + var_129, batch_normalization_31_gamma, batch_normalization_31_beta, + batch_normalization_31_mean, batch_normalization_31_variance, 0.001); + void *var_131 = tensorAdd(var_130, var_119); + void *var_132 = tensorRelu(var_131); + void *var_133 = tensorConvolution(var_132, conv2d_32_w, 0, 0, 1, 1, 1, 1); + void *var_134 = tensorAdd(var_133, conv2d_32_b); + void *var_135 = tensorBatchNorm( + var_134, batch_normalization_32_gamma, batch_normalization_32_beta, + batch_normalization_32_mean, batch_normalization_32_variance, 0.001); + void *var_136 = tensorRelu(var_135); + void *var_137 = tensorConvolution(var_136, conv2d_33_w, 1, 1, 1, 1, 1, 1); + void *var_138 = tensorAdd(var_137, conv2d_33_b); + void *var_139 = tensorBatchNorm( + var_138, batch_normalization_33_gamma, batch_normalization_33_beta, + batch_normalization_33_mean, batch_normalization_33_variance, 0.001); + void *var_140 = tensorRelu(var_139); + void *var_141 = tensorConvolution(var_140, conv2d_34_w, 0, 0, 1, 1, 1, 1); + void *var_142 = tensorAdd(var_141, conv2d_34_b); + void *var_143 = tensorBatchNorm( + var_142, batch_normalization_34_gamma, batch_normalization_34_beta, + batch_normalization_34_mean, batch_normalization_34_variance, 0.001); + void *var_144 = tensorAdd(var_143, var_132); + void *var_145 = tensorRelu(var_144); + void *var_146 = tensorConvolution(var_145, conv2d_35_w, 0, 0, 1, 1, 1, 1); + void *var_147 = tensorAdd(var_146, conv2d_35_b); + void *var_148 = tensorBatchNorm( + var_147, batch_normalization_35_gamma, batch_normalization_35_beta, + batch_normalization_35_mean, batch_normalization_35_variance, 0.001); + void *var_149 = tensorRelu(var_148); + void *var_150 = tensorConvolution(var_149, conv2d_36_w, 1, 1, 1, 1, 1, 1); + void *var_151 = tensorAdd(var_150, conv2d_36_b); + void *var_152 = tensorBatchNorm( + var_151, batch_normalization_36_gamma, batch_normalization_36_beta, + batch_normalization_36_mean, batch_normalization_36_variance, 0.001); + void *var_153 = tensorRelu(var_152); + void *var_154 = tensorConvolution(var_153, conv2d_37_w, 0, 0, 1, 1, 1, 1); + void *var_155 = tensorAdd(var_154, conv2d_37_b); + void *var_156 = tensorBatchNorm( + var_155, batch_normalization_37_gamma, batch_normalization_37_beta, + batch_normalization_37_mean, batch_normalization_37_variance, 0.001); + void *var_157 = tensorAdd(var_156, var_145); + void *var_158 = tensorRelu(var_157); + void *var_159 = tensorConvolution(var_158, conv2d_38_w, 0, 0, 1, 1, 1, 1); + void *var_160 = tensorAdd(var_159, conv2d_38_b); + void *var_161 = tensorBatchNorm( + var_160, batch_normalization_38_gamma, batch_normalization_38_beta, + batch_normalization_38_mean, batch_normalization_38_variance, 0.001); + void *var_162 = tensorRelu(var_161); + void *var_163 = tensorConvolution(var_162, conv2d_39_w, 1, 1, 1, 1, 1, 1); + void *var_164 = tensorAdd(var_163, conv2d_39_b); + void *var_165 = tensorBatchNorm( + var_164, batch_normalization_39_gamma, batch_normalization_39_beta, + batch_normalization_39_mean, batch_normalization_39_variance, 0.001); + void *var_166 = tensorRelu(var_165); + void *var_167 = tensorConvolution(var_166, conv2d_40_w, 0, 0, 1, 1, 1, 1); + void *var_168 = tensorAdd(var_167, conv2d_40_b); + void *var_169 = tensorBatchNorm( + var_168, batch_normalization_40_gamma, batch_normalization_40_beta, + batch_normalization_40_mean, batch_normalization_40_variance, 0.001); + void *var_170 = tensorAdd(var_169, var_158); + void *var_171 = tensorRelu(var_170); + void *var_172 = tensorConvolution(var_171, conv2d_41_w, 0, 0, 1, 1, 1, 1); + void *var_173 = tensorAdd(var_172, conv2d_41_b); + void *var_174 = tensorBatchNorm( + var_173, batch_normalization_41_gamma, batch_normalization_41_beta, + batch_normalization_41_mean, batch_normalization_41_variance, 0.001); + void *var_175 = tensorRelu(var_174); + void *var_176 = tensorConvolution(var_175, conv2d_42_w, 1, 1, 1, 1, 1, 1); + void *var_177 = tensorAdd(var_176, conv2d_42_b); + void *var_178 = tensorBatchNorm( + var_177, batch_normalization_42_gamma, batch_normalization_42_beta, + batch_normalization_42_mean, batch_normalization_42_variance, 0.001); + void *var_179 = tensorRelu(var_178); + void *var_180 = tensorConvolution(var_179, conv2d_43_w, 0, 0, 1, 1, 1, 1); + void *var_181 = tensorAdd(var_180, conv2d_43_b); + void *var_182 = tensorBatchNorm( + var_181, batch_normalization_43_gamma, batch_normalization_43_beta, + batch_normalization_43_mean, batch_normalization_43_variance, 0.001); + void *var_183 = tensorAdd(var_182, var_171); + void *var_184 = tensorRelu(var_183); + void *var_185 = tensorConvolution(var_184, conv2d_44_w, 0, 0, 2, 2, 1, 1); + void *var_186 = tensorAdd(var_185, conv2d_44_b); + void *var_187 = tensorBatchNorm( + var_186, batch_normalization_44_gamma, batch_normalization_44_beta, + batch_normalization_44_mean, batch_normalization_44_variance, 0.001); + void *var_188 = tensorRelu(var_187); + void *var_189 = tensorConvolution(var_188, conv2d_45_w, 1, 1, 1, 1, 1, 1); + void *var_190 = tensorAdd(var_189, conv2d_45_b); + void *var_191 = tensorBatchNorm( + var_190, batch_normalization_45_gamma, batch_normalization_45_beta, + batch_normalization_45_mean, batch_normalization_45_variance, 0.001); + void *var_192 = tensorRelu(var_191); + void *var_193 = tensorConvolution(var_192, conv2d_46_w, 0, 0, 1, 1, 1, 1); + void *var_194 = tensorAdd(var_193, conv2d_46_b); + void *var_195 = tensorBatchNorm( + var_194, batch_normalization_46_gamma, batch_normalization_46_beta, + batch_normalization_46_mean, batch_normalization_46_variance, 0.001); + void *var_196 = tensorConvolution(var_184, conv2d_47_w, 0, 0, 2, 2, 1, 1); + void *var_197 = tensorAdd(var_196, conv2d_47_b); + void *var_198 = tensorBatchNorm( + var_197, batch_normalization_47_gamma, batch_normalization_47_beta, + batch_normalization_47_mean, batch_normalization_47_variance, 0.001); + void *var_199 = tensorAdd(var_195, var_198); + void *var_200 = tensorRelu(var_199); + void *var_201 = tensorConvolution(var_200, conv2d_48_w, 0, 0, 1, 1, 1, 1); + void *var_202 = tensorAdd(var_201, conv2d_48_b); + void *var_203 = tensorBatchNorm( + var_202, batch_normalization_48_gamma, batch_normalization_48_beta, + batch_normalization_48_mean, batch_normalization_48_variance, 0.001); + void *var_204 = tensorRelu(var_203); + void *var_205 = tensorConvolution(var_204, conv2d_49_w, 1, 1, 1, 1, 1, 1); + void *var_206 = tensorAdd(var_205, conv2d_49_b); + void *var_207 = tensorBatchNorm( + var_206, batch_normalization_49_gamma, batch_normalization_49_beta, + batch_normalization_49_mean, batch_normalization_49_variance, 0.001); + void *var_208 = tensorRelu(var_207); + void *var_209 = tensorConvolution(var_208, conv2d_50_w, 0, 0, 1, 1, 1, 1); + void *var_210 = tensorAdd(var_209, conv2d_50_b); + void *var_211 = tensorBatchNorm( + var_210, batch_normalization_50_gamma, batch_normalization_50_beta, + batch_normalization_50_mean, batch_normalization_50_variance, 0.001); + void *var_212 = tensorAdd(var_211, var_200); + void *var_213 = tensorRelu(var_212); + void *var_214 = tensorConvolution(var_213, conv2d_51_w, 0, 0, 1, 1, 1, 1); + void *var_215 = tensorAdd(var_214, conv2d_51_b); + void *var_216 = tensorBatchNorm( + var_215, batch_normalization_51_gamma, batch_normalization_51_beta, + batch_normalization_51_mean, batch_normalization_51_variance, 0.001); + void *var_217 = tensorRelu(var_216); + void *var_218 = tensorConvolution(var_217, conv2d_52_w, 1, 1, 1, 1, 1, 1); + void *var_219 = tensorAdd(var_218, conv2d_52_b); + void *var_220 = tensorBatchNorm( + var_219, batch_normalization_52_gamma, batch_normalization_52_beta, + batch_normalization_52_mean, batch_normalization_52_variance, 0.001); + void *var_221 = tensorRelu(var_220); + void *var_222 = tensorConvolution(var_221, conv2d_53_w, 0, 0, 1, 1, 1, 1); + void *var_223 = tensorAdd(var_222, conv2d_53_b); + void *var_224 = tensorBatchNorm( + var_223, batch_normalization_53_gamma, batch_normalization_53_beta, + batch_normalization_53_mean, batch_normalization_53_variance, 0.001); + void *var_225 = tensorAdd(var_224, var_213); + void *var_226 = tensorRelu(var_225); + void *var_227 = tensorPooling(var_226, 1, 7, 7, 0, 0, 7, 7); + void *var_229 = tensorGemmGPU(var_227, dense_1_w); + void *var_230 = tensorAdd(var_229, dense_1_b); + void *var_231 = tensorSoftmax(var_230); - for(int i = 0; i < batch_count; i++){ + uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end); - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); - - void* var_2 = tensorConvolution(input, conv2d_1_w, 3, 3, 2, 2, 1, 1); - void* var_3 = tensorAdd(var_2, conv2d_1_b); - void* var_4 = tensorRelu(var_3); - void* var_5 = tensorPooling(var_4,0,3,3,0,0,2,2); - void* var_6 = tensorBatchNorm(var_5, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, 0.001); - void* var_7 = tensorConvolution(var_6, conv2d_2_w, 0, 0, 1, 1, 1, 1); - void* var_8 = tensorAdd(var_7, conv2d_2_b); - void* var_9 = tensorBatchNorm(var_8, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, 0.001); - void* var_10 = tensorRelu(var_9); - void* var_11 = tensorConvolution(var_10, conv2d_3_w, 1, 1, 1, 1, 1, 1); - void* var_12 = tensorAdd(var_11, conv2d_3_b); - void* var_13 = tensorBatchNorm(var_12, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, 0.001); - void* var_14 = tensorRelu(var_13); - void* var_15 = tensorConvolution(var_14, conv2d_4_w, 0, 0, 1, 1, 1, 1); - void* var_16 = tensorAdd(var_15, conv2d_4_b); - void* var_17 = tensorBatchNorm(var_16, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, 0.001); - void* var_18 = tensorConvolution(var_6, conv2d_5_w, 0, 0, 1, 1, 1, 1); - void* var_19 = tensorAdd(var_18, conv2d_5_b); - void* var_20 = tensorBatchNorm(var_19, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, 0.001); - void* var_21 = tensorAdd(var_17, var_20); - void* var_22 = tensorRelu(var_21); - void* var_23 = tensorConvolution(var_22, conv2d_6_w, 0, 0, 1, 1, 1, 1); - void* var_24 = tensorAdd(var_23, conv2d_6_b); - void* var_25 = tensorBatchNorm(var_24, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, 0.001); - void* var_26 = tensorRelu(var_25); - void* var_27 = tensorConvolution(var_26, conv2d_7_w, 1, 1, 1, 1, 1, 1); - void* var_28 = tensorAdd(var_27, conv2d_7_b); - void* var_29 = tensorBatchNorm(var_28, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, 0.001); - void* var_30 = tensorRelu(var_29); - void* var_31 = tensorConvolution(var_30, conv2d_8_w, 0, 0, 1, 1, 1, 1); - void* var_32 = tensorAdd(var_31, conv2d_8_b); - void* var_33 = tensorBatchNorm(var_32, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, 0.001); - void* var_34 = tensorAdd(var_33, var_22); - void* var_35 = tensorRelu(var_34); - void* var_36 = tensorConvolution(var_35, conv2d_9_w, 0, 0, 1, 1, 1, 1); - void* var_37 = tensorAdd(var_36, conv2d_9_b); - void* var_38 = tensorBatchNorm(var_37, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, 0.001); - void* var_39 = tensorRelu(var_38); - void* var_40 = tensorConvolution(var_39, conv2d_10_w, 1, 1, 1, 1, 1, 1); - void* var_41 = tensorAdd(var_40, conv2d_10_b); - void* var_42 = tensorBatchNorm(var_41, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, 0.001); - void* var_43 = tensorRelu(var_42); - void* var_44 = tensorConvolution(var_43, conv2d_11_w, 0, 0, 1, 1, 1, 1); - void* var_45 = tensorAdd(var_44, conv2d_11_b); - void* var_46 = tensorBatchNorm(var_45, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, 0.001); - void* var_47 = tensorAdd(var_46, var_35); - void* var_48 = tensorRelu(var_47); - void* var_49 = tensorConvolution(var_48, conv2d_12_w, 0, 0, 2, 2, 1, 1); - void* var_50 = tensorAdd(var_49, conv2d_12_b); - void* var_51 = tensorBatchNorm(var_50, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, 0.001); - void* var_52 = tensorRelu(var_51); - void* var_53 = tensorConvolution(var_52, conv2d_13_w, 1, 1, 1, 1, 1, 1); - void* var_54 = tensorAdd(var_53, conv2d_13_b); - void* var_55 = tensorBatchNorm(var_54, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, 0.001); - void* var_56 = tensorRelu(var_55); - void* var_57 = tensorConvolution(var_56, conv2d_14_w, 0, 0, 1, 1, 1, 1); - void* var_58 = tensorAdd(var_57, conv2d_14_b); - void* var_59 = tensorBatchNorm(var_58, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, 0.001); - void* var_60 = tensorConvolution(var_48, conv2d_15_w, 0, 0, 2, 2, 1, 1); - void* var_61 = tensorAdd(var_60, conv2d_15_b); - void* var_62 = tensorBatchNorm(var_61, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, 0.001); - void* var_63 = tensorAdd(var_59, var_62); - void* var_64 = tensorRelu(var_63); - void* var_65 = tensorConvolution(var_64, conv2d_16_w, 0, 0, 1, 1, 1, 1); - void* var_66 = tensorAdd(var_65, conv2d_16_b); - void* var_67 = tensorBatchNorm(var_66, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, 0.001); - void* var_68 = tensorRelu(var_67); - void* var_69 = tensorConvolution(var_68, conv2d_17_w, 1, 1, 1, 1, 1, 1); - void* var_70 = tensorAdd(var_69, conv2d_17_b); - void* var_71 = tensorBatchNorm(var_70, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, 0.001); - void* var_72 = tensorRelu(var_71); - void* var_73 = tensorConvolution(var_72, conv2d_18_w, 0, 0, 1, 1, 1, 1); - void* var_74 = tensorAdd(var_73, conv2d_18_b); - void* var_75 = tensorBatchNorm(var_74, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, 0.001); - void* var_76 = tensorAdd(var_75, var_64); - void* var_77 = tensorRelu(var_76); - void* var_78 = tensorConvolution(var_77, conv2d_19_w, 0, 0, 1, 1, 1, 1); - void* var_79 = tensorAdd(var_78, conv2d_19_b); - void* var_80 = tensorBatchNorm(var_79, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, 0.001); - void* var_81 = tensorRelu(var_80); - void* var_82 = tensorConvolution(var_81, conv2d_20_w, 1, 1, 1, 1, 1, 1); - void* var_83 = tensorAdd(var_82, conv2d_20_b); - void* var_84 = tensorBatchNorm(var_83, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, 0.001); - void* var_85 = tensorRelu(var_84); - void* var_86 = tensorConvolution(var_85, conv2d_21_w, 0, 0, 1, 1, 1, 1); - void* var_87 = tensorAdd(var_86, conv2d_21_b); - void* var_88 = tensorBatchNorm(var_87, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, 0.001); - void* var_89 = tensorAdd(var_88, var_77); - void* var_90 = tensorRelu(var_89); - void* var_91 = tensorConvolution(var_90, conv2d_22_w, 0, 0, 1, 1, 1, 1); - void* var_92 = tensorAdd(var_91, conv2d_22_b); - void* var_93 = tensorBatchNorm(var_92, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, 0.001); - void* var_94 = tensorRelu(var_93); - void* var_95 = tensorConvolution(var_94, conv2d_23_w, 1, 1, 1, 1, 1, 1); - void* var_96 = tensorAdd(var_95, conv2d_23_b); - void* var_97 = tensorBatchNorm(var_96, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, 0.001); - void* var_98 = tensorRelu(var_97); - void* var_99 = tensorConvolution(var_98, conv2d_24_w, 0, 0, 1, 1, 1, 1); - void* var_100 = tensorAdd(var_99, conv2d_24_b); - void* var_101 = tensorBatchNorm(var_100, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, 0.001); - void* var_102 = tensorAdd(var_101, var_90); - void* var_103 = tensorRelu(var_102); - void* var_104 = tensorConvolution(var_103, conv2d_25_w, 0, 0, 2, 2, 1, 1); - void* var_105 = tensorAdd(var_104, conv2d_25_b); - void* var_106 = tensorBatchNorm(var_105, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, 0.001); - void* var_107 = tensorRelu(var_106); - void* var_108 = tensorConvolution(var_107, conv2d_26_w, 1, 1, 1, 1, 1, 1); - void* var_109 = tensorAdd(var_108, conv2d_26_b); - void* var_110 = tensorBatchNorm(var_109, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, 0.001); - void* var_111 = tensorRelu(var_110); - void* var_112 = tensorConvolution(var_111, conv2d_27_w, 0, 0, 1, 1, 1, 1); - void* var_113 = tensorAdd(var_112, conv2d_27_b); - void* var_114 = tensorBatchNorm(var_113, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, 0.001); - void* var_115 = tensorConvolution(var_103, conv2d_28_w, 0, 0, 2, 2, 1, 1); - void* var_116 = tensorAdd(var_115, conv2d_28_b); - void* var_117 = tensorBatchNorm(var_116, batch_normalization_28_gamma, batch_normalization_28_beta, batch_normalization_28_mean, batch_normalization_28_variance, 0.001); - void* var_118 = tensorAdd(var_114, var_117); - void* var_119 = tensorRelu(var_118); - void* var_120 = tensorConvolution(var_119, conv2d_29_w, 0, 0, 1, 1, 1, 1); - void* var_121 = tensorAdd(var_120, conv2d_29_b); - void* var_122 = tensorBatchNorm(var_121, batch_normalization_29_gamma, batch_normalization_29_beta, batch_normalization_29_mean, batch_normalization_29_variance, 0.001); - void* var_123 = tensorRelu(var_122); - void* var_124 = tensorConvolution(var_123, conv2d_30_w, 1, 1, 1, 1, 1, 1); - void* var_125 = tensorAdd(var_124, conv2d_30_b); - void* var_126 = tensorBatchNorm(var_125, batch_normalization_30_gamma, batch_normalization_30_beta, batch_normalization_30_mean, batch_normalization_30_variance, 0.001); - void* var_127 = tensorRelu(var_126); - void* var_128 = tensorConvolution(var_127, conv2d_31_w, 0, 0, 1, 1, 1, 1); - void* var_129 = tensorAdd(var_128, conv2d_31_b); - void* var_130 = tensorBatchNorm(var_129, batch_normalization_31_gamma, batch_normalization_31_beta, batch_normalization_31_mean, batch_normalization_31_variance, 0.001); - void* var_131 = tensorAdd(var_130, var_119); - void* var_132 = tensorRelu(var_131); - void* var_133 = tensorConvolution(var_132, conv2d_32_w, 0, 0, 1, 1, 1, 1); - void* var_134 = tensorAdd(var_133, conv2d_32_b); - void* var_135 = tensorBatchNorm(var_134, batch_normalization_32_gamma, batch_normalization_32_beta, batch_normalization_32_mean, batch_normalization_32_variance, 0.001); - void* var_136 = tensorRelu(var_135); - void* var_137 = tensorConvolution(var_136, conv2d_33_w, 1, 1, 1, 1, 1, 1); - void* var_138 = tensorAdd(var_137, conv2d_33_b); - void* var_139 = tensorBatchNorm(var_138, batch_normalization_33_gamma, batch_normalization_33_beta, batch_normalization_33_mean, batch_normalization_33_variance, 0.001); - void* var_140 = tensorRelu(var_139); - void* var_141 = tensorConvolution(var_140, conv2d_34_w, 0, 0, 1, 1, 1, 1); - void* var_142 = tensorAdd(var_141, conv2d_34_b); - void* var_143 = tensorBatchNorm(var_142, batch_normalization_34_gamma, batch_normalization_34_beta, batch_normalization_34_mean, batch_normalization_34_variance, 0.001); - void* var_144 = tensorAdd(var_143, var_132); - void* var_145 = tensorRelu(var_144); - void* var_146 = tensorConvolution(var_145, conv2d_35_w, 0, 0, 1, 1, 1, 1); - void* var_147 = tensorAdd(var_146, conv2d_35_b); - void* var_148 = tensorBatchNorm(var_147, batch_normalization_35_gamma, batch_normalization_35_beta, batch_normalization_35_mean, batch_normalization_35_variance, 0.001); - void* var_149 = tensorRelu(var_148); - void* var_150 = tensorConvolution(var_149, conv2d_36_w, 1, 1, 1, 1, 1, 1); - void* var_151 = tensorAdd(var_150, conv2d_36_b); - void* var_152 = tensorBatchNorm(var_151, batch_normalization_36_gamma, batch_normalization_36_beta, batch_normalization_36_mean, batch_normalization_36_variance, 0.001); - void* var_153 = tensorRelu(var_152); - void* var_154 = tensorConvolution(var_153, conv2d_37_w, 0, 0, 1, 1, 1, 1); - void* var_155 = tensorAdd(var_154, conv2d_37_b); - void* var_156 = tensorBatchNorm(var_155, batch_normalization_37_gamma, batch_normalization_37_beta, batch_normalization_37_mean, batch_normalization_37_variance, 0.001); - void* var_157 = tensorAdd(var_156, var_145); - void* var_158 = tensorRelu(var_157); - void* var_159 = tensorConvolution(var_158, conv2d_38_w, 0, 0, 1, 1, 1, 1); - void* var_160 = tensorAdd(var_159, conv2d_38_b); - void* var_161 = tensorBatchNorm(var_160, batch_normalization_38_gamma, batch_normalization_38_beta, batch_normalization_38_mean, batch_normalization_38_variance, 0.001); - void* var_162 = tensorRelu(var_161); - void* var_163 = tensorConvolution(var_162, conv2d_39_w, 1, 1, 1, 1, 1, 1); - void* var_164 = tensorAdd(var_163, conv2d_39_b); - void* var_165 = tensorBatchNorm(var_164, batch_normalization_39_gamma, batch_normalization_39_beta, batch_normalization_39_mean, batch_normalization_39_variance, 0.001); - void* var_166 = tensorRelu(var_165); - void* var_167 = tensorConvolution(var_166, conv2d_40_w, 0, 0, 1, 1, 1, 1); - void* var_168 = tensorAdd(var_167, conv2d_40_b); - void* var_169 = tensorBatchNorm(var_168, batch_normalization_40_gamma, batch_normalization_40_beta, batch_normalization_40_mean, batch_normalization_40_variance, 0.001); - void* var_170 = tensorAdd(var_169, var_158); - void* var_171 = tensorRelu(var_170); - void* var_172 = tensorConvolution(var_171, conv2d_41_w, 0, 0, 1, 1, 1, 1); - void* var_173 = tensorAdd(var_172, conv2d_41_b); - void* var_174 = tensorBatchNorm(var_173, batch_normalization_41_gamma, batch_normalization_41_beta, batch_normalization_41_mean, batch_normalization_41_variance, 0.001); - void* var_175 = tensorRelu(var_174); - void* var_176 = tensorConvolution(var_175, conv2d_42_w, 1, 1, 1, 1, 1, 1); - void* var_177 = tensorAdd(var_176, conv2d_42_b); - void* var_178 = tensorBatchNorm(var_177, batch_normalization_42_gamma, batch_normalization_42_beta, batch_normalization_42_mean, batch_normalization_42_variance, 0.001); - void* var_179 = tensorRelu(var_178); - void* var_180 = tensorConvolution(var_179, conv2d_43_w, 0, 0, 1, 1, 1, 1); - void* var_181 = tensorAdd(var_180, conv2d_43_b); - void* var_182 = tensorBatchNorm(var_181, batch_normalization_43_gamma, batch_normalization_43_beta, batch_normalization_43_mean, batch_normalization_43_variance, 0.001); - void* var_183 = tensorAdd(var_182, var_171); - void* var_184 = tensorRelu(var_183); - void* var_185 = tensorConvolution(var_184, conv2d_44_w, 0, 0, 2, 2, 1, 1); - void* var_186 = tensorAdd(var_185, conv2d_44_b); - void* var_187 = tensorBatchNorm(var_186, batch_normalization_44_gamma, batch_normalization_44_beta, batch_normalization_44_mean, batch_normalization_44_variance, 0.001); - void* var_188 = tensorRelu(var_187); - void* var_189 = tensorConvolution(var_188, conv2d_45_w, 1, 1, 1, 1, 1, 1); - void* var_190 = tensorAdd(var_189, conv2d_45_b); - void* var_191 = tensorBatchNorm(var_190, batch_normalization_45_gamma, batch_normalization_45_beta, batch_normalization_45_mean, batch_normalization_45_variance, 0.001); - void* var_192 = tensorRelu(var_191); - void* var_193 = tensorConvolution(var_192, conv2d_46_w, 0, 0, 1, 1, 1, 1); - void* var_194 = tensorAdd(var_193, conv2d_46_b); - void* var_195 = tensorBatchNorm(var_194, batch_normalization_46_gamma, batch_normalization_46_beta, batch_normalization_46_mean, batch_normalization_46_variance, 0.001); - void* var_196 = tensorConvolution(var_184, conv2d_47_w, 0, 0, 2, 2, 1, 1); - void* var_197 = tensorAdd(var_196, conv2d_47_b); - void* var_198 = tensorBatchNorm(var_197, batch_normalization_47_gamma, batch_normalization_47_beta, batch_normalization_47_mean, batch_normalization_47_variance, 0.001); - void* var_199 = tensorAdd(var_195, var_198); - void* var_200 = tensorRelu(var_199); - void* var_201 = tensorConvolution(var_200, conv2d_48_w, 0, 0, 1, 1, 1, 1); - void* var_202 = tensorAdd(var_201, conv2d_48_b); - void* var_203 = tensorBatchNorm(var_202, batch_normalization_48_gamma, batch_normalization_48_beta, batch_normalization_48_mean, batch_normalization_48_variance, 0.001); - void* var_204 = tensorRelu(var_203); - void* var_205 = tensorConvolution(var_204, conv2d_49_w, 1, 1, 1, 1, 1, 1); - void* var_206 = tensorAdd(var_205, conv2d_49_b); - void* var_207 = tensorBatchNorm(var_206, batch_normalization_49_gamma, batch_normalization_49_beta, batch_normalization_49_mean, batch_normalization_49_variance, 0.001); - void* var_208 = tensorRelu(var_207); - void* var_209 = tensorConvolution(var_208, conv2d_50_w, 0, 0, 1, 1, 1, 1); - void* var_210 = tensorAdd(var_209, conv2d_50_b); - void* var_211 = tensorBatchNorm(var_210, batch_normalization_50_gamma, batch_normalization_50_beta, batch_normalization_50_mean, batch_normalization_50_variance, 0.001); - void* var_212 = tensorAdd(var_211, var_200); - void* var_213 = tensorRelu(var_212); - void* var_214 = tensorConvolution(var_213, conv2d_51_w, 0, 0, 1, 1, 1, 1); - void* var_215 = tensorAdd(var_214, conv2d_51_b); - void* var_216 = tensorBatchNorm(var_215, batch_normalization_51_gamma, batch_normalization_51_beta, batch_normalization_51_mean, batch_normalization_51_variance, 0.001); - void* var_217 = tensorRelu(var_216); - void* var_218 = tensorConvolution(var_217, conv2d_52_w, 1, 1, 1, 1, 1, 1); - void* var_219 = tensorAdd(var_218, conv2d_52_b); - void* var_220 = tensorBatchNorm(var_219, batch_normalization_52_gamma, batch_normalization_52_beta, batch_normalization_52_mean, batch_normalization_52_variance, 0.001); - void* var_221 = tensorRelu(var_220); - void* var_222 = tensorConvolution(var_221, conv2d_53_w, 0, 0, 1, 1, 1, 1); - void* var_223 = tensorAdd(var_222, conv2d_53_b); - void* var_224 = tensorBatchNorm(var_223, batch_normalization_53_gamma, batch_normalization_53_beta, batch_normalization_53_mean, batch_normalization_53_variance, 0.001); - void* var_225 = tensorAdd(var_224, var_213); - void* var_226 = tensorRelu(var_225); - void* var_227 = tensorPooling(var_226,1,7,7,0,0,7,7); - void* var_229 = tensorGemmGPU(var_227, dense_1_w); - void* var_230 = tensorAdd(var_229, dense_1_b); - void* var_231 = tensorSoftmax(var_230); - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy3(labels, var_231); - final_accuracy += accuracy; - freeBatchMemory(); - + float accuracy = computeAccuracy3(labels, var_231); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); - - - llvm_hpvm_cleanupTensorRt(); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc index a6dc7cbc11cf77357a749bff117489fc4b292941..7807cdced2f5472ff4dfe70c855e82da345f7953 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar10.cc @@ -1,82 +1,109 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,10); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,10,1,1); +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/vgg16_cifar10/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 10); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 10, 1, 1); startMemTracking(); @@ -85,77 +112,76 @@ int main(){ int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; - for(int i = 0; i < batch_count; i++){ + for (int i = 0; i < batch_count; i++) { int start = i * batch_size; int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_1 = tensorAdd(var_0, conv2d_1_b); - void* var_2 = tensorRelu(var_1); - void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_5 = tensorAdd(var_4, conv2d_2_b); - void* var_6 = tensorRelu(var_5); - void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_9 = tensorAdd(var_8, conv2d_3_b); - void* var_10 = tensorRelu(var_9); - void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_13 = tensorAdd(var_12, conv2d_4_b); - void* var_14 = tensorRelu(var_13); - void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); - void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorAdd(var_16, conv2d_5_b); - void* var_18 = tensorRelu(var_17); - void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_21 = tensorAdd(var_20, conv2d_6_b); - void* var_22 = tensorRelu(var_21); - void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorAdd(var_24, conv2d_7_b); - void* var_26 = tensorRelu(var_25); - void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); - void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorAdd(var_28, conv2d_8_b); - void* var_30 = tensorRelu(var_29); - void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_33 = tensorAdd(var_32, conv2d_9_b); - void* var_34 = tensorRelu(var_33); - void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); - void* var_37 = tensorAdd(var_36, conv2d_10_b); - void* var_38 = tensorRelu(var_37); - void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); - void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_41 = tensorAdd(var_40, conv2d_11_b); - void* var_42 = tensorRelu(var_41); - void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_45 = tensorAdd(var_44, conv2d_12_b); - void* var_46 = tensorRelu(var_45); - void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_49 = tensorAdd(var_48, conv2d_13_b); - void* var_50 = tensorRelu(var_49); - void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); - void* var_54 = tensorGemmGPU(var_51, dense_1_w); - void* var_55 = tensorAdd(var_54, dense_1_b); - void* var_56 = tensorRelu(var_55); - void* var_58 = tensorGemmGPU(var_56, dense_2_w); - void* var_59 = tensorAdd(var_58, dense_2_b); - void* var_60 = tensorSoftmax(var_59); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - - float accuracy = computeAccuracy2(labels,batch_size,var_60); + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_1 = tensorAdd(var_0, conv2d_1_b); + void *var_2 = tensorRelu(var_1); + void *var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_5 = tensorAdd(var_4, conv2d_2_b); + void *var_6 = tensorRelu(var_5); + void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_9 = tensorAdd(var_8, conv2d_3_b); + void *var_10 = tensorRelu(var_9); + void *var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_13 = tensorAdd(var_12, conv2d_4_b); + void *var_14 = tensorRelu(var_13); + void *var_15 = tensorPooling(var_14, 0, 2, 2, 0, 0, 2, 2); + void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorAdd(var_16, conv2d_5_b); + void *var_18 = tensorRelu(var_17); + void *var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_21 = tensorAdd(var_20, conv2d_6_b); + void *var_22 = tensorRelu(var_21); + void *var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorAdd(var_24, conv2d_7_b); + void *var_26 = tensorRelu(var_25); + void *var_27 = tensorPooling(var_26, 0, 2, 2, 0, 0, 2, 2); + void *var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorAdd(var_28, conv2d_8_b); + void *var_30 = tensorRelu(var_29); + void *var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_33 = tensorAdd(var_32, conv2d_9_b); + void *var_34 = tensorRelu(var_33); + void *var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void *var_37 = tensorAdd(var_36, conv2d_10_b); + void *var_38 = tensorRelu(var_37); + void *var_39 = tensorPooling(var_38, 0, 2, 2, 0, 0, 2, 2); + void *var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_41 = tensorAdd(var_40, conv2d_11_b); + void *var_42 = tensorRelu(var_41); + void *var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_45 = tensorAdd(var_44, conv2d_12_b); + void *var_46 = tensorRelu(var_45); + void *var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_49 = tensorAdd(var_48, conv2d_13_b); + void *var_50 = tensorRelu(var_49); + void *var_51 = tensorPooling(var_50, 0, 2, 2, 0, 0, 2, 2); + void *var_54 = tensorGemmGPU(var_51, dense_1_w); + void *var_55 = tensorAdd(var_54, dense_1_b); + void *var_56 = tensorRelu(var_55); + void *var_58 = tensorGemmGPU(var_56, dense_2_w); + void *var_59 = tensorAdd(var_58, dense_2_b); + void *var_60 = tensorSoftmax(var_59); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60); final_accuracy += accuracy; - + freeBatchMemory(); } final_accuracy = final_accuracy / batch_count; dumpFinalAccuracy(final_accuracy); - - llvm_hpvm_cleanupTensorRt(); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc index 2539f8d8722909724a9dc2890e82f4f98853f5cd..22afc20687f8d85b2033c01a375a9960a50e07d8 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_cifar100.cc @@ -1,161 +1,187 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "../../tensor_runtime/include/tensor_runtime.h" -#include "../include/utils.h" - -int main(){ - - llvm_hpvm_initTensorRt(0); - - std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,512,512); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,512,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,512,100); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,100,1,1); - - - startMemTracking(); - - int test_input_size = 5000; - int batch_size = 5000; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; - - for(int i = 0; i < batch_count; i++){ - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); - void* var_1 = tensorAdd(var_0, conv2d_1_b); - void* var_2 = tensorRelu(var_1); - void* var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); - void* var_5 = tensorAdd(var_4, conv2d_2_b); - void* var_6 = tensorRelu(var_5); - void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_9 = tensorAdd(var_8, conv2d_3_b); - void* var_10 = tensorRelu(var_9); - void* var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_13 = tensorAdd(var_12, conv2d_4_b); - void* var_14 = tensorRelu(var_13); - void* var_15 = tensorPooling(var_14,0,2,2,0,0,2,2); - void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorAdd(var_16, conv2d_5_b); - void* var_18 = tensorRelu(var_17); - void* var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); - void* var_21 = tensorAdd(var_20, conv2d_6_b); - void* var_22 = tensorRelu(var_21); - void* var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); - void* var_25 = tensorAdd(var_24, conv2d_7_b); - void* var_26 = tensorRelu(var_25); - void* var_27 = tensorPooling(var_26,0,2,2,0,0,2,2); - void* var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); - void* var_29 = tensorAdd(var_28, conv2d_8_b); - void* var_30 = tensorRelu(var_29); - void* var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); - void* var_33 = tensorAdd(var_32, conv2d_9_b); - void* var_34 = tensorRelu(var_33); - void* var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); - void* var_37 = tensorAdd(var_36, conv2d_10_b); - void* var_38 = tensorRelu(var_37); - void* var_39 = tensorPooling(var_38,0,2,2,0,0,2,2); - void* var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); - void* var_41 = tensorAdd(var_40, conv2d_11_b); - void* var_42 = tensorRelu(var_41); - void* var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); - void* var_45 = tensorAdd(var_44, conv2d_12_b); - void* var_46 = tensorRelu(var_45); - void* var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); - void* var_49 = tensorAdd(var_48, conv2d_13_b); - void* var_50 = tensorRelu(var_49); - void* var_51 = tensorPooling(var_50,0,2,2,0,0,2,2); - void* var_54 = tensorGemmGPU(var_51, dense_1_w); - void* var_55 = tensorAdd(var_54, dense_1_b); - void* var_56 = tensorRelu(var_55); - void* var_58 = tensorGemmGPU(var_56, dense_2_w); - void* var_59 = tensorAdd(var_58, dense_2_b); - void* var_60 = tensorSoftmax(var_59); - - uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); - final_accuracy += accuracy; - freeBatchMemory(); - +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "../../tensor_runtime/include/tensor_runtime.h" +#include "../include/utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = model_params_path + std::string("/vgg16_cifar100/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 512, 512); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 512, 100); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 100, 1, 1); + + startMemTracking(); + + int test_input_size = 5000; + int batch_size = 5000; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = readInputBatch(input_path.c_str(), 0, start, end, 3, 32, 32); + + void *var_0 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 0); + void *var_1 = tensorAdd(var_0, conv2d_1_b); + void *var_2 = tensorRelu(var_1); + void *var_4 = tensorConvolution(var_2, conv2d_2_w, 1, 1, 1, 1, 1, 0); + void *var_5 = tensorAdd(var_4, conv2d_2_b); + void *var_6 = tensorRelu(var_5); + void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 0); + void *var_9 = tensorAdd(var_8, conv2d_3_b); + void *var_10 = tensorRelu(var_9); + void *var_12 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 0); + void *var_13 = tensorAdd(var_12, conv2d_4_b); + void *var_14 = tensorRelu(var_13); + void *var_15 = tensorPooling(var_14, 0, 2, 2, 0, 0, 2, 2); + void *var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); + void *var_17 = tensorAdd(var_16, conv2d_5_b); + void *var_18 = tensorRelu(var_17); + void *var_20 = tensorConvolution(var_18, conv2d_6_w, 1, 1, 1, 1, 1, 0); + void *var_21 = tensorAdd(var_20, conv2d_6_b); + void *var_22 = tensorRelu(var_21); + void *var_24 = tensorConvolution(var_22, conv2d_7_w, 1, 1, 1, 1, 1, 0); + void *var_25 = tensorAdd(var_24, conv2d_7_b); + void *var_26 = tensorRelu(var_25); + void *var_27 = tensorPooling(var_26, 0, 2, 2, 0, 0, 2, 2); + void *var_28 = tensorConvolution(var_27, conv2d_8_w, 1, 1, 1, 1, 1, 0); + void *var_29 = tensorAdd(var_28, conv2d_8_b); + void *var_30 = tensorRelu(var_29); + void *var_32 = tensorConvolution(var_30, conv2d_9_w, 1, 1, 1, 1, 1, 0); + void *var_33 = tensorAdd(var_32, conv2d_9_b); + void *var_34 = tensorRelu(var_33); + void *var_36 = tensorConvolution(var_34, conv2d_10_w, 1, 1, 1, 1, 1, 0); + void *var_37 = tensorAdd(var_36, conv2d_10_b); + void *var_38 = tensorRelu(var_37); + void *var_39 = tensorPooling(var_38, 0, 2, 2, 0, 0, 2, 2); + void *var_40 = tensorConvolution(var_39, conv2d_11_w, 1, 1, 1, 1, 1, 0); + void *var_41 = tensorAdd(var_40, conv2d_11_b); + void *var_42 = tensorRelu(var_41); + void *var_44 = tensorConvolution(var_42, conv2d_12_w, 1, 1, 1, 1, 1, 0); + void *var_45 = tensorAdd(var_44, conv2d_12_b); + void *var_46 = tensorRelu(var_45); + void *var_48 = tensorConvolution(var_46, conv2d_13_w, 1, 1, 1, 1, 1, 0); + void *var_49 = tensorAdd(var_48, conv2d_13_b); + void *var_50 = tensorRelu(var_49); + void *var_51 = tensorPooling(var_50, 0, 2, 2, 0, 0, 2, 2); + void *var_54 = tensorGemmGPU(var_51, dense_1_w); + void *var_55 = tensorAdd(var_54, dense_1_b); + void *var_56 = tensorRelu(var_55); + void *var_58 = tensorGemmGPU(var_56, dense_2_w); + void *var_59 = tensorAdd(var_58, dense_2_b); + void *var_60 = tensorSoftmax(var_59); + + uint8_t *labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, var_60, 100); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - llvm_hpvm_cleanupTensorRt(); + llvm_hpvm_cleanupTensorRt(); - return 0; + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc index 1d78065c5725deae9c14fc97a699fc14f55ad8ef..0e0a1dfbbca765dd323d83227476650d2d14460f 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/fp32/vgg16_imagenet.cc @@ -1,173 +1,199 @@ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <string.h> -#include "tensor_runtime.h" -#include "utils.h" - - - -int main(){ - - llvm_hpvm_initTensorRt(0); - - - std::string dir_prefix = std::string("/home/nvidia/sd_card/vgg16_imagenet_new/"); - std::string input_path = dir_prefix + std::string("input.bin"); - std::string labels_path = dir_prefix + std::string("labels.bin"); - std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,3,3); - std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,64,64,3,3); - std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,128,64,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,128,128,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,128,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,128,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); - void* conv2d_6_w = readTrainedWeights(conv2d_6_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); - void* conv2d_6_b = readTrainedWeights(conv2d_6_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); - void* conv2d_7_w = readTrainedWeights(conv2d_7_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); - void* conv2d_7_b = readTrainedWeights(conv2d_7_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); - void* conv2d_8_w = readTrainedWeights(conv2d_8_w_path.c_str(), 0,512,256,3,3); - std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); - void* conv2d_8_b = readTrainedWeights(conv2d_8_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); - void* conv2d_9_w = readTrainedWeights(conv2d_9_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); - void* conv2d_9_b = readTrainedWeights(conv2d_9_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); - void* conv2d_10_w = readTrainedWeights(conv2d_10_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); - void* conv2d_10_b = readTrainedWeights(conv2d_10_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); - void* conv2d_11_w = readTrainedWeights(conv2d_11_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); - void* conv2d_11_b = readTrainedWeights(conv2d_11_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); - void* conv2d_12_w = readTrainedWeights(conv2d_12_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); - void* conv2d_12_b = readTrainedWeights(conv2d_12_b_path.c_str(), 0,1,512,1,1); - std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); - void* conv2d_13_w = readTrainedWeights(conv2d_13_w_path.c_str(), 0,512,512,3,3); - std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); - void* conv2d_13_b = readTrainedWeights(conv2d_13_b_path.c_str(), 0,1,512,1,1); - std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,25088,4096); - std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); - std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); - void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); - std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); - void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); - std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); - void* dense_3_w = readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); - std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); - void* dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); - - - - startMemTracking(); - - int test_input_size = 500; - int batch_size = 100; - int batch_count = test_input_size / batch_size; - float final_accuracy = 0.0; - - for(int i = 0; i < batch_count; i++){ - - int start = i * batch_size; - int end = (i + 1) * batch_size; - - void* input = readInputBatch(input_path.c_str(),0,start,end,3,224,224); - - void* var_1 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); - void* var_2 = tensorAdd(var_1, conv2d_1_b); - void* var_3 = tensorRelu(var_2); - void* var_4 = tensorConvolution(var_3, conv2d_2_w, 1, 1, 1, 1, 1, 1); - void* var_5 = tensorAdd(var_4, conv2d_2_b); - void* var_6 = tensorRelu(var_5); - void* var_7 = tensorPooling(var_6,0,2,2,0,0,2,2); - void* var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 1); - void* var_9 = tensorAdd(var_8, conv2d_3_b); - void* var_10 = tensorRelu(var_9); - void* var_11 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 1); - void* var_12 = tensorAdd(var_11, conv2d_4_b); - void* var_13 = tensorRelu(var_12); - void* var_14 = tensorPooling(var_13,0,2,2,0,0,2,2); - void* var_15 = tensorConvolution(var_14, conv2d_5_w, 1, 1, 1, 1, 1, 1); - void* var_16 = tensorAdd(var_15, conv2d_5_b); - void* var_17 = tensorRelu(var_16); - void* var_18 = tensorConvolution(var_17, conv2d_6_w, 1, 1, 1, 1, 1, 1); - void* var_19 = tensorAdd(var_18, conv2d_6_b); - void* var_20 = tensorRelu(var_19); - void* var_21 = tensorConvolution(var_20, conv2d_7_w, 1, 1, 1, 1, 1, 1); - void* var_22 = tensorAdd(var_21, conv2d_7_b); - void* var_23 = tensorRelu(var_22); - void* var_24 = tensorPooling(var_23,0,2,2,0,0,2,2); - void* var_25 = tensorConvolution(var_24, conv2d_8_w, 1, 1, 1, 1, 1, 1); - void* var_26 = tensorAdd(var_25, conv2d_8_b); - void* var_27 = tensorRelu(var_26); - void* var_28 = tensorConvolution(var_27, conv2d_9_w, 1, 1, 1, 1, 1, 1); - void* var_29 = tensorAdd(var_28, conv2d_9_b); - void* var_30 = tensorRelu(var_29); - void* var_31 = tensorConvolution(var_30, conv2d_10_w, 1, 1, 1, 1, 1, 1); - void* var_32 = tensorAdd(var_31, conv2d_10_b); - void* var_33 = tensorRelu(var_32); - void* var_34 = tensorPooling(var_33,0,2,2,0,0,2,2); - void* var_35 = tensorConvolution(var_34, conv2d_11_w, 1, 1, 1, 1, 1, 1); - void* var_36 = tensorAdd(var_35, conv2d_11_b); - void* var_37 = tensorRelu(var_36); - void* var_38 = tensorConvolution(var_37, conv2d_12_w, 1, 1, 1, 1, 1, 1); - void* var_39 = tensorAdd(var_38, conv2d_12_b); - void* var_40 = tensorRelu(var_39); - void* var_41 = tensorConvolution(var_40, conv2d_13_w, 1, 1, 1, 1, 1, 1); - void* var_42 = tensorAdd(var_41, conv2d_13_b); - void* var_43 = tensorRelu(var_42); - void* var_44 = tensorPooling(var_43,0,2,2,0,0,2,2); - void* var_46 = tensorGemmGPU(var_44, dense_1_w); - void* var_47 = tensorAdd(var_46, dense_1_b); - void* var_48 = tensorRelu(var_47); - void* var_49 = tensorGemmGPU(var_48, dense_2_w); - void* var_50 = tensorAdd(var_49, dense_2_b); - void* var_51 = tensorRelu(var_50); - void* var_52 = tensorGemmGPU(var_51, dense_3_w); - void* var_53 = tensorAdd(var_52, dense_3_b); - void* var_54 = tensorSoftmax(var_53); - - uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); - - float accuracy = computeAccuracy3(labels, var_54); - final_accuracy += accuracy; - freeBatchMemory(); - +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> +#include "tensor_runtime.h" +#include "utils.h" + +int main() { + + llvm_hpvm_initTensorRt(0); + + std::string dir_prefix = + std::string("/home/nvidia/sd_card/vgg16_imagenet_new/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void *conv2d_1_w = + readTrainedWeights(conv2d_1_w_path.c_str(), 0, 64, 3, 3, 3); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void *conv2d_1_b = + readTrainedWeights(conv2d_1_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void *conv2d_2_w = + readTrainedWeights(conv2d_2_w_path.c_str(), 0, 64, 64, 3, 3); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void *conv2d_2_b = + readTrainedWeights(conv2d_2_b_path.c_str(), 0, 1, 64, 1, 1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void *conv2d_3_w = + readTrainedWeights(conv2d_3_w_path.c_str(), 0, 128, 64, 3, 3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void *conv2d_3_b = + readTrainedWeights(conv2d_3_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void *conv2d_4_w = + readTrainedWeights(conv2d_4_w_path.c_str(), 0, 128, 128, 3, 3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void *conv2d_4_b = + readTrainedWeights(conv2d_4_b_path.c_str(), 0, 1, 128, 1, 1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void *conv2d_5_w = + readTrainedWeights(conv2d_5_w_path.c_str(), 0, 256, 128, 3, 3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void *conv2d_5_b = + readTrainedWeights(conv2d_5_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_6_w_path = dir_prefix + std::string("conv2d_6_w.bin"); + void *conv2d_6_w = + readTrainedWeights(conv2d_6_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_6_b_path = dir_prefix + std::string("conv2d_6_b.bin"); + void *conv2d_6_b = + readTrainedWeights(conv2d_6_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_7_w_path = dir_prefix + std::string("conv2d_7_w.bin"); + void *conv2d_7_w = + readTrainedWeights(conv2d_7_w_path.c_str(), 0, 256, 256, 3, 3); + std::string conv2d_7_b_path = dir_prefix + std::string("conv2d_7_b.bin"); + void *conv2d_7_b = + readTrainedWeights(conv2d_7_b_path.c_str(), 0, 1, 256, 1, 1); + std::string conv2d_8_w_path = dir_prefix + std::string("conv2d_8_w.bin"); + void *conv2d_8_w = + readTrainedWeights(conv2d_8_w_path.c_str(), 0, 512, 256, 3, 3); + std::string conv2d_8_b_path = dir_prefix + std::string("conv2d_8_b.bin"); + void *conv2d_8_b = + readTrainedWeights(conv2d_8_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_9_w_path = dir_prefix + std::string("conv2d_9_w.bin"); + void *conv2d_9_w = + readTrainedWeights(conv2d_9_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_9_b_path = dir_prefix + std::string("conv2d_9_b.bin"); + void *conv2d_9_b = + readTrainedWeights(conv2d_9_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_10_w_path = dir_prefix + std::string("conv2d_10_w.bin"); + void *conv2d_10_w = + readTrainedWeights(conv2d_10_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_10_b_path = dir_prefix + std::string("conv2d_10_b.bin"); + void *conv2d_10_b = + readTrainedWeights(conv2d_10_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_11_w_path = dir_prefix + std::string("conv2d_11_w.bin"); + void *conv2d_11_w = + readTrainedWeights(conv2d_11_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_11_b_path = dir_prefix + std::string("conv2d_11_b.bin"); + void *conv2d_11_b = + readTrainedWeights(conv2d_11_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_12_w_path = dir_prefix + std::string("conv2d_12_w.bin"); + void *conv2d_12_w = + readTrainedWeights(conv2d_12_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_12_b_path = dir_prefix + std::string("conv2d_12_b.bin"); + void *conv2d_12_b = + readTrainedWeights(conv2d_12_b_path.c_str(), 0, 1, 512, 1, 1); + std::string conv2d_13_w_path = dir_prefix + std::string("conv2d_13_w.bin"); + void *conv2d_13_w = + readTrainedWeights(conv2d_13_w_path.c_str(), 0, 512, 512, 3, 3); + std::string conv2d_13_b_path = dir_prefix + std::string("conv2d_13_b.bin"); + void *conv2d_13_b = + readTrainedWeights(conv2d_13_b_path.c_str(), 0, 1, 512, 1, 1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void *dense_1_w = + readTrainedWeights(dense_1_w_path.c_str(), 0, 1, 1, 25088, 4096); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void *dense_1_b = + readTrainedWeights(dense_1_b_path.c_str(), 0, 1, 4096, 1, 1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void *dense_2_w = + readTrainedWeights(dense_2_w_path.c_str(), 0, 1, 1, 4096, 4096); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void *dense_2_b = + readTrainedWeights(dense_2_b_path.c_str(), 0, 1, 4096, 1, 1); + std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); + void *dense_3_w = + readTrainedWeights(dense_3_w_path.c_str(), 0, 1, 1, 4096, 1000); + std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); + void *dense_3_b = + readTrainedWeights(dense_3_b_path.c_str(), 0, 1, 1000, 1, 1); + + startMemTracking(); + + int test_input_size = 500; + int batch_size = 100; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + for (int i = 0; i < batch_count; i++) { + + int start = i * batch_size; + int end = (i + 1) * batch_size; + + void *input = + readInputBatch(input_path.c_str(), 0, start, end, 3, 224, 224); + + void *var_1 = tensorConvolution(input, conv2d_1_w, 1, 1, 1, 1, 1, 1); + void *var_2 = tensorAdd(var_1, conv2d_1_b); + void *var_3 = tensorRelu(var_2); + void *var_4 = tensorConvolution(var_3, conv2d_2_w, 1, 1, 1, 1, 1, 1); + void *var_5 = tensorAdd(var_4, conv2d_2_b); + void *var_6 = tensorRelu(var_5); + void *var_7 = tensorPooling(var_6, 0, 2, 2, 0, 0, 2, 2); + void *var_8 = tensorConvolution(var_7, conv2d_3_w, 1, 1, 1, 1, 1, 1); + void *var_9 = tensorAdd(var_8, conv2d_3_b); + void *var_10 = tensorRelu(var_9); + void *var_11 = tensorConvolution(var_10, conv2d_4_w, 1, 1, 1, 1, 1, 1); + void *var_12 = tensorAdd(var_11, conv2d_4_b); + void *var_13 = tensorRelu(var_12); + void *var_14 = tensorPooling(var_13, 0, 2, 2, 0, 0, 2, 2); + void *var_15 = tensorConvolution(var_14, conv2d_5_w, 1, 1, 1, 1, 1, 1); + void *var_16 = tensorAdd(var_15, conv2d_5_b); + void *var_17 = tensorRelu(var_16); + void *var_18 = tensorConvolution(var_17, conv2d_6_w, 1, 1, 1, 1, 1, 1); + void *var_19 = tensorAdd(var_18, conv2d_6_b); + void *var_20 = tensorRelu(var_19); + void *var_21 = tensorConvolution(var_20, conv2d_7_w, 1, 1, 1, 1, 1, 1); + void *var_22 = tensorAdd(var_21, conv2d_7_b); + void *var_23 = tensorRelu(var_22); + void *var_24 = tensorPooling(var_23, 0, 2, 2, 0, 0, 2, 2); + void *var_25 = tensorConvolution(var_24, conv2d_8_w, 1, 1, 1, 1, 1, 1); + void *var_26 = tensorAdd(var_25, conv2d_8_b); + void *var_27 = tensorRelu(var_26); + void *var_28 = tensorConvolution(var_27, conv2d_9_w, 1, 1, 1, 1, 1, 1); + void *var_29 = tensorAdd(var_28, conv2d_9_b); + void *var_30 = tensorRelu(var_29); + void *var_31 = tensorConvolution(var_30, conv2d_10_w, 1, 1, 1, 1, 1, 1); + void *var_32 = tensorAdd(var_31, conv2d_10_b); + void *var_33 = tensorRelu(var_32); + void *var_34 = tensorPooling(var_33, 0, 2, 2, 0, 0, 2, 2); + void *var_35 = tensorConvolution(var_34, conv2d_11_w, 1, 1, 1, 1, 1, 1); + void *var_36 = tensorAdd(var_35, conv2d_11_b); + void *var_37 = tensorRelu(var_36); + void *var_38 = tensorConvolution(var_37, conv2d_12_w, 1, 1, 1, 1, 1, 1); + void *var_39 = tensorAdd(var_38, conv2d_12_b); + void *var_40 = tensorRelu(var_39); + void *var_41 = tensorConvolution(var_40, conv2d_13_w, 1, 1, 1, 1, 1, 1); + void *var_42 = tensorAdd(var_41, conv2d_13_b); + void *var_43 = tensorRelu(var_42); + void *var_44 = tensorPooling(var_43, 0, 2, 2, 0, 0, 2, 2); + void *var_46 = tensorGemmGPU(var_44, dense_1_w); + void *var_47 = tensorAdd(var_46, dense_1_b); + void *var_48 = tensorRelu(var_47); + void *var_49 = tensorGemmGPU(var_48, dense_2_w); + void *var_50 = tensorAdd(var_49, dense_2_b); + void *var_51 = tensorRelu(var_50); + void *var_52 = tensorGemmGPU(var_51, dense_3_w); + void *var_53 = tensorAdd(var_52, dense_3_b); + void *var_54 = tensorSoftmax(var_53); + + uint32_t *labels = readLabelsBatch3(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy3(labels, var_54); + final_accuracy += accuracy; + freeBatchMemory(); } - final_accuracy = final_accuracy / batch_count; - dumpFinalAccuracy(final_accuracy); - - - llvm_hpvm_cleanupTensorRt(); + final_accuracy = final_accuracy / batch_count; + dumpFinalAccuracy(final_accuracy); - return 0; + llvm_hpvm_cleanupTensorRt(); + return 0; } diff --git a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc index 6793cd79f19bfe9fd192b2e6199323bb28940aa3..ea959342a4ac034deeba4191faa6620f2ec81037 100644 --- a/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc +++ b/hpvm/projects/hpvm-tensor-rt/dnn_sources/src/unit_tests.cc @@ -10,10 +10,7 @@ using namespace std; - - - -class UnitTestResults{ +class UnitTestResults { private: unsigned int total_tests; @@ -22,48 +19,46 @@ private: std::vector<string> failed_test_ids; public: - - UnitTestResults(){ + UnitTestResults() { total_tests = 0; failed_tests = 0; passed_tests = 0; } - void evalTestResult(Tensor* res, const float* expected_result, size_t num_elems, - float epsilon, string test_name){ + void evalTestResult(Tensor *res, const float *expected_result, + size_t num_elems, float epsilon, string test_name) { - total_tests += 1; - if(res->num_elems != num_elems){ + total_tests += 1; + if (res->num_elems != num_elems) { failed_tests += 1; failed_test_ids.push_back(test_name); return; } - float* data_ptr = (float*) res->host_data; - for (unsigned int i = 0; i < res->num_elems; i++){ - //printf("**diff value = %f ", std::abs(data_ptr[i] - expected_result[i])); - if (std::abs(data_ptr[i] - expected_result[i]) > epsilon){ - failed_tests += 1; - failed_test_ids.push_back(test_name); + float *data_ptr = (float *)res->host_data; + for (unsigned int i = 0; i < res->num_elems; i++) { + // printf("**diff value = %f ", std::abs(data_ptr[i] - + // expected_result[i])); + if (std::abs(data_ptr[i] - expected_result[i]) > epsilon) { + failed_tests += 1; + failed_test_ids.push_back(test_name); return; } } - - passed_tests += 1; + + passed_tests += 1; } - void compareTensors(Tensor* res, Tensor* gold_res, - float epsilon, string test_name){ + void compareTensors(Tensor *res, Tensor *gold_res, float epsilon, + string test_name) { - const float* expected_result = (float*) gold_res->host_data; + const float *expected_result = (float *)gold_res->host_data; unsigned int num_elems = res->num_elems; evalTestResult(res, expected_result, num_elems, epsilon, test_name); - } - - void printSummary(){ + void printSummary() { printf("\n\n\n ************* Printing Results Summary ********** \n\n"); printf("-- Total tests := %d \n", total_tests); @@ -71,147 +66,136 @@ public: printf("-- Tests Failed := %d \n", failed_tests); printf("\n\n Tests that failed : \n\n"); - for (int i = 0; i < failed_test_ids.size(); i++){ + for (int i = 0; i < failed_test_ids.size(); i++) { printf("*** Test = %s \n", failed_test_ids[i].c_str()); } } - }; - - - -void testTensorHgemm(UnitTestResults& unitTestResults){ +void testTensorHgemm(UnitTestResults &unitTestResults) { printf("***** TensorHgemm ***** \n\n"); - void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); - struct Tensor* lhs = (struct Tensor*) lhs_ptr; + void *lhs_ptr = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); + struct Tensor *lhs = (struct Tensor *)lhs_ptr; fillTensorWithOnes(lhs); - - float* data_arr = (float*) lhs->host_data; - for(int i = 0; i < lhs->num_elems; i++){ + + float *data_arr = (float *)lhs->host_data; + for (int i = 0; i < lhs->num_elems; i++) { data_arr[i] = (i / 4) + 1; } - - void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); + + void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); fillTensorWithOnes(rhs); - - void* output = tensorHalfGemm(lhs, rhs); - convertToFP32((struct Tensor*) output); + + void *output = tensorHalfGemm(lhs, rhs); + convertToFP32((struct Tensor *)output); printTensorValues(output); - const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 20, 20, 20}; + const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, + 12, 16, 16, 16, 20, 20, 20}; - unitTestResults.evalTestResult((Tensor*) output, expected_result, 15, 0.01, "Hgemm"); + unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01, + "Hgemm"); } - - -void testTensorSgemm(UnitTestResults& unitTestResults){ +void testTensorSgemm(UnitTestResults &unitTestResults) { printf("***** TensorSgemm ***** \n\n"); - void* lhs_ptr = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); - struct Tensor* lhs = (struct Tensor*) lhs_ptr; + void *lhs_ptr = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 5, 4, 1, 1); + struct Tensor *lhs = (struct Tensor *)lhs_ptr; fillTensorWithOnes(lhs); - - float* data_arr = (float*) lhs->host_data; - for(int i = 0; i < lhs->num_elems; i++){ + + float *data_arr = (float *)lhs->host_data; + for (int i = 0; i < lhs->num_elems; i++) { data_arr[i] = (i / 4) + 1; } - void* rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); + void *rhs = create4DTensor(CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, 1, 4, 3); fillTensorWithOnes(rhs); - - void* output = tensorGemmGPU(lhs, rhs); - printTensorValues(output); - const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, 12, 16, 16, 16, 20, 20, 20}; + void *output = tensorGemmGPU(lhs, rhs); + printTensorValues(output); - unitTestResults.evalTestResult((Tensor*) output, expected_result, 15, 0.01, "Sgemm"); + const float expected_result[15] = {4, 4, 4, 8, 8, 8, 12, 12, + 12, 16, 16, 16, 20, 20, 20}; + unitTestResults.evalTestResult((Tensor *)output, expected_result, 15, 0.01, + "Sgemm"); } +void testTensorConcatAndSplit() { + int conv_mode = 1; // CROSS_CORRELATION mode + int compute_precision = 0; // floating point precision - - -void testTensorConcatAndSplit(){ - - int conv_mode = 1; // CROSS_CORRELATION mode - int compute_precision = 0; // floating point precision - - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); fillWithOnesAndTwos(input); - void** splits = tensorSplit(input, 2, 1); + void **splits = tensorSplit(input, 2, 1); - void* conv2W = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); + void *conv2W = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); fillTensorWithOnes(conv2W); - - void** conv2fils = tensorSplit(conv2W, 2, 0); - void* conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0, - 1, 1, conv_mode, compute_precision); + void **conv2fils = tensorSplit(conv2W, 2, 0); + + void *conv2a_out = tensorConvolution(splits[0], conv2fils[0], 0, 0, 1, 1, + conv_mode, compute_precision); printTensorDims(conv2a_out); - void* conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0, - 1, 1, conv_mode, compute_precision); + void *conv2b_out = tensorConvolution(splits[1], conv2fils[1], 0, 0, 1, 1, + conv_mode, compute_precision); printTensorDims(conv2b_out); - - void* conv2_outs[2]; + + void *conv2_outs[2]; conv2_outs[0] = conv2a_out; conv2_outs[1] = conv2b_out; - void* conv2_concat_out = tensorConcat(conv2_outs, 2, 1); + void *conv2_concat_out = tensorConcat(conv2_outs, 2, 1); printTensorDims(conv2_concat_out); printTensorValues(conv2_concat_out); - } +void testLRN() { - - - - -void testLRN(){ - - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20); + void *input = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 20, 20, 20, 20); fillTensorWithOnes(input); unsigned LRN_window = 5; double LRN_alpha = 2e-05; printf("LRN_alpha = %f \n", LRN_alpha); - + double LRN_beta = 0.75; double LRN_k = 1.0; // TEST-point - Compare TF vs CUDNN - void* lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k); + void *lrn1out = tensorLRN(input, LRN_window, LRN_alpha, LRN_beta, LRN_k); printTensorDims(lrn1out); dumpWeightsToFile("tensors_out/lrn1_test.out", lrn1out); - void* input2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7); + void *input2 = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 7, 7, 7, 7); fillTensorWithOnes(input2); LRN_window = 5; LRN_alpha = 0.5 * LRN_window; - + LRN_beta = 0.75; LRN_k = 1.0; - void* lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k); + void *lrn2out = tensorLRN(input2, LRN_window, LRN_alpha, LRN_beta, LRN_k); printTensorDims(lrn2out); - dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out); + dumpWeightsToFile("tensors_out/lrn2_test.out", lrn2out); } - - - -void testTensorAdd(){ +void testTensorAdd() { // Tensor add with equal dimensions - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); - void* bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); + void *bias = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 2, 2); fillTensorWithOnes(x); fillTensorWithOnes(bias); @@ -222,8 +206,8 @@ void testTensorAdd(){ printTensorValues(x); // Tensor addd with matching channel dimension - void* x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2); - void* bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1); + void *x2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 2, 2); + void *bias2 = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 1, 1); fillTensorWithOnes(x2); fillTensorWithOnes(bias2); @@ -231,191 +215,181 @@ void testTensorAdd(){ printTensorValues(x2); } -void testTensorConv(){ +void testTensorConv() { - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); + void *filter = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); fillTensorWithOnes(input); fillTensorWithOnes(filter); - int conv_mode = 1; // NOTE: uses CROSS_CORRELATION + int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int compute_precision = 0; // floating point precision for conv - - void* conv_out = tensorConvolution(input, filter, 0, 0, - 1, 1, conv_mode, compute_precision); - printTensorValues(conv_out); + void *conv_out = tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, + compute_precision); + printTensorValues(conv_out); } +void testTensorHalfConv() { -void testTensorHalfConv(){ - - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); + void *filter = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); fillTensorWithOnes(input); fillTensorWithOnes(filter); - int conv_mode = 1; // NOTE: uses CROSS_CORRELATION + int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int compute_precision = 0; // floating point precision for conv - - void* conv_out = tensorHalfConvolution(input, filter, 0, 0, - 1, 1, conv_mode, compute_precision); - printTensorValues(conv_out); + void *conv_out = tensorHalfConvolution(input, filter, 0, 0, 1, 1, conv_mode, + compute_precision); + printTensorValues(conv_out); } +void testTensorGroupConv() { + // NOTE: The input channel count value (param2 to Tensor and Filter) must be + // the same + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); + void *filter = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); - -void testTensorGroupConv(){ - - // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); - - // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor, val) + // FIXIT: fillTensor* calls should be replaced with initTensorValue(tenosor, + // val) fillTensorWithOnes(input); fillTensorWithOnes(filter); int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int conv_groups = 2; - - void* conv_out = tensorConvolution(input, filter, - 0, 0, - 1, 1, - conv_mode, conv_groups); + + void *conv_out = + tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups); printTensorValues(conv_out); - } +void testTensorHalfGroupConv() { -void testTensorHalfGroupConv(){ - - // NOTE: The input channel count value (param2 to Tensor and Filter) must be the same - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); - void* filter = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); + // NOTE: The input channel count value (param2 to Tensor and Filter) must be + // the same + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 2, 4, 4); + void *filter = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 3, 3); fillTensorWithOnes(input); fillTensorWithOnes(filter); int conv_mode = 1; // NOTE: uses CROSS_CORRELATION int conv_groups = 2; - - void* conv_out = tensorConvolution(input, filter, - 0, 0, - 1, 1, - conv_mode, conv_groups); - - convertToFP32((struct Tensor*) conv_out); + + void *conv_out = + tensorConvolution(input, filter, 0, 0, 1, 1, conv_mode, conv_groups); + + convertToFP32((struct Tensor *)conv_out); printTensorValues(conv_out); } +void testTensorPooling() { -void testTensorPooling(){ - - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); fillTensorWithOnes(x); - float* data_arr = (float*) ((Tensor*) x)->host_data; - for(int i = 0; i < ((Tensor*) x)->num_elems; i += 4){ + float *data_arr = (float *)((Tensor *)x)->host_data; + for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) { data_arr[i] = i; } - void* output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); + void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); printTensorValues(output); } +void testTensorHalfPooling() { -void testTensorHalfPooling(){ - - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 4, 4); fillTensorWithOnes(x); - float* data_arr = (float*) ((Tensor*) x)->host_data; - for(int i = 0; i < ((Tensor*) x)->num_elems; i += 4){ + float *data_arr = (float *)((Tensor *)x)->host_data; + for (int i = 0; i < ((Tensor *)x)->num_elems; i += 4) { data_arr[i] = i; } - void* output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); - convertToFP32((struct Tensor*) output); + void *output = tensorPooling(x, 0, 2, 2, 0, 0, 2, 2); + convertToFP32((struct Tensor *)output); printTensorValues(output); } +void testTensorBatchNorm() { -void testTensorBatchNorm(){ - - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); fillTensorWithVal(x, 3); - void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(gamma, 1); - void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(beta, 0); - void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(mean, 1); - void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *variance = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(variance, 1); double epsilon = 1; // NOTE: result = X - mean / sqrt(epsilon + variance) - void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); + void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); - printTensorValues(output); + printTensorValues(output); } +void testTensorHalfBatchNorm() { -void testTensorHalfBatchNorm(){ - - void* x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); + void *x = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 2, 2); fillTensorWithVal(x, 3); - void* gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *gamma = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(gamma, 1); - void* beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *beta = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(beta, 0); - void* mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *mean = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(mean, 1); - void* variance = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); + void *variance = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 1, 1); fillTensorWithVal(variance, 1); - double epsilon = 1; // NOTE: result = X - mean / sqrt(epsilon + variance) - void* output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); - convertToFP32((struct Tensor*) output); + void *output = tensorBatchNorm(x, gamma, beta, mean, variance, 1); + convertToFP32((struct Tensor *)output); - printTensorValues(output); + printTensorValues(output); } +void testTensorRelu() { -void testTensorRelu(){ - - // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match + // NOTE: 2nd dim of bias and d2*d3*d4 for the input tensor MUST match printf("***** TensorRelu ***** \n\n"); - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 1, 2, 2); fillTensorWithNegOnes(input); - void* output = tensorRelu(input); + void *output = tensorRelu(input); printTensorValues(output); } - -void testTensorSoftmax(){ +void testTensorSoftmax() { printf("***** TensorSoftmax ***** \n\n"); - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 4, 1, 1); - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + float *host_ptr = (float *)((struct Tensor *)input)->host_data; host_ptr[0] = 0.1; host_ptr[1] = 0.2; host_ptr[2] = 0.3; @@ -425,39 +399,36 @@ void testTensorSoftmax(){ host_ptr[6] = 0.7; host_ptr[7] = 2.5; - void* output = tensorSoftmax(input); + void *output = tensorSoftmax(input); printTensorValues(output); } +void testSoftmaxOutput(void *output_ptr) { -void testSoftmaxOutput(void* output_ptr){ + struct Tensor *output = (struct Tensor *)output_ptr; - struct Tensor* output = (struct Tensor*) output_ptr; - size_t batch_dim = output->dims.dim_sizes[0]; size_t channels = output->dims.dim_sizes[1]; - float* data = (float*) output->host_data; - for(int i = 0; i < batch_dim; i++){ + float *data = (float *)output->host_data; + for (int i = 0; i < batch_dim; i++) { float sum = 0.0; - for(int j = 0; j < channels; j++){ + for (int j = 0; j < channels; j++) { sum += data[i * channels + j]; } printf("output_sum = %f \n", sum); } - } - - -void testPromiseError(){ +void testPromiseError() { printf("***** TensorQuantize ***** \n\n"); - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); + float *host_ptr = (float *)((struct Tensor *)input)->host_data; - void* gold_tensor = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - float* gold_ptr = (float*) ((struct Tensor*) gold_tensor)->host_data; + void *gold_tensor = + create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); + float *gold_ptr = (float *)((struct Tensor *)gold_tensor)->host_data; gold_ptr[0] = -1; gold_ptr[1] = -2; @@ -472,21 +443,20 @@ void testPromiseError(){ gold_ptr[10] = 1; gold_ptr[11] = 1; - int num_elems = 12; int num_runs = 1000; - float* result_ptr = (float*) malloc(sizeof(float) * num_elems); + float *result_ptr = (float *)malloc(sizeof(float) * num_elems); - for (int swing = 1; swing <= 7; swing++){ + for (int swing = 1; swing <= 7; swing++) { - for (int j = 0; j < num_elems; j++){ - result_ptr[j] = 0; + for (int j = 0; j < num_elems; j++) { + result_ptr[j] = 0; } float error_sum = 0.0; - - for (int i = 0; i < 1000; i++){ + + for (int i = 0; i < 1000; i++) { host_ptr[0] = -1; host_ptr[1] = -2; host_ptr[2] = -3; @@ -499,43 +469,39 @@ void testPromiseError(){ host_ptr[9] = 2; host_ptr[10] = 1; host_ptr[11] = 1; - - void* error_out = addPromiseError(input, swing); - //printTensorValues(error_out); + + void *error_out = addPromiseError(input, swing); + // printTensorValues(error_out); // Move result data back to the host hpvm_request_tensor(input, 0); - float* error_out_ptr = (float*) ((struct Tensor*) input)->host_data; + float *error_out_ptr = (float *)((struct Tensor *)input)->host_data; - for (int j = 0; j < num_elems; j++){ - result_ptr[j] += error_out_ptr[j]; - error_sum += (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]); + for (int j = 0; j < num_elems; j++) { + result_ptr[j] += error_out_ptr[j]; + error_sum += + (error_out_ptr[j] - gold_ptr[j]) * (error_out_ptr[j] - gold_ptr[j]); } } - printf ("\n\n - Swing %d results : \n", swing); - for (int j = 0; j < num_elems; j++){ + printf("\n\n - Swing %d results : \n", swing); + for (int j = 0; j < num_elems; j++) { result_ptr[j] = result_ptr[j] / num_runs; printf(" %f ", result_ptr[j]); } printf("mean_error = %f \n", error_sum / num_runs); - + printf(" \n"); } - - } - - - -void testQuantization(){ +void testQuantization() { printf("***** TensorQuantize ***** \n\n"); - void* input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); + void *input = create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 6, 1, 1); - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + float *host_ptr = (float *)((struct Tensor *)input)->host_data; host_ptr[0] = -0.1; host_ptr[1] = -25; host_ptr[2] = 0.2; @@ -548,13 +514,12 @@ void testQuantization(){ host_ptr[9] = 7.2; host_ptr[10] = 2.5; host_ptr[11] = 3; - - void* quantize_result1 = quantizeTensorPromise(input, -4, 6); + void *quantize_result1 = quantizeTensorPromise(input, -4, 6); - printf ("\n ** quantizing with range min = %d max = %d \n", -4, 6); + printf("\n ** quantizing with range min = %d max = %d \n", -4, 6); printTensorValues(quantize_result1); - + host_ptr[0] = -0.1; host_ptr[1] = -25; host_ptr[2] = 0.2; @@ -568,9 +533,9 @@ void testQuantization(){ host_ptr[10] = 2.5; host_ptr[11] = 3; - void* quantize_result2 = quantizeTensorPromise(input, -2, 2); + void *quantize_result2 = quantizeTensorPromise(input, -2, 2); - printf ("\n ** quantizing with range min = %d max = %d \n", -2, 2); + printf("\n ** quantizing with range min = %d max = %d \n", -2, 2); printTensorValues(quantize_result2); host_ptr[0] = -0.1; @@ -586,13 +551,12 @@ void testQuantization(){ host_ptr[10] = 2.5; host_ptr[11] = 3; + void *quantize_result3 = quantizeTensorPromise(input, -25, 8); - void* quantize_result3 = quantizeTensorPromise(input, -25, 8); - - printf ("\n ** quantizing with range min = %d max = %d \n", -25, 8); + printf("\n ** quantizing with range min = %d max = %d \n", -25, 8); printTensorValues(quantize_result3); - printf ("\n ** quantizing with range min = %d max = %d \n", -10, 10); + printf("\n ** quantizing with range min = %d max = %d \n", -10, 10); host_ptr[0] = -0.1; host_ptr[1] = -25; @@ -607,30 +571,26 @@ void testQuantization(){ host_ptr[10] = 2.5; host_ptr[11] = 3; - - void* quantize_result4 = quantizeTensorPromise(input, -10, 10); + void *quantize_result4 = quantizeTensorPromise(input, -10, 10); printTensorValues(quantize_result4); - - void* quantize_result5 = quantizeTensorPromise(input, -10, 10); + void *quantize_result5 = quantizeTensorPromise(input, -10, 10); printTensorValues(quantize_result5); - - //void* error_out = addPromiseError(quantize_result, 1); - //printTensorValues(error_out); + // void* error_out = addPromiseError(quantize_result, 1); + // printTensorValues(error_out); } - - - -void testSampleFilter(){ +void testSampleFilter() { printf("***** Tensor Sample Filter ***** \n\n"); - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); - //fillTensorWithVal(input, 3); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 2, 2, 3, 3); + // fillTensorWithVal(input, 3); fillWithOnesAndTwos(input); - - Tensor* input2 = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, 2, 32, 32); + + Tensor *input2 = (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, + 3, 2, 32, 32); fillTensorWithVal(input2, 1); /* float* host_ptr = (float*) ((struct Tensor*) input)->host_data; @@ -649,7 +609,7 @@ void testSampleFilter(){ /* printf("\n\n"); hpvm_request_tensor(input, DEVICE); - + sampleFilter(input, 2, 1); hpvm_request_tensor(input, HOST); @@ -657,116 +617,81 @@ void testSampleFilter(){ printTensorValues(input); */ - void* exact_res = tensorConvolution(input2, input, 0, 0, - 1, 1, 1, 1); + void *exact_res = tensorConvolution(input2, input, 0, 0, 1, 1, 1, 1); printTensorValues(exact_res); - - void* res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0); - - //void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3); - - printTensorValues(res); - -} - - + void *res = tensorConvSampSim(input2, input, 0, 0, 1, 1, 1, 1, 4, 0); + // void* res = tensorConvApprox(input2, input, 0, 0, 1, 1, 1, 1, 1, 1, 4, 3); -void testPerforationCalls(void* input, void* filter, - int pad_h, int pad_w, - int stride_h, int stride_w, - int row, int col){ + printTensorValues(res); +} +void testPerforationCalls(void *input, void *filter, int pad_h, int pad_w, + int stride_h, int stride_w, int row, int col) { float interpolation_rate = 1.0; - for (int offset = 0; offset < 2; offset++){ - - printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d row = %d col = %d offset= %d \n\n", - pad_h, pad_w, stride_h, stride_w, row, col, offset); - - - void* res_exact = tensorConvolution(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1); - - printf ("tensorConvolution Result :"); - printTensorValues(res_exact); - + for (int offset = 0; offset < 2; offset++) { - void* res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, 1, 1); + printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d " + "row = %d col = %d offset= %d \n\n", + pad_h, pad_w, stride_h, stride_w, row, col, offset); - printf ("\nBaseline Result :"); - printTensorValues(res_exact2); + void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1); + printf("tensorConvolution Result :"); + printTensorValues(res_exact); - void* res_exact3 = tensorConvApproxHalf2(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, 1, 1); - convertToFP32((struct Tensor*) res_exact3); + void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, 1, 1); - printf ("\nFP16_Baseline Result :"); - printTensorValues(res_exact3); + printf("\nBaseline Result :"); + printTensorValues(res_exact2); - - void* res_sim = tensorConvPerfCuda(input, filter, - pad_h, pad_w, - stride_h, stride_w, - 1, 1, - row, col, - offset); + void *res_exact3 = tensorConvApproxHalf2( + input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1); + convertToFP32((struct Tensor *)res_exact3); - printf ("\nConvPerfCuda Result :"); - printTensorValues(res_sim); + printf("\nFP16_Baseline Result :"); + printTensorValues(res_exact3); - - void* res = tensorConvApprox(input, filter, - pad_h, pad_w, - stride_h, stride_w, - 1, 1, - row, col, - 1, offset); + void *res_sim = tensorConvPerfCuda(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, row, col, offset); + printf("\nConvPerfCuda Result :"); + printTensorValues(res_sim); - printf ("\nConvApprox Result :"); - printTensorValues(res); + void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, row, col, 1, offset); + printf("\nConvApprox Result :"); + printTensorValues(res); - void* res_half = tensorConvApproxHalf2(input, filter, - pad_h, pad_w, - stride_h, stride_w, - 1, 1, - row, col, - 1, offset); + void *res_half = + tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w, + 1, 1, row, col, 1, offset); - convertToFP32((struct Tensor*) res_half); + convertToFP32((struct Tensor *)res_half); - printf ("\nConvApproxHalf2 Result :"); - printTensorValues(res_half); + printf("\nConvApproxHalf2 Result :"); + printTensorValues(res_half); + } - } - - - printf ("\n\n\n--- End of Test \n\n\n"); + printf("\n\n\n--- End of Test \n\n\n"); } - - - - /**** Tests Perforation for a set of different inputs */ -void testPerforation(UnitTestResults& unitTestResults){ +void testPerforation(UnitTestResults &unitTestResults) { - printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n"); - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); fillTensorWithVal(input, 1); - - Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); - fillTensorWithVal(filter, 1); + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + fillTensorWithVal(filter, 1); /* float* host_ptr = (float*) ((struct Tensor*) filter)->host_data; @@ -785,43 +710,33 @@ void testPerforation(UnitTestResults& unitTestResults){ host_ptr[24] = 2; host_ptr[26] = 2; */ - testPerforationCalls(input, filter, 0, 0, 1, 1, 1, 2); testPerforationCalls(input, filter, 0, 0, 1, 1, 2, 1); - testPerforationCalls(input, filter, 1, 1, 1, 1, 1, 3); testPerforationCalls(input, filter, 1, 1, 1, 1, 3, 1); - testPerforationCalls(input, filter, 1, 1, 2, 2, 1, 4); testPerforationCalls(input, filter, 1, 1, 2, 2, 4, 1); - } - - - - - - - - -void testSampling(){ +void testSampling() { printf("***** Testing Sampling ***** \n\n"); - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); fillTensorWithVal(input, 1); - //fillWithOnesAndTwos(input); - - Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + // fillWithOnesAndTwos(input); + + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); fillTensorWithVal(filter, 1); - float* host_ptr = (float*) ((struct Tensor*) filter)->host_data; + float *host_ptr = (float *)((struct Tensor *)filter)->host_data; host_ptr[0] = 2; host_ptr[2] = 2; host_ptr[4] = 2; @@ -836,144 +751,124 @@ void testSampling(){ host_ptr[22] = 2; host_ptr[24] = 2; host_ptr[26] = 2; - //printTensorValues(input); + // printTensorValues(input); + + void *res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); - void* res = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); - printTensorValues(res); + void *res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1); - void* res2 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 1); - printTensorValues(res2); + void *res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0); - void* res2_sim = tensorConvSampSim(input, filter, 0, 0, 1, 1, 1, 1, 2, 0); - printTensorValues(res2_sim); - - void* res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0); - + void *res3 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 2, 0); + printTensorValues(res3); + void *res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - void* res4 = tensorConvApprox(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - printTensorValues(res4); + void *res4_half = + tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - void* res4_half = tensorConvApproxHalf2(input, filter, 0, 0, 1, 1, 1, 1, 1, 1, 4, 0); - - convertToFP32((struct Tensor*) res4_half); + convertToFP32((struct Tensor *)res4_half); printTensorValues(res4_half); - } - - - -void testSamplingCalls(void* input, void* filter, - int pad_h, int pad_w, - int stride_h, int stride_w, - int skip_every, UnitTestResults& unitTestResults){ - +void testSamplingCalls(void *input, void *filter, int pad_h, int pad_w, + int stride_h, int stride_w, int skip_every, + UnitTestResults &unitTestResults) { float interpolation_rate = 1.0; - for (int offset = 0; offset < 2; offset++){ - - - printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d skip_every = %d offset= %d interpolation_rate = %f \n\n", - pad_h, pad_w, stride_h, stride_w, skip_every, offset, interpolation_rate); - - - void* res_exact = tensorConvolution(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1); + for (int offset = 0; offset < 2; offset++) { - printf ("tensorConvolution Result :"); - printTensorValues(res_exact); + printf("\n\n\n**Test -- pad_h = %d pad_w = %d stride_h = %d stride_w = %d " + "skip_every = %d offset= %d interpolation_rate = %f \n\n", + pad_h, pad_w, stride_h, stride_w, skip_every, offset, + interpolation_rate); + void *res_exact = tensorConvolution(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1); - void* res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, 1, 1); + printf("tensorConvolution Result :"); + printTensorValues(res_exact); - printf ("\nBaseline Result :"); - printTensorValues(res_exact2); + void *res_exact2 = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, 1, 1); + printf("\nBaseline Result :"); + printTensorValues(res_exact2); - void* res_exact3 = tensorConvApproxHalf2(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, 1, 1); - convertToFP32((struct Tensor*) res_exact3); + void *res_exact3 = tensorConvApproxHalf2( + input, filter, pad_h, pad_w, stride_h, stride_w, 1, 1, 1, 1, 1, 1); + convertToFP32((struct Tensor *)res_exact3); - printf ("\nFP16_Baseline Result :"); - printTensorValues(res_exact3); + printf("\nFP16_Baseline Result :"); + printTensorValues(res_exact3); - - void* res_sim = tensorConvSampSim2(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, skip_every, offset, interpolation_rate); + void *res_sim = + tensorConvSampSim2(input, filter, pad_h, pad_w, stride_h, stride_w, 1, + 1, skip_every, offset, interpolation_rate); - printf ("\nConvSampSim Result :"); - printTensorValues(res_sim); + printf("\nConvSampSim Result :"); + printTensorValues(res_sim); - - void* res = tensorConvApprox(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, skip_every, offset); + void *res = tensorConvApprox(input, filter, pad_h, pad_w, stride_h, + stride_w, 1, 1, 1, 1, skip_every, offset); + printf("\nConvApprox Result :"); + printTensorValues(res); - printf ("\nConvApprox Result :"); - printTensorValues(res); + void *res_half = + tensorConvApproxHalf2(input, filter, pad_h, pad_w, stride_h, stride_w, + 1, 1, 1, 1, skip_every, offset); + convertToFP32((struct Tensor *)res_half); - void* res_half = tensorConvApproxHalf2(input, filter, pad_h, pad_w, - stride_h, stride_w, - 1, 1, 1, 1, skip_every, offset); + printf("\nConvApproxHalf2 Result :"); + printTensorValues(res_half); - convertToFP32((struct Tensor*) res_half); + std::string suffix = + std::string(" pad_h = ") + std::to_string(pad_h) + + std::string(" pad_w = ") + std::to_string(pad_w) + + std::string(" stride_h = ") + std::to_string(stride_h) + + std::string(" stride_w = ") + std::to_string(stride_w) + + std::string(" skip_every = ") + std::to_string(skip_every) + + std::string(" offset = ") + std::to_string(offset); - printf ("\nConvApproxHalf2 Result :"); - printTensorValues(res_half); + std::string test_name = std::string("SAMP_FP32 ") + suffix; - std::string suffix = std::string(" pad_h = ") + std::to_string(pad_h) - + std::string(" pad_w = ") + std::to_string(pad_w) - + std::string(" stride_h = ") + std::to_string(stride_h) - + std::string(" stride_w = ") + std::to_string(stride_w) - + std::string(" skip_every = ") + std::to_string(skip_every) - + std::string(" offset = ") + std::to_string(offset); + unitTestResults.compareTensors((Tensor *)res, (Tensor *)res_sim, 0.01, + test_name); - std::string test_name = std::string("SAMP_FP32 ") + suffix; - - unitTestResults.compareTensors((Tensor*) res, (Tensor*) res_sim, 0.01, test_name); + std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix; + unitTestResults.compareTensors((Tensor *)res_half, (Tensor *)res_sim, 0.04, + fp16_test_name); + } - std::string fp16_test_name = std::string("SAMP_FP16 ") + suffix; - unitTestResults.compareTensors((Tensor*) res_half, (Tensor*) res_sim, 0.04, fp16_test_name); - } - - - printf ("\n\n\n --- End of Test \n\n\n"); + printf("\n\n\n --- End of Test \n\n\n"); } - - /**** Tests Sample for a sample 3 * 3 Filter */ -void testSampling_3_3(UnitTestResults& unitTestResults){ +void testSampling_3_3(UnitTestResults &unitTestResults) { - printf("***** Tests Sample for a sample 3 * 3 Filter ***** \n\n"); - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 4, 4); fillTensorWithVal(input, 1); - //fillWithOnesAndTwos(input); - - Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); - fillTensorWithVal(filter, 1); + // fillWithOnesAndTwos(input); + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 3, 3, 3); + fillTensorWithVal(filter, 1); - float* host_ptr = (float*) ((struct Tensor*) filter)->host_data; + float *host_ptr = (float *)((struct Tensor *)filter)->host_data; host_ptr[0] = 2; host_ptr[2] = 2; host_ptr[4] = 2; @@ -989,7 +884,6 @@ void testSampling_3_3(UnitTestResults& unitTestResults){ host_ptr[24] = 2; host_ptr[26] = 2; - // Tests with padding = 0 stride = 1 testSamplingCalls(input, filter, 0, 0, 1, 1, 2, unitTestResults); @@ -1010,27 +904,19 @@ void testSampling_3_3(UnitTestResults& unitTestResults){ testSamplingCalls(input, filter, 1, 1, 2, 2, 3, unitTestResults); testSamplingCalls(input, filter, 1, 1, 2, 2, 4, unitTestResults); - - } - - - - - - /**** Tests Sample for a sample 1 * 1 Filter */ -void testSampling_1_1(UnitTestResults& unitTestResults){ +void testSampling_1_1(UnitTestResults &unitTestResults) { - - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2); + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 1, 9, 2, 2); fillTensorWithVal(input, 2); - //fillWithOnesAndTwos(input); - - Tensor* filter = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1); + // fillWithOnesAndTwos(input); + + Tensor *filter = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 9, 1, 1); fillTensorWithVal(filter, 2); - // Tests with padding = 0 stride = 1 testSamplingCalls(input, filter, 0, 0, 1, 1, 2, unitTestResults); @@ -1039,25 +925,20 @@ void testSampling_1_1(UnitTestResults& unitTestResults){ testSamplingCalls(input, filter, 0, 0, 1, 1, 4, unitTestResults); - // Tests with padding = 1 stride = 1 testSamplingCalls(input, filter, 1, 1, 1, 1, 2, unitTestResults); testSamplingCalls(input, filter, 1, 1, 1, 1, 3, unitTestResults); testSamplingCalls(input, filter, 1, 1, 1, 1, 4, unitTestResults); - - } +void *testTensorArgMax() { + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1); - -void* testTensorArgMax(){ - - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1); - - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + float *host_ptr = (float *)((struct Tensor *)input)->host_data; // Input 0 host_ptr[0] = 1; @@ -1079,37 +960,34 @@ void* testTensorArgMax(){ host_ptr[10] = 2; host_ptr[11] = 8; - void* argmax_out = tensorArgMax(input); - - // Expect Output of call below to be: + void *argmax_out = tensorArgMax(input); + + // Expect Output of call below to be: // 1 2 2 0 printTensorValues(argmax_out); - return argmax_out; + return argmax_out; } +void *testTensorSelect(void *argmax_out) { - -void* testTensorSelect(void* argmax_out){ - - void* select_out = tensorSelect(argmax_out, 2); - printf ("***** tensorSelect output \n"); + void *select_out = tensorSelect(argmax_out, 2); + printf("***** tensorSelect output \n"); printTensorValues(select_out); - return select_out; - + return select_out; } +void testTensorContract(void *select_out) { -void testTensorContract(void* select_out){ - - Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1); - float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + Tensor *input = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1); + float *host_ptr = (float *)((struct Tensor *)input)->host_data; // Input 0 host_ptr[0] = 1; - host_ptr[1] = 1; + host_ptr[1] = 1; host_ptr[2] = 1; host_ptr[3] = 1; @@ -1118,51 +996,38 @@ void testTensorContract(void* select_out){ host_ptr[5] = 2; host_ptr[6] = 2; host_ptr[7] = 2; - + // Input 2 host_ptr[8] = 3; host_ptr[9] = 3; - host_ptr[10] = 3; - host_ptr[11] = 3; + host_ptr[10] = 3; + host_ptr[11] = 3; // Input 3 - host_ptr[12] = 4; + host_ptr[12] = 4; host_ptr[13] = 4; host_ptr[14] = 4; host_ptr[15] = 4; - - void* contract_out = tensorContract(input, select_out); - printf ("***** tensorContract output \n"); + void *contract_out = tensorContract(input, select_out); + printf("***** tensorContract output \n"); printTensorValues(contract_out); - } +void testNewTensorOps() { - -void testNewTensorOps(){ - - void* argmax_out = testTensorArgMax(); - void* select_out = testTensorSelect(argmax_out); + void *argmax_out = testTensorArgMax(); + void *select_out = testTensorSelect(argmax_out); testTensorContract(select_out); - } - - - - - - - -int main(){ +int main() { llvm_hpvm_initTensorRt(0); - UnitTestResults unitTestResults; - + // Function call per unit test testTensorHgemm(unitTestResults); testTensorSgemm(unitTestResults); @@ -1181,31 +1046,26 @@ int main(){ testTensorHalfPooling(); */ - + testSampling_3_3(unitTestResults); testSampling_1_1(unitTestResults); testPerforation(unitTestResults); - - unitTestResults.printSummary(); - // testTensorError(); - // testQuantization(); + // testQuantization(); // testTensorGemm(); // testTensorGemmGPU(); - // testTensorGemmBias(); + // testTensorGemmBias(); // testTensorConv2(); // testTensorConv3(); // testLRN(); // testSampleFilter(); - // testNewTensorOps(); + // testNewTensorOps(); // testQuantization(); // testPromiseError(); - - + return 0; } - diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h index 98d6d63eadc44b171b54bd09a9096d072c4be10d..1ca90cf6f724b5e42f3b8c774b23c25f7d294437 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h @@ -14,10 +14,10 @@ __global__ void convToGemmApproxHalf( // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? for (int i = 0; i < KH; i++) { for (int j = 0; j < KW; j++) { const int filter_elem_num = @@ -58,7 +58,7 @@ convToGemmPerfRow(float *const __restrict__ output, // number const int h = tx % (H_eff * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) + const int w = tx % W_out; // output width index (col number) int past_start = (h % (x - 1) >= (x - 1 - start)); const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride - V_pad; // input height index (row number) @@ -135,7 +135,7 @@ convToGemmPerfCol(float *const __restrict__ output, // number const int h = tx % (H_out * W_eff) / W_eff; // output height index (row // number) - const int w = tx % W_eff; // output width index (col number) + const int w = tx % W_eff; // output width index (col number) int past_start = (w % (x - 1)) >= (x - 1 - start); const int inH = h * V_stride - V_pad; // input height index (row number) const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride - @@ -394,7 +394,7 @@ __global__ void convToGemmPerfRowHalf( // number const int h = tx % (H_eff * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) + const int w = tx % W_out; // output width index (col number) int past_start = (h % (x - 1) >= (x - 1 - start)); const int inH = (h / (x - 1) * x + h % (x - 1) + past_start) * V_stride - V_pad; // input height index (row number) @@ -469,7 +469,7 @@ __global__ void convToGemmPerfColHalf( // number const int h = tx % (H_out * W_eff) / W_eff; // output height index (row // number) - const int w = tx % W_eff; // output width index (col number) + const int w = tx % W_eff; // output width index (col number) int past_start = (w % (x - 1)) >= (x - 1 - start); const int inH = h * V_stride - V_pad; // input height index (row number) const int inW = (w / (x - 1) * x + w % (x - 1) + past_start) * H_stride - @@ -557,10 +557,10 @@ __global__ void convToGemmApproxHalfN( // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? for (int i = 0; i < KH; i++) { for (int j = 0; j < KW; j++) { const int filter_elem_num = @@ -832,10 +832,10 @@ convToGemmHalfInput(__half *const __restrict__ output, // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? for (int i = 0; i < KH; i++) { for (int j = 0; j < KW; j++) { const int filter_elem_num = @@ -873,10 +873,10 @@ convToGemmHalfInput2(__half *const __restrict__ output, // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? const int filter_elem_num = c * KH * KW; for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) { int i = l / KW; @@ -1044,10 +1044,10 @@ convToGemmFullInput(float *const __restrict__ output, // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? for (int i = 0; i < KH; i++) { for (int j = 0; j < KW; j++) { const int filter_elem_num = @@ -1085,10 +1085,10 @@ convToGemmFullInput2(float *const __restrict__ output, // number const int h = tx % (H_out * W_out) / W_out; // output height index (row // number) - const int w = tx % W_out; // output width index (col number) - const int inH = h * V_stride - V_pad; // input height index (row number) - const int inW = w * H_stride - H_pad; // input width index (col number) - if (n < N) { // is thread id within bounds? + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? const int filter_elem_num = c * KH * KW; for (int l = (filter_elem_num % 2) + skip_offset; l < KH * KW; l += 2) { int i = l / KW; diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h index 330d97600e6cdcf44bb93dbf28625cca8051c3ec..c318a8fb6aba604282cf709d09b6a6ef1a771f0e 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_runtime_utils.h @@ -3,7 +3,6 @@ #ifndef APPROXHPVM_RUNTIME_UTILS #define APPROXHPVM_RUNTIME_UTILS - #include "tensor_runtime.h" #include "tensor_cpu_runtime.h" #include "configuration.h" @@ -17,30 +16,29 @@ //--- CPU Approximation handling ---// //----------------------------------------------------------------------------// -void* handleTensorAddApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* bias) { +void *handleTensorAddApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *bias) { -if (approxTuples.size() == 1) { + if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorAddCPU(input, bias); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorAddCPU(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorAddCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorAddCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -53,32 +51,31 @@ if (approxTuples.size() == 1) { return NULL; } -void* handleTensorMulApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* lhs, void* rhs) { +void *handleTensorMulApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *lhs, void *rhs) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorGemmCPU(lhs, rhs); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorGemmCPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorGemmCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorGemmCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here - } + } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); abort(); @@ -89,79 +86,72 @@ void* handleTensorMulApproximationTuples_CPU( return NULL; } -void* handleTensorConvApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* filter, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w) { +void *handleTensorConvApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *filter, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxCPU(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, 1, 1); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); - return t_out; - } - case CPUNodeConfiguration::APPROX::PERFORATION : - { - PerfParams params = perfParamSet->getPerfParams(param); - INFO("perforation param = %i\n", param); - INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", - params.row, params.col, params.skip_offset); - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxCPU(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - params.row, params.col, 1, params.skip_offset); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)", pinfo.second); - return t_out; - } - case CPUNodeConfiguration::APPROX::INPUT_SAMPLING : - { - SampParams params = sampParamSet->getSampParams(param); - INFO("sampling param = %i\n", param); - INFO("params.skip_rate = %i, params.skip_offset = %i\n", - params.skip_rate, params.skip_offset); - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxCPU(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, - params.skip_rate, params.skip_offset); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = + tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); + return t_out; + } + case CPUNodeConfiguration::APPROX::PERFORATION: { + PerfParams params = perfParamSet->getPerfParams(param); + INFO("perforation param = %i\n", param); + INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", + params.row, params.col, params.skip_offset); + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApproxCPU( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, + 1, 1, params.row, params.col, 1, params.skip_offset); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox(_perf)", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_perf)", + pinfo.second); + return t_out; + } + case CPUNodeConfiguration::APPROX::INPUT_SAMPLING: { + SampParams params = sampParamSet->getSampParams(param); + INFO("sampling param = %i\n", param); + INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, + params.skip_offset); + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApproxCPU(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, + params.skip_rate, params.skip_offset); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox(_samp)", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox(_samp)", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -174,75 +164,73 @@ void* handleTensorConvApproximationTuples_CPU( return NULL; } -void* handleTensorGroupConvApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* filter, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups) { +void *handleTensorGroupConvApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *filter, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, int conv_mode, + int conv_groups) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvCutlassCPU(input, filter, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorConvCutlassCPU(input, filter, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, + conv_mode, conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvCutlassCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvCutlassCPU", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorBatchNormApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon) { +void *handleTensorBatchNormApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, + void *variance_ptr, double epsilon) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr, - mean_ptr, variance_ptr, epsilon); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorBatchNormCPU(input_ptr, gamma_ptr, beta_ptr, mean_ptr, + variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorBatchNormCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorBatchNormCPU", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); @@ -254,161 +242,154 @@ void* handleTensorBatchNormApproximationTuples_CPU( return NULL; } -void* handleTensorReluApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input) { +void *handleTensorReluApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorReluCPU(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorReluCPU(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorReluCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorReluCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorClippedReluApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, float min, float max) { +void *handleTensorClippedReluApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, float min, float max) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorRelu2CPU(input, min, max); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorRelu2CPU(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu2CPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu2CPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorTanhApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input) { +void *handleTensorTanhApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorTanhCPU(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorTanhCPU(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorTanhCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorTanhCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorPoolingApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr, int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { +void *handleTensorPoolingApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr, int poolFunction, int window_height, int window_width, + int vertical_pad, int horizontal_pad, int vertical_stride, + int horizontal_stride) { if (approxTuples.size() == 1) { enum CPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case CPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorPoolingCPU(input_ptr, - poolFunction, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case CPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorPoolingCPU(input_ptr, poolFunction, window_height, + window_width, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorPoolingCPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorPoolingCPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorSoftmaxApproximationTuples_CPU( - std::vector< std::pair<CPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr) { - void* t_out; +void *handleTensorSoftmaxApproximationTuples_CPU( + std::vector<std::pair<CPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr) { + void *t_out; RC->resume_profiler(); t_out = tensorSoftmaxCPU(input_ptr); RC->pause_profiler(); @@ -423,42 +404,40 @@ void* handleTensorSoftmaxApproximationTuples_CPU( //--- GPU Approximation handling ---// //----------------------------------------------------------------------------// -void* handleTensorAddApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* bias) { +void *handleTensorAddApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *bias) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorAdd(input, bias); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfAdd(input, bias); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorAdd(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorAdd", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfAdd(input, bias); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfAdd", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfAdd", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -471,44 +450,42 @@ void* handleTensorAddApproximationTuples( return NULL; } -void* handleTensorMulApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* lhs, void* rhs) { +void *handleTensorMulApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *lhs, void *rhs) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorGemmGPU(lhs, rhs); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfGemmGPU(lhs, rhs); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorGemmGPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorGemmGPU", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfGemmGPU(lhs, rhs); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfGemmGPU", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfGemmGPU", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here - } + } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); abort(); @@ -519,100 +496,88 @@ void* handleTensorMulApproximationTuples( return NULL; } -void* handleTensorConvApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* filter, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w) { +void *handleTensorConvApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *filter, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApprox(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, 1, 1); - - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, 1, 1); - - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::PERFORATION : - case GPUNodeConfiguration::APPROX::PERFORATION_HP : - { - PerfParams params = perfParamSet->getPerfParams(param); - INFO("perforation param = %i\n", param); - INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", - params.row, params.col, params.skip_offset); - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - params.row, params.col, 1, params.skip_offset); - - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::INPUT_SAMPLING : - case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP : - { - SampParams params = sampParamSet->getSampParams(param); - INFO("sampling param = %i\n", param); - INFO("params.skip_rate = %i, params.skip_offset = %i\n", - params.skip_rate, params.skip_offset); - void* t_out; - RC->resume_profiler(); - t_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, - params.skip_rate, params.skip_offset); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApprox", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApprox", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = + tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, 1, 1); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf", + pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::PERFORATION: + case GPUNodeConfiguration::APPROX::PERFORATION_HP: { + PerfParams params = perfParamSet->getPerfParams(param); + INFO("perforation param = %i\n", param); + INFO("params.row = %i, params.col = %i, params.skip_offset = %i\n", + params.row, params.col, params.skip_offset); + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApproxHalf2( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, + 1, 1, params.row, params.col, 1, params.skip_offset); + + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_perf)", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_perf)", + pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::INPUT_SAMPLING: + case GPUNodeConfiguration::APPROX::INPUT_SAMPLING_HP: { + SampParams params = sampParamSet->getSampParams(param); + INFO("sampling param = %i\n", param); + INFO("params.skip_rate = %i, params.skip_offset = %i\n", params.skip_rate, + params.skip_offset); + void *t_out; + RC->resume_profiler(); + t_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, + params.skip_rate, params.skip_offset); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvApproxHalf(_samp)", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvApproxHalf(_samp)", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { @@ -625,103 +590,99 @@ void* handleTensorConvApproximationTuples( return NULL; } -void* handleTensorGroupConvApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, void* filter, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups) { +void *handleTensorGroupConvApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, void *filter, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, int conv_mode, + int conv_groups) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorConvCutlass(input, filter, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfConvCutlass(input, filter, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorConvCutlass(input, filter, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, conv_mode, + conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorConvCutlass", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorConvCutlass", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfConvCutlass(input, filter, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, + conv_mode, conv_groups); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfConvCutlass", + pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfConvCutlass", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorBatchNormApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon) { +void *handleTensorBatchNormApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr, void *gamma_ptr, void *beta_ptr, void *mean_ptr, + void *variance_ptr, double epsilon) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, - mean_ptr, variance_ptr, epsilon); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, - mean_ptr, variance_ptr, epsilon); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, + variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorBatchNorm", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfBatchNorm(input_ptr, gamma_ptr, beta_ptr, mean_ptr, + variance_ptr, epsilon); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfBatchNorm", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfBatchNorm", + pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here } } else if (approxTuples.size() == 2) { ERROR("Currently unsupported case"); @@ -733,215 +694,202 @@ void* handleTensorBatchNormApproximationTuples( return NULL; } -void* handleTensorReluApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input) { +void *handleTensorReluApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorRelu(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfRelu(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorRelu(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfRelu(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorClippedReluApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input, float min, float max) { +void *handleTensorClippedReluApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input, float min, float max) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorRelu2(input, min, max); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfRelu2(input, min, max); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorRelu2(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorRelu2", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfRelu2(input, min, max); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfRelu2", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfRelu2", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorTanhApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input) { +void *handleTensorTanhApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorTanh(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfTanh(input); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorTanh(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorTanh", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfTanh(input); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfTanh", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfTanh", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorPoolingApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr, int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { +void *handleTensorPoolingApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr, int poolFunction, int window_height, int window_width, + int vertical_pad, int horizontal_pad, int vertical_stride, + int horizontal_stride) { if (approxTuples.size() == 1) { enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; int param = approxTuples[0].second; switch (approx) { - case GPUNodeConfiguration::APPROX::FP32 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorPooling(input_ptr, - poolFunction, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second); - return t_out; - } - case GPUNodeConfiguration::APPROX::FP16 : - { - void* t_out; - RC->resume_profiler(); - t_out = tensorHalfPooling(input_ptr, - poolFunction, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - RC->pause_profiler(); - std::pair<double, double> pinfo = RC->get_time_energy(); - RC->reset_profiler(); - RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first); - RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second); - return t_out; - } - default : - CUSTOM_ASSERT(false && "Unknown approximation type"); - ERROR("Unknown approximation type"); - abort(); - // TODO additional approx methods implemented here - } - } else if (approxTuples.size() == 2) { - ERROR("Currently unsupported case"); - abort(); - } else { - ERROR("Unsupported case"); + case GPUNodeConfiguration::APPROX::FP32: { + void *t_out; + RC->resume_profiler(); + t_out = tensorPooling(input_ptr, poolFunction, window_height, + window_width, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorPooling", pinfo.second); + return t_out; + } + case GPUNodeConfiguration::APPROX::FP16: { + void *t_out; + RC->resume_profiler(); + t_out = tensorHalfPooling(input_ptr, poolFunction, window_height, + window_width, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorHalfPooling", pinfo.first); + RC->addToCurrentIterationComputeEnergy("tensorHalfPooling", pinfo.second); + return t_out; + } + default: + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); abort(); + // TODO additional approx methods implemented here } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } return NULL; } -void* handleTensorSoftmaxApproximationTuples( - std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, - void* input_ptr) { - //TODO: if approximation choices are added for softmax operation, +void *handleTensorSoftmaxApproximationTuples( + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>> &approxTuples, + void *input_ptr) { + // TODO: if approximation choices are added for softmax operation, // implement this like the other handle* functions - void* t_out; + void *t_out; RC->resume_profiler(); t_out = tensorSoftmax(input_ptr); RC->pause_profiler(); @@ -952,5 +900,4 @@ void* handleTensorSoftmaxApproximationTuples( return t_out; } - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h index b4f3d39fae77b214a46301ba7d6c95a5e651c44f..3b52cce9f62504753d63015a599d214194d48d98 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/configuration.h @@ -144,7 +144,8 @@ public: // - energy // - accuracy (compared to golden output) // - accuracy loss (compared to baseline) -// - a hardware choice and set or operations-approximation choices, described in setup +// - a hardware choice and set or operations-approximation choices, described in +// setup struct Configuration { std::string name; float speedup; @@ -152,7 +153,7 @@ struct Configuration { float accuracy; float accuracyLoss; std::map<std::string, NodeConfiguration *> setup; - // map for mapping visc.node.id IDs to HPVM (fused) node approx-configurations + // map for mapping visc.node.id IDs to HPVM (fused) node approx-configurations std::map<int, NodeConfiguration *> idConfigMap; Configuration(std::string &n, float f, float e, float a, float al); @@ -171,8 +172,8 @@ struct Configuration { // Comparison operator definition, in increasing accuracy loss // (for std sort, used in pareto optimal computation) struct ConfigurationLessThan { - bool operator()( - const struct Configuration &a, const struct Configuration &b) const; + bool operator()(const struct Configuration &a, + const struct Configuration &b) const; }; // Comparison operator definition, in increasing accuracy loss diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h index 50a0527def09f1786007a06ce0ab89ab0c7c078f..a766c02d6cc724fe91e4ef581871497cfddee788 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/rt-controller-api.h @@ -2,5 +2,6 @@ extern "C" { // Functions to be inserted with initializeTensorRT and clearTensorRT void llvm_hpvm_initializeRuntimeController(const char *); void llvm_hpvm_clearRuntimeController(); -void llvm_hpvm_invokeRtControl(void *result, const char *str, int start, int end); +void llvm_hpvm_invokeRtControl(void *result, const char *str, int start, + int end); } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h index f8b722ca38bc2ae7065da365585ca495001038d7..d070d7755c1f5982c2c9fabf1acdca83bd446870 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h @@ -1,84 +1,79 @@ -//===--------------------------- tensor_cpu_runtime.h -----------------------===// +//===--------------------------- tensor_cpu_runtime.h +//-----------------------===// // //===----------------------------------------------------------------------===// -// +// // This header file comprises of the API to the tensor routines for CPU. // This also contains the interfaces to the approximated versions of tensor // operations that are supported on CPU. // //===----------------------------------------------------------------------===// - #include <stdio.h> #include <cstdlib> #include <cmath> #include <memory> #include <string> - #ifndef TENSOR_CPU_HEADER #define TENSOR_CPU_HEADER +extern "C" { +/**** Initialization Routine - Must be inserted at program start (in the + * backend) ****/ +void llvm_hpvm_initTensorRtCPU(); +void llvm_hpvm_cleanupTensorRtCPU(); -extern "C"{ - /**** Initialization Routine - Must be inserted at program start (in the backend) ****/ - void llvm_hpvm_initTensorRtCPU(); - void llvm_hpvm_cleanupTensorRtCPU(); +// Routine to moving tensor data (from and to GPU,CPU) +void hpvm_request_tensorCPU(void *tensor, int destination); - // Routine to moving tensor data (from and to GPU,CPU) - void hpvm_request_tensorCPU(void* tensor, int destination); +// NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for +// cuDNN operations NOTE: The only data format supported as of now is: NCHW +// (batch_dimension, channels, Height, Width) +// void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t +// dim2_size, +/// size_t dim3_size, size_t dim4_size, bool freeMemory = true); +void initTensorData(void *tensor, void *data_ptr, size_t size_in_bytes); - // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations - // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width) - //void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, - /// size_t dim3_size, size_t dim4_size, bool freeMemory = true); - - void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes); +/********** Tensor Operation API ******/ - /********** Tensor Operation API ******/ +// NOTE: For conv_mode, only value '1' is supported +void *tensorConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision, int row, int col, + int skip_every, int start); - // NOTE: For conv_mode, only value '1' is supported -void* tensorConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int row, int col, int skip_every, int start); +void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision, int row, int col, + int skip_every, int start); -void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int row, int col, int skip_every, int start); +void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups); -void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups); - - void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon); +void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, + void *mean_ptr, void *variance_ptr, double epsilon); +void *tensorPoolingCPU(void *input, int poolFunction, int window_height, + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride); - void* tensorPoolingCPU(void* input, - int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride); +void *tensorGemmCPU(void *lhs, void *rhs); - void* tensorGemmCPU(void* lhs, void* rhs); +void *tensorAddCPU(void *x, void *bias); - void* tensorAddCPU(void* x, void* bias); +void *tensorReluCPU(void *input); - void* tensorReluCPU(void* input); +void *tensorRelu2CPU(void *input, float min, float max); - void* tensorRelu2CPU(void* input, float min, float max); - - void* tensorTanhCPU(void* input); - - void* tensorSoftmaxCPU(void* input); - -} +void *tensorTanhCPU(void *input); +void *tensorSoftmaxCPU(void *input); +} #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h index f05dab738bbfab51e21673bdf76b81596fc3b49b..1b6e986a47324ab0ab663fc8e1e5171b07c135cf 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h @@ -159,22 +159,14 @@ void *wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, int activation_id, // Relu, Tanh, ClipRelu float out_min, float out_max); +void *wrapper_ConvLayer2( + const char *hpvm_node_id, void *input, void *filter, void *bias, + int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v, + int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id, + // NOTE: out_min, out_max are only relevant for ClippedRelu + float out_min, float out_max); -void* wrapper_ConvLayer2(const char* hpvm_node_id, - void* input, - void* filter, - void* bias, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, - int pool_size_v, int pool_size_h, - int pool_pad_v, int pool_pad_h, - int pool_stride_v, int pool_stride_h, - int activation_id, - // NOTE: out_min, out_max are only relevant for ClippedRelu - float out_min, float out_max); - - void *wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights, void *bias, int activation_id, float out_min, float out_max); @@ -204,11 +196,8 @@ void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr, void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr); - void *tensor_set_node_id(unsigned int node_id); - - - + // Utilities // TODO: separate utils in separate header void dumpAccuracyNorms(); diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc index b3abdc0ce48a507a250e7f82fc6ff7729dff3e9f..a3853fda533aa4668963826eb646f009aae02695 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_signatures.cc @@ -1,14 +1,14 @@ -//===--------------------------- tensor_signatures.cc -----------------------===// +//===--------------------------- tensor_signatures.cc +//-----------------------===// // //===----------------------------------------------------------------------===// -// +// // This file contains the declarations of the API to the HPVM tensor runtime. // This is compiled to LLVM bitcode file that is loaded by HPVM passes when // tensor-based application are compiled through HPVM. // //===----------------------------------------------------------------------===// - #include "tensor_runtime.h" void dummyFunction() { diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc index b272bbcab45573f03ac17305f86a99e630db2950..a0ca6f5bb0632b592b6cc6b09c9cd6068319b954 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_knobs_utils.cc @@ -27,17 +27,17 @@ PerfParamSet::PerfParamSet() { printf("- knobs_file_path = %s \n", GLOBAL_KNOBS_FILE); std::ifstream file(GLOBAL_KNOBS_FILE); - if (!file){ + if (!file) { ERROR(" Could NOT find global_knobs.txt \n"); } - + std::string line; std::string partial; std::vector<std::string> tokens; while (std::getline(file, line)) { // Read each line - //printf ("***** line === %s ", line); + // printf ("***** line === %s ", line); std::istringstream iss(line); std::string token; while (std::getline(iss, token, '\t')) { // Read each token in the line @@ -64,7 +64,7 @@ PerfParamSet::PerfParamSet() { std::getline(token_stream, tok, ','); int offset = atoi(tok.c_str()); - //printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob, + // printf("**** knob = %d, row = %d, col = %d, offset = %d \n\n", knob, // row, col, offset); PerfParams params(row, col, offset); perf_knob_map[knob] = params; @@ -101,10 +101,10 @@ SampParamSet::SampParamSet() { printf("- knobs_file_path = %s \n", GLOBAL_KNOBS_FILE); std::ifstream file(GLOBAL_KNOBS_FILE); - if (!file){ + if (!file) { ERROR("Could NOT find global_knobs.txt \n"); } - + std::string line; std::string partial; std::vector<std::string> tokens; @@ -124,7 +124,7 @@ SampParamSet::SampParamSet() { int index2 = token.find(","); std::string knob_str = token.substr(index2 + 1); int knob = atoi(knob_str.c_str()); - //printf("knob = %d \n", knob); + // printf("knob = %d \n", knob); std::getline(iss, token, '\t'); std::istringstream token_stream(token); @@ -140,7 +140,7 @@ SampParamSet::SampParamSet() { std::getline(token_stream, tok, ','); float interpolation_id = atof(tok.c_str()); - //printf("skip_every = %d, offset = %d \n", skip_every, offset); + // printf("skip_every = %d, offset = %d \n", skip_every, offset); SampParams params(skip_every, offset, interpolation_id); samp_knob_map[knob] = params; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu index 41aa1852841d0b82f433c4c337e8866771fa9c50..8a8ff8435db96607917fc627036e72318409ef9b 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_simulation.cu @@ -1,14 +1,13 @@ //===--------------------------- approxs_simulator.cu ---------------------===// // //===----------------------------------------------------------------------===// -// -// This file consists of the emulations of implementation of software -// approximations for tensor convolutions. The approximations implemented are -// feature sampling and perforation for FP32 and FP16 compute precisions. +// +// This file consists of the emulations of implementation of software +// approximations for tensor convolutions. The approximations implemented are +// feature sampling and perforation for FP32 and FP16 compute precisions. // //===----------------------------------------------------------------------===// - #ifndef SIM_HEADER #define SIM_HEADER @@ -27,7 +26,6 @@ #include "global_data.h" #include "approx_knob_utils.h" - #include <unordered_map> #include <sstream> #include <fstream> @@ -36,77 +34,67 @@ #include <map> #include <cassert> - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void postInterpolateRow(int N, int n, int c, int h, int w, - float* data, int int_row){ +// N is new_data's size +// n, c, h, w are the dimensions of new_data +__global__ void postInterpolateRow(int N, int n, int c, int h, int w, + float *data, int int_row) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - for(int i = index; i < N; i += stride){ + for (int i = index; i < N; i += stride) { int col = ((i % (c * h * w)) % (h * w)) % w; int row = ((i % (c * h * w)) % (h * w)) / w; int ch = (i % (c * h * w)) / (h * w); int n = i / (c * h * w); - if((row % int_row == 1) && (row != 0) && (row != h-1)) + if ((row % int_row == 1) && (row != 0) && (row != h - 1)) data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (data[n * (c * h * w) + ch * (h * w) + (row - 1) * (w) + col] + - data[n * (c * h * w) + ch * (h * w) + (row + 1) * (w) + col]) / 2; - + (data[n * (c * h * w) + ch * (h * w) + (row - 1) * (w) + col] + + data[n * (c * h * w) + ch * (h * w) + (row + 1) * (w) + col]) / + 2; } } - - -__global__ -void postInterpolateCol(int N, int n, int c, int h, int w, - float* data, int int_col){ +__global__ void postInterpolateCol(int N, int n, int c, int h, int w, + float *data, int int_col) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - for(int i = index; i < N; i += stride){ + for (int i = index; i < N; i += stride) { int col = ((i % (c * h * w)) % (h * w)) % w; int row = ((i % (c * h * w)) % (h * w)) / w; int ch = (i % (c * h * w)) / (h * w); int n = i / (c * h * w); - if((col % int_col == 1) && (col != 0) && (col != w-1)) + if ((col % int_col == 1) && (col != 0) && (col != w - 1)) data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (data[n * (c * h * w) + ch * (h * w) + row * (w) + (col-1) ] + - data[n * (c * h * w) + ch * (h * w) + row * (w) + (col+1) ])/2; - + (data[n * (c * h * w) + ch * (h * w) + row * (w) + (col - 1)] + + data[n * (c * h * w) + ch * (h * w) + row * (w) + (col + 1)]) / + 2; } } - - - // A 'Simulation' of perforated tensor convolution -void* tensorConvPerfSim(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int row, int col){ - +void *tensorConvPerfSim(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col) { INFO("*** TensorConvolution \n"); profileEvent("tensorConv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - - if(conv_mode == 0) + + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; float alpha = 1.0f, beta = 0.0f; @@ -114,13 +102,13 @@ void* tensorConvPerfSim(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", - vertical_stride, horizontal_stride); + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } @@ -130,134 +118,111 @@ void* tensorConvPerfSim(void* input_ptr, void* filter_ptr, int new_v = vertical_stride + 0; int new_h = horizontal_stride + 0; cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - new_v, new_h, // conv strides - 1, 1, // upscaling values - mode , // mode is configurable - computeType)); // defines compute precision + + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + new_v, new_h, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output; - if(input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - else if(input->data_format == CUDNN_TENSOR_NHWC){ + Tensor *output; + if (input->data_format == CUDNN_TENSOR_NCHW) + output = (Tensor *)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*) create4DTensor((cudnnDataType_t) input->data_type, - CUDNN_TENSOR_NHWC, n, h, w, c); - } - else + output = (Tensor *)create4DTensor((cudnnDataType_t)input->data_type, + CUDNN_TENSOR_NHWC, n, h, w, c); + } else ERROR("Unsupported Tensor Type"); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], - output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = " + "%d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support + // is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // FIXIT: Algo shouldn't be hardcoded convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, filter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); h = (2 * vertical_pad + input->dims.dim_sizes[2] - - filter->dims.dim_sizes[2]) / vertical_stride + 1; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - - filter->dims.dim_sizes[3]) / horizontal_stride + 1; + filter->dims.dim_sizes[2]) / + vertical_stride + + 1; + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - + filter->dims.dim_sizes[3]) / + horizontal_stride + + 1; - int numBlocks = (n * c * h * w + 127) / 128; + int numBlocks = (n * c * h * w + 127) / 128; if (row > 0) - postInterpolateRow<<<numBlocks,128>>>(n * c * h * w, n, c, h, w, - (float *) output->gpu_data, row); + postInterpolateRow<<<numBlocks, 128>>>(n * c * h * w, n, c, h, w, + (float *)output->gpu_data, row); if (col > 0) - postInterpolateCol<<<numBlocks,128>>>(n * c * h * w, n, c, h, w, - (float *) output->gpu_data, col); - + postInterpolateCol<<<numBlocks, 128>>>(n * c * h * w, n, c, h, w, + (float *)output->gpu_data, col); profileEvent("tensorConv_end", true); return output; } - - - - -//N is new_data's size -//n, c, h, w are the dimensions of new_data -__global__ -void sampleFilterElems(int N, - int n, int c, int h, int w, - float* data, - int skip_elem, int skip_offset, - float mul_factor, - float* newData){ +// N is new_data's size +// n, c, h, w are the dimensions of new_data +__global__ void sampleFilterElems(int N, int n, int c, int h, int w, + float *data, int skip_elem, int skip_offset, + float mul_factor, float *newData) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ + for (int i = index; i < N; i += stride) { int col = ((i % (c * h * w)) % (h * w)) % w; int row = ((i % (c * h * w)) % (h * w)) / w; int ch = (i % (c * h * w)) / (h * w); @@ -265,75 +230,60 @@ void sampleFilterElems(int N, int local_index = (ch * (h * w)) + (row * w) + col; - if(skip_elem == 3 && h == 3 && w == 3){ + if (skip_elem == 3 && h == 3 && w == 3) { skip_offset = (skip_offset + ch) % w; // wrap around skip offsets } - if(local_index % skip_elem == skip_offset) - newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0; + if (local_index % skip_elem == skip_offset) + newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = 0; else newData[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - data[n * (c * h * w) + ch * (h * w) + row * (w) + col] * mul_factor; - + data[n * (c * h * w) + ch * (h * w) + row * (w) + col] * mul_factor; } } - - - - -void sampleFilter(Tensor* newFilter, Tensor* filter, - int skip_rate, int skip_offset){ +void sampleFilter(Tensor *newFilter, Tensor *filter, int skip_rate, + int skip_offset) { int n = filter->dims.dim_sizes[0]; int c = filter->dims.dim_sizes[1]; int h = filter->dims.dim_sizes[2]; int w = filter->dims.dim_sizes[3]; - - int numBlocks = (n * c * h * w + 127) / 128; - int N = n * c * h * w; - float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); + int numBlocks = (n * c * h * w + 127) / 128; + int N = n * c * h * w; - //float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); - //mul_factor = (mul_factor + 1.0) / 2; + float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); - - DEBUG ("mul_factor = %f \n", mul_factor); + // float mul_factor = (skip_rate * 1.0) / (skip_rate - 1); + // mul_factor = (mul_factor + 1.0) / 2; - - sampleFilterElems<<<numBlocks,128>>>(N, - n, c, h, w, - (float *) filter->gpu_data, - skip_rate, skip_offset, mul_factor, - (float *) newFilter->gpu_data); + DEBUG("mul_factor = %f \n", mul_factor); + sampleFilterElems<<<numBlocks, 128>>>( + N, n, c, h, w, (float *)filter->gpu_data, skip_rate, skip_offset, + mul_factor, (float *)newFilter->gpu_data); } - - // A 'Simulation' of perforated tensor convolution -void* tensorConvSampSim(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int skip_rate, int skip_offset){ - +void *tensorConvSampSim(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int skip_rate, int skip_offset) { INFO("*** TensorConvolution \n"); profileEvent("tensorConv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; - cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - - if(conv_mode == 0) + + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; float alpha = 1.0f, beta = 0.0f; @@ -344,24 +294,22 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr, convertToFP32(input); convertToFP32(filter); - Tensor* newFilter; - newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0], - filter->dims.dim_sizes[1], filter->dims.dim_sizes[2], - filter->dims.dim_sizes[3]); - + Tensor *newFilter; + newFilter = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0], + filter->dims.dim_sizes[1], filter->dims.dim_sizes[2], + filter->dims.dim_sizes[3]); // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling sampleFilter(newFilter, filter, skip_rate, skip_offset); - - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", - vertical_stride, horizontal_stride); + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } @@ -371,147 +319,116 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr, int new_v = vertical_stride + 0; int new_h = horizontal_stride + 0; cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - new_v, new_h, // conv strides - 1, 1, // upscaling values - mode , // mode is configurable - computeType)); // defines compute precision + + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + new_v, new_h, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output; - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - + Tensor *output; + output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], - output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = " + "%d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support + // is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // NOTE: Using GEMM-based Algo convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, newFilter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, newFilter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - - - freeTensor(newFilter); profileEvent("tensorConv_end", true); return output; } - - - - - - - - - -void sampleFilter2(Tensor* newFilter, Tensor* filter, - int skip_rate, int skip_offset, float interpolation_rate){ +void sampleFilter2(Tensor *newFilter, Tensor *filter, int skip_rate, + int skip_offset, float interpolation_rate) { int n = filter->dims.dim_sizes[0]; int c = filter->dims.dim_sizes[1]; int h = filter->dims.dim_sizes[2]; int w = filter->dims.dim_sizes[3]; - - int numBlocks = (n * c * h * w + 127) / 128; + + int numBlocks = (n * c * h * w + 127) / 128; int N = n * c * h * w; float mul_factor; mul_factor = (skip_rate * 1.0) / (skip_rate - 1); mul_factor = 1 + (interpolation_rate * (mul_factor - 1.0)); - DEBUG ("mul_factor = %f \n", mul_factor); - - sampleFilterElems<<<numBlocks,128>>>(N, - n, c, h, w, - (float *) filter->gpu_data, - skip_rate, skip_offset, mul_factor, - (float *) newFilter->gpu_data); -} - + DEBUG("mul_factor = %f \n", mul_factor); + sampleFilterElems<<<numBlocks, 128>>>( + N, n, c, h, w, (float *)filter->gpu_data, skip_rate, skip_offset, + mul_factor, (float *)newFilter->gpu_data); +} // A 'Simulation' of perforated tensor convolution -void* tensorConvSampSim2(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int skip_rate, int skip_offset, float interpolation_rate){ - +void *tensorConvSampSim2(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int skip_rate, int skip_offset, + float interpolation_rate) { INFO("*** TensorConvolution \n"); profileEvent("tensorConv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; - cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t convAlgo; + cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - - if(conv_mode == 0) + + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; float alpha = 1.0f, beta = 0.0f; @@ -522,24 +439,22 @@ void* tensorConvSampSim2(void* input_ptr, void* filter_ptr, convertToFP32(input); convertToFP32(filter); - Tensor* newFilter; - newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0], - filter->dims.dim_sizes[1], filter->dims.dim_sizes[2], - filter->dims.dim_sizes[3]); - + Tensor *newFilter; + newFilter = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0], + filter->dims.dim_sizes[1], filter->dims.dim_sizes[2], + filter->dims.dim_sizes[3]); // Zeroing (+Scaling) Filter elements to 'Simulate' input sampling sampleFilter2(newFilter, filter, skip_rate, skip_offset, interpolation_rate); - - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", - vertical_stride, horizontal_stride); + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } @@ -549,166 +464,135 @@ void* tensorConvSampSim2(void* input_ptr, void* filter_ptr, int new_v = vertical_stride + 0; int new_h = horizontal_stride + 0; cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - new_v, new_h, // conv strides - 1, 1, // upscaling values - mode , // mode is configurable - computeType)); // defines compute precision + + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + new_v, new_h, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output; - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - + Tensor *output; + output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], - output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = " + "%d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support + // is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // NOTE: Using GEMM-based Algo convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, newFilter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, newFilter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - - - freeTensor(newFilter); profileEvent("tensorConv_end", true); return output; } +/************ NOTE: API for ApproxHPVM Wrapper runtime *******/ +void *PROMISE_Conv(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, int pool_stride, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + Tensor *input_t = (Tensor *)input; + Tensor *filter_t = (Tensor *)filter; + Tensor *bias_t = (Tensor *)bias; - - - - - -/************ NOTE: API for ApproxHPVM Wrapper runtime *******/ - - -void* PROMISE_Conv(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, int pool_stride, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - - Tensor* input_t = (Tensor*) input; - Tensor* filter_t = (Tensor*) filter; - Tensor* bias_t = (Tensor*) bias; - int orig_type = input_t->cur_type; DEBUG("FP32 conversions \n"); - + convertToFP32(input_t); convertToFP32(filter_t); convertToFP32(bias_t); DEBUG("DONE FP32 conversions \n"); - - if(swing < 8){ + if (swing < 8) { input = quantizeTensorPromise(input, i_min, i_max); filter = quantizeTensorPromise(filter, w_min, w_max); - if(bias != NULL) + if (bias != NULL) bias = quantizeTensorPromise(bias, b_min, b_max); // aRead error - + input = addPromiseError(input, swing); } - - void* conv_out; - conv_out = tensorConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); - - void* conv_add; - if(bias != NULL){ + void *conv_out; + conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 0); + + void *conv_add; + if (bias != NULL) { conv_add = tensorAdd(conv_out, bias); - } - else{ + } else { conv_add = conv_out; } - void* pool_out; + void *pool_out; // NOTE: Skip pooling on negative pool sizes - if(pool_size > 0){ - //FIXME: Currently only using MaxPooling - //-- pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_size, pool_size); - pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, pool_stride, pool_stride); - } - else{ + if (pool_size > 0) { + // FIXME: Currently only using MaxPooling + //-- pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, + // pool_size, pool_size); + pool_out = tensorPooling(conv_add, 0, pool_size, pool_size, 0, 0, + pool_stride, pool_stride); + } else { pool_out = conv_add; } - - void* activation_out; - switch(activation_id){ + + void *activation_out; + switch (activation_id) { case -1: activation_out = pool_out; INFO("NO Activation Function \n"); @@ -727,68 +611,54 @@ void* PROMISE_Conv(void* input, float i_min, float i_max, break; } - - if(swing < 8 && activation_id != -1){ + if (swing < 8 && activation_id != -1) { activation_out = quantizeTensorPromise(activation_out, out_min, out_max); } - - - //NOTE: Convert back to FP16 if original type - if (orig_type == half_type){ - convertToFP16((Tensor*) activation_out); + // NOTE: Convert back to FP16 if original type + if (orig_type == half_type) { + convertToFP16((Tensor *)activation_out); } - return activation_out; } +void *PROMISE_FC(void *input, float i_min, float i_max, void *weights, + float w_min, float w_max, void *bias, float b_min, float b_max, + int activation_id, float out_min, float out_max, int swing) { + Tensor *input_t = (Tensor *)input; + Tensor *weights_t = (Tensor *)weights; + Tensor *bias_t = (Tensor *)bias; -void* PROMISE_FC(void* input, float i_min, float i_max, - void* weights, float w_min, float w_max, - void* bias, float b_min, float b_max, - int activation_id, - float out_min, float out_max, int swing){ - - - Tensor* input_t = (Tensor*) input; - Tensor* weights_t = (Tensor*) weights; - Tensor* bias_t = (Tensor*) bias; - int orig_type = input_t->cur_type; - + convertToFP32(input_t); convertToFP32(weights_t); convertToFP32(bias_t); - - - if(swing < 8){ + + if (swing < 8) { input = quantizeTensorPromise(input, i_min, i_max); weights = quantizeTensorPromise(weights, w_min, w_max); - if(bias != NULL) + if (bias != NULL) bias = quantizeTensorPromise(bias, b_min, b_max); // NOTE: Modelling aRead error in PROMISE input = addPromiseError(input, swing); } - - - void* gemm_out; + void *gemm_out; gemm_out = tensorGemmGPU(input, weights); - - void* gemmbias_out; - if(bias != NULL){ + void *gemmbias_out; + if (bias != NULL) { gemmbias_out = tensorAdd(gemm_out, bias); - } - else{ + } else { gemmbias_out = gemm_out; } - - void* activation_out; - switch(activation_id){ + + void *activation_out; + switch (activation_id) { case -1: activation_out = gemmbias_out; @@ -807,86 +677,71 @@ void* PROMISE_FC(void* input, float i_min, float i_max, ERROR("Activation id %d NOT supported \n", activation_out); break; } - - - if(swing < 8 && activation_id != -1){ + + if (swing < 8 && activation_id != -1) { activation_out = quantizeTensorPromise(activation_out, out_min, out_max); } - - //NOTE: Convert back to FP16 if original type - if (orig_type == half_type){ - convertToFP16((Tensor*) activation_out); + // NOTE: Convert back to FP16 if original type + if (orig_type == half_type) { + convertToFP16((Tensor *)activation_out); } - - return activation_out; } - - - - -// NOTE: Enabling the macro below is used for testing against the old PROMISE wrapper +// NOTE: Enabling the macro below is used for testing against the old PROMISE +// wrapper //#define OLD_MODEL #ifndef OLD_MODEL +bool isPromiseLayer(int swing) { - -bool isPromiseLayer(int swing){ - - if(swing < 8) + if (swing < 8) return true; else - return false; + return false; } +bool isGPULayer(int swing) { -bool isGPULayer(int swing){ - - if(swing > 10 ) // PROMISE layers are 1-7 + if (swing > 10) // PROMISE layers are 1-7 return true; else - return false; + return false; } +bool isFullPrecision(int swing) { -bool isFullPrecision(int swing){ - - if(swing == 11) + if (swing == 11) return true; else - return false; + return false; } +bool isHalfPrecision(int swing) { - -bool isHalfPrecision(int swing){ - - if(swing == 12) + if (swing == 12) return true; else - return false; + return false; } +bool isPerforation(int swing) { -bool isPerforation(int swing){ - - if(swing >= 100 && swing <= 200) + if (swing >= 100 && swing <= 200) return true; else - return false; + return false; } +bool isSampling(int swing) { -bool isSampling(int swing){ - - if(swing >= 200 && swing <= 300) + if (swing >= 200 && swing <= 300) return true; else - return false; + return false; } bool isReductionSampling(int swing) { @@ -894,300 +749,227 @@ bool isReductionSampling(int swing) { if (swing >= 41 && swing <= 49) return true; else - return false; + return false; } -int getSwing(int swing){ +int getSwing(int swing) { - #ifdef PROMISE_TUNER_ENABLED +#ifdef PROMISE_TUNER_ENABLED // NOTE: Skip reading file-based error levels for ApproxHPVM wrapper runtime - if(!approxhpvm_runtime_mode){ - - if(op_counter >= total_ops){ + if (!approxhpvm_runtime_mode) { + + if (op_counter >= total_ops) { ERROR("No accuracy flag found \n"); } - + swing = op_accuracies[op_counter]; op_counter++; } - #endif +#endif - DEBUG("---- swing_value = %d \n", swing); + DEBUG("---- swing_value = %d \n", swing); - return swing; + return swing; } - - - -//bool FP16_tuning = false; - +// bool FP16_tuning = false; /***** API for Autotuner Use - Not the ApproxHPVM Wrapper API */ - - -void initializeAutotuner(){ +void initializeAutotuner() { DEBUG("initializing tuner .... \n"); - + sampParamSet = new SampParamSet; - perfParamSet = new PerfParamSet; + perfParamSet = new PerfParamSet; } +void *Autotuner_SampConv(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + + SampParams params = sampParamSet->getSampParams(swing); -void* Autotuner_SampConv(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ + DEBUG("params.skip_rate = %d, params.skip_offset = %d \n", params.skip_rate, + params.skip_offset); + void *conv_out; + + if (!FP16_tuning) { - SampParams params = sampParamSet->getSampParams(swing); - - DEBUG("params.skip_rate = %d, params.skip_offset = %d \n", - params.skip_rate, params.skip_offset); - - void* conv_out; - - if (!FP16_tuning){ - /* conv_out = tensorConvSampSim(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - params.skip_rate, params.skip_offset); + conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, + params.skip_rate, params.skip_offset); */ - - if (SIMULATION_MODE){ - conv_out = tensorConvSampSim2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - params.skip_rate, params.skip_offset, params.interpolation_id); + if (SIMULATION_MODE) { + conv_out = tensorConvSampSim2( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, + 1, 1, params.skip_rate, params.skip_offset, params.interpolation_id); } - else { - conv_out = tensorConvApprox(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - 1, 1, params.skip_rate, params.skip_offset); + conv_out = tensorConvApprox(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, + params.skip_rate, params.skip_offset); } - - - } - else{ - - conv_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - 1, 1, - params.skip_rate, params.skip_offset); - + + } else { + + conv_out = tensorConvApproxHalf2(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, 1, 1, + params.skip_rate, params.skip_offset); } return conv_out; } - - - -void* Autotuner_PerforatedConv(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - +void *Autotuner_PerforatedConv(void *input, float i_min, float i_max, + void *filter, float w_min, float w_max, + void *bias, float b_min, float b_max, + int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { PerfParams params = perfParamSet->getPerfParams(swing); - + DEBUG("params.row = %d, params.col = %d, params.skip_offset = %d \n", - params.row, params.col, params.skip_offset); - + params.row, params.col, params.skip_offset); - void* conv_out; - - if (!FP16_tuning){ + void *conv_out; + if (!FP16_tuning) { - if (SIMULATION_MODE){ + if (SIMULATION_MODE) { - conv_out = tensorConvPerfCuda(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, 1, 1, - params.row, params.col, params.skip_offset); + conv_out = tensorConvPerfCuda(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 1, + params.row, params.col, params.skip_offset); + } else { + + conv_out = tensorConvApprox( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, + 1, 1, params.row, params.col, 1, params.skip_offset); } - else{ - - conv_out = tensorConvApprox(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - params.row, params.col, - 1, params.skip_offset); - } - - - } - else{ - conv_out = tensorConvApproxHalf2(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 1, - params.row, params.col, - 1, params.skip_offset); + } else { + conv_out = tensorConvApproxHalf2( + input, filter, conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, 1, + 1, params.row, params.col, 1, params.skip_offset); } - - return conv_out; -} - - - + return conv_out; +} +void *Autotuner_ConvOp(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { -void* Autotuner_ConvOp(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ + void *conv_out; + if (isPerforation(swing)) { - - void* conv_out; - if(isPerforation(swing)){ + conv_out = Autotuner_PerforatedConv( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, + conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, pool_id, + pool_size, activation_id, out_min, out_max, swing); - conv_out = Autotuner_PerforatedConv(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, - activation_id, - out_min, out_max, swing); - } - else if(isSampling(swing)){ + else if (isSampling(swing)) { - conv_out = Autotuner_SampConv(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, - activation_id, - out_min, out_max, swing); + conv_out = Autotuner_SampConv( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, + conv_pad_h, conv_pad_w, conv_stride_h, conv_stride_w, pool_id, + pool_size, activation_id, out_min, out_max, swing); } - - else if (isHalfPrecision(swing)){ + else if (isHalfPrecision(swing)) { - if (FP16_tuning){ - - conv_out = tensorHalfConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); - } - else{ - conv_out = tensorConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); + if (FP16_tuning) { + + conv_out = tensorHalfConvolution(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 0); + } else { + conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 0); } - - } - else if (isFullPrecision(swing)){ - conv_out = tensorConvolution(input, filter, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - 1, 0); } + else if (isFullPrecision(swing)) { + conv_out = tensorConvolution(input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w, 1, 0); + } - return conv_out; + return conv_out; } +void *Autotuner_Add(void *input, void *bias, int swing) { + void *conv_add; + if (bias != NULL) { -void* Autotuner_Add(void* input, void* bias, int swing){ - - void* conv_add; - if(bias != NULL){ - - if( isFullPrecision(swing) || !(FP16_tuning) ){ + if (isFullPrecision(swing) || !(FP16_tuning)) { conv_add = tensorAdd(input, bias); - } - else { + } else { conv_add = tensorHalfAdd(input, bias); } - } - else{ + } else { conv_add = input; } return conv_add; } +void *Autotuner_Pooling(void *input, int pool_size, int pool_stride, + int swing) { + void *pool_out; -void* Autotuner_Pooling(void* input, - int pool_size, int pool_stride, - int swing){ + if (pool_size > 0) { - void* pool_out; - - if(pool_size > 0){ - - //FIXME: Currently only using MaxPooling - if( isFullPrecision(swing) || !(FP16_tuning) ){ - pool_out = tensorPooling(input, 0, pool_size, pool_size, - 0, 0, pool_stride, pool_stride); + // FIXME: Currently only using MaxPooling + if (isFullPrecision(swing) || !(FP16_tuning)) { + pool_out = tensorPooling(input, 0, pool_size, pool_size, 0, 0, + pool_stride, pool_stride); } - + else { - pool_out = tensorHalfPooling(input, 0, pool_size, pool_size, - 0, 0, pool_stride, pool_stride); + pool_out = tensorHalfPooling(input, 0, pool_size, pool_size, 0, 0, + pool_stride, pool_stride); } - - - } - else{ + + } else { pool_out = input; } - - + return pool_out; } +void *Autotuner_Activation(void *input, int activation_id, int out_min, + int out_max, int swing) { + void *activation_out; + if (isFullPrecision(swing) || (!FP16_tuning)) { -void* Autotuner_Activation(void* input, int activation_id, - int out_min, int out_max, int swing){ - - void* activation_out; - - if ( isFullPrecision(swing) || (!FP16_tuning) ){ - - switch(activation_id){ + switch (activation_id) { case -1: activation_out = input; INFO("NO Activation Function \n"); @@ -1206,10 +988,10 @@ void* Autotuner_Activation(void* input, int activation_id, break; } } - - else{ - switch(activation_id){ + else { + + switch (activation_id) { case -1: activation_out = input; INFO("NO Activation Function \n"); @@ -1227,167 +1009,116 @@ void* Autotuner_Activation(void* input, int activation_id, ERROR("Activation id %d NOT supported \n", activation_out); break; } - } - return activation_out; } -void* Autotuner_GPU_ConvLayer(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, int pool_stride, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - - void* conv_out = Autotuner_ConvOp(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, - activation_id, - out_min, out_max, swing); - - - void* conv_add = Autotuner_Add(conv_out, bias, swing); - - void* pool_out = Autotuner_Pooling(conv_add, pool_size, pool_stride, swing); - - void* activation_out = Autotuner_Activation(pool_out, activation_id, out_min, out_max, swing); - - - return activation_out; -} +void *Autotuner_GPU_ConvLayer(void *input, float i_min, float i_max, + void *filter, float w_min, float w_max, + void *bias, float b_min, float b_max, + int conv_pad_h, int conv_pad_w, int conv_stride_h, + int conv_stride_w, int pool_id, int pool_size, + int pool_stride, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + + void *conv_out = Autotuner_ConvOp( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h, + conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, + activation_id, out_min, out_max, swing); + void *conv_add = Autotuner_Add(conv_out, bias, swing); + + void *pool_out = Autotuner_Pooling(conv_add, pool_size, pool_stride, swing); + + void *activation_out = + Autotuner_Activation(pool_out, activation_id, out_min, out_max, swing); + + return activation_out; +} /**** Top-level API for Handling Convolution Layers The granularity of handling is at a layer-level - not tensor-op level - + ***/ -void* Autotuner_ConvLayer(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, int pool_stride, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - if(FP16_tuning){ - if(ONLINE_PROFILING){ +void *Autotuner_ConvLayer(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, int pool_stride, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + + if (FP16_tuning) { + if (ONLINE_PROFILING) { ERROR("Online Profiling cannot be enabled with PROMISE Simulation \n"); } } - swing = getSwing(swing); - - if(isPromiseLayer(swing)){ - - return PROMISE_Conv(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, pool_stride, - activation_id, - out_min, out_max, swing); + swing = getSwing(swing); + + if (isPromiseLayer(swing)) { + + return PROMISE_Conv(input, i_min, i_max, filter, w_min, w_max, bias, b_min, + b_max, conv_pad_h, conv_pad_w, conv_stride_h, + conv_stride_w, pool_id, pool_size, pool_stride, + activation_id, out_min, out_max, swing); } assert(isGPULayer(swing)); - return Autotuner_GPU_ConvLayer(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, pool_stride, - activation_id, - out_min, out_max, swing); - + return Autotuner_GPU_ConvLayer( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h, + conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, pool_stride, + activation_id, out_min, out_max, swing); } - - - - /**** Top-level API Unchanged for backwards compatibility ***/ -void* ConvLayer_PROMISE(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - - return Autotuner_ConvLayer(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, pool_size, // FIXIT: Assumption pool_size == pool_strides - activation_id, - out_min, out_max, swing); - - +void *ConvLayer_PROMISE(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + + return Autotuner_ConvLayer( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h, + conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, + pool_size, // FIXIT: Assumption pool_size == pool_strides + activation_id, out_min, out_max, swing); } - - - -void* ConvLayer_PROMISE2(void* input, float i_min, float i_max, - void* filter, float w_min, float w_max, - void* bias, float b_min, float b_max, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, int pool_stride, - int activation_id, // Relu, Tanh, ClipRelu - float out_min, float out_max, int swing){ - - - return Autotuner_ConvLayer(input, i_min, i_max, - filter, w_min, w_max, - bias, b_min, b_max, - conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w, - pool_id, pool_size, pool_stride, - activation_id, - out_min, out_max, swing); - - +void *ConvLayer_PROMISE2(void *input, float i_min, float i_max, void *filter, + float w_min, float w_max, void *bias, float b_min, + float b_max, int conv_pad_h, int conv_pad_w, + int conv_stride_h, int conv_stride_w, int pool_id, + int pool_size, int pool_stride, + int activation_id, // Relu, Tanh, ClipRelu + float out_min, float out_max, int swing) { + + return Autotuner_ConvLayer( + input, i_min, i_max, filter, w_min, w_max, bias, b_min, b_max, conv_pad_h, + conv_pad_w, conv_stride_h, conv_stride_w, pool_id, pool_size, pool_stride, + activation_id, out_min, out_max, swing); } +void * +FCLayer_PROMISE(void *input, float i_min, float i_max, void *weights, + float w_min, float w_max, void *bias, float b_min, float b_max, + int activation_id, float out_min, float out_max, + int swing) { // NOTE: min_val, max_val apply to 'ClippedRelu' + swing = getSwing(swing); + if (isPromiseLayer(swing)) { - - - -void* FCLayer_PROMISE(void* input, float i_min, float i_max, - void* weights, float w_min, float w_max, - void* bias, float b_min, float b_max, - int activation_id, - float out_min, float out_max, int swing){ //NOTE: min_val, max_val apply to 'ClippedRelu' - - - swing = getSwing(swing); - - if(isPromiseLayer(swing)){ - - return PROMISE_FC(input, i_min, i_max, - weights, w_min, w_max, - bias, b_min, b_max, - activation_id, - out_min, out_max, swing); + return PROMISE_FC(input, i_min, i_max, weights, w_min, w_max, bias, b_min, + b_max, activation_id, out_min, out_max, swing); } assert(isGPULayer(swing)); @@ -1433,18 +1164,12 @@ void* FCLayer_PROMISE(void* input, float i_min, float i_max, } return activation_out; - } #endif - - #ifdef OLD_MODEL #endif -#endif - - - +#endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu index c1848f126750808a9438a4d2cf7729d1bf420fd1..b97e5beadb7822cce12bdf2ee4d16407cd0483c4 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques.cu @@ -1,13 +1,12 @@ //===--------------------------- approxtechniques.cu ---------------------===// // //===----------------------------------------------------------------------===// -// +// // This file consists of the custom implementation of software approximations // for tensor convolutions. The approximations implemented are feature sampling -// and perforation for FP32 and FP16 compute precisions. +// and perforation for FP32 and FP16 compute precisions. // //===----------------------------------------------------------------------===// - #include "tensor_utils.h" #include "approx_utils.h" @@ -17,406 +16,465 @@ #include "fp16_conversion.h" #include "profiling.h" -extern "C"{ - -__global__ void convToGemm(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +extern "C" { + +__global__ void convToGemm(float *const __restrict__ output, + const float *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int num_filter_elem) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) const int inH = h * V_stride - V_pad; const int inW = w * H_stride - H_pad; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[out_index] = 0; + output[out_index] = 0; } } } } -__global__ void convToGemmFullInput(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_ - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen - if(filter_elem_num % skip_every != skip_every-1-skip_offset) { - int output_col = filter_elem_num - - ((filter_elem_num + skip_every)/skip_every); - if(skip_every == 1) output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } - } - } +__global__ void convToGemmFullInput( + float *const __restrict__ output, const float *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int skip_every, const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number)_ + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter elemen + if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) { + int output_col = + filter_elem_num - ((filter_elem_num + skip_every) / skip_every); + if (skip_every == 1) + output_col = filter_elem_num; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; } + } + } + } } -__global__ void convToGemmHalfInputNew(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(filter_elem_num % skip_every != skip_offset) { - int output_col = filter_elem_num - - (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - if(skip_every == 1) output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } - } - } +__global__ void +convToGemmHalfInputNew(__half *const __restrict__ output, + const __half *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (filter_elem_num % skip_every != skip_offset) { + int output_col = + filter_elem_num - (filter_elem_num / skip_every + + (filter_elem_num % skip_every > skip_offset)); + if (skip_every == 1) + output_col = filter_elem_num; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; + } } + } + } } - -__global__ -void convToGemmHalf(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, - const int V_pad, const int H_pad, - const int H_out, const int W_out, - const int V_stride, const int H_stride){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i - const int n = tx / (C * H_out * W_out); //output image numbe - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe - const int h = tx % (H_out * W_out) / W_out; //output height index (row number - const int w = tx % W_out; //output width index (col number - const int inH = h * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - } else { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; - } - } +__global__ void convToGemmHalf(__half *const __restrict__ output, + const __half *const __restrict input, + const int N, const int C, const int H, + const int W, const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i + const int n = tx / (C * H_out * W_out); // output image numbe + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe + const int h = tx % (H_out * W_out) / W_out; // output height index (row number + const int w = tx % W_out; // output width index (col number + const int inH = h * V_stride - V_pad; + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + } else { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; } + } } + } } -__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = condition * filter_elem_num - + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputNewIrregular( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if ((filter_elem_num - skip_offset) % skip_every) { + const int condition = (filter_elem_num < skip_offset); + const int output_col = + condition * filter_elem_num + + (!condition) * + (filter_elem_num - + ((filter_elem_num + 1 - skip_offset) / skip_every) - + ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); + const int out_index = + ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; + //((output_col*N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } + } } + } } -__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = condition * filter_elem_num - + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - - const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputNewIrregular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if ((filter_elem_num - skip_offset) % skip_every) { + const int condition = (filter_elem_num < skip_offset); + const int output_col = + condition * filter_elem_num + + (!condition) * + (filter_elem_num - + ((filter_elem_num + 1 - skip_offset) / skip_every) - + ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); + + const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; + + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } + } } + } } - - -__global__ void convToGemmHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void convToGemmHalf2(__half *const __restrict__ output, + const __half *const __restrict input, + const int N, const int C, const int H, + const int W, const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride, + const int num_filter_elem) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) const int inH = h * V_stride - V_pad; const int inW = w * H_stride - H_pad; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[out_index] = 0; + output[out_index] = 0; } } } } -__global__ void convToGemmPerfRow(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void +convToGemmPerfRow(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int x, const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) int h_index; - if(h < start) { - h_index = h; + if (h < start) { + h_index = h; } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; + const int inW = w * H_stride - H_pad; // input width index (col number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } } } } -__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w, - float *old_data, float *new_data, int x, int start){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //filter number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - w]) / 2; - } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, + int w, float *old_data, float *new_data, + int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // filter number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - w]) / 2; + } else { + int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void convToGemmPerfCol(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) +__global__ void +convToGemmPerfCol(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int x, const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) int w_index; - if(w < start) { + if (w < start) { w_index = w; } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0; + const int inW = w_index * H_stride - H_pad; + const int inH = h * V_stride - V_pad; // input height index (row number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = 0; } } } } -__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w, - float *old_data, float *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //output chan number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - 1]) / 2; - } else { - int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, + int w, float *old_data, float *new_data, + int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // output chan number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - 1]) / 2; + } else { + int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void convToGemmPerfRowHalf( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) int h_index; - if(h < start) { - h_index = h; + if (h < start) { + h_index = h; } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int inW = w * H_stride - H_pad; // input width index (col number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else output[out_index] = 0; } @@ -424,844 +482,903 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, } } -__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image numbe - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - int h_index; - if(h < start) { - h_index = h; - } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w; - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - - } - } - +__global__ void convToGemmPerfRowHalf2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image numbe + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + int h_index; + if (h < start) { + h_index = h; + } else { + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + } + const int inH = h_index * V_stride - V_pad; + const int inW = w * H_stride - H_pad; // input width index (col number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((filter_elem_num * N + n) * H_eff + h) * W_out + w; + + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + } } + } } -__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - - const int ch = tx % (c * h * w) / (h * w); //filter number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + + const int ch = tx % (c * h * w) / (h * w); // filter number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - - const int ch = tx % (c * h * w) / (h * w); //filter number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - const int row_index = row - ((row + 1 - start) / x); - const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - const int output_index = ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRowHalf2(int N, int old_h, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + + const int ch = tx % (c * h * w) / (h * w); // filter number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + const int row_index = row - ((row + 1 - start) / x); + const int output_index = + ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + const int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + const int output_index = + ch * (b * old_h * w) + n * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } - -__global__ void convToGemmPerfColHalf(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) +__global__ void convToGemmPerfColHalf( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) int w_index; - if(w < start) { + if (w < start) { w_index = w; } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int inH = h * V_stride - V_pad; // input height index (row number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else output[out_index] = 0; - } } } } -__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) - int w_index; - if(w < start) { - w_index = w; - } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen - const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmPerfColHalf2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) + int w_index; + if (w < start) { + w_index = w; + } else { + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + } + const int inW = w_index * H_stride - H_pad; + const int inH = h * V_stride - V_pad; // input height index (row number) + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter elemen + const int out_index = + ((filter_elem_num * N + n) * H_out + h) * W_eff + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + } } + } } - -__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //output chan number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } - } +__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // output chan number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; + } + } } -__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //output chan number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col]; - - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; - - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; - - } else if((col - start) % x == 0) { - const int col_index = col - ((col + 1 - start) / x); - const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - const int output_index = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // output chan number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col]; + + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + + old_w - 1]; + + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; + + } else if ((col - start) % x == 0) { + const int col_index = col - ((col + 1 - start) / x); + const int output_index = + ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + const int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + const int output_index = + ch * (b * h * old_w) + n * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } +__global__ void +convToGemmFullInputRegular(float *const __restrict__ output, + const float *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int ch = (fi * C) / reduced_filter_elem; + const int offset = (skip_offset + ch) % skip_every; + int in_index; + if (fi < offset) { + in_index = fi; + } else { + in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } -__global__ void convToGemmFullInputRegular(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int ch = (fi * C) / reduced_filter_elem; - const int offset = (skip_offset + ch) % skip_every; - int in_index; - if(fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - } - - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; } } + } } -__global__ void convToGemmFullInputIrregular(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - int in_index; - if(fi < skip_offset) { - in_index = fi; - } else { - in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmFullInputIrregular( + float *const __restrict__ output, const float *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + int in_index; + if (fi < skip_offset) { + in_index = fi; + } else { + in_index = + ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1; + } + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } - - + } } -__global__ void createReducedFiltersFullRegular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter +__global__ void createReducedFiltersFullRegular( + float *output, const float *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, const int skip_every, const int skip_offset, + const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter const int ch = (offset * channels) / reduced_filter_elem; const int channel_offset = (skip_offset + ch) % skip_every; - int in_index; - if(offset < channel_offset) { - in_index = offset; - } - else { - in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; - } - - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; + int in_index; + if (offset < channel_offset) { + in_index = offset; + } else { + in_index = + ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + channel_offset - 1; + } + + output[fIdx * reduced_filter_elem + offset] = + fac * input[num_filter_elem * fIdx + in_index]; } } -__global__ void createReducedFiltersFullIrregular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter - int in_index; - if(offset < skip_offset) { - in_index = offset; - } else { - in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - } - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; +__global__ void createReducedFiltersFullIrregular( + float *output, const float *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter + int in_index; + if (offset < skip_offset) { + in_index = offset; + } else { + in_index = + ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1; } + output[fIdx * reduced_filter_elem + offset] = + fac * input[num_filter_elem * fIdx + in_index]; + } } -__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - const int offset = (skip_offset + ch) % skip_every; - - const bool condition = (fi < offset); - const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } +__global__ void +convToGemmHalfInputRegular(__half *const __restrict__ output, + const __half *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int ch = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int ki = 0; ki < reduced_filter_elem / C; ki++) { + const int fi = ch * (reduced_filter_elem / C) + ki; + const int offset = (skip_offset + ch) % skip_every; + + const bool condition = (fi < offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1); + + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; } } + } } -__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - - const int fi = ch * (reduced_filter_elem / C) + ki; - const int offset = (skip_offset + ch) % skip_every; - const int condition = (fi < offset); - const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } - else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputRegular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int ch = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int ki = 0; ki < reduced_filter_elem / C; ki++) { + + const int fi = ch * (reduced_filter_elem / C) + ki; + const int offset = (skip_offset + ch) % skip_every; + const int condition = (fi < offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1); + + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((fi * N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } - else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputIrregular( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int condition = (fi < skip_offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1); + + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputIrregular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int condition = (fi < skip_offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1); + + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((fi * N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } +__global__ void createReducedFiltersHalfRegular( + __half *output, const __half *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, const int skip_every, const int skip_offset, + const float fac) { -__global__ void createReducedFiltersHalfRegular(__half * output, - const __half * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter const int ch = (offset * channels) / reduced_filter_elem; const int channel_offset = (skip_offset + ch) % skip_every; const int condition = (offset < channel_offset); - const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1); - - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); - } - + const int in_index = + condition * offset + + (!condition) * + (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + channel_offset - 1); + + output[fIdx * reduced_filter_elem + offset] = + __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); + } } -__global__ void createReducedFiltersHalfIrregular(__half * output, - const __half * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { +__global__ void createReducedFiltersHalfIrregular( + __half *output, const __half *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - - if(fIdx < NF) { + if (fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter + const int offset = tx % reduced_filter_elem; // offset within filter const int condition = (offset < skip_offset); - - int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); - } - -} + int in_index = + condition * offset + + (!condition) * + (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + skip_offset - 1); + output[fIdx * reduced_filter_elem + offset] = + __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); + } +} -//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols -__global__ void convToGemmApprox(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element? - const int output_col = filter_elem_num - (filter_elem_num/skip_every); //cal output column, taking skipping into account - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0; - } +// produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem +// cols +__global__ void +convToGemmApprox(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (filter_elem_num % skip_every != + skip_every - 1) { // are we including this filter element? + const int output_col = + filter_elem_num - + (filter_elem_num / + skip_every); // cal output column, taking skipping into account + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * reduced_filter_elem + output_col) * H_out + h) * + W_out + + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * reduced_filter_elem + output_col) * H_out + h) * + W_out + + w] = 0; + } } } } } - /// This function serves as an API with the custom implementation of convolution -/// with the perforation and filter sampling support. The compute precison is FP32. -/// This routine is invoked by the tuner for tuning approximations for convolutions. +/// with the perforation and filter sampling support. The compute precison is +/// FP32. This routine is invoked by the tuner for tuning approximations for +/// convolutions. /// -void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int start){ - - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty +void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int start) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } - - Tensor* output; + + Tensor *output; // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); convertToFP32(input); convertToFP32(filter); - + long int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; - c = filter->dims.dim_sizes[0]; //number of filters + c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; int rem_row = (h - start) % row > 0; int h_eff = h - ((h - start) / row) - rem_row; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; int rem_col = (w - start) % col > 0; int w_eff = w - ((w - start) / col) - rem_col; - Tensor* new_output; - if(row > 1){ - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); + Tensor *new_output; + if (row > 1) { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float* convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - - convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, - h, w, - vertical_stride, horizontal_stride, - row, start, h_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + convToGemmPerfRow<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + row, start, h_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, - num_filter_elem * h_eff * w, - (float *)filter->gpu_data, - num_filter_elem, 0, - &beta, - (float *)output->gpu_data, - h_eff * w, c * h_eff * w, - n)); - - new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); + + new_output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w, - (float *) output->gpu_data, - (float *) new_output->gpu_data, - row, start); + // interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateRow<<<numBlocks, 128>>>( + n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, row, start); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } - else if(col > 1){ - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + } else if (col > 1) { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - col, start, w_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + col, start, w_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - &alpha, - convData, - h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, - num_filter_elem, 0, - &beta, - (float *)output->gpu_data, - h * w_eff, c * h * w_eff, - n)); - - new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); + + new_output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data, - col, start); + // interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateCol<<<numBlocks, 128>>>( + n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, col, start); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else { - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + } else { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - convToGemmApprox<<<gridSize, blockSize>>>(convData, - (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - num_filter_elem, c * h * w); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + convToGemmApprox<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + num_filter_elem, c * h * w); checkCudaErrors(cudaDeviceSynchronize()); - //Do the matrix multiplication - //Want to multiply convData by filter->gpu_data[f * chan * KH * KW] - + // Do the matrix multiplication + // Want to multiply convData by filter->gpu_data[f * chan * KH * KW] + float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w, c * h * w, - n)); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem, + &alpha, convData, h * w, num_filter_elem * h * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w, c * h * w, n)); new_output = output; cudaFree(convData); } - //Event("Conv_end"); //, true); + // Event("Conv_end"); //, true); return new_output; } -__global__ -void switchMatrixFull(int N, int n, int c, int h, int w, - float *old_data, float *new_data){ - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < N){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row ) * w + col] = - old_data[((ch * n + n_new) * h + row ) * w + col]; - } -} +__global__ void switchMatrixFull(int N, int n, int c, int h, int w, + float *old_data, float *new_data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); + + new_data[((n_new * c + ch) * h + row) * w + col] = + old_data[((ch * n + n_new) * h + row) * w + col]; + } +} /// This function serves as an API with the custom implementation of convolution -/// with the perforation and filter sampling support. The compute precison is FP32. +/// with the perforation and filter sampling support. The compute precison is +/// FP32. /// -void* tensorConvApprox(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset){ +void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int skip_every, int offset) { //////INFO("*** TensorConvolution approximation \n"); - //Event("Conv"); + // Event("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } @@ -1275,15 +1392,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ////Event("H2F_end"); const int n = input->dims.dim_sizes[0]; - const int c = filter->dims.dim_sizes[0]; //number of filters + const int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const int w = + (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); ////INFO("batch: %d\n", n); @@ -1296,619 +1416,572 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ////INFO("horizontal_stride: %d\n", horizontal_stride); ////INFO("output height: %d\n", h); ////INFO("output width: %d\n", w); - if(row > 1) { + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; - Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w)); - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, - vertical_stride, horizontal_stride, - row, offset, h_eff); + ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h_eff * w)); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + convToGemmPerfRow<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + row, offset, h_eff); checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, num_filter_elem * h_eff * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h_eff * w, c * h_eff * w, - n)); - //interpolate + + float alpha = 1.0f, beta = 0.0f; + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); + // interpolate int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (float *) output->gpu_data, - (float *) new_output->gpu_data, - row, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateRow<<<numBlocks, blocksize>>>( + n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, row, offset); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(col > 1) { + } else if (col > 1) { const int rem_col = (w - offset) % col > 0; const int w_eff = w - ((w - offset) / col) - rem_col; - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff)); - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - col, offset, w_eff); + ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w_eff)); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + col, offset, w_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - &alpha, - convData, h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w_eff, c * h * w_eff, - n)); - - //interpolate + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); + + // interpolate int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data, - col, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateCol<<<numBlocks, blocksize>>>( + n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, col, offset); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(skip_every > 1) { - //reduced number after skipping + } else if (skip_every > 1) { + // reduced number after skipping const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; + const int reduced_filter_elem = + num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - float* convData; + float *convData; size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - float* reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); - + float *reducedFilter; + checkCudaErrors( + cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); + const int filtBlockSize = 128; ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); - const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float) skip_every) / ((float) skip_every - 1); + const int filtGridSize = + (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const float fac = ((float)skip_every) / ((float)skip_every - 1); //////INFO("fac: %f\n", fac); const int blockSize = 128; - //////INFO("n * h * w : %d\n", (n * h * w )); - const int gridSize = (n * h * w + blockSize - 1) / blockSize; - if(!(KH * KW % skip_every)) { - // ////INFO("REGULAR FILTERING\n"); - createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + //////INFO("n * h * w : %d\n", (n * h * w )); + const int gridSize = (n * h * w + blockSize - 1) / blockSize; + if (!(KH * KW % skip_every)) { + // ////INFO("REGULAR FILTERING\n"); + createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); + checkCudaErrors(cudaDeviceSynchronize()); + convToGemmFullInputRegular<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, reduced_filter_elem, skip_every, offset); } else { - // ////INFO("IRREGULAR FILTERING\n"); - createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + // ////INFO("IRREGULAR FILTERING\n"); + createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + convToGemmFullInputIrregular<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, reduced_filter_elem, skip_every, offset); } checkCudaErrors(cudaDeviceSynchronize()); - + const float alpha = 1.0; const float beta = 0.0; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - &alpha, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - &beta, - (float *)new_output->gpu_data, h * w, c * h * w, - n)); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, + &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter, + reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w, + c * h * w, n)); cudaFree(convData); cudaFree(reducedFilter); } else { - //INFO("FP32 BASELINE\n"); - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + // INFO("FP32 BASELINE\n"); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w)); - convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - skip_every, offset);//num_filter_elem); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w)); + convToGemmFullInput<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + skip_every, offset); // num_filter_elem); checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - /* - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)new_output->gpu_data, h * w, c * h * w, - n)); - */ - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, num_filter_elem, - &alpha, - convData, - CUDA_R_32F, n * h * w, - (float *) filter->gpu_data, CUDA_R_32F, - num_filter_elem, - &beta, - (float *) output->gpu_data, - CUDA_R_32F, n * h * w, - CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data); - + + float alpha = 1.0f, beta = 0.0f; + /* + checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, + CUBLAS_OP_N, CUBLAS_OP_N, + h * w, c, num_filter_elem, + &alpha, + convData, h * w, num_filter_elem * h + * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float + *)new_output->gpu_data, h * w, c * h * w, n)); + */ + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, + &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data, + CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data, + CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (float *)output->gpu_data, + (float *)new_output->gpu_data); + checkCudaErrors(cudaDeviceSynchronize()); cudaFree(convData); } - //Event("Conv_end"); + // Event("Conv_end"); return new_output; } -__global__ -void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){ - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < N){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row ) * w + col] = - old_data[((ch * n + n_new) * h + row ) * w + col]; - } -} +__global__ void switchMatrixHalf(int N, int n, int c, int h, int w, + __half *old_data, __half *new_data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); + + new_data[((n_new * c + ch) * h + row) * w + col] = + old_data[((ch * n + n_new) * h + row) * w + col]; + } +} -/// This function serves as an API to custom implementation of the +/// This function serves as an API to custom implementation of the /// half-precision convolution with the perforation and filter sampling -/// support. +/// support. /// -void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset) { - - //INFO("*** TensorConvolution half approximation \n"); - // profileEvent("#Conv"); - - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty +void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups, int row, int col, int skip_every, + int offset) { + + // INFO("*** TensorConvolution half approximation \n"); + // profileEvent("#Conv"); + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } hostToDeviceCopy(input); hostToDeviceCopy(filter); - + profileEvent("F2H_start"); - convertToFP16(input); - convertToFP16(filter); + convertToFP16(input); + convertToFP16(filter); profileEvent("F2H_end"); - + const long int n = input->dims.dim_sizes[0]; - const long int c = filter->dims.dim_sizes[0]; //number of filters + const long int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const long int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const long int w = + (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(new_output, DEVICE); - //INFO("batch: %d\n", n); + // INFO("batch: %d\n", n); // INFO("channels: %d\n", input->dims.dim_sizes[1]); // INFO("num_filters: %d\n", c); // INFO("kernel height: %d\n", KH); - // INFO("kernel width: %d\n", KW); + // INFO("kernel width: %d\n", KW); // INFO("num_filter_elem: %d\n", num_filter_elem); - //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); - //INFO("vertical_stride: %d\n", vertical_stride); - //INFO("horizontal_stride: %d\n", horizontal_stride); + // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); + // INFO("vertical_stride: %d\n", vertical_stride); + // INFO("horizontal_stride: %d\n", horizontal_stride); // INFO("output height: %d\n", h); // INFO("output width: %d\n", w); - //INFO("skip_every: %d\n", skip_every); + // INFO("skip_every: %d\n", skip_every); const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(row > 1){ + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; - - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, - n, c, h_eff, w); + + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); changeTensorPlacement(output_half, DEVICE); - __half * convData; + __half *convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - + const int patchBlockSize = 256; - const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) / patchBlockSize; + const int numPatchBlocks = + (n * input->dims.dim_sizes[1] * h_eff * w + patchBlockSize - 1) / + patchBlockSize; const int interpolationBlocksize = 256; - const int numInterpolationBlocks = (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; - if(h * w <= 64) { - //INFO("H *W <= 64\n"); - convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h_eff * w, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h_eff * w, - (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - approxInterpolateRowHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - row, offset); - checkCudaErrors(cudaDeviceSynchronize()); - - } else { - //INFO("H *W > 64\n"); - convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - alpha_half, - convData, h_eff * w, num_filter_elem * h_eff * w, - (__half *)filter->gpu_half_data, num_filter_elem, 0, - beta_half, - (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, - n)); - - approxInterpolateRowHalf<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - row, offset); - checkCudaErrors(cudaDeviceSynchronize()); + const int numInterpolationBlocks = + (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; + if (h * w <= 64) { + // INFO("H *W <= 64\n"); + convToGemmPerfRowHalf2<<<numPatchBlocks, patchBlockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c, + num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, + n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + approxInterpolateRowHalf2<<<numInterpolationBlocks, + interpolationBlocksize>>>( + n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, row, offset); + checkCudaErrors(cudaDeviceSynchronize()); + } else { + // INFO("H *W > 64\n"); + convToGemmPerfRowHalf<<<numPatchBlocks, patchBlockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w, + (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, + (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n)); + + approxInterpolateRowHalf<<<numInterpolationBlocks, + interpolationBlocksize>>>( + n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, row, offset); + checkCudaErrors(cudaDeviceSynchronize()); } freeTensor(output_half); cudaFree(convData); -} else if(col > 1) { + } else if (col > 1) { const int rem_col = (w - offset) % col > 0; const int w_eff = w - ((w - offset) / col) - rem_col; - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); changeTensorPlacement(output_half, DEVICE); - - __half * convData; + + __half *convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - + const int patchBlockSize = 256; - const int numPatchBlocks = (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) / patchBlockSize; + const int numPatchBlocks = + (n * input->dims.dim_sizes[1] * h * w_eff + patchBlockSize - 1) / + patchBlockSize; const int interpolationBlocksize = 256; - const int numInterpolationBlocks = (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; - if(h * w <= 64) { - //INFO("H *W <= 64\n"); - convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w_eff, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w_eff, - (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - approxInterpolateColHalf2<<<numInterpolationBlocks, interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - col, offset); - checkCudaErrors(cudaDeviceSynchronize()); + const int numInterpolationBlocks = + (n * c * h * w + interpolationBlocksize - 1) / interpolationBlocksize; + if (h * w <= 64) { + // INFO("H *W <= 64\n"); + convToGemmPerfColHalf2<<<numPatchBlocks, patchBlockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c, + num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, + n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + approxInterpolateColHalf2<<<numInterpolationBlocks, + interpolationBlocksize>>>( + n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, col, offset); + checkCudaErrors(cudaDeviceSynchronize()); } else { - //INFO("H *W > 64\n"); - convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - alpha_half, - convData, h * w_eff, num_filter_elem * h * w_eff, - (__half *)filter->gpu_half_data, num_filter_elem, 0, - beta_half, - (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, - n)); - - approxInterpolateColHalf<<<numInterpolationBlocks,interpolationBlocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - col, offset); - checkCudaErrors(cudaDeviceSynchronize()); + // INFO("H *W > 64\n"); + convToGemmPerfColHalf<<<numPatchBlocks, patchBlockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff, + (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, + (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n)); + + approxInterpolateColHalf<<<numInterpolationBlocks, + interpolationBlocksize>>>( + n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, col, offset); + checkCudaErrors(cudaDeviceSynchronize()); } freeTensor(output_half); cudaFree(convData); - } else if(skip_every > 1) { + } else if (skip_every > 1) { const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; + const int reduced_filter_elem = + num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - __half* convData; + __half *convData; size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - __half* reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); + __half *reducedFilter; + checkCudaErrors( + cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); const int filtBlockSize = 256; - const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float) skip_every) / ((float) skip_every - 1); + const int filtGridSize = + (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const float fac = ((float)skip_every) / ((float)skip_every - 1); const int blockSize = 256; - //const int gridSize = (n * h * w + blockSize - 1) / blockSize; - // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); - // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); + // const int gridSize = (n * h * w + blockSize - 1) / blockSize; + // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); + // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { - if(!(KH * KW % skip_every)) { - //INFO("---REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); + if (c * num_filter_elem < + 500000) { // 250) {//c * reduced_filter_elem < 150000) { + if (!(KH * KW % skip_every)) { + // INFO("---REGULAR FILTERING\n"); + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputRegular<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); } else { - //INFO("---IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); + // INFO("---IRREGULAR FILTERING\n"); + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - //convToGemmHalfInputIrregular - convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - alpha_half, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - beta_half, - (__half *)new_output->gpu_half_data, h * w, c * h * w, - n)); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + // convToGemmHalfInputIrregular + convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, + alpha_half, convData, h * w, reduced_filter_elem * h * w, + reducedFilter, reduced_filter_elem, 0, beta_half, + (__half *)new_output->gpu_half_data, h * w, c * h * w, n)); } else { - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - changeTensorPlacement(output_half, DEVICE); - - if(!(KH * KW % skip_every)) { - //INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } else { - // INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, reduced_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w, - reducedFilter, CUDA_R_16F, reduced_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output_half); + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w); + changeTensorPlacement(output_half, DEVICE); + + if (!(KH * KW % skip_every)) { + // INFO("REGULAR FILTERING\n"); + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); + checkCudaErrors(cudaDeviceSynchronize()); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputRegular2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } else { + // INFO("IRREGULAR FILTERING\n"); + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, + reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w, + reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half, + (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data); + checkCudaErrors(cudaDeviceSynchronize()); + + freeTensor(output_half); } - + cudaFree(convData); cudaFree(reducedFilter); } else { - //INFO("FP16 BASELINE\n"); - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - changeTensorPlacement(output, DEVICE); - __half * convData; - long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 256; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //convToGemmHalf - convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, num_filter_elem, - skip_every, offset); - checkCudaErrors(cudaDeviceSynchronize()); - - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha_half = &alf; - const __half *beta_half = &bet; - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w, - (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output); - cudaFree(convData); + // INFO("FP16 BASELINE\n"); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + changeTensorPlacement(output, DEVICE); + __half *convData; + long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + // convToGemmHalf + convToGemmHalfInputNew<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + num_filter_elem, skip_every, offset); + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, + alpha_half, convData, CUDA_R_16F, n * h * w, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half, + (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (__half *)output->gpu_half_data, + (__half *)new_output->gpu_half_data); + checkCudaErrors(cudaDeviceSynchronize()); + + freeTensor(output); + cudaFree(convData); } profileEvent("H2F_start"); diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu index 6e9f88bb54e5655b18d72fc88e5a08a2478ea9fc..bdcfb2c5684d1584e1a520194066fc20e3724632 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/approx_techniques2_tuned.cu @@ -7,429 +7,489 @@ #include "fp16_conversion.h" #include "profiling.h" -extern "C"{ - -__global__ void convToGemm(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +extern "C" { + +__global__ void convToGemm(float *const __restrict__ output, + const float *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int num_filter_elem) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) const int inH = h * V_stride - V_pad; const int inW = w * H_stride - H_pad; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[out_index] = 0; + output[out_index] = 0; } } } } -__global__ void convToGemmFullInput(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number)_ - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter elemen - if(filter_elem_num % skip_every != skip_every-1-skip_offset) { - int output_col = filter_elem_num - - ((filter_elem_num + skip_every)/skip_every); - if(skip_every == 1) - output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } - } - } +__global__ void convToGemmFullInput( + float *const __restrict__ output, const float *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int skip_every, const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number)_ + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter elemen + if (filter_elem_num % skip_every != skip_every - 1 - skip_offset) { + int output_col = + filter_elem_num - ((filter_elem_num + skip_every) / skip_every); + if (skip_every == 1) + output_col = filter_elem_num; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; } + } + } + } } -__global__ void convToGemmHalfInputNew(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(filter_elem_num % skip_every != skip_offset) { - int output_col = filter_elem_num - - (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - if(skip_every == 1) - output_col = filter_elem_num; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((output_col*N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((output_col*N + n) * H_out + h) * W_out + w] = 0; - } - } - } +__global__ void +convToGemmHalfInputNew(__half *const __restrict__ output, + const __half *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (filter_elem_num % skip_every != skip_offset) { + int output_col = + filter_elem_num - (filter_elem_num / skip_every + + (filter_elem_num % skip_every > skip_offset)); + if (skip_every == 1) + output_col = filter_elem_num; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; + } } + } + } } - -__global__ -void convToGemmHalf(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, - const int V_pad, const int H_pad, - const int H_out, const int W_out, - const int V_stride, const int H_stride){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread i - const int n = tx / (C * H_out * W_out); //output image numbe - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan numbe - const int h = tx % (H_out * W_out) / W_out; //output height index (row number - const int w = tx % W_out; //output width index (col number - const int inH = h * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - } else { - output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; - } - } +__global__ void convToGemmHalf(__half *const __restrict__ output, + const __half *const __restrict input, + const int N, const int C, const int H, + const int W, const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread i + const int n = tx / (C * H_out * W_out); // output image numbe + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan numbe + const int h = tx % (H_out * W_out) / W_out; // output height index (row number + const int w = tx % W_out; // output width index (col number + const int inH = h * V_stride - V_pad; + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + } else { + output[((filter_elem_num * N + n) * H_out + h) * W_out + w] = 0; } + } } + } } -__global__ void convToGemmHalfInputNewIrregular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - //const int ki = c * KH * KW + i; - //const int kj = c * KH * KW + j; - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = condition * filter_elem_num - + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - //if(filter_elem_num % skip_every != skip_offset) { - // int output_col = filter_elem_num - - // (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - //if(skip_every == 1) - // output_col = filter_elem_num; - const int out_index = ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputNewIrregular( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + // const int ki = c * KH * KW + i; + // const int kj = c * KH * KW + j; + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if ((filter_elem_num - skip_offset) % skip_every) { + const int condition = (filter_elem_num < skip_offset); + const int output_col = + condition * filter_elem_num + + (!condition) * + (filter_elem_num - + ((filter_elem_num + 1 - skip_offset) / skip_every) - + ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); + // if(filter_elem_num % skip_every != skip_offset) { + // int output_col = filter_elem_num - + // (filter_elem_num/skip_every + (filter_elem_num % skip_every > + // skip_offset)); + // if(skip_every == 1) + // output_col = filter_elem_num; + const int out_index = + ((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; + //((output_col*N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } + } } + } } -__global__ void convToGemmHalfInputNewIrregular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - if(n < N) { //is thread id within bounds? - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - //const int ki = c * KH * KW + i; - //const int kj = c * KH * KW + j; - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - if((filter_elem_num - skip_offset) % skip_every) { - const int condition = (filter_elem_num < skip_offset); - const int output_col = condition * filter_elem_num - + (!condition) * (filter_elem_num - ((filter_elem_num + 1 - skip_offset) / skip_every) - - ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); - //if(filter_elem_num % skip_every != skip_offset) { - // int output_col = filter_elem_num - - // (filter_elem_num/skip_every + (filter_elem_num % skip_every > skip_offset)); - //if(skip_every == 1) - // output_col = filter_elem_num; - const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; - //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; - //((output_col*N + n) * H_out + h) * W_out + w - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputNewIrregular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); // output chan + // number + const int h = tx % (H_out * W_out) / W_out; // output height index (row + // number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + if (n < N) { // is thread id within bounds? + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + // const int ki = c * KH * KW + i; + // const int kj = c * KH * KW + j; + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + if ((filter_elem_num - skip_offset) % skip_every) { + const int condition = (filter_elem_num < skip_offset); + const int output_col = + condition * filter_elem_num + + (!condition) * + (filter_elem_num - + ((filter_elem_num + 1 - skip_offset) / skip_every) - + ((filter_elem_num + 1 - skip_offset) % skip_every > 0)); + // if(filter_elem_num % skip_every != skip_offset) { + // int output_col = filter_elem_num - + // (filter_elem_num/skip_every + (filter_elem_num % skip_every > + // skip_offset)); + // if(skip_every == 1) + // output_col = filter_elem_num; + const int out_index = ((output_col * N + n) * H_out + h) * W_out + w; + //((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w; + //((output_col*N + n) * H_out + h) * W_out + w + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } + } } + } } - - -__global__ void convToGemmHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int num_filter_elem) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void convToGemmHalf2(__half *const __restrict__ output, + const __half *const __restrict input, + const int N, const int C, const int H, + const int W, const int KH, const int KW, + const int V_pad, const int H_pad, + const int H_out, const int W_out, + const int V_stride, const int H_stride, + const int num_filter_elem) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) const int inH = h * V_stride - V_pad; const int inW = w * H_stride - H_pad; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + (c * KH + i) * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[out_index] = 0; + output[out_index] = 0; } } } } -__global__ void convToGemmPerfRow(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void +convToGemmPerfRow(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int x, const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) int h_index; - if(h < start) { - h_index = h; + if (h < start) { + h_index = h; } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - //#pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i* KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; + const int inW = w * H_stride - H_pad; // input width index (col number) + //#pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; } } } } -__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, int w, - float *old_data, float *new_data, int x, int start){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //filter number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - w]) / 2; - } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRow(int N, int old_h, int j, int c, int h, + int w, float *old_data, float *new_data, + int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // filter number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - w]) / 2; + } else { + int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void convToGemmPerfCol(float * const __restrict__ output, - const float * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) +__global__ void +convToGemmPerfCol(float *const __restrict__ output, + const float *const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, + const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int x, const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) int w_index; - if(w < start) { + if (w < start) { w_index = w; } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = w_index * H_stride - H_pad; + const int inH = h * V_stride - V_pad; // input height index (row number) //#pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0; + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = 0; } } } } -__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, int w, - float *old_data, float *new_data, int x, int start) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (c * h * w); //output image number - if(n < N) { - const int ch = tx % (c * h * w) / (h * w); //output chan number - const int row = tx % (h * w) / w; //output height index (row number) - const int col = tx % w; //output width index (col number) - - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - (old_data[output_index] + old_data[output_index - 1]) / 2; - } else { - int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateCol(int N, int old_w, int b, int c, int h, + int w, float *old_data, float *new_data, + int x, int start) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (c * h * w); // output image number + if (n < N) { + const int ch = tx % (c * h * w) / (h * w); // output chan number + const int row = tx % (h * w) / w; // output height index (row number) + const int col = tx % w; // output width index (col number) + + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + (old_data[output_index] + old_data[output_index - 1]) / 2; + } else { + int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } -__global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image number - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) +__global__ void convToGemmPerfRowHalf( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image number + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) int h_index; - if(h < start) { - h_index = h; + if (h < start) { + h_index = h; } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - // #pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int inW = w * H_stride - H_pad; // input width index (col number) + // #pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else output[out_index] = 0; } @@ -437,872 +497,941 @@ __global__ void convToGemmPerfRowHalf(__half * const __restrict__ output, } } -__global__ void convToGemmPerfRowHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int H_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_eff * W_out); //output image numbe - if(n < N) { - const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number - const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - int h_index; - if(h < start) { - h_index = h; - } else { - h_index = ((h - start + 1) * x) / (x - 1) + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inH = h_index * V_stride - V_pad; - const int inW = w * H_stride - H_pad; //input width index (col number) - // #pragma unroll - //for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - const int out_index = ((filter_elem_num * N + n) * H_eff + h) * W_out + w; - //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmPerfRowHalf2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int H_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_eff * W_out); // output image numbe + if (n < N) { + const int c = + tx % (C * H_eff * W_out) / (H_eff * W_out); // output chan number + const int h = + tx % (H_eff * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + int h_index; + if (h < start) { + h_index = h; + } else { + h_index = ((h - start + 1) * x) / (x - 1) + + (((h - start + 1) * x) % (x - 1) > 0) + start - 1; } -} - -__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int n = tx / (c * h * w); //output image number - const int stride = blockDim.x * gridDim.x; - //if(n < N) { - for(int i = index; i < N; i += stride){ - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - - //const int ch = tx % (c * h * w) / (h * w); //filter number - //const int row = tx % (h * w) / w; //output height index (row number) - //const int col = tx % w; //output width index (col number) - - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - int row_index = row - ((row + 1 - start) / x); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - int output_index = n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } + const int inH = h_index * V_stride - V_pad; + const int inW = w * H_stride - H_pad; // input width index (col number) + // #pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + const int out_index = + ((filter_elem_num * N + n) * H_eff + h) * W_out + w; + //((n * C * KH * KW + filter_elem_num) * H_eff + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + } } + } } -__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - //const int n = tx / (c * h * w); //output image numbe - const int stride = blockDim.x * gridDim.x; - //if(n < N) { - for(int i = index; i < N; i += stride){ - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - - //const int ch = tx % (c * h * w) / (h * w); //filter number - //const int row = tx % (h * w) / w; //output height index (row number) - //const int col = tx % w; //output width index (col number - if(row < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col]; - } else if(row == h-1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; - } else if (row == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col]; - } else if((row - start) % x == 0) { - const int row_index = row - ((row + 1 - start) / x); - const int output_index = ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; - //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); - } else { - const int row_index = row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); - const int output_index = ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; - //n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateRowHalf(int N, int old_h, int j, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id + // const int n = tx / (c * h * w); //output image number + const int stride = blockDim.x * gridDim.x; + // if(n < N) { + for (int i = index; i < N; i += stride) { + const int col = ((i % (c * h * w)) % (h * w)) % w; + const int row = ((i % (c * h * w)) % (h * w)) / w; + const int ch = (i % (c * h * w)) / (h * w); + const int n = i / (c * h * w); + + // const int ch = tx % (c * h * w) / (h * w); //filter number + // const int row = tx % (h * w) / w; //output height index (row number) + // const int col = tx % w; //output width index (col number) + + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * old_h * w) + ch * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + int row_index = row - ((row + 1 - start) / x); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + int output_index = + n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } +__global__ void approxInterpolateRowHalf2(int N, int old_h, int j, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { -__global__ void convToGemmPerfColHalf(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ + const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id + // const int n = tx / (c * h * w); //output image numbe + const int stride = blockDim.x * gridDim.x; + // if(n < N) { + for (int i = index; i < N; i += stride) { + const int col = ((i % (c * h * w)) % (h * w)) % w; + const int row = ((i % (c * h * w)) % (h * w)) / w; + const int ch = (i % (c * h * w)) / (h * w); + const int n = i / (c * h * w); + + // const int ch = tx % (c * h * w) / (h * w); //filter number + // const int row = tx % (h * w) / w; //output height index (row number) + // const int col = tx % w; //output width index (col number + if (row < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * old_h * w) + n * (old_h * w) + row * (w) + col]; + } else if (row == h - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + + col]; + } else if (row == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * old_h * w) + n * (old_h * w) + 0 * (w) + col]; + } else if ((row - start) % x == 0) { + const int row_index = row - ((row + 1 - start) / x); + const int output_index = + ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; + // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - w]), 2); + } else { + const int row_index = + row - ((row + 1 - start) / x) - ((row + 1 - start) % x > 0); + const int output_index = + ch * (n * old_h * w) + n * (old_h * w) + row_index * (w) + col; + // n * (c * old_h * w) + ch * (old_h * w) + row_index * (w) + col; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; + } + } +} - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) +__global__ void convToGemmPerfColHalf( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) int w_index; - if(w < start) { + if (w < start) { w_index = w; } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter element - - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = - input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + const int inH = h * V_stride - V_pad; // input height index (row number) + //#pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter element + + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; else - output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + w] = 0; - + output[((n * C * KH * KW + filter_elem_num) * H_out + h) * W_eff + + w] = 0; } } } } -__global__ void convToGemmPerfColHalf2(__half * const __restrict__ output, - const __half * const __restrict input, const int N, const int C, - const int H, const int W, const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, const int W_out, const int V_stride, - const int H_stride, const int x, const int start, const int W_eff){ - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_eff); //output image number - if(n < N) { - const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number - const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) - const int w = tx % W_eff; //output width index (col number) - int w_index; - if(w < start) { - w_index = w; - } else { - w_index = ((w - start + 1) * x) / (x - 1) + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; - } - const int inW = w_index * H_stride - H_pad; - const int inH = h * V_stride - V_pad; //input height index (row number) - //#pragma unroll - // for (int ki = 0; ki < KH * KW; ki++) { - // int i = ki / KW; - // int j = ki % KW; - for(int i = 0; i < KH; i++) { - for(int j = 0; j < KW; j++) { - const int filter_elem_num = c * KH * KW + i * KW + j; //index of this filter elemen - const int out_index = ((filter_elem_num * N + n) * H_out + h) * W_eff + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) - output[out_index] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; - else - output[out_index] = 0; - } - } +__global__ void convToGemmPerfColHalf2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, const int x, + const int start, const int W_eff) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_eff); // output image number + if (n < N) { + const int c = + tx % (C * H_out * W_eff) / (H_out * W_eff); // output chan number + const int h = + tx % (H_out * W_eff) / W_eff; // output height index (row number) + const int w = tx % W_eff; // output width index (col number) + int w_index; + if (w < start) { + w_index = w; + } else { + w_index = ((w - start + 1) * x) / (x - 1) + + (((w - start + 1) * x) % (x - 1) > 0) + start - 1; } + const int inW = w_index * H_stride - H_pad; + const int inH = h * V_stride - V_pad; // input height index (row number) + //#pragma unroll + // for (int ki = 0; ki < KH * KW; ki++) { + // int i = ki / KW; + // int j = ki % KW; + for (int i = 0; i < KH; i++) { + for (int j = 0; j < KW; j++) { + const int filter_elem_num = + c * KH * KW + i * KW + j; // index of this filter elemen + const int out_index = + ((filter_elem_num * N + n) * H_out + h) * W_eff + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[out_index] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[out_index] = 0; + } + } + } } +__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { -__global__ void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { + const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int stride = blockDim.x * gridDim.x; - const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); + for (int i = index; i < N; i += stride) { + const int col = ((i % (c * h * w)) % (h * w)) % w; + const int row = ((i % (c * h * w)) % (h * w)) / w; + const int ch = (i % (c * h * w)) / (h * w); + const int n = i / (c * h * w); - //const int n = tx / (c * h * w); //output image number - //if(n < N) { - //const int ch = tx % (c * h * w) / (h * w); //output chan number - //const int row = tx % (h * w) / w; //output height index (row number) + // const int n = tx / (c * h * w); //output image number + // if(n < N) { + // const int ch = tx % (c * h * w) / (h * w); //output chan number + // const int row = tx % (h * w) / w; //output height index (row number) // const int col = tx % w; //output width index (col number) - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - int col_index = col - ((col + 1 - start) / x); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } - } + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + int col_index = col - ((col + 1 - start) / x); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + int output_index = + n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; + } + } } -__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, int w, - __half *old_data, __half *new_data, int x, int start) { - - const int index = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int stride = blockDim.x * gridDim.x; - - for(int i = index; i < N; i += stride){ - const int col = ((i % (c * h * w)) % (h * w)) % w; - const int row = ((i % (c * h * w)) % (h * w)) / w; - const int ch = (i % (c * h * w)) / (h * w); - const int n = i / (c * h * w); - //const int n = tx / (c * h * w); //output image number - //if(n < N) { - //const int ch = tx % (c * h * w) / (h * w); //output chan number - //const int row = tx % (h * w) / w; //output height index (row number) - // const int col = tx % w; //output width index (col number) - if(col < start) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] - = old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col]; - //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; - } else if(col == w - 1) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; - //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; - } else if (col == 0) { - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)]; - //n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; - } else if((col - start) % x == 0) { - const int col_index = col - ((col + 1 - start) / x); - const int output_index = ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; - //n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = - __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); - } else { - const int col_index = col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); - const int output_index = ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; - //const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; - new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = old_data[output_index]; - } +__global__ void approxInterpolateColHalf2(int N, int old_w, int b, int c, int h, + int w, __half *old_data, + __half *new_data, int x, int start) { + + const int index = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int stride = blockDim.x * gridDim.x; + + for (int i = index; i < N; i += stride) { + const int col = ((i % (c * h * w)) % (h * w)) % w; + const int row = ((i % (c * h * w)) % (h * w)) / w; + const int ch = (i % (c * h * w)) / (h * w); + const int n = i / (c * h * w); + // const int n = tx / (c * h * w); //output image number + // if(n < N) { + // const int ch = tx % (c * h * w) / (h * w); //output chan number + // const int row = tx % (h * w) / w; //output height index (row number) + // const int col = tx % w; //output width index (col number) + if (col < start) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col]; + // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col]; + } else if (col == w - 1) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w) + + old_w - 1]; + // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w) + old_w - 1]; + } else if (col == 0) { + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (n * h * old_w) + n * (h * old_w) + row * (old_w)]; + // n * (c * h * old_w) + ch * (h * old_w) + row * (old_w)]; + } else if ((col - start) % x == 0) { + const int col_index = col - ((col + 1 - start) / x); + const int output_index = + ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; + // n * (c * h * old_w) + ch * (h * old_w) + row * old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[output_index], old_data[output_index - 1]), 2); + } else { + const int col_index = + col - ((col + 1 - start) / x) - ((col + 1 - start) % x > 0); + const int output_index = + ch * (n * h * old_w) + n * (h * old_w) + row * old_w + col_index; + // const int output_index = n * (c * h * old_w) + ch * (h * old_w) + row * + // old_w + col_index; + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[output_index]; } + } } - -__global__ void convToGemmFullInputRegular(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int ch = (fi * C) / reduced_filter_elem; - const int offset = (skip_offset + ch) % skip_every; - int in_index; - if(fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } +__global__ void +convToGemmFullInputRegular(float *const __restrict__ output, + const float *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int ch = (fi * C) / reduced_filter_elem; + const int offset = (skip_offset + ch) % skip_every; + int in_index; + if (fi < offset) { + in_index = fi; + } else { + in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; } } + } } -__global__ void convToGemmFullInputIrregular(float * const __restrict__ output, - const float * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - int in_index; - if(fi < skip_offset) { - in_index = fi; - } else { - in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmFullInputIrregular( + float *const __restrict__ output, const float *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + int in_index; + if (fi < skip_offset) { + in_index = fi; + } else { + in_index = + ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1; + } + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void createReducedFiltersFullRegular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter +__global__ void createReducedFiltersFullRegular( + float *output, const float *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, const int skip_every, const int skip_offset, + const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter const int ch = (offset * channels) / reduced_filter_elem; const int channel_offset = (skip_offset + ch) % skip_every; - int in_index; - if(offset < channel_offset) { - in_index = offset; - } else { - in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; - } - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; + int in_index; + if (offset < channel_offset) { + in_index = offset; + } else { + in_index = + ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + channel_offset - 1; + } + output[fIdx * reduced_filter_elem + offset] = + fac * input[num_filter_elem * fIdx + in_index]; } } -__global__ void createReducedFiltersFullIrregular(float * output, - const float * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int fIdx = tx / reduced_filter_elem; //filter index - if(fIdx < NF) { - const int offset = tx % reduced_filter_elem; //offset within filter - int in_index; - if(offset < skip_offset) { - in_index = offset; - } else { - in_index = ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - } - output[fIdx * reduced_filter_elem + offset] = fac * input[num_filter_elem * fIdx + in_index]; +__global__ void createReducedFiltersFullIrregular( + float *output, const float *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int fIdx = tx / reduced_filter_elem; // filter index + if (fIdx < NF) { + const int offset = tx % reduced_filter_elem; // offset within filter + int in_index; + if (offset < skip_offset) { + in_index = offset; + } else { + in_index = + ((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1; } + output[fIdx * reduced_filter_elem + offset] = + fac * input[num_filter_elem * fIdx + in_index]; + } } -__global__ void convToGemmHalfInputRegular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - //for(int fi = 0; fi < reduced_filter_elem; fi++) { - //const int ch = (fi * C) / reduced_filter_elem; - for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - const int offset = (skip_offset + ch) % skip_every; - //int in_index; - const bool condition = (fi < offset); - const int in_index = condition * fi + (!condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - //if(fi < offset) { - // in_index = fi; - //} else { - // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - // } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } +__global__ void +convToGemmHalfInputRegular(__half *const __restrict__ output, + const __half *const __restrict input, const int N, + const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, + const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every, const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int ch = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + // for(int fi = 0; fi < reduced_filter_elem; fi++) { + // const int ch = (fi * C) / reduced_filter_elem; + for (int ki = 0; ki < reduced_filter_elem / C; ki++) { + const int fi = ch * (reduced_filter_elem / C) + ki; + const int offset = (skip_offset + ch) % skip_every; + // int in_index; + const bool condition = (fi < offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1); + // if(fi < offset) { + // in_index = fi; + //} else { + // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + // + offset - 1; + // } + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; } } + } } -__global__ void convToGemmHalfInputRegular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (C * H_out * W_out); //output image number - if(n < N) { - const int ch = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int ki = 0; ki < reduced_filter_elem / C; ki++) { - const int fi = ch * (reduced_filter_elem / C) + ki; - //for(int fi = 0; fi < reduced_filter_elem; fi++) { - // const int ch = (fi * C) / reduced_filter_elem; - const int offset = (skip_offset + ch) % skip_every; - const int condition = (fi < offset); - const int in_index = condition * fi + (! condition) * (((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1); - // int in_index; - //if(fi < offset) { - // in_index = fi; - //} else { - // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - // } - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputRegular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (C * H_out * W_out); // output image number + if (n < N) { + const int ch = + tx % (C * H_out * W_out) / (H_out * W_out); // output chan number + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int ki = 0; ki < reduced_filter_elem / C; ki++) { + const int fi = ch * (reduced_filter_elem / C) + ki; + // for(int fi = 0; fi < reduced_filter_elem; fi++) { + // const int ch = (fi * C) / reduced_filter_elem; + const int offset = (skip_offset + ch) % skip_every; + const int condition = (fi < offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1); + // int in_index; + // if(fi < offset) { + // in_index = fi; + //} else { + // in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) + // + (((fi - offset + 1) * skip_every) % (skip_every - 1) > + // 0) + offset - 1; + // } + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((fi * N + n) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void convToGemmHalfInputIrregular(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = condition * fi + (! condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - //int in_index; - //if(fi < skip_offset) { - // in_index = fi; - //} else { - // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - // } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputIrregular( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) + +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int condition = (fi < skip_offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1); + // int in_index; + // if(fi < skip_offset) { + // in_index = fi; + //} else { + // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + // + (((fi - skip_offset + 1) * skip_every) % (skip_every - + // 1) > 0) + skip_offset - 1; + // } + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = + ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } -__global__ void convToGemmHalfInputIrregular2(__half * const __restrict__ output, - const __half * const __restrict input, - const int N, const int C, - const int H, const int W, - const int KH, const int KW, const int V_pad, - const int H_pad, const int H_out, - const int W_out, const int V_stride, - const int H_stride, const int reduced_filter_elem, - const int skip_every, const int skip_offset) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int n = tx / (H_out * W_out); //output image number - if(n < N) { - const int h = tx % (H_out * W_out) / W_out; //output height index (row number) - const int w = tx % W_out; //output width index (col number) - const int inH = h * V_stride - V_pad; //input height index (row number) - const int inW = w * H_stride - H_pad; //input width index (col number) - #pragma unroll - for(int fi = 0; fi < reduced_filter_elem; fi++) { - const int condition = (fi < skip_offset); - const int in_index = condition * fi + (!condition) * (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - // int in_index; - // if(fi < skip_offset) { - // in_index = fi; - // } else { - // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) - // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1; - // } - const int ch = in_index / (KW * KH); - const int i = (in_index % (KW * KH)) / KW; - const int j = in_index % KW; - const int out_index = ((fi * N + n) * H_out + h) * W_out + w; - //const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * W_out + w; - if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { - output[out_index] = input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; - } else { - output[out_index] = 0; - } - } +__global__ void convToGemmHalfInputIrregular2( + __half *const __restrict__ output, const __half *const __restrict input, + const int N, const int C, const int H, const int W, const int KH, + const int KW, const int V_pad, const int H_pad, const int H_out, + const int W_out, const int V_stride, const int H_stride, + const int reduced_filter_elem, const int skip_every, + const int skip_offset) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int n = tx / (H_out * W_out); // output image number + if (n < N) { + const int h = + tx % (H_out * W_out) / W_out; // output height index (row number) + const int w = tx % W_out; // output width index (col number) + const int inH = h * V_stride - V_pad; // input height index (row number) + const int inW = w * H_stride - H_pad; // input width index (col number) +#pragma unroll + for (int fi = 0; fi < reduced_filter_elem; fi++) { + const int condition = (fi < skip_offset); + const int in_index = + condition * fi + + (!condition) * + (((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + + skip_offset - 1); + // int in_index; + // if(fi < skip_offset) { + // in_index = fi; + // } else { + // in_index = ((fi - skip_offset + 1) * skip_every) / (skip_every - 1) + // + (((fi - skip_offset + 1) * skip_every) % (skip_every - 1) + // > 0) + skip_offset - 1; + // } + const int ch = in_index / (KW * KH); + const int i = (in_index % (KW * KH)) / KW; + const int j = in_index % KW; + const int out_index = ((fi * N + n) * H_out + h) * W_out + w; + // const int out_index = ((n * reduced_filter_elem + fi) * H_out + h) * + // W_out + w; + if (inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) { + output[out_index] = + input[((n * C + ch) * H + (inH + i)) * W + (inW + j)]; + } else { + output[out_index] = 0; + } } + } } +__global__ void createReducedFiltersHalfRegular( + __half *output, const __half *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int channels, const int skip_every, const int skip_offset, + const float fac) { -__global__ void createReducedFiltersHalfRegular(__half * output, - const __half * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int channels, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id const int stride = blockDim.x * gridDim.x; - + //#pragma unroll for (int i = tx; i < NF; i += stride) { - const int fIdx = i / reduced_filter_elem; //filter index - //if(fIdx < NF) { - const int offset = i % reduced_filter_elem; //offset within filter + const int fIdx = i / reduced_filter_elem; // filter index + // if(fIdx < NF) { + const int offset = i % reduced_filter_elem; // offset within filter const int ch = (offset * channels) / reduced_filter_elem; const int channel_offset = (skip_offset + ch) % skip_every; const int condition = (offset < channel_offset); - const int in_index = condition * offset + (!condition) * (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset - 1); - - // int in_index; - // if(offset < channel_offset) { - // in_index = offset; - //} else { - // in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - 1) - // + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > 0) + channel_offset -1; + const int in_index = + condition * offset + + (!condition) * + (((offset - channel_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - channel_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + channel_offset - 1); + + // int in_index; + // if(offset < channel_offset) { + // in_index = offset; + //} else { + // in_index = ((offset - channel_offset + 1) * skip_every) / (skip_every - + // 1) + // + (((offset - channel_offset + 1) * skip_every) % (skip_every - + // 1) > 0) + channel_offset -1; // } - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); + output[fIdx * reduced_filter_elem + offset] = + __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); } } -__global__ void createReducedFiltersHalfIrregular(__half * output, - const __half * const __restrict input, const int NF, - const int num_filter_elem, const int reduced_filter_elem, - const int skip_every, const int skip_offset, const float fac) { - - const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id - const int stride = blockDim.x * gridDim.x; - //#pragma unroll - for (int i = tx; i < NF; i += stride) { - - const int fIdx = i / reduced_filter_elem; //filter index +__global__ void createReducedFiltersHalfIrregular( + __half *output, const __half *const __restrict input, const int NF, + const int num_filter_elem, const int reduced_filter_elem, + const int skip_every, const int skip_offset, const float fac) { + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; // thread id + const int stride = blockDim.x * gridDim.x; + //#pragma unroll + for (int i = tx; i < NF; i += stride) { + + const int fIdx = i / reduced_filter_elem; // filter index // if(fIdx < NF) { - const int offset = i % reduced_filter_elem; //offset within filter - const int condition = (offset < skip_offset); - int in_index = condition * offset + (!condition) * (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) - + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > 0) + skip_offset - 1); - //} - output[fIdx * reduced_filter_elem + offset] = __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); + const int offset = i % reduced_filter_elem; // offset within filter + const int condition = (offset < skip_offset); + int in_index = + condition * offset + + (!condition) * + (((offset - skip_offset + 1) * skip_every) / (skip_every - 1) + + (((offset - skip_offset + 1) * skip_every) % (skip_every - 1) > + 0) + + skip_offset - 1); + //} + output[fIdx * reduced_filter_elem + offset] = + __hmul(__float2half_rn(fac), input[num_filter_elem * fIdx + in_index]); //} } } -void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int start){ +void *tensorConvPerfCuda(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int start) { //////INFO("*** TensorConvolution (output perforation) \n"); - //Event("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + // Event("Conv"); + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } - - Tensor* output; + + Tensor *output; // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); - //Event("H2F_start"); + // Event("H2F_start"); convertToFP32(input); convertToFP32(filter); - //Event("H2F_end"); - + // Event("H2F_end"); + long int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; - c = filter->dims.dim_sizes[0]; //number of filters + c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; int rem_row = (h - start) % row > 0; int h_eff = h - ((h - start) / row) - rem_row; - - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; int rem_col = (w - start) % col > 0; int w_eff = w - ((w - start) / col) - rem_col; - Tensor* new_output; - if(row > 1){ - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); + Tensor *new_output; + if (row > 1) { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float* convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - - convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, - h, w, - vertical_stride, horizontal_stride, - row, start, h_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + convToGemmPerfRow<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + row, start, h_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, - num_filter_elem * h_eff * w, - (float *)filter->gpu_data, - num_filter_elem, 0, - &beta, - (float *)output->gpu_data, - h_eff * w, c * h_eff * w, - n)); - - new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); + + new_output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateRow<<<numBlocks,128>>>(n * c * h * w, h_eff, n, c, h, w, - (float *) output->gpu_data, - (float *) new_output->gpu_data, - row, start); + // interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateRow<<<numBlocks, 128>>>( + n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, row, start); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(col > 1){ - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + } else if (col > 1) { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - col, start, w_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + col, start, w_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - &alpha, - convData, - h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, - num_filter_elem, 0, - &beta, - (float *)output->gpu_data, - h * w_eff, c * h * w_eff, - n)); - - new_output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); + + new_output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //interpolate - int numBlocks = (n * c * h * w + 127) / 128; - approxInterpolateCol<<<numBlocks,128>>>(n * c * h * w, w_eff, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data, - col, start); + // interpolate + int numBlocks = (n * c * h * w + 127) / 128; + approxInterpolateCol<<<numBlocks, 128>>>( + n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, col, start); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else { - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + } else { + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - //total number of filter elem + // total number of filter elem const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - convToGemmApprox<<<gridSize, blockSize>>>(convData, - (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - num_filter_elem, c * h * w); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + convToGemmApprox<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + num_filter_elem, c * h * w); checkCudaErrors(cudaDeviceSynchronize()); - //Do the matrix multiplication - //Want to multiply convData by filter->gpu_data[f * chan * KH * KW] - + // Do the matrix multiplication + // Want to multiply convData by filter->gpu_data[f * chan * KH * KW] + float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w, c * h * w, - n)); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, num_filter_elem, + &alpha, convData, h * w, num_filter_elem * h * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w, c * h * w, n)); new_output = output; cudaFree(convData); } - //Event("Conv_end"); //, true); + // Event("Conv_end"); //, true); return new_output; } -__global__ -void switchMatrixFull(int N, int n, int c, int h, int w, - float *old_data, float *new_data){ - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < N){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row ) * w + col] = - old_data[((ch * n + n_new) * h + row ) * w + col]; - } -} +__global__ void switchMatrixFull(int N, int n, int c, int h, int w, + float *old_data, float *new_data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); -void* tensorConvApprox(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset){ + new_data[((n_new * c + ch) * h + row) * w + col] = + old_data[((ch * n + n_new) * h + row) * w + col]; + } +} + +void *tensorConvApprox(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int skip_every, int offset) { //////INFO("*** TensorConvolution approximation \n"); - //Event("Conv"); + // Event("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } @@ -1316,15 +1445,18 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ////Event("H2F_end"); const int n = input->dims.dim_sizes[0]; - const int c = filter->dims.dim_sizes[0]; //number of filters + const int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const int w = + (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); ////INFO("batch: %d\n", n); @@ -1337,327 +1469,299 @@ void* tensorConvApprox(void* input_ptr, void* filter_ptr, ////INFO("horizontal_stride: %d\n", horizontal_stride); ////INFO("output height: %d\n", h); ////INFO("output width: %d\n", w); - if(row > 1) { + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; - Tensor *output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h_eff, w); + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w)); - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - convToGemmPerfRow<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, - vertical_stride, horizontal_stride, - row, offset, h_eff); + ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h_eff * w)); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + convToGemmPerfRow<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + row, offset, h_eff); checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - &alpha, - convData, h_eff * w, num_filter_elem * h_eff * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h_eff * w, c * h_eff * w, - n)); - //interpolate + + float alpha = 1.0f, beta = 0.0f; + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + &alpha, convData, h_eff * w, num_filter_elem * h_eff * w, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h_eff * w, c * h_eff * w, n)); + // interpolate int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateRow<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (float *) output->gpu_data, - (float *) new_output->gpu_data, - row, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateRow<<<numBlocks, blocksize>>>( + n * c * h * w, h_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, row, offset); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(col > 1) { + } else if (col > 1) { const int rem_col = (w - offset) % col > 0; const int w_eff = w - ((w - offset) / col) - rem_col; - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff)); - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - - convToGemmPerfCol<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, - vertical_pad, horizontal_pad, h, w, - vertical_stride, horizontal_stride, - col, offset, w_eff); + ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w_eff)); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfCol<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + col, offset, w_eff); checkCudaErrors(cudaDeviceSynchronize()); float alpha = 1.0f, beta = 0.0f; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - &alpha, - convData, h * w_eff, num_filter_elem * h * w_eff, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)output->gpu_data, h * w_eff, c * h * w_eff, - n)); - - //interpolate + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + &alpha, convData, h * w_eff, num_filter_elem * h * w_eff, + (float *)filter->gpu_data, num_filter_elem, 0, &beta, + (float *)output->gpu_data, h * w_eff, c * h * w_eff, n)); + + // interpolate int blocksize = 128; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - approxInterpolateCol<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data, - col, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + approxInterpolateCol<<<numBlocks, blocksize>>>( + n * c * h * w, w_eff, n, c, h, w, (float *)output->gpu_data, + (float *)new_output->gpu_data, col, offset); cudaDeviceSynchronize(); freeTensor(output); cudaFree(convData); - } else if(skip_every > 1) { - //reduced number after skipping + } else if (skip_every > 1) { + // reduced number after skipping const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; + const int reduced_filter_elem = + num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - float* convData; + float *convData; size_t convDataSize = sizeof(float) * n * reduced_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - float* reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); - + float *reducedFilter; + checkCudaErrors( + cudaMalloc(&reducedFilter, sizeof(float) * c * reduced_filter_elem)); + const int filtBlockSize = 128; ////INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); - const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float) skip_every) / ((float) skip_every - 1); + const int filtGridSize = + (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const float fac = ((float)skip_every) / ((float)skip_every - 1); //////INFO("fac: %f\n", fac); const int blockSize = 128; - //////INFO("n * h * w : %d\n", (n * h * w )); - const int gridSize = (n * h * w + blockSize - 1) / blockSize; - if(!(KH * KW % skip_every)) { - // ////INFO("REGULAR FILTERING\n"); - createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputRegular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + //////INFO("n * h * w : %d\n", (n * h * w )); + const int gridSize = (n * h * w + blockSize - 1) / blockSize; + if (!(KH * KW % skip_every)) { + // ////INFO("REGULAR FILTERING\n"); + createReducedFiltersFullRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); + checkCudaErrors(cudaDeviceSynchronize()); + convToGemmFullInputRegular<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, reduced_filter_elem, skip_every, offset); } else { - // ////INFO("IRREGULAR FILTERING\n"); - createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (float *)filter->gpu_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - convToGemmFullInputIrregular<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + // ////INFO("IRREGULAR FILTERING\n"); + createReducedFiltersFullIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (float *)filter->gpu_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + convToGemmFullInputIrregular<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, reduced_filter_elem, skip_every, offset); } checkCudaErrors(cudaDeviceSynchronize()); - + const float alpha = 1.0; const float beta = 0.0; - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - &alpha, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - &beta, - (float *)new_output->gpu_data, h * w, c * h * w, - n)); + checkCudaErrors(cublasSgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, + &alpha, convData, h * w, reduced_filter_elem * h * w, reducedFilter, + reduced_filter_elem, 0, &beta, (float *)new_output->gpu_data, h * w, + c * h * w, n)); cudaFree(convData); cudaFree(reducedFilter); } else { - //INFO("FP32 BASELINE\n"); - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + // INFO("FP32 BASELINE\n"); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); changeTensorPlacement(new_output, DEVICE); - float * convData; + float *convData; long int convDataSize = sizeof(float) * n * num_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); const int blockSize = 128; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * input->dims.dim_sizes[1] * h * w)); - convToGemmFullInput<<<gridSize, blockSize>>>(convData, (float *)input->gpu_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - skip_every, offset);//num_filter_elem); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + //////INFO("n * input->dims.dim_sizes[1] * h * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w)); + convToGemmFullInput<<<gridSize, blockSize>>>( + convData, (float *)input->gpu_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + skip_every, offset); // num_filter_elem); checkCudaErrors(cudaDeviceSynchronize()); - - float alpha = 1.0f, beta = 0.0f; - /* - checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, num_filter_elem, - &alpha, - convData, h * w, num_filter_elem * h * w, - (float *)filter->gpu_data, num_filter_elem, 0, - &beta, - (float *)new_output->gpu_data, h * w, c * h * w, - n)); - */ - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, num_filter_elem, - &alpha, - convData, - CUDA_R_32F, n * h * w, - (float *) filter->gpu_data, CUDA_R_32F, - num_filter_elem, - &beta, - (float *) output->gpu_data, - CUDA_R_32F, n * h * w, - CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixFull<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (float *)output->gpu_data, - (float *)new_output->gpu_data); - + + float alpha = 1.0f, beta = 0.0f; + /* + checkCudaErrors(cublasSgemmStridedBatched(cublasHandle, + CUBLAS_OP_N, CUBLAS_OP_N, + h * w, c, num_filter_elem, + &alpha, + convData, h * w, num_filter_elem * h + * w, (float *)filter->gpu_data, num_filter_elem, 0, &beta, (float + *)new_output->gpu_data, h * w, c * h * w, n)); + */ + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, + &alpha, convData, CUDA_R_32F, n * h * w, (float *)filter->gpu_data, + CUDA_R_32F, num_filter_elem, &beta, (float *)output->gpu_data, + CUDA_R_32F, n * h * w, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixFull<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (float *)output->gpu_data, + (float *)new_output->gpu_data); + checkCudaErrors(cudaDeviceSynchronize()); cudaFree(convData); } - //Event("Conv_end"); + // Event("Conv_end"); return new_output; } -__global__ -void switchMatrixHalf(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){ - - int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < N){ - int col = ((i % (c * h * w)) % (h * w)) % w; - int row = ((i % (c * h * w)) % (h * w)) / w; - int ch = (i % (c * h * w)) / (h * w); - int n_new = i / (c * h * w); - - new_data[((n_new * c + ch) * h + row ) * w + col] = - old_data[((ch * n + n_new) * h + row ) * w + col]; - } -} +__global__ void switchMatrixHalf(int N, int n, int c, int h, int w, + __half *old_data, __half *new_data) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < N) { + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); -void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, - int row, int col, int skip_every, int offset) { + new_data[((n_new * c + ch) * h + row) * w + col] = + old_data[((ch * n + n_new) * h + row) * w + col]; + } +} + +void *tensorConvApproxHalf2(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups, int row, int col, int skip_every, + int offset) { - //INFO("*** TensorConvolution half approximation \n"); - // profileEvent("#Conv"); + // INFO("*** TensorConvolution half approximation \n"); + // profileEvent("#Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } hostToDeviceCopy(input); hostToDeviceCopy(filter); - // INFO("CONVERT\n"); + // INFO("CONVERT\n"); profileEvent("F2H_start"); - convertToFP16(input); - convertToFP16(filter); + convertToFP16(input); + convertToFP16(filter); profileEvent("F2H_end"); -//INFO("CONVERTED\n"); + // INFO("CONVERTED\n"); const long int n = input->dims.dim_sizes[0]; - const long int c = filter->dims.dim_sizes[0]; //number of filters + const long int c = filter->dims.dim_sizes[0]; // number of filters const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - const long int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - const long int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + const long int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + const long int w = + (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + + 1; const long int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; - Tensor *new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *new_output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(new_output, DEVICE); - //INFO("batch: %d\n", n); + // INFO("batch: %d\n", n); // INFO("channels: %d\n", input->dims.dim_sizes[1]); // INFO("num_filters: %d\n", c); // INFO("kernel height: %d\n", KH); - // INFO("kernel width: %d\n", KW); + // INFO("kernel width: %d\n", KW); // INFO("num_filter_elem: %d\n", num_filter_elem); - //INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); - //INFO("vertical_stride: %d\n", vertical_stride); - //INFO("horizontal_stride: %d\n", horizontal_stride); + // INFO("num_filters * num_filter_elem: %d\n", c * num_filter_elem); + // INFO("vertical_stride: %d\n", vertical_stride); + // INFO("horizontal_stride: %d\n", horizontal_stride); // INFO("output height: %d\n", h); // INFO("output width: %d\n", w); - //INFO("skip_every: %d\n", skip_every); - if(row > 1){ + // INFO("skip_every: %d\n", skip_every); + if (row > 1) { const int rem_row = (h - offset) % row > 0; const int h_eff = h - ((h - offset) / row) - rem_row; - - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, - n, c, h_eff, w); + + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h_eff, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output_half, DEVICE); - __half * convData; + __half *convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * input->dims.dim_sizes[1] * h_eff * w)); + ////INFO("n * input->dims.dim_sizes[1] * h_eff * w: %d\n", (n * + /// input->dims.dim_sizes[1] * h_eff * w)); const int blockSize = 256; - const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; - - if(h * w <= 64) { - convToGemmPerfRowHalf2<<<gridSize, blockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + if (h * w <= 64) { + convToGemmPerfRowHalf2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); } else { - convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, row, offset, h_eff); + convToGemmPerfRowHalf<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, row, offset, h_eff); } checkCudaErrors(cudaDeviceSynchronize()); @@ -1665,74 +1769,68 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(h * w <= 64) { - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h_eff * w, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h_eff * w, - (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + if (h * w <= 64) { + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h_eff * w, c, + num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h_eff * w, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, + n * h_eff * w, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h_eff * w, c, num_filter_elem, - alpha_half, - convData, h_eff * w, num_filter_elem * h_eff * w, - (__half *)filter->gpu_half_data, num_filter_elem, 0, - beta_half, - (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, - n)); + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h_eff * w, c, num_filter_elem, + alpha_half, convData, h_eff * w, num_filter_elem * h_eff * w, + (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, + (__half *)output_half->gpu_half_data, h_eff * w, c * h_eff * w, n)); } - //interpolate + // interpolate int blocksize = 256; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - if(h * w <= 64) { - approxInterpolateRowHalf2<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - row, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + if (h * w <= 64) { + approxInterpolateRowHalf2<<<numBlocks, blocksize>>>( + n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, row, offset); } else { - approxInterpolateRowHalf<<<numBlocks,blocksize>>>(n * c * h * w, h_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - row, offset); + approxInterpolateRowHalf<<<numBlocks, blocksize>>>( + n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, row, offset); } checkCudaErrors(cudaDeviceSynchronize()); freeTensor(output_half); cudaFree(convData); -} else if(col > 1) { + } else if (col > 1) { const int rem_col = (w - offset) % col > 0; const int w_eff = w - ((w - offset) / col) - rem_col; - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w_eff); + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w_eff); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output_half, DEVICE); - - __half * convData; + + __half *convData; long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * input->dims.dim_sizes[1] * h * w_eff)); + ////INFO("n * input->dims.dim_sizes[1] * h * w_eff: %d\n", (n * + /// input->dims.dim_sizes[1] * h * w_eff)); const int blockSize = 256; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; - if(h * w <= 64) { - convToGemmPerfColHalf2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + if (h * w <= 64) { + convToGemmPerfColHalf2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); } else { - convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, col, offset, w_eff); + convToGemmPerfColHalf<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, + horizontal_stride, col, offset, w_eff); } checkCudaErrors(cudaDeviceSynchronize()); @@ -1740,229 +1838,211 @@ void* tensorConvApproxHalf2(void* input_ptr, void* filter_ptr, const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(h * w <= 64) { - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w_eff, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w_eff, - (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + if (h * w <= 64) { + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w_eff, c, + num_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w_eff, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, (__half *)output_half->gpu_half_data, CUDA_R_16F, + n * h * w_eff, CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w_eff, c, num_filter_elem, - alpha_half, - convData, h * w_eff, num_filter_elem * h * w_eff, - (__half *)filter->gpu_half_data, num_filter_elem, 0, - beta_half, - (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, - n)); + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w_eff, c, num_filter_elem, + alpha_half, convData, h * w_eff, num_filter_elem * h * w_eff, + (__half *)filter->gpu_half_data, num_filter_elem, 0, beta_half, + (__half *)output_half->gpu_half_data, h * w_eff, c * h * w_eff, n)); } - //interpolate + // interpolate int blocksize = 256; - int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; - if(h * w <= 64) { - approxInterpolateColHalf2<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - col, offset); + int numBlocks = (n * c * h * w + blocksize - 1) / blocksize; + if (h * w <= 64) { + approxInterpolateColHalf2<<<numBlocks, blocksize>>>( + n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, col, offset); } else { - approxInterpolateColHalf<<<numBlocks,blocksize>>>(n * c * h * w, w_eff, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data, - col, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); + approxInterpolateColHalf<<<numBlocks, blocksize>>>( + n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, col, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); freeTensor(output_half); cudaFree(convData); - } else if(skip_every > 1) { + } else if (skip_every > 1) { const int remainder = ((num_filter_elem - offset) % skip_every > 0); - const int reduced_filter_elem = num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; + const int reduced_filter_elem = + num_filter_elem - ((num_filter_elem - offset) / skip_every) - remainder; - __half* convData; + __half *convData; size_t convDataSize = sizeof(__half) * n * reduced_filter_elem * h * w; checkCudaErrors(cudaMalloc(&convData, convDataSize)); - __half* reducedFilter; - checkCudaErrors(cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); + __half *reducedFilter; + checkCudaErrors( + cudaMalloc(&reducedFilter, sizeof(__half) * c * reduced_filter_elem)); const int filtBlockSize = 256; - const int filtGridSize = (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; - const float fac = ((float) skip_every) / ((float) skip_every - 1); + const int filtGridSize = + (c * reduced_filter_elem + filtBlockSize - 1) / filtBlockSize; + const float fac = ((float)skip_every) / ((float)skip_every - 1); const int blockSize = 256; - //const int gridSize = (n * h * w + blockSize - 1) / blockSize; - // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); - // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); + // const int gridSize = (n * h * w + blockSize - 1) / blockSize; + // INFO("reduced_filter_elem: %d\n", (reduced_filter_elem)); + // INFO("c * reduced_filter_elem: %d\n", (c * reduced_filter_elem)); const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - if(c * num_filter_elem < 500000) {//250) {//c * reduced_filter_elem < 150000) { - if(!(KH * KW % skip_every)) { - //INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); + if (c * num_filter_elem < + 500000) { // 250) {//c * reduced_filter_elem < 150000) { + if (!(KH * KW % skip_every)) { + // INFO("REGULAR FILTERING\n"); + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputRegular<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); } else { - //INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); + // INFO("IRREGULAR FILTERING\n"); + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - //convToGemmHalfInputIrregular - convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasHgemmStridedBatched(cublasHandle, - CUBLAS_OP_N, CUBLAS_OP_N, - h * w, c, reduced_filter_elem, - alpha_half, - convData, h * w, reduced_filter_elem * h * w, - reducedFilter, reduced_filter_elem, 0, - beta_half, - (__half *)new_output->gpu_half_data, h * w, c * h * w, - n)); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + // convToGemmHalfInputIrregular + convToGemmHalfInputNewIrregular<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasHgemmStridedBatched( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, h * w, c, reduced_filter_elem, + alpha_half, convData, h * w, reduced_filter_elem * h * w, + reducedFilter, reduced_filter_elem, 0, beta_half, + (__half *)new_output->gpu_half_data, h * w, c * h * w, n)); } else { - Tensor *output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - changeTensorPlacement(output_half, DEVICE); - - if(!(KH * KW % skip_every)) { - // INFO("REGULAR FILTERING\n"); - createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - input->dims.dim_sizes[1], skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputRegular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } else { - //INFO("IRREGULAR FILTERING\n"); - createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>(reducedFilter, - (__half *)filter->gpu_half_data, - c, num_filter_elem, - reduced_filter_elem, - skip_every, offset, fac); - checkCudaErrors(cudaDeviceSynchronize()); - - const int gridSize = (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; - convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, horizontal_pad, - h, w, vertical_stride, horizontal_stride, - reduced_filter_elem, skip_every, offset); - } - checkCudaErrors(cudaDeviceSynchronize()); - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, reduced_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w, - reducedFilter, CUDA_R_16F, reduced_filter_elem, - beta_half, - (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - - int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, - (__half *)output_half->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output_half); + Tensor *output_half = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, CUDNN_TENSOR_NCHW, n, c, h, w); + changeTensorPlacement(output_half, DEVICE); + + if (!(KH * KW % skip_every)) { + // INFO("REGULAR FILTERING\n"); + createReducedFiltersHalfRegular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, input->dims.dim_sizes[1], skip_every, offset, + fac); + checkCudaErrors(cudaDeviceSynchronize()); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputRegular2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } else { + // INFO("IRREGULAR FILTERING\n"); + createReducedFiltersHalfIrregular<<<filtGridSize, filtBlockSize>>>( + reducedFilter, (__half *)filter->gpu_half_data, c, num_filter_elem, + reduced_filter_elem, skip_every, offset, fac); + checkCudaErrors(cudaDeviceSynchronize()); + + const int gridSize = + (n * h * w * input->dims.dim_sizes[1] + blockSize - 1) / blockSize; + convToGemmHalfInputNewIrregular2<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, + w, vertical_stride, horizontal_stride, reduced_filter_elem, + skip_every, offset); + } + checkCudaErrors(cudaDeviceSynchronize()); + + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, + reduced_filter_elem, alpha_half, convData, CUDA_R_16F, n * h * w, + reducedFilter, CUDA_R_16F, reduced_filter_elem, beta_half, + (__half *)output_half->gpu_half_data, CUDA_R_16F, n * h * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data); + checkCudaErrors(cudaDeviceSynchronize()); + + freeTensor(output_half); } - + cudaFree(convData); cudaFree(reducedFilter); } else { // INFO("BASELINE\n"); - Tensor *output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - - changeTensorPlacement(output, DEVICE); - __half * convData; - long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; - checkCudaErrors(cudaMalloc(&convData, convDataSize)); - - const int blockSize = 256; - const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; - //convToGemmHalf - convToGemmHalfInputNew<<<gridSize, blockSize>>>(convData, - (__half *)input->gpu_half_data, n, - input->dims.dim_sizes[1], - input->dims.dim_sizes[2], - input->dims.dim_sizes[3], - KH, KW, vertical_pad, - horizontal_pad, h, w, vertical_stride, - horizontal_stride, num_filter_elem, - skip_every, offset); - checkCudaErrors(cudaDeviceSynchronize()); - - const __half alf = approx_float_to_half(1.0); - const __half bet = approx_float_to_half(0.0); - const __half *alpha_half = &alf; - const __half *beta_half = &bet; - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n * h * w, c, num_filter_elem, - alpha_half, - convData, CUDA_R_16F, n * h * w, - (__half *) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, - beta_half, - (__half *) output->gpu_half_data, CUDA_R_16F, n * h * w, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - const int numBlocks = (n * c * h * w + 255) / 256; - switchMatrixHalf<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, (__half *)output->gpu_half_data, - (__half *)new_output->gpu_half_data); - checkCudaErrors(cudaDeviceSynchronize()); - - freeTensor(output); - cudaFree(convData); + Tensor *output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + changeTensorPlacement(output, DEVICE); + __half *convData; + long int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = + (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + // convToGemmHalf + convToGemmHalfInputNew<<<gridSize, blockSize>>>( + convData, (__half *)input->gpu_half_data, n, input->dims.dim_sizes[1], + input->dims.dim_sizes[2], input->dims.dim_sizes[3], KH, KW, + vertical_pad, horizontal_pad, h, w, vertical_stride, horizontal_stride, + num_filter_elem, skip_every, offset); + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n * h * w, c, num_filter_elem, + alpha_half, convData, CUDA_R_16F, n * h * w, + (__half *)filter->gpu_half_data, CUDA_R_16F, num_filter_elem, beta_half, + (__half *)output->gpu_half_data, CUDA_R_16F, n * h * w, CUDA_R_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + const int numBlocks = (n * c * h * w + 255) / 256; + switchMatrixHalf<<<numBlocks, 256>>>(n * c * h * w, n, c, h, w, + (__half *)output->gpu_half_data, + (__half *)new_output->gpu_half_data); + checkCudaErrors(cudaDeviceSynchronize()); + + freeTensor(output); + cudaFree(convData); } -// INFO("CONV DONE\n"); + // INFO("CONV DONE\n"); profileEvent("H2F_start"); convertToFP32_offline(new_output); - //convertToFP32(input); - //convertToFP32(filter); + // convertToFP32(input); + // convertToFP32(filter); profileEvent("H2F_end"); - //profileEvent("#Conv_end"); - //INFO("CONVOLUTION END\n"); + // profileEvent("#Conv_end"); + // INFO("CONVOLUTION END\n"); return new_output; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp index fd1492fe68e8833ea4cdca4d5df6518b6ec3b37c..c18ffcea26f93fe752500983f4d4a3fcfe59ded2 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/configuration.cpp @@ -1,13 +1,13 @@ -//===--------------------------- configuration.cpp -------------------------===// +//===--------------------------- configuration.cpp +//-------------------------===// // //===----------------------------------------------------------------------===// -// -// This file consists of the definitions of API to get information about +// +// This file consists of the definitions of API to get information about // configurations for rest of the tensor runtime to use. // //===----------------------------------------------------------------------===// - #include "configuration.h" using G_APPROX = GPUNodeConfiguration::APPROX; @@ -31,9 +31,8 @@ void GPUNodeConfiguration::pushNewTensorOperation(G_TENSOR_OP top) { void GPUNodeConfiguration::pushNewApproximationChoiceForOperation( G_APPROX approx, int u) { unsigned size = ApproxChoices.size(); - CUSTOM_ASSERT( - size >= 1 && - "Cannot apply approximation choice to non existent operation."); + CUSTOM_ASSERT(size >= 1 && + "Cannot apply approximation choice to non existent operation."); ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u)); } @@ -55,9 +54,8 @@ void CPUNodeConfiguration::pushNewTensorOperation(C_TENSOR_OP top) { void CPUNodeConfiguration::pushNewApproximationChoiceForOperation( C_APPROX approx, int u) { unsigned size = ApproxChoices.size(); - CUSTOM_ASSERT( - size >= 1 && - "Cannot apply approximation choice to non existent operation."); + CUSTOM_ASSERT(size >= 1 && + "Cannot apply approximation choice to non existent operation."); ApproxChoices[size - 1].second.push_back(std::make_pair(approx, u)); } @@ -71,8 +69,8 @@ CPUNodeConfiguration::CPUNodeConfiguration() { } CPUNodeConfiguration::~CPUNodeConfiguration() {} -Configuration::Configuration( - std::string &n, float f, float e, float a, float al) +Configuration::Configuration(std::string &n, float f, float e, float a, + float al) : name(n), speedup(f), energy(e), accuracy(a), accuracyLoss(al) {} float Configuration::getSpeedup() { return speedup; } @@ -82,20 +80,20 @@ float Configuration::getEnergy() { return energy; } float Configuration::getAccuracy() { return accuracy; } float Configuration::getAccuracyLoss() { return accuracyLoss; } -bool ConfigurationLessThan:: -operator()(const struct Configuration &a, const struct Configuration &b) const { +bool ConfigurationLessThan::operator()(const struct Configuration &a, + const struct Configuration &b) const { return (a.accuracyLoss < b.accuracyLoss); } -bool ConfigurationLessThan_AL:: -operator()(const struct Configuration *a, const float &b) const { +bool ConfigurationLessThan_AL::operator()(const struct Configuration *a, + const float &b) const { return (a->accuracyLoss < b); } -bool ConfigurationLessThan_SP:: -operator()(const struct Configuration *a, const float &b) const { +bool ConfigurationLessThan_SP::operator()(const struct Configuration *a, + const float &b) const { return (a->speedup < b); } -bool ConfigurationLessThan_E:: -operator()(const struct Configuration *a, const float &b) const { +bool ConfigurationLessThan_E::operator()(const struct Configuration *a, + const float &b) const { return (a->energy < b); } @@ -286,9 +284,8 @@ void CPUNodeConfiguration::print() { void Configuration::print() { printf("+++++\n"); - printf( - "%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, - accuracyLoss); + printf("%s %f %f %f %f\n", name.c_str(), speedup, energy, accuracy, + accuracyLoss); for (std::map<std::string, NodeConfiguration *>::const_iterator it = setup.begin(); it != setup.end(); ++it) { diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc index 3e4aecb824a93b932ef2146380b86496f71b0f28..1abf5432b99b2c23acc57001f389ebad851c3846 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/debug.cc @@ -5,7 +5,7 @@ #define LOG_DEBUG 0 // Sets the debug logging to true #define LOG_INFO 1 // Sets the info logging to true -#define LOG_ERROR 1 // Print Errors +#define LOG_ERROR 1 // Print Errors #define ASSERT_FLAG // Sets assertions to true (opposite of NDEBUG macro) #include "debug.h" diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu index 0e05813bb6eb5de86057bf3b2066c8fd98642e8d..032443bd7a63a1640e463c0457dd362e09733be3 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu @@ -12,8 +12,8 @@ #define CASE_FUNC(ename, fname) \ case MathOp::ename: { \ void *v_func_ptr = nullptr; \ - checkCudaErrors(cudaMemcpyFromSymbol( \ - &v_func_ptr, _internal::fname##_ptr, sizeof(void *))); \ + checkCudaErrors(cudaMemcpyFromSymbol(&v_func_ptr, _internal::fname##_ptr, \ + sizeof(void *))); \ return v_func_ptr; \ } @@ -120,7 +120,7 @@ template <> void *mathOpToFunc<float2>(MathOp op) { CASE_FUNC(Mul, f2mul) default: ERROR("Float2 function not found\n"); - return nullptr; // For some compilers + return nullptr; // For some compilers } } @@ -129,7 +129,7 @@ template <> void *mathOpToFunc<half2>(MathOp op) { CASE_FUNC(Mul, h2mul) default: ERROR("Half2 function not found\n"); - return nullptr; // For some compilers + return nullptr; // For some compilers } } @@ -151,7 +151,7 @@ template <> void *mathOpToFunc<float>(MathOp op) { default: ERROR("Float function not found\n"); } - return nullptr; // For some compilers + return nullptr; // For some compilers } template <> void *mathOpToFunc<half>(MathOp op) { @@ -169,7 +169,7 @@ template <> void *mathOpToFunc<half>(MathOp op) { default: ERROR("Half function not found\n"); } - return nullptr; // For some compilers + return nullptr; // For some compilers } template <> half reduceOpToIdentity<half>(MathOp op) { @@ -185,5 +185,5 @@ template <> half reduceOpToIdentity<half>(MathOp op) { default: ERROR("Operator does not have id value\n"); } - return 0.0f; // For some compilers + return 0.0f; // For some compilers } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu index 4afed4c287e5c282fd6a4f43f7c4231e6b558fb4..638e06e786a8d8e4c587d4bda5d0223fa386f39a 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/error.cu @@ -2,7 +2,6 @@ #ifndef ERROR_HEADER #define ERROR_HEADER - #include <stdio.h> #include <stdarg.h> #include <cstdio> @@ -23,7 +22,6 @@ #include <math.h> #include <assert.h> - #include "debug.h" #include "tensor.h" #include "profiling.h" @@ -31,39 +29,33 @@ #include "global_data.h" #include "error.h" +extern "C" { +void readSkipTensors(int *skip_tensor_ids, int op_count) { -extern "C"{ - - -void readSkipTensors(int* skip_tensor_ids, int op_count){ - - for(int i = 0; i < op_count; i++){ + for (int i = 0; i < op_count; i++) { int tensor_id = skip_tensor_ids[i]; skip_tensors[tensor_id] = 1; } - } - - -void readOpenTunerFlags(const char* file_name){ +void readOpenTunerFlags(const char *file_name) { total_ops = 0; op_counter = 0; op_accuracies.clear(); - - FILE* fp = fopen(file_name, "r"); - if(fp == NULL){ + + FILE *fp = fopen(file_name, "r"); + if (fp == NULL) { DEBUG("\n WARNING: File 'opentuner_flags' not found \n\n\n"); return; } - + int retVal = 200; - while(retVal != EOF){ + while (retVal != EOF) { int op_acc; - if(fp != NULL) + if (fp != NULL) retVal = fscanf(fp, "%d", &op_acc); else op_acc = 0; @@ -75,24 +67,23 @@ void readOpenTunerFlags(const char* file_name){ fclose(fp); } - -void readQuantRanges(char* file_name){ +void readQuantRanges(char *file_name) { total_ops = 0; op_counter = 0; quant_ranges.clear(); - - FILE* fp = fopen(file_name, "r"); - if(fp == NULL){ + + FILE *fp = fopen(file_name, "r"); + if (fp == NULL) { ERROR("File %s not found \n", file_name); } - + int retVal = 200; - while(retVal != EOF && retVal != -1){ + while (retVal != EOF && retVal != -1) { int min; int max; - if(fp != NULL){ + if (fp != NULL) { retVal = fscanf(fp, "%d", &min); printf("min =% d \n", min); @@ -100,22 +91,18 @@ void readQuantRanges(char* file_name){ printf("max =% d \n", max); } - if(retVal != -1){ - struct Range* range = (struct Range*) malloc(sizeof(struct Range)); + if (retVal != -1) { + struct Range *range = (struct Range *)malloc(sizeof(struct Range)); range->min = min; range->max = max; quant_ranges.push_back(range); total_ops++; } } - + fclose(fp); } - - - - /*__device__ inline void atomicAdd(float* address, float value) { @@ -133,11 +120,7 @@ void readQuantRanges(char* file_name){ }; */ - - - - -Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){ +Norm_t *calculateNorms(Tensor *x, Tensor *x_orig) { deviceToHostCopy(x); deviceToHostCopy(x_orig); @@ -148,18 +131,18 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){ float inf_norm = -1.0; double total = 0.0; - float* arr1 = (float*) x->host_data; - float* arr2 = (float*) x_orig->host_data; - - for(unsigned int i = 0; i < x->num_elems; i++){ + float *arr1 = (float *)x->host_data; + float *arr2 = (float *)x_orig->host_data; + + for (unsigned int i = 0; i < x->num_elems; i++) { total = total + arr2[i]; - + float diff = abs(arr1[i] - arr2[i]); l1_norm += diff; - l2_norm += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]); + l2_norm += (arr1[i] - arr2[i]) * (arr1[i] - arr2[i]); - if(inf_norm < diff) + if (inf_norm < diff) inf_norm = diff; } @@ -170,12 +153,11 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){ l1_norm = l1_norm / distribution_mean; l2_norm = l2_norm / distribution_mean; - - Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t)); + Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t)); norms->l1_norm = l1_norm; norms->l2_norm = l2_norm; - norms->inf_norm = inf_norm; - + norms->inf_norm = inf_norm; + INFO("l1_norm = %f \n", l1_norm); INFO("l2_norm = %f \n", l2_norm); INFO("inf_norm = %f \n", inf_norm); @@ -183,9 +165,7 @@ Norm_t* calculateNorms(Tensor* x, Tensor* x_orig){ return norms; } - - -Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ +Norm_t *calculateNorms2(Tensor *x, Tensor *x_orig) { deviceToHostCopy(x); deviceToHostCopy(x_orig); @@ -196,50 +176,49 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ double l1_norm_A = 0.0; double l1_norm_B = 0.0; - + double l2_norm_A = 0.0; double l2_norm_B = 0.0; float inf_norm = -1.0; float orig_inf_norm = -1.0; double total_diff = 0.0; double total_diff_squared = 0.0; - - float* arr1 = (float*) x->host_data; - float* arr2 = (float*) x_orig->host_data; - - for(unsigned int i = 0; i < x->num_elems; i++){ - if(arr2[i] != 0.0) + float *arr1 = (float *)x->host_data; + float *arr2 = (float *)x_orig->host_data; + + for (unsigned int i = 0; i < x->num_elems; i++) { + + if (arr2[i] != 0.0) l0_norm_A = l0_norm_A + 1.0; - if(arr1[i] != 0.0) + if (arr1[i] != 0.0) l0_norm_B = l0_norm_B + 1.0; - + l1_norm_A = l1_norm_A + abs(arr2[i]); l1_norm_B = l1_norm_B + abs(arr1[i]); l2_norm_A = l2_norm_A + (arr2[i] * arr2[i]); l2_norm_B = l2_norm_B + (arr1[i] * arr1[i]); - + float diff = abs(arr1[i] - arr2[i]); total_diff = total_diff + diff; float diff_squared = diff * diff; - total_diff_squared = total_diff_squared + diff_squared; - + total_diff_squared = total_diff_squared + diff_squared; - if(orig_inf_norm < diff){ + if (orig_inf_norm < diff) { orig_inf_norm = diff; } - + // Relative difference value - float normalized_diff = diff / arr2[i]; - if(inf_norm < normalized_diff){ + float normalized_diff = diff / arr2[i]; + if (inf_norm < normalized_diff) { inf_norm = normalized_diff; - } + } } // Relative L1 and Mean L1 norms of the difference Matrix - float mean_l1 = ( total_diff ) / x->num_elems; - float relative_l1 = ( total_diff ) / l1_norm_A; + float mean_l1 = (total_diff) / x->num_elems; + float relative_l1 = (total_diff) / l1_norm_A; // Computing Relative L2 norm - i.e., Euclidean distance double norm_root_A = sqrt(l2_norm_A); @@ -248,8 +227,9 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ float relative_l2 = diff_root / norm_root_A; // Packing computed norms in Norm_t struct - Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t)); - // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware + Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t)); + // Mean metrics - not normalized for the distribution - suitable for precision + // tuning hardware norms->mean_l1 = mean_l1; norms->mean_l2 = mean_l2; norms->orig_inf_norm = orig_inf_norm; @@ -257,8 +237,8 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ // Relative metrics (relative to distribution) - suitable for PROMISE norms->l1_norm = relative_l1; norms->l2_norm = relative_l2; - norms->inf_norm = inf_norm; - + norms->inf_norm = inf_norm; + INFO("l1_norm = %f \n", relative_l1); INFO("l2_norm = %f \n", relative_l2); INFO("inf_norm = %f \n", inf_norm); @@ -266,33 +246,28 @@ Norm_t* calculateNorms2(Tensor* x, Tensor* x_orig){ return norms; } - - - - -__global__ void normComputeKernel(float* A, float * B, double* l1_A, double* l2_A, - double* l1_diff, double* l2_diff, unsigned int n){ +__global__ void normComputeKernel(float *A, float *B, double *l1_A, + double *l2_A, double *l1_diff, + double *l2_diff, unsigned int n) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < n){ - + if (i < n) { + double diff = fabsf(A[i] - B[i]); - double diff_squared = diff * diff; + double diff_squared = diff * diff; - atomicAdd( l1_A, fabsf(A[i]) ); - atomicAdd( l2_A, (A[i] * A[i]) ); + atomicAdd(l1_A, fabsf(A[i])); + atomicAdd(l2_A, (A[i] * A[i])); - atomicAdd( l1_diff, diff); - atomicAdd( l2_diff, diff_squared); + atomicAdd(l1_diff, diff); + atomicAdd(l2_diff, diff_squared); } } - - __inline__ __device__ double warpReduceSum(double val) { - for (int offset = warpSize/2; offset > 0; offset /= 2) + for (int offset = warpSize / 2; offset > 0; offset /= 2) val += __shfl_down_sync(0xFFFFFFFF, val, offset); return val; @@ -304,36 +279,34 @@ __inline__ __device__ double blockReduceSum(double val) { int lane = threadIdx.x % warpSize; int wid = threadIdx.x / warpSize; - val = warpReduceSum(val); // Each warp performs partial reduction + val = warpReduceSum(val); // Each warp performs partial reduction if (lane == 0) - shared[wid]=val; // Write reduced value to shared memory + shared[wid] = val; // Write reduced value to shared memory - - __syncthreads(); // Wait for all partial reductions + __syncthreads(); // Wait for all partial reductions - - //read from shared memory only if that warp existed + // read from shared memory only if that warp existed val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; - if (wid == 0) val = warpReduceSum(val); //Final reduce within first warp + if (wid == 0) + val = warpReduceSum(val); // Final reduce within first warp return val; - } - - -__global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N, - double* A_l1, double* A_l2, - double* diff_l1, double* diff_l2) { +__global__ void deviceReduceBlockAtomicKernel(float *A, float *B, int N, + double *A_l1, double *A_l2, + double *diff_l1, + double *diff_l2) { double sum_A_l1 = double(0); double sum_A_l2 = double(0); double sum_diff_l1 = double(0); double sum_diff_l2 = double(0); - for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += blockDim.x * gridDim.x) { sum_A_l1 += fabsf(A[i]); sum_A_l2 += (A[i] * A[i]); @@ -347,31 +320,28 @@ __global__ void deviceReduceBlockAtomicKernel(float* A, float* B, int N, sum_A_l2 = blockReduceSum(sum_A_l2); sum_diff_l1 = blockReduceSum(sum_diff_l1); sum_diff_l2 = blockReduceSum(sum_diff_l2); - - if (threadIdx.x == 0){ + + if (threadIdx.x == 0) { atomicAdd(A_l1, sum_A_l1); atomicAdd(A_l2, sum_A_l2); atomicAdd(diff_l1, sum_diff_l1); atomicAdd(diff_l2, sum_diff_l2); - } + } } - -void deviceReduce(float* A, float* B, int N, - double* A_l1, double* A_l2, - double* diff_l1, double* diff_l2) { +void deviceReduce(float *A, float *B, int N, double *A_l1, double *A_l2, + double *diff_l1, double *diff_l2) { int threads = 512; int blocks = min((N + threads - 1) / threads, 1024); - deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2, diff_l1, diff_l2); + deviceReduceBlockAtomicKernel<<<blocks, threads>>>(A, B, N, A_l1, A_l2, + diff_l1, diff_l2); //-- deviceReduceKernel<<<1, 1024>>>(out, out, blocks); } - - // Compute Norms on the GPU -Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){ +Norm_t *calculateNormsTreeReduction(Tensor *x, Tensor *x_orig) { hostToDeviceCopy(x); hostToDeviceCopy(x_orig); @@ -388,26 +358,27 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){ double *l2_norm_A_d; double *l1_diff_d; double *l2_diff_d; - - cudaMalloc( (void**) &l1_norm_A_d, sizeof(double)); - cudaMalloc( (void**) &l2_norm_A_d, sizeof(double)); - cudaMalloc( (void**) &l1_diff_d, sizeof(double)); - cudaMalloc( (void**) &l2_diff_d, sizeof(double)); - - - float* arr1 = (float*) x->gpu_data; - float* arr2 = (float*) x_orig->gpu_data; - - //normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems); - deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d); - + + cudaMalloc((void **)&l1_norm_A_d, sizeof(double)); + cudaMalloc((void **)&l2_norm_A_d, sizeof(double)); + cudaMalloc((void **)&l1_diff_d, sizeof(double)); + cudaMalloc((void **)&l2_diff_d, sizeof(double)); + + float *arr1 = (float *)x->gpu_data; + float *arr2 = (float *)x_orig->gpu_data; + + // normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, + // l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems); + deviceReduce(arr1, arr2, x->num_elems, l1_norm_A_d, l2_norm_A_d, l1_diff_d, + l2_diff_d); + cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost); INFO("l1_norm_A = %f, l2_norm_A = %f, l1_diff = %f, l2_diff = %f \n", - l1_norm_A, l2_norm_A,l1_diff, l2_diff); + l1_norm_A, l2_norm_A, l1_diff, l2_diff); // Relative L1 and Mean L1 norms of the difference Matrix float mean_l1 = l1_diff / x->num_elems; @@ -420,34 +391,32 @@ Norm_t* calculateNormsTreeReduction(Tensor* x, Tensor* x_orig){ float relative_l2 = diff_root / norm_root_A; // Packing computed norms in Norm_t struct - Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t)); - // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware + Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t)); + // Mean metrics - not normalized for the distribution - suitable for precision + // tuning hardware norms->mean_l1 = mean_l1; norms->mean_l2 = mean_l2; norms->orig_inf_norm = 0.0; - // Relative metrics (relative to distribution) + // Relative metrics (relative to distribution) norms->l1_norm = relative_l1; norms->l2_norm = relative_l2; - norms->inf_norm = 0.0; - + norms->inf_norm = 0.0; + INFO("l1_norm = %f \n", relative_l1); INFO("l2_norm = %f \n", relative_l2); return norms; } - - - // Compute Norms on the GPU -Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){ +Norm_t *calculateNormsGPU(Tensor *x, Tensor *x_orig) { hostToDeviceCopy(x); hostToDeviceCopy(x_orig); // FIXIT: Move all floats to doubles - overflow is possible - + double l1_norm_A; double l2_norm_A; @@ -459,27 +428,26 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){ double *l2_norm_A_d; double *l1_diff_d; double *l2_diff_d; - - cudaMalloc( (void**) &l1_norm_A_d, sizeof(double)); - cudaMalloc( (void**) &l2_norm_A_d, sizeof(double)); - cudaMalloc( (void**) &l1_diff_d, sizeof(double)); - cudaMalloc( (void**) &l2_diff_d, sizeof(double)); - - - float* arr1 = (float*) x->gpu_data; - float* arr2 = (float*) x_orig->gpu_data; + + cudaMalloc((void **)&l1_norm_A_d, sizeof(double)); + cudaMalloc((void **)&l2_norm_A_d, sizeof(double)); + cudaMalloc((void **)&l1_diff_d, sizeof(double)); + cudaMalloc((void **)&l2_diff_d, sizeof(double)); + + float *arr1 = (float *)x->gpu_data; + float *arr2 = (float *)x_orig->gpu_data; int blockSize = 1024; - int gridSize = (int) ceil ((float) x->num_elems / blockSize); + int gridSize = (int)ceil((float)x->num_elems / blockSize); INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize); - normComputeKernel<<<gridSize, blockSize>>>(arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems); + normComputeKernel<<<gridSize, blockSize>>>( + arr1, arr2, l1_norm_A_d, l2_norm_A_d, l1_diff_d, l2_diff_d, x->num_elems); cudaMemcpy(&l1_norm_A, l1_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l2_norm_A, l2_norm_A_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l1_diff, l1_diff_d, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(&l2_diff, l2_diff_d, sizeof(double), cudaMemcpyDeviceToHost); - // Relative L1 and Mean L1 norms of the difference Matrix float mean_l1 = l1_diff / x->num_elems; @@ -492,8 +460,9 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){ float relative_l2 = diff_root / norm_root_A; // Packing computed norms in Norm_t struct - Norm_t* norms = (Norm_t*) malloc(sizeof(Norm_t)); - // Mean metrics - not normalized for the distribution - suitable for precision tuning hardware + Norm_t *norms = (Norm_t *)malloc(sizeof(Norm_t)); + // Mean metrics - not normalized for the distribution - suitable for precision + // tuning hardware norms->mean_l1 = mean_l1; norms->mean_l2 = mean_l2; norms->orig_inf_norm = 0.0; @@ -501,54 +470,47 @@ Norm_t* calculateNormsGPU(Tensor* x, Tensor* x_orig){ // Relative metrics (relative to distribution) - suitable for PROMISE norms->l1_norm = relative_l1; norms->l2_norm = relative_l2; - norms->inf_norm = 0.0; - + norms->inf_norm = 0.0; + INFO("l1_norm = %f \n", relative_l1); INFO("l2_norm = %f \n", relative_l2); return norms; } - - - -__global__ void vecConstMul(float* A, float mul_factor, int n){ +__global__ void vecConstMul(float *A, float mul_factor, int n) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n) - A[id] = A[id] * mul_factor; + if (id < n) + A[id] = A[id] * mul_factor; } - -__global__ void vecRound(float* A, int n){ +__global__ void vecRound(float *A, int n) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n) - A[id] = roundf(A[id]); + if (id < n) + A[id] = roundf(A[id]); } - -__global__ void vecConstDiv(float* A, float div_factor, int n){ +__global__ void vecConstDiv(float *A, float div_factor, int n) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n) - A[id] = A[id] / div_factor; + if (id < n) + A[id] = A[id] / div_factor; } - - -__global__ void vecMul(float* A, float* B, int n){ +__global__ void vecMul(float *A, float *B, int n) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n) - B[id] = A[id] * B[id]; + if (id < n) + B[id] = A[id] * B[id]; } -void initPromiseRandValues(Tensor* bias, int error_scale){ +void initPromiseRandValues(Tensor *bias, int error_scale) { float scaling_values[10]; @@ -558,98 +520,91 @@ void initPromiseRandValues(Tensor* bias, int error_scale){ scaling_values[2] = 0.336; scaling_values[3] = 0.21; scaling_values[4] = 0.168; - scaling_values[5] = 0.14; + scaling_values[5] = 0.14; scaling_values[6] = 0.11; scaling_values[7] = 0.0784; scaling_values[8] = 0.005; scaling_values[9] = 0.000; - curandGenerator_t gen; struct timespec ts; - if(timespec_get(&ts, TIME_UTC) == 0){ + if (timespec_get(&ts, TIME_UTC) == 0) { printf("crashed \n"); abort(); } curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); - curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec^ts.tv_sec); - curandGenerateNormal(gen, - (float*) bias->gpu_data, - bias->num_elems, 0.0, - 1.0 * scaling_values[error_scale]); - + curandSetPseudoRandomGeneratorSeed(gen, ts.tv_nsec ^ ts.tv_sec); + curandGenerateNormal(gen, (float *)bias->gpu_data, bias->num_elems, 0.0, + 1.0 * scaling_values[error_scale]); } - // NOTE: Assumption is that x_ptr is FP32 tensor - doesn't work with FP16 // Routine for Adding PROMISE bitline swing error -void* addPromiseError(void* x_ptr, int error_scale){ +void *addPromiseError(void *x_ptr, int error_scale) { - if(error_scale > 10 || error_scale < 0){ + if (error_scale > 10 || error_scale < 0) { ERROR("Error Scale out of bounds for PROMISE - 8 Swing values \n"); } - - INFO("*** addPromiseError \n"); + + INFO("*** addPromiseError \n"); profileEvent("addPromiseError"); - Tensor* x = (Tensor*) x_ptr; - - size_t* dim_sizes = x->dims.dim_sizes; - Tensor* bias = (Tensor*) create4DTensor(x->cur_type, x->data_format, - dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); - + Tensor *x = (Tensor *)x_ptr; + + size_t *dim_sizes = x->dims.dim_sizes; + Tensor *bias = + (Tensor *)create4DTensor(x->cur_type, x->data_format, dim_sizes[0], + dim_sizes[1], dim_sizes[2], dim_sizes[3]); + // NOTE: Error scale is used to generate the bias matrix - initPromiseRandValues(bias, error_scale); + initPromiseRandValues(bias, error_scale); hostToDeviceCopy(x); - //hostToDeviceCopy(bias); - + // hostToDeviceCopy(bias); + int blockSize = 1024; - int gridSize = (int) ceil ((float) x->num_elems / blockSize); + int gridSize = (int)ceil((float)x->num_elems / blockSize); INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize); // NOTE: Check if a large gridSize will work with really large tensors - vecMul<<<gridSize, blockSize>>>((float*) x->gpu_data, (float*) bias->gpu_data, x->num_elems); - + vecMul<<<gridSize, blockSize>>>((float *)x->gpu_data, (float *)bias->gpu_data, + x->num_elems); + float alpha = 1.0f; - //float beta = 0.0f; + // float beta = 0.0f; checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc, - bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data)); + bias->gpu_data, &alpha, x->tensor_desc, + x->gpu_data)); profileEvent("addPromiseError_end", true); - - return (void*) x; -} - - + return (void *)x; +} -__global__ void quantizeAndClip(float* A, int n, float mul_factor, float min, float max){ +__global__ void quantizeAndClip(float *A, int n, float mul_factor, float min, + float max) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n){ + if (id < n) { int temp = (A[id] - min) / mul_factor; float result = temp * 1.0 * mul_factor; result = result + min; A[id] = result; - if(A[id] > max){ + if (A[id] > max) { A[id] = max; } - if(A[id] < min){ + if (A[id] < min) { A[id] = min; } - } } - -__global__ void quantizeElem(float* A, int n, float mul_factor, float min){ +__global__ void quantizeElem(float *A, int n, float mul_factor, float min) { int id = blockIdx.x * blockDim.x + threadIdx.x; - if(id < n){ + if (id < n) { int temp = (A[id] - min) / mul_factor; float result = temp * 1.0 * mul_factor; result = result + min; @@ -657,32 +612,27 @@ __global__ void quantizeElem(float* A, int n, float mul_factor, float min){ } } - -void* quantizeTensorPromise(void* input_ptr, float min, float max){ +void *quantizeTensorPromise(void *input_ptr, float min, float max) { INFO("QuantizeTensorPROMISE \n"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; - int quantize_range = 256; float input_range = max - min; float mul_factor = input_range / quantize_range; INFO("mul_factor = %f \n", mul_factor); int blockSize = 1024; - int gridSize = (int) ceil ((float) input->num_elems / blockSize); + int gridSize = (int)ceil((float)input->num_elems / blockSize); INFO("blockSize = %d, gridSize = %d \n", blockSize, gridSize); hostToDeviceCopy(input); - quantizeAndClip<<<gridSize, blockSize>>>((float*) input->gpu_data, - input->num_elems, mul_factor, min, max); + quantizeAndClip<<<gridSize, blockSize>>>( + (float *)input->gpu_data, input->num_elems, mul_factor, min, max); - return input; } - } - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu index 4392839f7f6dbca8df4352a19fdd689d6f8e3d5e..00334f8ecc821fdb3209e48aa94785aad0a54f37 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/fp16_gemm.cu @@ -1,7 +1,7 @@ //===--------------------------- fp16_gemm.cu -----------------------------===// // //===----------------------------------------------------------------------===// -// +// // This file consists of the custom implementation of quantization kernels. // This helps HPVM to switch compute precision for tensor operations between // FP32 and FP16. @@ -17,236 +17,199 @@ #include <cuda_fp16.h> #include "fp16_emu.h" - - inline cudaError_t checkCuda(cudaError_t result) { - if (result != cudaSuccess) - std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n"; - return result; + if (result != cudaSuccess) + std::cerr << "CUDA Runtime Error: " << cudaGetErrorString(result) << "\n"; + return result; } inline cublasStatus_t checkCublas(cublasStatus_t result) { - if (result != CUBLAS_STATUS_SUCCESS) - std::cerr << "cuBLAS Error: " << result << "\n"; - return result; + if (result != CUBLAS_STATUS_SUCCESS) + std::cerr << "cuBLAS Error: " << result << "\n"; + return result; } template <typename T> -inline void printArray(const T * const __restrict__ array, +inline void printArray(const T *const __restrict__ array, const unsigned elements) { - for (unsigned i = 0; i < elements; i++) - std::cout << std::to_string(array[i]) << "\n"; + for (unsigned i = 0; i < elements; i++) + std::cout << std::to_string(array[i]) << "\n"; } // initialization template <typename T> -__global__ void initKernel(T * const __restrict__ array, +__global__ void initKernel(T *const __restrict__ array, const unsigned elements) { - const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < elements) - array[idx] = 1.2; + const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < elements) + array[idx] = 1.2; } template <typename T> -void init(T * const __restrict__ array, - const unsigned elements) { - const unsigned block_size = 512; - const unsigned num_blocks = (elements + block_size - 1) / block_size; - initKernel<<<num_blocks, block_size>>>(array, elements); - checkCuda(cudaDeviceSynchronize()); +void init(T *const __restrict__ array, const unsigned elements) { + const unsigned block_size = 512; + const unsigned num_blocks = (elements + block_size - 1) / block_size; + initKernel<<<num_blocks, block_size>>>(array, elements); + checkCuda(cudaDeviceSynchronize()); } // float to half -__global__ void f2hKernel(const float * const __restrict__ input, +__global__ void f2hKernel(const float *const __restrict__ input, const unsigned elements, - half * const __restrict__ output) { - const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < elements) - output[idx] = __float2half_rn(input[idx]); + half *const __restrict__ output) { + const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < elements) + output[idx] = __float2half_rn(input[idx]); } -void f2h(const float * const __restrict__ input, - const unsigned elements, - half * const __restrict__ output) { - const unsigned block_size = 512; - const unsigned num_blocks = (elements + block_size - 1) / block_size; - f2hKernel<<<num_blocks, block_size>>>(input, elements, output); - checkCuda(cudaDeviceSynchronize()); +void f2h(const float *const __restrict__ input, const unsigned elements, + half *const __restrict__ output) { + const unsigned block_size = 512; + const unsigned num_blocks = (elements + block_size - 1) / block_size; + f2hKernel<<<num_blocks, block_size>>>(input, elements, output); + checkCuda(cudaDeviceSynchronize()); } // half to float -__global__ void h2fKernel(const half * const __restrict__ input, +__global__ void h2fKernel(const half *const __restrict__ input, const unsigned elements, - float * const __restrict__ output) { - const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < elements) - output[idx] = __half2float(input[idx]); + float *const __restrict__ output) { + const unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < elements) + output[idx] = __half2float(input[idx]); } -void h2f(const half * const __restrict__ input, - const unsigned elements, - float * const __restrict__ output) { - const unsigned block_size = 512; - const unsigned num_blocks = (elements + block_size - 1) / block_size; - h2fKernel<<<num_blocks, block_size>>>(input, elements, output); - checkCuda(cudaDeviceSynchronize()); +void h2f(const half *const __restrict__ input, const unsigned elements, + float *const __restrict__ output) { + const unsigned block_size = 512; + const unsigned num_blocks = (elements + block_size - 1) / block_size; + h2fKernel<<<num_blocks, block_size>>>(input, elements, output); + checkCuda(cudaDeviceSynchronize()); } -void sgemm(const float * const __restrict__ a, - const unsigned num_rows_a, - const unsigned num_cols_a, - const float * const __restrict__ b, - const unsigned num_rows_b, - const unsigned num_cols_b, - float * const __restrict__ c) { - const unsigned iterations = 10; - float kernel_time; - cudaEvent_t start; - cudaEvent_t stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - cublasHandle_t handle; - checkCublas(cublasCreate(&handle)); - - // Enable Tensor Cores - checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); - - const float alpha_ = 1.0; - const float beta_ = 0.0; - const float *alpha = &alpha_; - const float *beta = &beta_; - - cudaEventRecord(start, 0); - for (unsigned i = 0; i < iterations; i++) { - checkCublas(cublasGemmEx(handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - // Dimensions - num_rows_a, - num_cols_b, - num_cols_a, - alpha, - // A - a, - CUDA_R_32F, - num_rows_a, - // B - b, - CUDA_R_32F, - num_rows_b, - beta, - // C - c, - CUDA_R_32F, - num_rows_a, - // Compute precision and algorithm - CUDA_R_32F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&kernel_time, start, stop); - - std::cout << "FP32 GEMM: " << std::to_string(kernel_time / iterations) << " ms\n"; +void sgemm(const float *const __restrict__ a, const unsigned num_rows_a, + const unsigned num_cols_a, const float *const __restrict__ b, + const unsigned num_rows_b, const unsigned num_cols_b, + float *const __restrict__ c) { + const unsigned iterations = 10; + float kernel_time; + cudaEvent_t start; + cudaEvent_t stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cublasHandle_t handle; + checkCublas(cublasCreate(&handle)); + + // Enable Tensor Cores + checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); + + const float alpha_ = 1.0; + const float beta_ = 0.0; + const float *alpha = &alpha_; + const float *beta = &beta_; + + cudaEventRecord(start, 0); + for (unsigned i = 0; i < iterations; i++) { + checkCublas(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, + // Dimensions + num_rows_a, num_cols_b, num_cols_a, alpha, + // A + a, CUDA_R_32F, num_rows_a, + // B + b, CUDA_R_32F, num_rows_b, beta, + // C + c, CUDA_R_32F, num_rows_a, + // Compute precision and algorithm + CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&kernel_time, start, stop); + + std::cout << "FP32 GEMM: " << std::to_string(kernel_time / iterations) + << " ms\n"; } -void hgemm(const float * const __restrict__ af, - const unsigned num_rows_a, - const unsigned num_cols_a, - const float * const __restrict__ bf, - const unsigned num_rows_b, - const unsigned num_cols_b, - float * const __restrict__ cf) { - const unsigned iterations = 10; - - const unsigned num_elements_a = num_rows_a * num_cols_a; - const unsigned num_elements_b = num_rows_b * num_cols_b; - const unsigned num_elements_c = num_rows_a * num_cols_b; - - float to_fp16_time; - float to_fp32_time; - float kernel_time; - float total_time; - - cudaEvent_t start; - cudaEvent_t stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - half *a; - half *b; - half *c; - - checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a)); - checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b)); - checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c)); - - init(a, num_elements_a); - init(b, num_elements_b); - init(c, num_elements_c); - - // Convert floats to halfs - cudaEventRecord(start, 0); - f2h(af, num_elements_a, a); - f2h(bf, num_elements_b, b); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&to_fp16_time, start, stop); - - cublasHandle_t handle; - checkCublas(cublasCreate(&handle)); - checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); - - const half alpha_ = cpu_float2half_rn(1.0); - const half beta_ = cpu_float2half_rn(0.0); - const half *alpha = &alpha_; - const half *beta = &beta_; - - cudaEventRecord(start, 0); - for (unsigned i = 0; i < iterations; i++) { - checkCublas(cublasGemmEx(handle, - CUBLAS_OP_N, - CUBLAS_OP_N, - // Dimensions - num_rows_a, - num_cols_b, - num_cols_a, - alpha, - // A - a, - CUDA_R_16F, - num_rows_a, - // B - b, - CUDA_R_16F, - num_rows_b, - beta, - // C - c, - CUDA_R_16F, - num_rows_a, - // Compute precision and algorithm - CUDA_R_16F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - } - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&kernel_time, start, stop); - - cudaEventRecord(start, 0); - h2f(c, num_elements_c, cf); - cudaEventRecord(stop, 0); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&to_fp32_time, start, stop); - - total_time = to_fp16_time + (kernel_time / iterations) + to_fp32_time; - std::cout << "FP16 GEMM: " << std::to_string(total_time) << " ms\n"; - std::cout << "\tTo FP16: " << std::to_string(to_fp16_time) << " ms\n"; - std::cout << "\tKernel : " << std::to_string(kernel_time / iterations) << " ms\n"; - std::cout << "\tTo FP32: " << std::to_string(to_fp32_time) << " ms\n"; +void hgemm(const float *const __restrict__ af, const unsigned num_rows_a, + const unsigned num_cols_a, const float *const __restrict__ bf, + const unsigned num_rows_b, const unsigned num_cols_b, + float *const __restrict__ cf) { + const unsigned iterations = 10; + + const unsigned num_elements_a = num_rows_a * num_cols_a; + const unsigned num_elements_b = num_rows_b * num_cols_b; + const unsigned num_elements_c = num_rows_a * num_cols_b; + + float to_fp16_time; + float to_fp32_time; + float kernel_time; + float total_time; + + cudaEvent_t start; + cudaEvent_t stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + half *a; + half *b; + half *c; + + checkCuda(cudaMallocManaged(&a, sizeof(half) * num_elements_a)); + checkCuda(cudaMallocManaged(&b, sizeof(half) * num_elements_b)); + checkCuda(cudaMallocManaged(&c, sizeof(half) * num_elements_c)); + + init(a, num_elements_a); + init(b, num_elements_b); + init(c, num_elements_c); + + // Convert floats to halfs + cudaEventRecord(start, 0); + f2h(af, num_elements_a, a); + f2h(bf, num_elements_b, b); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&to_fp16_time, start, stop); + + cublasHandle_t handle; + checkCublas(cublasCreate(&handle)); + checkCublas(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH)); + + const half alpha_ = cpu_float2half_rn(1.0); + const half beta_ = cpu_float2half_rn(0.0); + const half *alpha = &alpha_; + const half *beta = &beta_; + + cudaEventRecord(start, 0); + for (unsigned i = 0; i < iterations; i++) { + checkCublas(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, + // Dimensions + num_rows_a, num_cols_b, num_cols_a, alpha, + // A + a, CUDA_R_16F, num_rows_a, + // B + b, CUDA_R_16F, num_rows_b, beta, + // C + c, CUDA_R_16F, num_rows_a, + // Compute precision and algorithm + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&kernel_time, start, stop); + + cudaEventRecord(start, 0); + h2f(c, num_elements_c, cf); + cudaEventRecord(stop, 0); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&to_fp32_time, start, stop); + + total_time = to_fp16_time + (kernel_time / iterations) + to_fp32_time; + std::cout << "FP16 GEMM: " << std::to_string(total_time) << " ms\n"; + std::cout << "\tTo FP16: " << std::to_string(to_fp16_time) << " ms\n"; + std::cout << "\tKernel : " << std::to_string(kernel_time / iterations) + << " ms\n"; + std::cout << "\tTo FP32: " << std::to_string(to_fp32_time) << " ms\n"; } - - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc index 4902043b7ce6a1240981224d98dc7dac70361500..b812a51d7eacf6c6ddf9760aecea0344407d1f84 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/global_data.cc @@ -48,4 +48,3 @@ PerfParamSet *perfParamSet; SampParamSet *sampParamSet; unsigned int currentTensorID = -1; - diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu index ab8896369ab4a8b4b6e2bc44a3896034001300cc..6a3fcc12e014205aaf81e2cae0906ed6cfbff33e 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/group_conv.cu @@ -1,14 +1,14 @@ -//===--------------------------- group_conv.cu -----------------------------===// +//===--------------------------- group_conv.cu +//-----------------------------===// // //===----------------------------------------------------------------------===// -// -// This file group convolutions with FP16 and FP32 compute precisions. +// +// This file group convolutions with FP16 and FP32 compute precisions. // Note that group convolutions, unlike regular convolutions, are not // approximable in any other way in HPVM. // //===----------------------------------------------------------------------===// - #include "tensor_utils.h" #include "fp16_gemm.h" #include "debug.h" @@ -17,31 +17,26 @@ #include "op_overheads.h" #include "error.h" +extern "C" { -extern "C"{ - - +__global__ void depthwise_convNew8( + float *const __restrict__ y, const float *const __restrict__ x, + const float *const __restrict__ w, const int B, const int M, const int H, + const int W, const int KH, const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, const int H_stride, const int W_stride) { -__global__ void depthwise_convNew8(float* const __restrict__ y, - const float* const __restrict__ x, - const float* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) -{ - - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] +#define y4d(i3, i2, i1, i0) \ + y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] +#define x4d(i3, i2, i1, i0) \ + x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] const int num = 8; const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out); + + if (m < M) { + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); const int start_h = (tx / W_out) * H_stride - H_pad; const int start_w = (tx % W_out) * W_stride - W_pad; @@ -54,80 +49,73 @@ __global__ void depthwise_convNew8(float* const __restrict__ y, float c5 = 0; float c6 = 0; float c7 = 0; - - const float* weights = &w[m * KH * KW]; + + const float *weights = &w[m * KH * KW]; for (int k = 0; k < KH * KW; k++) { int p = k / KW; int q = k % KW; - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; - if(b + 1 < B) - c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; - if(b + 2 < B) - c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; - if(b + 3 < B) - c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; - if(b + 4 < B) - c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; - if(b + 5 < B) - c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; - if(b + 6 < B) - c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; - if(b + 7 < B) - c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; - - + if (start_h + p > -1 && start_h + p < H && start_w + q > -1 && + start_w + q < W) { + + c0 += x4d(b, m, start_h + p, start_w + q) * weights[k]; + if (b + 1 < B) + c1 += x4d(b + 1, m, start_h + p, start_w + q) * weights[k]; + if (b + 2 < B) + c2 += x4d(b + 2, m, start_h + p, start_w + q) * weights[k]; + if (b + 3 < B) + c3 += x4d(b + 3, m, start_h + p, start_w + q) * weights[k]; + if (b + 4 < B) + c4 += x4d(b + 4, m, start_h + p, start_w + q) * weights[k]; + if (b + 5 < B) + c5 += x4d(b + 5, m, start_h + p, start_w + q) * weights[k]; + if (b + 6 < B) + c6 += x4d(b + 6, m, start_h + p, start_w + q) * weights[k]; + if (b + 7 < B) + c7 += x4d(b + 7, m, start_h + p, start_w + q) * weights[k]; } } - y4d(b, m, 0, tx) = c0; - if(b + 1 < B) + y4d(b, m, 0, tx) = c0; + if (b + 1 < B) y4d(b + 1, m, 0, tx) = c1; - if(b + 2 < B) + if (b + 2 < B) y4d(b + 2, m, 0, tx) = c2; - if(b + 3 < B) + if (b + 3 < B) y4d(b + 3, m, 0, tx) = c3; - if(b + 4 < B) + if (b + 4 < B) y4d(b + 4, m, 0, tx) = c4; - if(b + 5 < B) + if (b + 5 < B) y4d(b + 5, m, 0, tx) = c5; - if(b + 6 < B) + if (b + 6 < B) y4d(b + 6, m, 0, tx) = c6; - if(b + 7 < B) + if (b + 7 < B) y4d(b + 7, m, 0, tx) = c7; } - - #undef y4d - #undef x4d -} - - +#undef y4d +#undef x4d +} -__global__ void depthwise_convNew8_half2(__half* const __restrict__ y, - const __half* const __restrict__ x, - const __half* const __restrict__ w, - const int B, const int M, - const int H, const int W, const int KH, - const int KW, const int H_out, const int W_out, - const int H_pad, const int W_pad, - const int H_stride, const int W_stride) -{ +__global__ void depthwise_convNew8_half2( + __half *const __restrict__ y, const __half *const __restrict__ x, + const __half *const __restrict__ w, const int B, const int M, const int H, + const int W, const int KH, const int KW, const int H_out, const int W_out, + const int H_pad, const int W_pad, const int H_stride, const int W_stride) { - #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] - #define x4d(i3, i2, i1, i0) x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] +#define y4d(i3, i2, i1, i0) \ + y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] +#define x4d(i3, i2, i1, i0) \ + x[(i3) * (M * H * W) + (i2) * (H * W) + (i1) * (W) + i0] const int num = 8; const int b = num * blockIdx.x; - const int m = (blockIdx.y * blockDim.x + threadIdx.x)/ (H_out * W_out); - - if(m < M){ - const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); + const int m = (blockIdx.y * blockDim.x + threadIdx.x) / (H_out * W_out); + + if (m < M) { + const int tx = (blockIdx.y * blockDim.x + threadIdx.x) % (H_out * W_out); const int start_h = (tx / W_out) * H_stride - H_pad; const int start_w = (tx % W_out) * W_stride - W_pad; @@ -136,111 +124,112 @@ __global__ void depthwise_convNew8_half2(__half* const __restrict__ y, __half2 c1 = __half2half2(0); __half2 c2 = __half2half2(0); __half2 c3 = __half2half2(0); - - const __half* weights = &w[m * KH * KW]; + + const __half *weights = &w[m * KH * KW]; for (int k = 0; k < KH * KW; k++) { int p = k / KW; int q = k % KW; - if (start_h + p > -1 && start_h + p < H && - start_w + q > -1 && start_w + q < W) { - - - __half2 t1; - __half2 t2; - __half2 t3; - __half2 t4; - if(b + 7 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), x4d(b + 6, m, start_h + p, start_w + q)); - } - else if(b + 6 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q)); - - } - else if(b + 5 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), x4d(b + 4, m, start_h + p, start_w + q)); - } - else if(b + 4 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q)); - - } - else if(b + 3 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), x4d(b + 2, m, start_h + p, start_w + q)); - } - else if(b + 2 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q)); - - } - else if(b + 1 < B){ - t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), x4d(b, m, start_h + p, start_w + q)); - } - else{ - t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q)); - - } - - - c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0); - c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1); - c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2); - c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3); - + if (start_h + p > -1 && start_h + p < H && start_w + q > -1 && + start_w + q < W) { + + __half2 t1; + __half2 t2; + __half2 t3; + __half2 t4; + if (b + 7 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), + x4d(b + 4, m, start_h + p, start_w + q)); + t4 = __halves2half2(x4d(b + 7, m, start_h + p, start_w + q), + x4d(b + 6, m, start_h + p, start_w + q)); + } else if (b + 6 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), + x4d(b + 4, m, start_h + p, start_w + q)); + t4 = __halves2half2(0, x4d(b + 6, m, start_h + p, start_w + q)); + + } else if (b + 5 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(x4d(b + 5, m, start_h + p, start_w + q), + x4d(b + 4, m, start_h + p, start_w + q)); + } else if (b + 4 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + t3 = __halves2half2(0, x4d(b + 4, m, start_h + p, start_w + q)); + + } else if (b + 3 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(x4d(b + 3, m, start_h + p, start_w + q), + x4d(b + 2, m, start_h + p, start_w + q)); + } else if (b + 2 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + t2 = __halves2half2(0, x4d(b + 2, m, start_h + p, start_w + q)); + + } else if (b + 1 < B) { + t1 = __halves2half2(x4d(b + 1, m, start_h + p, start_w + q), + x4d(b, m, start_h + p, start_w + q)); + } else { + t1 = __halves2half2(0, x4d(b, m, start_h + p, start_w + q)); + } + + c0 = __hfma2(t1, __halves2half2(weights[k], weights[k]), c0); + c1 = __hfma2(t2, __halves2half2(weights[k], weights[k]), c1); + c2 = __hfma2(t3, __halves2half2(weights[k], weights[k]), c2); + c3 = __hfma2(t4, __halves2half2(weights[k], weights[k]), c3); } } - y4d(b, m, 0, tx) = __high2half(c0); - if(b + 1 < B) + y4d(b, m, 0, tx) = __high2half(c0); + if (b + 1 < B) y4d(b + 1, m, 0, tx) = __low2half(c0); - if(b + 2 < B) + if (b + 2 < B) y4d(b + 2, m, 0, tx) = __high2half(c1); - if(b + 3 < B) + if (b + 3 < B) y4d(b + 3, m, 0, tx) = __low2half(c1); - if(b + 4 < B) + if (b + 4 < B) y4d(b + 4, m, 0, tx) = __high2half(c2); - if(b + 5 < B) + if (b + 5 < B) y4d(b + 5, m, 0, tx) = __low2half(c2); - if(b + 6 < B) + if (b + 6 < B) y4d(b + 6, m, 0, tx) = __high2half(c3); - if(b + 7 < B) + if (b + 7 < B) y4d(b + 7, m, 0, tx) = __low2half(c3); } - - #undef y4d - #undef x4d -} - - -void* tensorConvCutlass(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ +#undef y4d +#undef x4d +} +void *tensorConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups) { INFO("*** TensorConvolution \n"); profileEvent("Conv"); - Tensor* input = (Tensor*)input_ptr; - Tensor* filter = (Tensor*)filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; - //FIXME: Current hack to preserve backward compatibilty + // FIXME: Current hack to preserve backward compatibilty if (conv_groups == 0) { conv_groups = 1; } - Tensor* output; + Tensor *output; hostToDeviceCopy(input); hostToDeviceCopy(filter); @@ -248,43 +237,43 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, convertToFP32(input); convertToFP32(filter); - if (conv_groups > 32) { - // TODO: Support other cases; + // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); - int n, c, h, w; // output dimensions + int n, c, h, w; // output dimensions n = input->dims.dim_sizes[0]; c = input->dims.dim_sizes[1]; const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + + 1; + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / + horizontal_stride + + 1; - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - - int blockSize; blockSize = 64; - - dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); + + dim3 grid(((n + 7) / 8), (c * h * w + blockSize - 1) / blockSize); dim3 block(blockSize); - depthwise_convNew8<<<grid, block>>> ((float*)output->gpu_data, - (float*)input->gpu_data, (float*)filter->gpu_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); + depthwise_convNew8<<<grid, block>>>( + (float *)output->gpu_data, (float *)input->gpu_data, + (float *)filter->gpu_data, input->dims.dim_sizes[0], + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, h, w, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); - } - else { + } else { cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; @@ -297,130 +286,119 @@ void* tensorConvCutlass(void* input_ptr, void* filter_ptr, // FIXIT: Need to be more aware of the implications of alpha and beta float alpha = 1.0f, beta = 0.0f; - // TODO: Support other cases; + // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); - INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); + INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); // NOTE: Adding support for grouped convolution checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - cudnnDataType_t computeType = CUDNN_DATA_FLOAT; // FIXIT: Think if upscaling values need to be configurable? - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision - - int n, c, h, w; // output dimensions - // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); + // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE + // should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision + int n, c, h, w; // output dimensions + // Find dimension of convolution output + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, + w); if (input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*)create4DTensor((cudnnDataType_t) float_type, //input->data_type, - CUDNN_TENSOR_NHWC, n, h, w, c); - } - else + output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, // input->data_type, + CUDNN_TENSOR_NHWC, n, h, w, c); + } else ERROR("Unsupported Tensor Type"); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H " + "= %d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); if (convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN + // support is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // FIXIT: Algo shouldn't be hardcoded convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, filter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); } cudaDeviceSynchronize(); profileEvent("Conv_end", true); return output; - - } // FIXME: Need to properly fix the new HALF type conversion -void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ +void *tensorHalfConvCutlass(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups) { INFO("*** TensorHConvolution \n"); profileEvent("#Conv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - - if(conv_mode == 0) + + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; // FIXIT: Need to be more aware of the implications of alpha and beta @@ -432,33 +410,34 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); - // Float-Half Conversions profileEvent("F2H_start"); convertToFP16(input); - convertToFP16(filter); + convertToFP16(filter); profileEvent("F2H_end"); /******* END OF INPUT DATA CONVERSIONS*/ - Tensor *output; - if(conv_groups > 1){ + if (conv_groups > 1) { int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; const int KH = filter->dims.dim_sizes[2]; const int KW = filter->dims.dim_sizes[3]; - int h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; - int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; - - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - + int h = + (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + + 1; + int w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / + horizontal_stride + + 1; + + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, + w); - output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + output = (Tensor *)create4DTensor((cudnnDataType_t)half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor @@ -466,117 +445,90 @@ void* tensorHalfConvCutlass(void* input_ptr, void* filter_ptr, int blockSize; blockSize = 128; - dim3 grid(((n + 7)/ 8), (c * h * w + blockSize - 1)/ blockSize); + dim3 grid(((n + 7) / 8), (c * h * w + blockSize - 1) / blockSize); dim3 block(blockSize); - depthwise_convNew8_half2<<<grid, block>>> ((__half*) output->gpu_half_data, - (__half*) input->gpu_half_data, - (__half*) filter->gpu_half_data, - input->dims.dim_sizes[0], input->dims.dim_sizes[1], - input->dims.dim_sizes[2], input->dims.dim_sizes[3], - KH, KW, h, w, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); + depthwise_convNew8_half2<<<grid, block>>>( + (__half *)output->gpu_half_data, (__half *)input->gpu_half_data, + (__half *)filter->gpu_half_data, input->dims.dim_sizes[0], + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, h, w, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride); cudaDeviceSynchronize(); - - } - else{ + } else { checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } - + // NOTE: Adding support for grouped convolution checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_half_desc, - filter->filter_half_desc, - &n, &c, &h, &w)); - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_half_desc, filter->filter_half_desc, &n, &c, &h, + &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, + w); - output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, //input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + output = (Tensor *)create4DTensor( + (cudnnDataType_t)half_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); - // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", - output->data_type, output->data_format, - output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W " + "= %d, C = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) ERROR("NULL descriptor! \n"); - // NOTE: The following algo works with TRUE half precision convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + // convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_half_desc, - filter->filter_half_desc, - convDesc, - output->tensor_half_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_half_desc, filter->filter_half_desc, + convDesc, output->tensor_half_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace DEBUG("workspace size = %d \n", workspace_size); - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - - - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, - &alpha, - input->tensor_half_desc, - input->gpu_half_data, - filter->filter_half_desc, - filter->gpu_half_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, - output->tensor_half_desc, - output->gpu_half_data)); - + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_half_desc, input->gpu_half_data, + filter->filter_half_desc, filter->gpu_half_data, convDesc, convAlgo, + workspace, workspace_size, &beta, output->tensor_half_desc, + output->gpu_half_data)); } - + profileEvent("H2F_start"); convertToFP32_offline(output); - - profileEvent("H2F_end"); + profileEvent("H2F_end"); profileEvent("#Conv_end"); - return output; - } - - - -}// End of Extern C - +} // End of Extern C diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu index e706080051a41dac1f7486027fcb9225793921bf..8324b18e044b37ee697a624e60ec77eb4bc7a8d5 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/half_precision_api.cu @@ -1,9 +1,11 @@ -//===--------------------------- half_precision_api.cu --------------------------===// +//===--------------------------- half_precision_api.cu +//--------------------------===// // //===----------------------------------------------------------------------===// -// -// This file consists of the custom implementation of tensor precision changing -// kernels useful for approximated and non-approximated versions of tensor +// +// This file consists of the custom implementation of tensor precision +// changing +// kernels useful for approximated and non-approximated versions of tensor // operations. This file also contains API for tensor operations operating on // tensors with half-precision. // @@ -12,7 +14,6 @@ #ifndef HALF_API_HEADER #define HALF_API_HEADER - #include <stdio.h> #include <stdarg.h> #include <cstdio> @@ -37,7 +38,6 @@ #include <cuda_fp16.h> #include <driver_types.h> - // Tensor runtime header files #include "../include/tensor_runtime.h" #include "../include/tensor_utils.h" @@ -48,15 +48,13 @@ #include "../include/fp16_gemm.h" #include "../include/fp16_conversion.h" - - -void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){ +void *tensorHalfGemm(void *lhs_ptr, void *rhs_ptr) { INFO("*** TensorHalfGemm \n"); profileEvent("#Mul"); - Tensor* lhs = (Tensor*) lhs_ptr; - Tensor* rhs = (Tensor*) rhs_ptr; + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; DEBUG("rhs->dims.num_dims = %d \n", rhs->dims.num_dims); DEBUG("lhs->dims.num_dims = %d \n", lhs->dims.num_dims); @@ -64,65 +62,60 @@ void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){ hostToDeviceCopy(lhs); hostToDeviceCopy(rhs); - profileEvent("F2H_start"); convertToFP16(lhs); convertToFP16(rhs); - - profileEvent("F2H_end"); + profileEvent("F2H_end"); // 'm' holds the batch dimension - assuming NCHW format Tensors int m = lhs->dims.dim_sizes[0]; // The rhs last dimension must contain the neurons - int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons int k = 1; - for (int j = 1 ; j < lhs->dims.num_dims; j++){ + for (int j = 1; j < lhs->dims.num_dims; j++) { k = k * lhs->dims.dim_sizes[j]; // input neurons } - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; // Dimension-note: Check if k is same across the two tensors DEBUG("m = %d, n = %d, k = %d \n", m, n, k); - if(rhs_k != k){ + if (rhs_k != k) { ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k); } - // NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines - Tensor* output = (Tensor*) create4DTensor(half_type, CUDNN_TENSOR_NCHW, - m, n, 1, 1); + // NOTE: Creating a 4D tensor to be compatible with later called cuDNN + // routines + Tensor *output = + (Tensor *)create4DTensor(half_type, CUDNN_TENSOR_NCHW, m, n, 1, 1); changeTensorPlacement(output, DEVICE); - //convertToFP16(output); - + // convertToFP16(output); // INFO: cuBlas uses column-major format // INFO: The leading dimension is just the FIRST Dimension - // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN expects + // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN + // expects const __half alf = approx_float_to_half(1.0); const __half bet = approx_float_to_half(0.0); const __half *alpha_half = &alf; const __half *beta_half = &bet; - - checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n, m, k, - alpha_half, - (__half*) rhs->gpu_half_data, CUDA_R_16F, n, - (__half*) lhs->gpu_half_data, CUDA_R_16F, k, - beta_half, - (__half*) output->gpu_half_data, CUDA_R_16F, n, - CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); - + checkCudaErrors(cublasGemmEx( + cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, alpha_half, + (__half *)rhs->gpu_half_data, CUDA_R_16F, n, (__half *)lhs->gpu_half_data, + CUDA_R_16F, k, beta_half, (__half *)output->gpu_half_data, CUDA_R_16F, n, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); profileEvent("H2F_start"); convertToFP32_offline(output); - //h2f((half*) output_half->gpu_data, output->num_elems, (float*) output->gpu_data); + // h2f((half*) output_half->gpu_data, output->num_elems, (float*) + // output->gpu_data); profileEvent("H2F_end"); @@ -131,32 +124,28 @@ void* tensorHalfGemm(void* lhs_ptr, void* rhs_ptr){ return output; } - - -void* tensorHalfGemmGPU(void* lhs_ptr, void* rhs_ptr){ +void *tensorHalfGemmGPU(void *lhs_ptr, void *rhs_ptr) { return tensorHalfGemm(lhs_ptr, rhs_ptr); } - - // FIXIT: Generalize all of the routines for types {half, float, double} -void* tensorHalfConvolution(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ +void *tensorHalfConvolution(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups) { INFO("*** TensorHConvolution \n"); profileEvent("#Conv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - if(conv_mode == 0) + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; // FIXIT: Need to be more aware of the implications of alpha and beta @@ -168,7 +157,6 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr, hostToDeviceCopy(input); hostToDeviceCopy(filter); - /***** CONVERSIONS from FP32 to FP16 - on the GPU */ profileEvent("F2H_start"); @@ -178,95 +166,76 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr, profileEvent("F2H_end"); /******* END OF INPUT DATA CONVERSIONS*/ - - checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; } - + // NOTE: Adding support for grouped convolution checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - // FIXIT: Think if upscaling values need to be configurable? // IMP-FIXIT: CUDNN Cross correlation is only used in the Lenet context - // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE should be used? - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode, // mode is configurable - computeType)); // defines compute precision + // IMP-FIXIT: Either make mode configurable OR see if CUDNN_CONVOLUTION MODE + // should be used? + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision int n, c, h, w; // output dimensions // Find dimension of convolution output - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - - DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output = (Tensor*) create4DTensor((cudnnDataType_t) half_type, // input->data_type, - CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *output = + (Tensor *)create4DTensor((cudnnDataType_t)half_type, // input->data_type, + CUDNN_TENSOR_NCHW, n, c, h, w); // NOTE: Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - //convertToFP16(output); + // convertToFP16(output); - // NOTE: Necessary to insert the above call for every output tensor - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = %d, C = %d \n", - output->data_type, output->data_format, - output->dims.dim_sizes[0], output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, H = %d, W = " + "%d, C = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); - if(convDesc == NULL || input->tensor_half_desc == NULL || - filter->filter_half_desc == NULL || output->tensor_half_desc == NULL) + if (convDesc == NULL || input->tensor_half_desc == NULL || + filter->filter_half_desc == NULL || output->tensor_half_desc == NULL) ERROR("NULL descriptor! \n"); - // NOTE: The following algo works with TRUE half precision convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - //convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + // convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; - size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_half_desc, - filter->filter_half_desc, - convDesc, - output->tensor_half_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_half_desc, filter->filter_half_desc, convDesc, + output->tensor_half_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace DEBUG("workspace size = %d \n", workspace_size); - void* workspace; + void *workspace; checkCudaErrors(cudaMalloc(&workspace, workspace_size)); - - - - checkCUDNN(cudnnConvolutionForward(cudnnHandle, - &alpha, - input->tensor_half_desc, - input->gpu_half_data, - filter->filter_half_desc, - filter->gpu_half_data, - convDesc, convAlgo, - workspace, workspace_size, - &beta, - output->tensor_half_desc, - output->gpu_half_data)); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_half_desc, input->gpu_half_data, + filter->filter_half_desc, filter->gpu_half_data, convDesc, convAlgo, + workspace, workspace_size, &beta, output->tensor_half_desc, + output->gpu_half_data)); profileEvent("H2F_start"); @@ -279,21 +248,18 @@ void* tensorHalfConvolution(void* input_ptr, void* filter_ptr, return output; } - - - -void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon){ +void *tensorHalfBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr, + void *mean_ptr, void *variance_ptr, double epsilon) { INFO("*** TensorHalfBatchNorm \n"); profileEvent("#BatchNorm"); - Tensor* input = (Tensor*) input_ptr; - Tensor* gamma = (Tensor*) gamma_ptr; - Tensor* beta = (Tensor*) beta_ptr; - Tensor* mean = (Tensor*) mean_ptr; - Tensor* variance = (Tensor*) variance_ptr; - + Tensor *input = (Tensor *)input_ptr; + Tensor *gamma = (Tensor *)gamma_ptr; + Tensor *beta = (Tensor *)beta_ptr; + Tensor *mean = (Tensor *)mean_ptr; + Tensor *variance = (Tensor *)variance_ptr; + float alpha_val = 1.0f, beta_val = 0.0f; hostToDeviceCopy(input); hostToDeviceCopy(gamma); @@ -301,56 +267,37 @@ void* tensorHalfBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, hostToDeviceCopy(mean); hostToDeviceCopy(variance); - profileEvent("F2H_start"); convertToFP16(input); profileEvent("F2H_end"); - - - - checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL, - &alpha_val, &beta_val, - input->tensor_half_desc, - input->gpu_half_data, - input->tensor_half_desc, - input->gpu_half_data, - gamma->tensor_desc, gamma->gpu_data, - beta->gpu_data, mean->gpu_data, - variance->gpu_data, epsilon)); - + checkCUDNN(cudnnBatchNormalizationForwardInference( + cudnnHandle, CUDNN_BATCHNORM_SPATIAL, &alpha_val, &beta_val, + input->tensor_half_desc, input->gpu_half_data, input->tensor_half_desc, + input->gpu_half_data, gamma->tensor_desc, gamma->gpu_data, beta->gpu_data, + mean->gpu_data, variance->gpu_data, epsilon)); profileEvent("H2F_start"); convertToFP32_offline(input); - - profileEvent("H2F_end"); + profileEvent("H2F_end"); - profileEvent("#tensorHalfBatchNorm_end", true); - return input; } - - - -void* tensorHalfPooling(void* input_ptr, - int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride){ - - +void *tensorHalfPooling(void *input_ptr, int poolFunction, int window_height, + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { INFO("*** TensorHalfPooling \n"); profileEvent("#Pool"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; hostToDeviceCopy(input); @@ -366,218 +313,185 @@ void* tensorHalfPooling(void* input_ptr, // FIXIT: Need to be more aware of the implications of alpha and beta float alpha = 1.0f, beta = 0.0f; - checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; - int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / vertical_stride; + int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / + vertical_stride; h = h + 1; - int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / horizontal_stride; + int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / + horizontal_stride; w = w + 1; DEBUG("n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); // FIXIT: Don't be specific to floats - Tensor* output = (Tensor*) create4DTensor(half_type, CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *output = + (Tensor *)create4DTensor(half_type, CUDNN_TENSOR_NCHW, n, c, h, w); // Changing output tensor placement from host to device changeTensorPlacement(output, DEVICE); - //convertToFP16(output); + // convertToFP16(output); // FIXIT: Fix being specific to CUDNN_DATA_FLOAT and NCHW format // FIXIT: Is this setTensor even needed? checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_half_desc, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_HALF, - n, c, - h, w)); + CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, n, + c, h, w)); cudnnPoolingMode_t pool_mode; - if(poolFunction == 0) + if (poolFunction == 0) pool_mode = CUDNN_POOLING_MAX; - else if(poolFunction == 1) + else if (poolFunction == 1) pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - // FIXIT: Make the pool function (max, min, avg) configurable - checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc, - pool_mode, - CUDNN_PROPAGATE_NAN, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride)); - - - checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, - input->tensor_half_desc, - input->gpu_half_data, &beta, - output->tensor_half_desc, output->gpu_half_data)); - + checkCUDNN(cudnnSetPooling2dDescriptor( + poolDesc, pool_mode, CUDNN_PROPAGATE_NAN, window_height, window_width, + vertical_pad, horizontal_pad, vertical_stride, horizontal_stride)); + checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, + input->tensor_half_desc, input->gpu_half_data, + &beta, output->tensor_half_desc, + output->gpu_half_data)); profileEvent("H2F_start"); convertToFP32_offline(output); - + profileEvent("H2F_end"); - profileEvent("#tensorHalfPooling_end", true); return output; } - - - - -void* tensorHalfRelu2(void* input_ptr, float min, float max){ +void *tensorHalfRelu2(void *input_ptr, float min, float max) { INFO("*** TensorClippedRelu \n"); profileEvent("#Relu"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; cudnnActivationDescriptor_t reluDesc; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); - //**** Floating point to half conversions profileEvent("F2H_start"); convertToFP16(input); - + profileEvent("F2H_end"); /*** End of data type conversion **/ - checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc)); - checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, - CUDNN_PROPAGATE_NAN, 2.0)); - - checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha, - input->tensor_half_desc, input->gpu_half_data, &beta, - input->tensor_half_desc, input->gpu_half_data)); + checkCUDNN(cudnnSetActivationDescriptor( + reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_PROPAGATE_NAN, 2.0)); + checkCUDNN(cudnnActivationForward( + cudnnHandle, reluDesc, &alpha, input->tensor_half_desc, + input->gpu_half_data, &beta, input->tensor_half_desc, + input->gpu_half_data)); profileEvent("H2F_start"); // NOTE: Transforming half precision output to single precision convertToFP32_offline(input); - + profileEvent("H2F_end"); profileEvent("#tensorHalfClippedRelu_end"); - return input; } - - - -void* tensorHalfRelu(void* input_ptr){ +void *tensorHalfRelu(void *input_ptr) { INFO("*** TensorHalfRelu \n"); profileEvent("#Relu"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; cudnnActivationDescriptor_t reluDesc; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); - //**** Floating point to half conversions profileEvent("F2H_start"); convertToFP16(input); - + profileEvent("F2H_end"); /*** End of data type conversion **/ - checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc)); checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU, - CUDNN_PROPAGATE_NAN, 0.0)); + CUDNN_PROPAGATE_NAN, 0.0)); - checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha, - input->tensor_half_desc, input->gpu_half_data, &beta, - input->tensor_half_desc, input->gpu_half_data)); + checkCUDNN(cudnnActivationForward( + cudnnHandle, reluDesc, &alpha, input->tensor_half_desc, + input->gpu_half_data, &beta, input->tensor_half_desc, + input->gpu_half_data)); - profileEvent("H2F_start"); convertToFP32_offline(input); - + profileEvent("H2F_end"); - profileEvent("#tensorHalfRelu_end"); - return input; } - - - - - -void* tensorHalfTanh(void* input_ptr){ +void *tensorHalfTanh(void *input_ptr) { INFO("*** TensorHalfTanh \n"); profileEvent("#Tanh"); - - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; cudnnActivationDescriptor_t tanhDesc; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); - //**** Data conversion from float to half profileEvent("F2H_start"); convertToFP16(input); - + profileEvent("F2H_end"); /**** End of data type conversion ****/ - checkCUDNN(cudnnCreateActivationDescriptor(&tanhDesc)); checkCUDNN(cudnnSetActivationDescriptor(tanhDesc, CUDNN_ACTIVATION_TANH, - CUDNN_PROPAGATE_NAN, 0.0)); + CUDNN_PROPAGATE_NAN, 0.0)); - checkCUDNN(cudnnActivationForward(cudnnHandle, tanhDesc, &alpha, - input->tensor_half_desc, input->gpu_half_data, &beta, - input->tensor_half_desc, input->gpu_half_data)); + checkCUDNN(cudnnActivationForward( + cudnnHandle, tanhDesc, &alpha, input->tensor_half_desc, + input->gpu_half_data, &beta, input->tensor_half_desc, + input->gpu_half_data)); profileEvent("H2F_start"); convertToFP32_offline(input); - + profileEvent("H2F_end"); - profileEvent("#tensorHalfTanh_end"); - return input; } +void *tensorHalfAdd(void *x_ptr, void *bias_ptr) { - -void* tensorHalfAdd(void* x_ptr, void* bias_ptr){ - - Tensor* x = (Tensor*) x_ptr; - Tensor* bias = (Tensor*) bias_ptr; + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; INFO("*** TensorHalfAdd \n"); profileEvent("#Add"); @@ -587,36 +501,29 @@ void* tensorHalfAdd(void* x_ptr, void* bias_ptr){ hostToDeviceCopy(x); hostToDeviceCopy(bias); - //**** Data conversion from float to half profileEvent("F2H_start"); convertToFP16(x); convertToFP16(bias); - + profileEvent("F2H_end"); /*** End of data type conversions ****/ - // FIXIT: routine fails for 3D tensors checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_half_desc, - bias->gpu_half_data, &alpha, - x->tensor_half_desc, x->gpu_half_data)); - + bias->gpu_half_data, &alpha, x->tensor_half_desc, + x->gpu_half_data)); profileEvent("H2F_start"); convertToFP32_offline(x); - + profileEvent("H2F_end"); - profileEvent("#tensorHalfAdd_end"); - return x; } - - #endif diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp index ae92f12335f79307765e188e549e2ab80247ccf0..c7237c0076f82009d6f6d7590c43d4e79571ec1f 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/hpvm-rt-controller.cpp @@ -1,47 +1,45 @@ -//===--------------------------- hpvm-rt-controller.cpp ---------------------===// +//===--------------------------- hpvm-rt-controller.cpp +//---------------------===// // //===----------------------------------------------------------------------===// -// -// This file contains code for that allows the tensor runtime to adapt +// +// This file contains code for that allows the tensor runtime to adapt // in response to external changes in conditions (such as frequency changes) // by helping to choose correct approximation configurations. It also provides // routines for the rest of the runtime to get performance and energy profiling. // //===----------------------------------------------------------------------===// - #include "hpvm-rt-controller.h" #include "global_data.h" #include <fstream> //-------- Functionality to read and update frequency on Jetson board -------// /*const char* available_freqs[] = {"140250000", "229500000", "318750000", - "408000000", "497250000", "586500000", + "408000000", "497250000", "586500000", "675750000", "765000000", "854250000", "943500000", "1032750000", "1122000000", "1211250000", "1300500000"}; */ - const int available_freqs[] = { -140250000, // 0 -229500000, // 1 -318750000, // 2 -408000000, // 3 -497250000, // 4 -586500000, // 5 -675750000, // 6 -765000000, // 7 -854250000, // 8 -943500000, // 9 -1032750000,// 10 -1122000000,// 11 -1211250000,// 12 -1300500000 // 13 + 140250000, // 0 + 229500000, // 1 + 318750000, // 2 + 408000000, // 3 + 497250000, // 4 + 586500000, // 5 + 675750000, // 6 + 765000000, // 7 + 854250000, // 8 + 943500000, // 9 + 1032750000, // 10 + 1122000000, // 11 + 1211250000, // 12 + 1300500000 // 13 }; - /*void updateJetsonGPUFreq(int freq_level) { if (freq_level < 0 || freq_level > 13) { @@ -49,7 +47,7 @@ const int available_freqs[] = { abort(); } - const char* freq_val = available_freqs[freq_level]; + const char* freq_val = available_freqs[freq_level]; printf("freq-val[0] = %s \n", freq_val); FILE* max_file = @@ -59,7 +57,7 @@ const int available_freqs[] = { } fwrite(freq_val, strlen(freq_val), 1, max_file); fclose(max_file); - + FILE* min_file = fopen("/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq", "w+"); if (min_file == NULL){ @@ -80,7 +78,7 @@ unsigned long int readJetsonGPUFreq() { char buf[50]; char* ptr; - + fread(buf, 50, 1, cur_freq_file); unsigned long cur_freq = strtoul(buf, &ptr, 10); fclose(cur_freq_file); @@ -89,14 +87,15 @@ unsigned long int readJetsonGPUFreq() { */ - // Sets frequency void setFreq(unsigned freq_index) { unsigned target_freq = available_freqs[freq_index]; - - const char * const min_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"; - const char * const max_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"; + + const char *const min_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/min_freq"; + const char *const max_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/max_freq"; std::ofstream min_stream; std::ofstream max_stream; @@ -115,7 +114,8 @@ void setFreq(unsigned freq_index) { unsigned recordFreq() { // Current frequency file - const char * const cur_freq_file = "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"; + const char *const cur_freq_file = + "/sys/devices/17000000.gp10b/devfreq/17000000.gp10b/cur_freq"; std::ifstream cur_stream; cur_stream.open(cur_freq_file, std::ifstream::in); @@ -128,10 +128,6 @@ unsigned recordFreq() { return cur_freq; } - - - - //---------------------------------------------------------------------------// /* @@ -145,13 +141,13 @@ bool fileExists(const std::string &file) { // There will be no frequency request for the first batch // Therefore, we skip the first element by initializing to 1, not 0. -FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) : - idx_list(il), rep_factor(rf), count(1), idx(0) {} +FrequencyIndexList::FrequencyIndexList(std::vector<int> il, unsigned rf) + : idx_list(il), rep_factor(rf), count(1), idx(0) {} unsigned FrequencyIndexList::getNextIndex() { if (count == rep_factor) { count = 0; - idx = (idx+1) % idx_list.size(); + idx = (idx + 1) % idx_list.size(); } count++; return idx_list[idx]; @@ -218,7 +214,7 @@ void ProfileInfo::readIterationFrequency() { frequency_current_iteration = recordFreq(); #else frequency_current_iteration = 0; -#endif //JETSON_EXECUTION +#endif // JETSON_EXECUTION } unsigned long ProfileInfo::getIterationFrequency() { @@ -285,15 +281,14 @@ void ProfileInfo::printToFile() { // to have equal sizes, in outer and inner vectors both, // and all time_info and energy_info vectors must have the same size. unsigned iterations = tensor_time_info.size(); - CUSTOM_ASSERT( - (tensor_time_info.size() == iterations) && - (tensor_energy_info.size() == iterations) && - (control_time_info.size() == iterations) && - (control_energy_info.size() == iterations) && - (config_time_info.size() == iterations) && - (config_energy_info.size() == iterations) && - (frequency_info.size() == iterations) && - "time_info, energy_info, frequency_info size: \ + CUSTOM_ASSERT((tensor_time_info.size() == iterations) && + (tensor_energy_info.size() == iterations) && + (control_time_info.size() == iterations) && + (control_energy_info.size() == iterations) && + (config_time_info.size() == iterations) && + (config_energy_info.size() == iterations) && + (frequency_info.size() == iterations) && + "time_info, energy_info, frequency_info size: \ iteration number does not match."); for (unsigned i = 0; i < tensor_time_info.size(); i++) { @@ -343,8 +338,8 @@ ProfileInfo::ProfileInfo() time_control_current_iteration(0.0), time_config_current_iteration(0.0), energy_compute_current_iteration(0.0), energy_control_current_iteration(0.0), - energy_config_current_iteration(0.0), - frequency_current_iteration(0), in_iteration(false) {} + energy_config_current_iteration(0.0), frequency_current_iteration(0), + in_iteration(false) {} Slowdowns::Slowdowns() { idx = 0; @@ -386,52 +381,50 @@ void RuntimeController::stop_profiler() { profiler->stop_profiler(); } // For testing purposes only - do not use widely -std::vector<struct Configuration *> &RuntimeController:: -getSpeedupConfigurations() { +std::vector<struct Configuration *> & +RuntimeController::getSpeedupConfigurations() { return SpeedupConfigurations; } // For testing purposes only - do not use widely -std::vector<struct Configuration *> &RuntimeController:: -getEnergyConfigurations() { +std::vector<struct Configuration *> & +RuntimeController::getEnergyConfigurations() { return EnergyConfigurations; } // For testing purposes only - do not use widely -std::vector<struct Configuration *> &RuntimeController:: -getThreeDCurveConfigurations() { +std::vector<struct Configuration *> & +RuntimeController::getThreeDCurveConfigurations() { return ThreeDCurveConfigurations; } // For testing purposes only - do not use widely unsigned RuntimeController::getConfigurationIdx() { return configurationIdx; } double RuntimeController::getCurrentConfigurationSpeedup() { - return (double) (*Configurations)[configurationIdx]->speedup; + return (double)(*Configurations)[configurationIdx]->speedup; } double RuntimeController::getCurrentConfigurationEnergy() { - return (double) (*Configurations)[configurationIdx]->energy; + return (double)(*Configurations)[configurationIdx]->energy; } double RuntimeController::getCurrentConfigurationAccuracy() { - return (double) (*Configurations)[configurationIdx]->accuracy; + return (double)(*Configurations)[configurationIdx]->accuracy; } double RuntimeController::getCurrentConfigurationAccuracyLoss() { - return (double) (*Configurations)[configurationIdx]->accuracyLoss; + return (double)(*Configurations)[configurationIdx]->accuracyLoss; } NodeConfiguration *RuntimeController::getNodeConfiguration(const char *data) { // if visc.node.id Not specified for this HPVM Node - if (currentTensorID == -1){ + if (currentTensorID == -1) { std::string s(data); // All nodes are expected to have a configuration return (*Configurations)[configurationIdx]->setup.at(s); - } - else{ - DEBUG("-- currentTensorID = \%u \n", currentTensorID); + } else { + DEBUG("-- currentTensorID = \%u \n", currentTensorID); return (*Configurations)[configurationIdx]->idConfigMap.at(currentTensorID); } - } void RuntimeController::init(const char *Cstr) { @@ -440,7 +433,8 @@ void RuntimeController::init(const char *Cstr) { setProfileInfoFilename(Cstr); readConfigurationFile(Cstr); - // NOTE: Configurations is pareto-configs. InitialConfigurations is the full list (config file) + // NOTE: Configurations is pareto-configs. InitialConfigurations is the full + // list (config file) Configurations = NULL; computeParetoConfigurationPoints(); // compute3DParetoConfigurationPoints(); Not using 3D curve @@ -461,8 +455,10 @@ void RuntimeController::init(const char *Cstr) { // Pseudo random variable (when we did few experiments) // or true random numbers for probabilistic control pseudo_rd = 0.0; - std::random_device rd; //Will be used to obtain a seed for the random number engine - generator = std::mt19937 (rd()); //Standard mersenne_twister_engine seeded with rd() + std::random_device + rd; // Will be used to obtain a seed for the random number engine + generator = + std::mt19937(rd()); // Standard mersenne_twister_engine seeded with rd() distr = std::uniform_real_distribution<>(0.0, 1.0); g_freq = available_freqs[13]; @@ -484,8 +480,8 @@ void RuntimeController::end_iteration() { PI->end_iteration(); } -void RuntimeController::addToCurrentIterationComputeTime( - const char *s, double t) { +void RuntimeController::addToCurrentIterationComputeTime(const char *s, + double t) { if (PI) PI->addToCurrentIterationComputeTime(s, t); } @@ -500,8 +496,8 @@ void RuntimeController::addToCurrentIterationConfigTime(double t) { PI->addToCurrentIterationConfigTime(t); } -void RuntimeController::addToCurrentIterationComputeEnergy( - const char *s, double e) { +void RuntimeController::addToCurrentIterationComputeEnergy(const char *s, + double e) { if (PI) PI->addToCurrentIterationComputeEnergy(s, e); } @@ -539,8 +535,8 @@ void RuntimeController::updateFrequency() { //--- updateJetsonGPUFreq(freq_idx); setFreq(freq_idx); - -#endif //JETSON_EXECUTION + +#endif // JETSON_EXECUTION } void RuntimeController::writeProfileInfo() { @@ -573,11 +569,9 @@ std::pair<double, double> RuntimeController::fc_profile( const unsigned num_rows_a, const unsigned num_cols_a, const unsigned num_rows_b, const unsigned num_cols_b, const unsigned voltage_swing, const unsigned patch_factor) { - return ( - promise ? promise->fc_profile( - num_rows_a, num_cols_a, num_rows_b, num_cols_b, - voltage_swing, patch_factor) - : std::make_pair(0.0, 0.0)); + return (promise ? promise->fc_profile(num_rows_a, num_cols_a, num_rows_b, + num_cols_b, voltage_swing, patch_factor) + : std::make_pair(0.0, 0.0)); } std::pair<double, double> RuntimeController::conv_profile( @@ -585,17 +579,16 @@ std::pair<double, double> RuntimeController::conv_profile( const unsigned c_out, const unsigned c_in, const unsigned k_h, const unsigned k_w, const unsigned s_h, const unsigned s_w, const unsigned voltage_swing, const unsigned patch_factor) { - return ( - promise ? promise->conv_profile( - n, c, h, w, c_out, c_in, k_h, k_w, s_h, s_w, voltage_swing, - patch_factor) - : std::make_pair(0.0, 0.0)); + return (promise ? promise->conv_profile(n, c, h, w, c_out, c_in, k_h, k_w, + s_h, s_w, voltage_swing, patch_factor) + : std::make_pair(0.0, 0.0)); } // Constructor and descructor RuntimeController::RuntimeController() { configurationIdx = 0; - FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, 10); + FIL = new FrequencyIndexList({13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + 10); #ifdef ACTIVE_PROFILING PI = new ProfileInfo(); profiler = new Profiler(); @@ -679,16 +672,14 @@ void RuntimeController::readConfigurationFile(const char *str) { std::getline(qin, first_line); DEBUG("first_line: %s\n", first_line.c_str()); - try{ + try { baseline_time = std::stod(first_line); DEBUG("Baseline time: %lf\n\n", baseline_time); - } - catch(...){ + } catch (...) { ERROR("Please Add/Fix Baseline Time at Top of Config File.. "); } - - unsigned int firstTensorID = 1; + unsigned int firstTensorID = 1; for (std::string line; std::getline(qin, line);) { DEBUG("line: %s\n", line.c_str()); @@ -721,10 +712,10 @@ void RuntimeController::readConfigurationFile(const char *str) { // Read first line, to create the new configuration struct readingFirstLine = false; firstTensorID = 1; // reset first tensor ID for new config - - InitialConfigurations.push_back(Configuration( - tokens[0], std::stof(tokens[1]), std::stof(tokens[2]), - std::stof(tokens[3]), std::stof(tokens[4]))); + + InitialConfigurations.push_back( + Configuration(tokens[0], std::stof(tokens[1]), std::stof(tokens[2]), + std::stof(tokens[3]), std::stof(tokens[4]))); continue; } @@ -732,9 +723,8 @@ void RuntimeController::readConfigurationFile(const char *str) { DEBUG("Found gpu configuration\n"); // There must be at least one operation, with an approximation option - CUSTOM_ASSERT( - (tokens.size() >= 5) && - "Not enough operations - approximation options."); + CUSTOM_ASSERT((tokens.size() >= 5) && + "Not enough operations - approximation options."); GPUNodeConfiguration *NodeConf = new GPUNodeConfiguration(); InitialConfigurations.back().setup.insert( @@ -745,7 +735,7 @@ void RuntimeController::readConfigurationFile(const char *str) { InitialConfigurations.back().idConfigMap.insert( std::make_pair(firstTensorID, NodeConf)); DEBUG("*** firstTensorID = %d \n\n", firstTensorID); - + unsigned idx = 2; while (idx < tokens.size()) { if (tokens[idx] == "add") { @@ -894,14 +884,13 @@ void RuntimeController::readConfigurationFile(const char *str) { // Update first TensorID using number of tensor ops in current node firstTensorID += NodeConf->getApproxChoices().size(); - + } else if (tokens[1] == "cpu") { DEBUG("Found gpu configuration\n"); // There must be at least one operation, with an approximation option - CUSTOM_ASSERT( - (tokens.size() >= 5) && - "Not enough operations - approximation options."); + CUSTOM_ASSERT((tokens.size() >= 5) && + "Not enough operations - approximation options."); CPUNodeConfiguration *NodeConf = new CPUNodeConfiguration(); InitialConfigurations.back().setup.insert( @@ -1017,9 +1006,8 @@ void RuntimeController::computeParetoConfigurationPoints() { // Sort the configurations according to accuracy loss INFO("Sorting autotuner configurations...\n"); - std::sort( - InitialConfigurations.begin() + 1, InitialConfigurations.end(), - ConfigurationLessThan()); + std::sort(InitialConfigurations.begin() + 1, InitialConfigurations.end(), + ConfigurationLessThan()); INFO("Done sorting.\n"); for (unsigned start_idx = 1; start_idx < InitialConfigurations.size();) { @@ -1053,14 +1041,12 @@ void RuntimeController::computeParetoConfigurationPoints() { en_idx = i; } } - DEBUG( - "accuracy loss = %f, speedup = %f, at sp_idx = %d\n", - InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx); + DEBUG("accuracy loss = %f, speedup = %f, at sp_idx = %d\n", + InitialConfigurations[sp_idx].accuracyLoss, sp, sp_idx); // Found best speedup for this accuracy point (not dominated by any of // these). - DEBUG( - "accuracy loss = %f, energy = %f, at en_idx = %d\n", - InitialConfigurations[en_idx].accuracyLoss, en, en_idx); + DEBUG("accuracy loss = %f, energy = %f, at en_idx = %d\n", + InitialConfigurations[en_idx].accuracyLoss, en, en_idx); // Found best energy for this accuracy point (not dominated by any of // these). @@ -1130,9 +1116,8 @@ void RuntimeController::compute3DParetoConfigurationPoints() { // Sort the configurations according to accuracy loss INFO("Sorting autotuner configurations...\n"); - std::sort( - InitialConfigurations.begin(), InitialConfigurations.end(), - ConfigurationLessThan()); + std::sort(InitialConfigurations.begin(), InitialConfigurations.end(), + ConfigurationLessThan()); INFO("Done sorting.\n"); for (unsigned start_idx = 0; start_idx < InitialConfigurations.size();) { @@ -1166,11 +1151,10 @@ void RuntimeController::compute3DParetoConfigurationPoints() { } } if (!dominated) { - DEBUG( - "accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n", - InitialConfigurations[i].accuracyLoss, - InitialConfigurations[i].speedup, InitialConfigurations[i].energy, - i); + DEBUG("accuracy loss = %f, speedup = %f, energy = %f, at idx = %d\n", + InitialConfigurations[i].accuracyLoss, + InitialConfigurations[i].speedup, InitialConfigurations[i].energy, + i); Indices.push_back(i); } } @@ -1229,31 +1213,22 @@ void RuntimeController::printConfigurations( } } -unsigned long RuntimeController::getLastFrequency() { - return g_freq; -} +unsigned long RuntimeController::getLastFrequency() { return g_freq; } -void RuntimeController::setLastFrequency(unsigned long f) { - g_freq = f; -} +void RuntimeController::setLastFrequency(unsigned long f) { g_freq = f; } -double RuntimeController::getLastSpeedup() { - return g_speedup; -} +double RuntimeController::getLastSpeedup() { return g_speedup; } -void RuntimeController::setLastSpeedup(double s) { - g_speedup = s; -} +void RuntimeController::setLastSpeedup(double s) { g_speedup = s; } void RuntimeController::findNextConfiguration() { configurationIdx = (configurationIdx + 1) % Configurations->size(); - DEBUG( - "findNextConfiguration: Updated configurationIdx to %u.\n", - configurationIdx); + DEBUG("findNextConfiguration: Updated configurationIdx to %u.\n", + configurationIdx); } -void RuntimeController::findTargetConfiguration( - float goal, enum SEARCH_KIND sk) { +void RuntimeController::findTargetConfiguration(float goal, + enum SEARCH_KIND sk) { // We search in range begin(), end()-1 . It is OK to decrement end(), because // the configurations vector always points to one of the pareto curves, and // they are never empty - we have always pushed at least one configuration. @@ -1264,25 +1239,25 @@ void RuntimeController::findTargetConfiguration( case SPEEDUP: { // Assigning one of Pareto configs to 'Configurations' class attribute Configurations = &SpeedupConfigurations; - low_it = std::lower_bound( - Configurations->begin(), Configurations->end() - 1, goal, - ConfigurationLessThan_SP()); + low_it = + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_SP()); configurationIdx = low_it - Configurations->begin(); break; } case ENERGY: { Configurations = &EnergyConfigurations; - low_it = std::lower_bound( - Configurations->begin(), Configurations->end() - 1, goal, - ConfigurationLessThan_E()); + low_it = + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_E()); configurationIdx = low_it - Configurations->begin(); break; } case ACCURACY_LOSS: { Configurations = &SpeedupConfigurations; - low_it = std::lower_bound( - Configurations->begin(), Configurations->end() - 1, goal, - ConfigurationLessThan_AL()); + low_it = + std::lower_bound(Configurations->begin(), Configurations->end() - 1, + goal, ConfigurationLessThan_AL()); if ((*low_it)->accuracyLoss > goal) --low_it; configurationIdx = low_it - Configurations->begin(); @@ -1297,9 +1272,8 @@ void RuntimeController::findTargetConfiguration( // After search, low_it points to the Configuration to the element with the // goal value or the immediately lower value if it does not exist - DEBUG( - "findTargetConfiguration: Updated configurationIdx to %u.\n", - configurationIdx); + DEBUG("findTargetConfiguration: Updated configurationIdx to %u.\n", + configurationIdx); } void RuntimeController::adjustTargetConfiguration(float goal) { @@ -1310,8 +1284,8 @@ void RuntimeController::adjustTargetConfiguration(float goal) { // Find configuration before the selected one. // There is always one, unless goal is 1. Then, we would pick baseline, and // both upper and lower should be the same configuration, at index 0. - unsigned prev_conf_idx = configurationIdx > 0 ? configurationIdx - 1 - : configurationIdx; + unsigned prev_conf_idx = + configurationIdx > 0 ? configurationIdx - 1 : configurationIdx; // Get the two configurations' speedup, and compute the appropriate ranges float curr_conf_speedup = (*Configurations)[configurationIdx]->speedup; float prev_conf_speedup = (*Configurations)[prev_conf_idx]->speedup; @@ -1330,32 +1304,32 @@ void RuntimeController::adjustTargetConfiguration(float goal) { //***--- Probability adjustment strategy 1 ---***// // No big adjustments at edges of probability range -// float adjust_val = 0.0; -// if (low_pb < high_pb) { -// adjust_val = low_pb * 0.2; -// } else { -// adjust_val = high_pb * 0.2; -// } -// low_pb -= adjust_val; -// high_pb += adjust_val; + // float adjust_val = 0.0; + // if (low_pb < high_pb) { + // adjust_val = low_pb * 0.2; + // } else { + // adjust_val = high_pb * 0.2; + // } + // low_pb -= adjust_val; + // high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 2 ---***// // No big adjustment at high edge of probability range -// float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2; -// low_pb -= adjust_val; -// high_pb += adjust_val; + // float adjust_val = high_pb * 0.2 > 0.1 ? 0.1 : high_pb * 0.2; + // low_pb -= adjust_val; + // high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 3 ---***// - //Similar to 2, but higher always increases, more significantly -// float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5; -// low_pb -= adjust_val; -// high_pb += adjust_val; + // Similar to 2, but higher always increases, more significantly + // float adjust_val = low_pb * 0.5 > 0.1 ? 0.1 : low_pb * 0.5; + // low_pb -= adjust_val; + // high_pb += adjust_val; //***--- ---***// //***--- Probability adjustment strategy 4 ---***// - //Similar to 2, but higher always increases, more significantly + // Similar to 2, but higher always increases, more significantly // Low end, high end a bit less aggressive than total range float adjust_val = low_pb * 0.6 > 0.2 ? 0.2 : low_pb * 0.6; adjust_val = adjust_val > high_pb ? high_pb : adjust_val; @@ -1364,20 +1338,18 @@ void RuntimeController::adjustTargetConfiguration(float goal) { //***--- ---***// } - DEBUG( - "**---- adjustTargetConfiguration: upper conf = %s with probability: " - "%f.\n", - ((*Configurations)[configurationIdx]->name).c_str(), high_pb); - DEBUG( - "**---- adjustTargetConfiguration: lower conf = %s with probability: " - "%f.\n\n", - ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb); + DEBUG("**---- adjustTargetConfiguration: upper conf = %s with probability: " + "%f.\n", + ((*Configurations)[configurationIdx]->name).c_str(), high_pb); + DEBUG("**---- adjustTargetConfiguration: lower conf = %s with probability: " + "%f.\n\n", + ((*Configurations)[prev_conf_idx]->name).c_str(), low_pb); // Select a random number from 0 to 1 // We assign the (0..low_pb) to the lower configuration, and the (low_pb..1) // to the upper // float rd = static_cast <float> (rand()) / static_cast <float> (RAND_MAX) ; - //float rd = pseudo_rd; + // float rd = pseudo_rd; float rd = distr(generator); if (rd < low_pb) { // If the probability is in the low range @@ -1411,8 +1383,8 @@ extern "C" void llvm_hpvm_clearRuntimeController() { //*** Methods to compute accuracy of a tensor by the runtime controller ***// uint32_t *labels_from_file = NULL; -uint32_t * -hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, int end) { +uint32_t *hpvm_rt_readLabelsBatch_cached(const char *labels_file, int start, + int end) { // Initialize buffer if (!labels_from_file) { @@ -1485,10 +1457,10 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; printf("****** Accuracy = %f \n\n", accuracy); - - average_accuracy = accuracy + (average_accuracy * num_executations); + + average_accuracy = accuracy + (average_accuracy * num_executations); num_executations++; - average_accuracy = average_accuracy/num_executations; + average_accuracy = average_accuracy / num_executations; FILE *fp = fopen("final_accuracy", "w+"); if (fp != NULL) { @@ -1510,8 +1482,8 @@ float hpvm_rt_computeAccuracy3(uint32_t *labels, void *result_ptr) { //#define llvm_hpvm_invokeRtControl_ADJUST_PR llvm_hpvm_invokeRtControl //#define llvm_hpvm_invokeRtControl_ITERATE llvm_hpvm_invokeRtControl -extern "C" void llvm_hpvm_invokeRtControl_BASE( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_BASE(void *result, const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1528,16 +1500,15 @@ extern "C" void llvm_hpvm_invokeRtControl_BASE( RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO( - "current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ITERATE( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_ITERATE(void *result, const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1561,16 +1532,15 @@ extern "C" void llvm_hpvm_invokeRtControl_ITERATE( RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO( - "current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ADJUST( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_ADJUST(void *result, const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1613,17 +1583,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST( RC->addToCurrentIterationConfigEnergy(pinfo2.second); //* */ - INFO( - "current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("target speedup = %lf\n\n", target_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR(void *result, + const char *str, int start, + int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1667,17 +1637,17 @@ extern "C" void llvm_hpvm_invokeRtControl_ADJUST_PR( RC->addToCurrentIterationConfigEnergy(pinfo2.second); //* */ - INFO( - "current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("target speedup = %lf\n\n", target_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN(void *result, + const char *str, int start, + int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1704,21 +1674,20 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN( float next_conf_speedup = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - INFO( - "current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("slowdown (target speedup) = %f\n", slowdown); INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO( - "Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); + INFO("Swapping to next configuration: %s with speedup %f\n\n", + next_conf_name.c_str(), next_conf_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR(void *result, + const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1746,21 +1715,19 @@ extern "C" void llvm_hpvm_invokeRtControl_SLOWDOWN_PR( float next_conf_speedup = RC->getSpeedupConfigurations()[RC->getConfigurationIdx()]->speedup; - INFO( - "current iteration time = %f, current iteration energy = %f\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n", + current_iteration_time, current_iteration_energy); INFO("slowdown (target speedup) = %f\n", slowdown); INFO("Previous configuration: %s\n", prev_conf_name.c_str()); - INFO( - "Swapping to next configuration: %s with speedup %f\n\n", - next_conf_name.c_str(), next_conf_speedup); + INFO("Swapping to next configuration: %s with speedup %f\n\n", + next_conf_name.c_str(), next_conf_speedup); // Note the end of iteration RC->end_iteration(); } -extern "C" void llvm_hpvm_invokeRtControl_RAND( - void *result, const char *str, int start, int end) { +extern "C" void llvm_hpvm_invokeRtControl_RAND(void *result, const char *str, + int start, int end) { uint32_t *labels_cached = hpvm_rt_readLabelsBatch_cached(str, start, end); hpvm_rt_computeAccuracy3(labels_cached, result); @@ -1778,9 +1745,8 @@ extern "C" void llvm_hpvm_invokeRtControl_RAND( RC->addToCurrentIterationControlTime(pinfo.first); RC->addToCurrentIterationControlEnergy(pinfo.second); - INFO( - "current iteration time = %f, current iteration energy = %f\n\n", - current_iteration_time, current_iteration_energy); + INFO("current iteration time = %f, current iteration energy = %f\n\n", + current_iteration_time, current_iteration_energy); // Note the end of iteration RC->end_iteration(); @@ -1791,7 +1757,7 @@ static void writeVectorToFile(const char *path, const std::vector<T> &vec) { std::ofstream of(path, std::ofstream::out | std::ofstream::app); if (!of.good()) ERROR("Cannot write to %s file", path); - for (float f: vec) + for (float f : vec) of << f << ' '; of << '\n'; } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc index 74ee15c2dcb916f9a7a24fdc1318255e626844b3..b322ee2be37b60487e15c9109d4230adf1ad84e2 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/init_api.cc @@ -68,9 +68,7 @@ void llvm_hpvm_initApproxhpvmRt(int gpuid) { void llvm_hpvm_cleanupApproxhpvmRt() {} -void dumpAccuracyNorms() { - dump_result("accuracy_summary"); -} +void dumpAccuracyNorms() { dump_result("accuracy_summary"); } // Returns the number of GPUs active on the platform unsigned int getGPUCount() { diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc index ad1d2e137d19d1c158afb031f35f278d9cdefaa0..08f13bf0f891e03f3d13e0c2f2e8bc97bacb3b64 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/profiling.cc @@ -1,13 +1,12 @@ //===----------------------------- profling.cc ---------------------------===// // //===----------------------------------------------------------------------===// -// +// // This file contains code provides the definition of the interface for // applications to start and stop profiling for energy and performance. // //===----------------------------------------------------------------------===// - #ifndef PROFILING_HEADER #define PROFILING_HEADER @@ -52,7 +51,7 @@ void stopProfiling() { void profileEvent(const char *event_name, bool compare_previous = false) { checkCudaErrors(cudaDeviceSynchronize()); - + auto it = func_counters.find(event_name); if (it == func_counters.end()) { func_counters[event_name] = 1; @@ -73,7 +72,7 @@ void profileEvent(const char *event_name, bool compare_previous = false) { time_reading - zero_time; DEBUG("AbsoluteTime, Event = %s, Time = %f \n", event_name, - current_time.count()); + current_time.count()); profile_data.append(event_name); profile_data.append(event_count); profile_data.append("\t"); @@ -86,14 +85,13 @@ void profileEvent(const char *event_name, bool compare_previous = false) { profile_data.append("\t"); profile_data.append(std::to_string(duration_time.count())); DEBUG("TimeDuration, Event = %s, Time = %f \n", event_name, - duration_time.count()); + duration_time.count()); } profile_data.append("\n"); previous_time = time_reading; // set the previous time reading to the current // profiled time - } } diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc index 9250810a2010a235074c0d29b8fe8bd63650324c..7a1acd2ba03871e015d289a999a0ea9a05ed5cd8 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -1,11 +1,11 @@ //===--------------------------- tensor_runtime_cpu.cc --------------------===// // //===----------------------------------------------------------------------===// -// -// This file consists of the custom implementation of non-approximated and -// approximated versions of tensor operations to execute on CPUs. The -// software approximations implemented for tensor convolutions are feature -// sampling and perforation for FP32 compute precisions only. +// +// This file consists of the custom implementation of non-approximated and +// approximated versions of tensor operations to execute on CPUs. The +// software approximations implemented for tensor convolutions are feature +// sampling and perforation for FP32 compute precisions only. // //===----------------------------------------------------------------------===// @@ -29,7 +29,7 @@ #include <string> #include <vector> #include <math.h> -#include<bits/stdc++.h> +#include <bits/stdc++.h> #include <pthread.h> #include <omp.h> @@ -39,1081 +39,1140 @@ #include "tensor_cpu_runtime.h" void llvm_hpvm_initTensorRtCPU() { - // NOTE: Do Nothing + // NOTE: Do Nothing } void llvm_hpvm_cleanupTensorRtCPU() { - // NOTE: Do Nothing + // NOTE: Do Nothing } void hpvm_request_tensorCPU(void *tensor, int destination) { - // NOTE: Do Nothing + // NOTE: Do Nothing } - + std::vector<void *> PtrVect; void freeBatchMemory() { - for(auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) { - free(*it); - } - PtrVect.erase(PtrVect.begin(), PtrVect.end()); + for (auto it = PtrVect.rbegin(); it != PtrVect.rend(); it++) { + free(*it); + } + PtrVect.erase(PtrVect.begin(), PtrVect.end()); } - -int getTypeSizeCPU(int data_type) __attribute__((always_inline)); +int getTypeSizeCPU(int data_type) __attribute__((always_inline)); inline int getTypeSizeCPU(int data_type) { - return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1); + return (data_type == 0) ? 4 : ((data_type == 1) ? 2 : 1); } -void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) __attribute__((always_inline)); -inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) { - int type_size = getTypeSizeCPU(data_type); - size_t size_in_bytes = type_size * num_elems; - tensor->size_in_bytes = size_in_bytes; +void setSizeInBytesCPU(struct Tensor *tensor, int data_type, size_t num_elems) + __attribute__((always_inline)); +inline void setSizeInBytesCPU(struct Tensor *tensor, int data_type, + size_t num_elems) { + int type_size = getTypeSizeCPU(data_type); + size_t size_in_bytes = type_size * num_elems; + tensor->size_in_bytes = size_in_bytes; } -void allocateMemCPU(struct Tensor *tensor, int data_type, - size_t num_elems, bool freeMemory = true) __attribute__((always_inline)); -inline void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, bool freeMemory) { - setSizeInBytesCPU(tensor, data_type, num_elems); - tensor->data_type = data_type; - tensor->num_elems = num_elems; - tensor->host_data = (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host - if(freeMemory) - PtrVect.push_back(tensor->host_data); +void allocateMemCPU(struct Tensor *tensor, int data_type, size_t num_elems, + bool freeMemory = true) __attribute__((always_inline)); +inline void allocateMemCPU(struct Tensor *tensor, int data_type, + size_t num_elems, bool freeMemory) { + setSizeInBytesCPU(tensor, data_type, num_elems); + tensor->data_type = data_type; + tensor->num_elems = num_elems; + tensor->host_data = + (void *)malloc(tensor->size_in_bytes); // Allocate memory on the host + if (freeMemory) + PtrVect.push_back(tensor->host_data); } -void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) __attribute__((always_inline)); -inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { - Tensor *tensor = (Tensor *)tensor_ptr; - if (tensor->size_in_bytes != size_in_bytes) { - printf("The destination and source sizes don't match"); - } - memcpy(tensor->host_data, data_ptr, size_in_bytes); // Is this efficient enough? +void initTensorDataCPU(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) + __attribute__((always_inline)); +inline void initTensorDataCPU(void *tensor_ptr, void *data_ptr, + size_t size_in_bytes) { + Tensor *tensor = (Tensor *)tensor_ptr; + if (tensor->size_in_bytes != size_in_bytes) { + printf("The destination and source sizes don't match"); + } + memcpy(tensor->host_data, data_ptr, + size_in_bytes); // Is this efficient enough? } void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size, - bool freeMemory = true) __attribute__((always_inline)); -inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, - size_t dim4_size, bool freeMemory) { - struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - if(freeMemory) - PtrVect.push_back(tensor); - allocateMemCPU(tensor, data_type, num_elems, freeMemory); - - // Setting the tensor dimensions - size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - dim_sizes[3] = dim4_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 4; - tensor->data_placement = HOST; - return tensor; + size_t dim2_size, size_t dim3_size, size_t dim4_size, + bool freeMemory = true) __attribute__((always_inline)); +inline void *create4DTensorCPU(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, + size_t dim4_size, bool freeMemory) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + if (freeMemory) + PtrVect.push_back(tensor); + allocateMemCPU(tensor, data_type, num_elems, freeMemory); + + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + dim_sizes[3] = dim4_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 4; + tensor->data_placement = HOST; + return tensor; } -void* tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, - int horizontal_stride, int conv_mode, - int compute_precision) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int output_size = output_width * output_height; - printf("--CREATE 4D TENSOR\n"); - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - output_height, output_width); - float * __restrict__ output_data = (float *)output->host_data; - printf("CREATED 4D TENSOR\n"); - long int conv_data_size = - sizeof(float) * num_filter_elem * output_height * output_width * batch_size; - float *host_data = (float *) malloc(conv_data_size); - printf("host data: %p\n", host_data); - printf("number of batches: %d\n", batch_size); - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - for(int h = 0; h < output_height; h++) { - for(int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for(int i = 0; i < kernel_height; i++) { - for(int j = 0; j < kernel_width; j++) { - const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size - + output_index * num_filter_elem + filter_elem_num; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } - } +void *tensorRegularConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int output_width = 1 + ((image_width - kernel_width + 2 * horizontal_pad) / + horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int output_size = output_width * output_height; + printf("--CREATE 4D TENSOR\n"); + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; + printf("CREATED 4D TENSOR\n"); + long int conv_data_size = sizeof(float) * num_filter_elem * output_height * + output_width * batch_size; + float *host_data = (float *)malloc(conv_data_size); + printf("host data: %p\n", host_data); + printf("number of batches: %d\n", batch_size); + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + for (int h = 0; h < output_height; h++) { + for (int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for (int i = 0; i < kernel_height; i++) { + for (int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + + filter_elem_num; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; - } + } + } + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = + k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; } + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } } - free(host_data); - printf("END: %p\n", output); - return output; + } + free(host_data); + printf("END: %p\n", output); + return output; } -void* tensorRegularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int skip_every, int start) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - const int batch_size = input->dims.dim_sizes[0]; - const int channels = input->dims.dim_sizes[1]; - const int image_height = input->dims.dim_sizes[2]; - const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; - const int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - const int output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - const int num_filter_elem = kernel_height * kernel_width * channels; - - const int remainder = ((num_filter_elem - start) % skip_every > 0); - const int reduced_num_filter_elem = - num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; - const int output_size = output_width * output_height; - - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - output_height, output_width); - float * __restrict__ output_data = (float *)output->host_data; - - const long int host_data_size = sizeof(float) * reduced_num_filter_elem - * output_height * output_width * batch_size; - float *host_data = (float *) malloc(host_data_size); - - const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem; - float *reduced_kernels = (float *) malloc(reduced_filer_size); - - float fac = (((float) skip_every) / ((float) skip_every - 1)); - int reduced_filter_dim = reduced_num_filter_elem / channels; - - // Create reduced filter - omp_set_num_threads(4); - #pragma omp parallel for - for(int f = 0; f < num_filters; f++) { - for(int i = 0; i < reduced_num_filter_elem; i++) { - int ch = i / reduced_filter_dim; - int offset = (start + ch) % skip_every; - int in_index; - if(i < offset) { - in_index = i; - } else { - in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) - + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset -1; - } - reduced_kernels[f * reduced_num_filter_elem + i] = - fac * host_filter[num_filter_elem * f + in_index]; +void *tensorRegularFilterSamplingConvolutionCPU( + void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, int conv_mode, + int compute_precision, int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int num_filter_elem = kernel_height * kernel_width * channels; + + const int remainder = ((num_filter_elem - start) % skip_every > 0); + const int reduced_num_filter_elem = + num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; + + const long int host_data_size = sizeof(float) * reduced_num_filter_elem * + output_height * output_width * batch_size; + float *host_data = (float *)malloc(host_data_size); + + const int reduced_filer_size = + sizeof(float) * num_filters * reduced_num_filter_elem; + float *reduced_kernels = (float *)malloc(reduced_filer_size); + + float fac = (((float)skip_every) / ((float)skip_every - 1)); + int reduced_filter_dim = reduced_num_filter_elem / channels; + + // Create reduced filter + omp_set_num_threads(4); +#pragma omp parallel for + for (int f = 0; f < num_filters; f++) { + for (int i = 0; i < reduced_num_filter_elem; i++) { + int ch = i / reduced_filter_dim; + int offset = (start + ch) % skip_every; + int in_index; + if (i < offset) { + in_index = i; + } else { + in_index = ((i - offset + 1) * skip_every) / (skip_every - 1) + + (((i - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } + reduced_kernels[f * reduced_num_filter_elem + i] = + fac * host_filter[num_filter_elem * f + in_index]; + } + } + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int h = 0; h < output_height; h++) { + for (int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for (int fi = 0; fi < reduced_num_filter_elem; fi++) { + int in_index; + const int ch = fi / reduced_filter_dim; + const int offset = (start + ch) % skip_every; + if (fi < offset) { + in_index = fi; + } else { + in_index = + ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } + const int i = + (in_index % (kernel_width * kernel_height)) / kernel_width; + const int j = in_index % kernel_width; + const int output_index = h * output_width + w; + const int out_index = b * reduced_num_filter_elem * output_size + + output_index * reduced_num_filter_elem + fi; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int h = 0; h < output_height; h++) { - for(int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for(int fi = 0; fi < reduced_num_filter_elem; fi++) { - int in_index; - const int ch = fi / reduced_filter_dim; - const int offset = (start + ch) % skip_every; - if(fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - } - const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; - const int j = in_index % kernel_width; - const int output_index = h * output_width + w; - const int out_index = b * reduced_num_filter_elem * output_size - + output_index * reduced_num_filter_elem + fi; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < reduced_num_filter_elem; ++k) { + int input_index = k + reduced_num_filter_elem * m + + b * reduced_num_filter_elem * output_size; + sum += host_data[input_index] * + reduced_kernels[p * reduced_num_filter_elem + k]; } - - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < reduced_num_filter_elem; ++k) { - int input_index = k + reduced_num_filter_elem * m - + b * reduced_num_filter_elem * output_size; - sum += host_data[input_index] - * reduced_kernels[p * reduced_num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; - } - } - + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } } - free(reduced_kernels); - free(host_data); - - return output; + } + free(reduced_kernels); + free(host_data); + + return output; } -void* tensorIrregularFilterSamplingConvolutionCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int skip_every, int start) { - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - const int batch_size = input->dims.dim_sizes[0]; - const int channels = input->dims.dim_sizes[1]; - const int image_height = input->dims.dim_sizes[2]; - const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; - const int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - const int output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - const int num_filter_elem = kernel_height * kernel_width * channels; - - const int remainder = ((num_filter_elem - start) % skip_every > 0); - const int reduced_num_filter_elem = - num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; - const int output_size = output_width * output_height; - - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - output_height, output_width); - float * __restrict__ output_data = (float *)output->host_data; - - const long int host_data_size = sizeof(float) * reduced_num_filter_elem - * output_height * output_width * batch_size; - float *host_data = (float *) malloc(host_data_size); - - const int reduced_filer_size = sizeof(float) * num_filters * reduced_num_filter_elem; - float *reduced_kernels = (float *) malloc(reduced_filer_size); - - float fac = (((float) skip_every) / ((float) skip_every - 1)); - int reduced_filter_dim = reduced_num_filter_elem / channels; - - // Create Reduced filter - omp_set_num_threads(4); - #pragma omp parallel for - for(int f = 0; f < num_filters; f++) { - for(int i = 0; i < start; i++) { - reduced_kernels[f * reduced_num_filter_elem + i] = - host_filter[num_filter_elem * f + i]; - } - #pragma omp simd - for(int i = start; i < reduced_num_filter_elem; i++) { - int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) - + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + start - 1; - reduced_kernels[f * reduced_num_filter_elem + i] = - fac * host_filter[num_filter_elem * f + in_index]; +void *tensorIrregularFilterSamplingConvolutionCPU( + void *input_ptr, void *filter_ptr, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, int conv_mode, + int compute_precision, int skip_every, int start) { + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int num_filter_elem = kernel_height * kernel_width * channels; + + const int remainder = ((num_filter_elem - start) % skip_every > 0); + const int reduced_num_filter_elem = + num_filter_elem - ((num_filter_elem - start) / skip_every) - remainder; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, num_filters, + output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; + + const long int host_data_size = sizeof(float) * reduced_num_filter_elem * + output_height * output_width * batch_size; + float *host_data = (float *)malloc(host_data_size); + + const int reduced_filer_size = + sizeof(float) * num_filters * reduced_num_filter_elem; + float *reduced_kernels = (float *)malloc(reduced_filer_size); + + float fac = (((float)skip_every) / ((float)skip_every - 1)); + int reduced_filter_dim = reduced_num_filter_elem / channels; + + // Create Reduced filter + omp_set_num_threads(4); +#pragma omp parallel for + for (int f = 0; f < num_filters; f++) { + for (int i = 0; i < start; i++) { + reduced_kernels[f * reduced_num_filter_elem + i] = + host_filter[num_filter_elem * f + i]; + } +#pragma omp simd + for (int i = start; i < reduced_num_filter_elem; i++) { + int in_index = ((i - start + 1) * skip_every) / (skip_every - 1) + + (((i - start + 1) * skip_every) % (skip_every - 1) > 0) + + start - 1; + reduced_kernels[f * reduced_num_filter_elem + i] = + fac * host_filter[num_filter_elem * f + in_index]; + } + } + +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int h = 0; h < output_height; h++) { + for (int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for (int fi = 0; fi < reduced_num_filter_elem; fi++) { + int in_index; + int offset = start; + if (fi < offset) { + in_index = fi; + } else { + in_index = + ((fi - offset + 1) * skip_every) / (skip_every - 1) + + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + + offset - 1; + } + const int ch = in_index / (kernel_width * kernel_height); + const int i = + (in_index % (kernel_width * kernel_height)) / kernel_width; + const int j = in_index % kernel_width; + const int output_index = h * output_width + w; + const int out_index = b * reduced_num_filter_elem * output_size + + output_index * reduced_num_filter_elem + fi; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int h = 0; h < output_height; h++) { - for(int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for(int fi = 0; fi < reduced_num_filter_elem; fi++) { - int in_index; - int offset = start; - if(fi < offset) { - in_index = fi; - } else { - in_index = ((fi - offset + 1) * skip_every) / (skip_every - 1) - + (((fi - offset + 1) * skip_every) % (skip_every - 1) > 0) + offset - 1; - } - const int ch = in_index / (kernel_width * kernel_height); - const int i = (in_index % (kernel_width * kernel_height)) / kernel_width; - const int j = in_index % kernel_width; - const int output_index = h * output_width + w; - const int out_index = b * reduced_num_filter_elem * output_size - + output_index * reduced_num_filter_elem + fi; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < reduced_num_filter_elem; ++k) { + int input_index = k + reduced_num_filter_elem * m + + b * reduced_num_filter_elem * output_size; + sum += host_data[input_index] * + reduced_kernels[p * reduced_num_filter_elem + k]; } - - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < reduced_num_filter_elem; ++k) { - int input_index = k + reduced_num_filter_elem * m - + b * reduced_num_filter_elem * output_size; - sum += host_data[input_index] - * reduced_kernels[p * reduced_num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; - } - } - + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } } - free(reduced_kernels); - free(host_data); - - return output; -} + } + free(reduced_kernels); + free(host_data); -void* tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, int row, int start) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - - int full_output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int full_output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int full_output_size = full_output_height * full_output_width; - - Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - full_output_height, full_output_width); - float * __restrict__ full_output_data = (float *)full_output->host_data; - - int remainder = (full_output_height - start) % row > 0; - int output_height = - full_output_height - ((full_output_height - start) / row) - remainder; - - int output_width = full_output_width; - float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters - * output_height * output_width); - int output_size = output_width * output_height; - long int host_data_size = sizeof(float) * num_filter_elem * output_height - * output_width * batch_size; - float *host_data = (float *) malloc(host_data_size); + return output; +} - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - for(int h = 0; h < output_height; h++) { - int inH; - if(h < start) { - inH = h * vertical_stride - vertical_pad; - } else { - int h_index = ((h - start + 1) * row) / (row - 1) - + (((h - start + 1) * row) % (row - 1) > 0) + start - 1; - inH = h_index * vertical_stride - vertical_pad; - } - for(int w = 0; w < output_width; w++) { - int inW = w * horizontal_stride - horizontal_pad; - for(int i = 0; i < kernel_height; i++) { - for(int j = 0; j < kernel_width; j++) { - const int filter_elem_num = - (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size - + output_index * num_filter_elem + filter_elem_num; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } - } +void *tensorRowPerfConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, int row, + int start) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + + int full_output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int full_output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int full_output_size = full_output_height * full_output_width; + + Tensor *full_output = (Tensor *)create4DTensorCPU( + 0, 0, batch_size, num_filters, full_output_height, full_output_width); + float *__restrict__ full_output_data = (float *)full_output->host_data; + + int remainder = (full_output_height - start) % row > 0; + int output_height = + full_output_height - ((full_output_height - start) / row) - remainder; + + int output_width = full_output_width; + float *output_data = (float *)malloc( + sizeof(float) * batch_size * num_filters * output_height * output_width); + int output_size = output_width * output_height; + long int host_data_size = sizeof(float) * num_filter_elem * output_height * + output_width * batch_size; + float *host_data = (float *)malloc(host_data_size); + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + for (int h = 0; h < output_height; h++) { + int inH; + if (h < start) { + inH = h * vertical_stride - vertical_pad; + } else { + int h_index = ((h - start + 1) * row) / (row - 1) + + (((h - start + 1) * row) % (row - 1) > 0) + start - 1; + inH = h_index * vertical_stride - vertical_pad; + } + for (int w = 0; w < output_width; w++) { + int inW = w * horizontal_stride - horizontal_pad; + for (int i = 0; i < kernel_height; i++) { + for (int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + + filter_elem_num; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } + } + } - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = k + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = + k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; } + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } + } - // Interpolate - for (int p = 0; p < num_filters; ++p) { - for(int h = 0; h < full_output_height; h++) { - for(int w = 0; w < full_output_width; w++) { - int full_output_index = b * num_filters * full_output_size - + p * full_output_size + h * full_output_width + w; - if(h < start) { - int output_index = b * num_filters * output_size - + p * output_size + h * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if(h == full_output_height - 1) { - int output_index = b * num_filters * output_size + p * output_size - + (output_height - 1) * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if(h == 0) { - int output_index = b * num_filters * output_size - + p * output_size + 0 * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if((h - start) % row == 0) { - int row_index = h - ((h + 1 - start) / row); - int output_index = b * num_filters * output_size + p * output_size - + row_index * output_width + w; - full_output_data[full_output_index] = - (output_data[output_index] + output_data[output_index - output_width]) / 2; - } else { - int remainder = ((h + 1 - start) % row) > 0; - int row_index = h - ((h + 1 - start) / row) - remainder; - int output_index = b * num_filters * output_size + p * output_size - + row_index * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } - } - } - } + // Interpolate + for (int p = 0; p < num_filters; ++p) { + for (int h = 0; h < full_output_height; h++) { + for (int w = 0; w < full_output_width; w++) { + int full_output_index = b * num_filters * full_output_size + + p * full_output_size + h * full_output_width + + w; + if (h < start) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if (h == full_output_height - 1) { + int output_index = b * num_filters * output_size + p * output_size + + (output_height - 1) * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if (h == 0) { + int output_index = b * num_filters * output_size + p * output_size + + 0 * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if ((h - start) % row == 0) { + int row_index = h - ((h + 1 - start) / row); + int output_index = b * num_filters * output_size + p * output_size + + row_index * output_width + w; + full_output_data[full_output_index] = + (output_data[output_index] + + output_data[output_index - output_width]) / + 2; + } else { + int remainder = ((h + 1 - start) % row) > 0; + int row_index = h - ((h + 1 - start) / row) - remainder; + int output_index = b * num_filters * output_size + p * output_size + + row_index * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } + } + } } - free(output_data); - free(host_data); + } + free(output_data); + free(host_data); - return full_output; + return full_output; } -void* tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, int vertical_pad, - int horizontal_pad, int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, int col, int start) { - - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int num_filters = filter->dims.dim_sizes[0]; - int kernel_height = filter->dims.dim_sizes[2]; - int kernel_width = filter->dims.dim_sizes[3]; - int full_output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - int full_output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - int num_filter_elem = kernel_height * kernel_width * channels; - int full_output_size = full_output_height * full_output_width; - - Tensor *full_output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, - full_output_height, full_output_width); - float * __restrict__ full_output_data = (float *)full_output->host_data; - - int remainder = (full_output_width - start) % col > 0; - int output_width = full_output_width - ((full_output_width - start) / col) - remainder; - - int output_height = full_output_height; - float *output_data = (float *) malloc(sizeof(float) * batch_size * num_filters - * output_height * output_width); - int output_size = output_width * output_height; - long int host_data_size = sizeof(float) * num_filter_elem * output_height - * output_width * batch_size; - float *host_data = (float *) malloc(host_data_size); - - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - for(int h = 0; h < output_height; h++) { - int inH = h * vertical_stride - vertical_pad; - for(int w = 0; w < output_width; w++) { - int inW; - if(w < start) { - inW = w * horizontal_stride - horizontal_pad; - } else { - int w_index = ((w - start + 1) * col) / (col - 1) - + (((w - start + 1) * col) % (col - 1) > 0) + start - 1; - inW = w_index * horizontal_stride - horizontal_pad; - } - for(int i = 0; i < kernel_height; i++) { - for(int j = 0; j < kernel_width; j++) { - const int filter_elem_num = - (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size - + output_index * num_filter_elem + filter_elem_num; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } - } - } - } - - // Tensor Multiply - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < num_filter_elem; ++k) { - int input_index = k + num_filter_elem * m - + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; - } - output_data[b * (output_size * num_filters) + p * output_size + m] = sum; +void *tensorColPerfConvolutionCPU(void *input_ptr, void *filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision, int col, + int start) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int num_filters = filter->dims.dim_sizes[0]; + int kernel_height = filter->dims.dim_sizes[2]; + int kernel_width = filter->dims.dim_sizes[3]; + int full_output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + int full_output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + int num_filter_elem = kernel_height * kernel_width * channels; + int full_output_size = full_output_height * full_output_width; + + Tensor *full_output = (Tensor *)create4DTensorCPU( + 0, 0, batch_size, num_filters, full_output_height, full_output_width); + float *__restrict__ full_output_data = (float *)full_output->host_data; + + int remainder = (full_output_width - start) % col > 0; + int output_width = + full_output_width - ((full_output_width - start) / col) - remainder; + + int output_height = full_output_height; + float *output_data = (float *)malloc( + sizeof(float) * batch_size * num_filters * output_height * output_width); + int output_size = output_width * output_height; + long int host_data_size = sizeof(float) * num_filter_elem * output_height * + output_width * batch_size; + float *host_data = (float *)malloc(host_data_size); + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + for (int h = 0; h < output_height; h++) { + int inH = h * vertical_stride - vertical_pad; + for (int w = 0; w < output_width; w++) { + int inW; + if (w < start) { + inW = w * horizontal_stride - horizontal_pad; + } else { + int w_index = ((w - start + 1) * col) / (col - 1) + + (((w - start + 1) * col) % (col - 1) > 0) + start - 1; + inW = w_index * horizontal_stride - horizontal_pad; + } + for (int i = 0; i < kernel_height; i++) { + for (int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + + filter_elem_num; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } + } + } - // Interpolate - for (int p = 0; p < num_filters; ++p) { - for(int h = 0; h < full_output_height; h++) { - for(int w = 0; w < full_output_width; w++) { - int full_output_index = b * num_filters * full_output_size - + p * full_output_size + h * full_output_width + w; - if(w < start) { - int output_index = b * num_filters * output_size - + p * output_size + h * output_width + w; - full_output_data[full_output_index] = output_data[output_index]; - } else if(w == full_output_width - 1) { - int output_index = b * num_filters * output_size + p * output_size - + h * output_width + output_width - 1; - full_output_data[full_output_index] = output_data[output_index]; - } else if(w == 0) { - int output_index = b * num_filters * output_size + p * output_size - + h * output_width + 0; - full_output_data[full_output_index] = output_data[output_index]; - } else if((w - start) % col == 0) { - int col_index = w - ((w + 1 - start) / col); - int output_index = b * num_filters * output_size + p * output_size - + h * output_width + col_index; - full_output_data[full_output_index] = - (output_data[output_index] + output_data[output_index - 1]) / 2; - } else { - int remainder = ((w + 1 - start) % col) > 0; - int col_index = w - ((w + 1 - start) / col) - remainder; - int output_index = b * num_filters * output_size + p * output_size - + h * output_width + col_index; - full_output_data[full_output_index] = output_data[output_index]; - } - } - } + // Tensor Multiply + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < num_filter_elem; ++k) { + int input_index = + k + num_filter_elem * m + b * num_filter_elem * output_size; + sum += host_data[input_index] * host_filter[p * num_filter_elem + k]; } + output_data[b * (output_size * num_filters) + p * output_size + m] = + sum; + } } - free(output_data); - free(host_data); - return full_output; -} - -void* tensorConvApproxCPU(void *input_ptr, void *filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int compute_precision, - int row, int col, int skip_every, int start) { - if(row > 1) { - printf("ROW PERFORATION\n"); - return tensorRowPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad, - horizontal_pad, vertical_stride, horizontal_stride, conv_mode, - compute_precision, row, start); - } - if(col > 1) { - printf("COL PERFORATION\n"); - return tensorColPerfConvolutionCPU(input_ptr, filter_ptr, vertical_pad, - horizontal_pad, vertical_stride, horizontal_stride, conv_mode, - compute_precision, col, start); - } - if(skip_every > 1) { - printf("INPUT FILTERING\n"); - Tensor *input = (Tensor *)input_ptr; - Tensor *filter = (Tensor *)filter_ptr; - - const int kernel_height = filter->dims.dim_sizes[2]; - const int kernel_width = filter->dims.dim_sizes[3]; - - if(!(kernel_height * kernel_width % skip_every)) { - return tensorRegularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, - vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, - compute_precision, skip_every, start); + // Interpolate + for (int p = 0; p < num_filters; ++p) { + for (int h = 0; h < full_output_height; h++) { + for (int w = 0; w < full_output_width; w++) { + int full_output_index = b * num_filters * full_output_size + + p * full_output_size + h * full_output_width + + w; + if (w < start) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + w; + full_output_data[full_output_index] = output_data[output_index]; + } else if (w == full_output_width - 1) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + output_width - 1; + full_output_data[full_output_index] = output_data[output_index]; + } else if (w == 0) { + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + 0; + full_output_data[full_output_index] = output_data[output_index]; + } else if ((w - start) % col == 0) { + int col_index = w - ((w + 1 - start) / col); + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + col_index; + full_output_data[full_output_index] = + (output_data[output_index] + output_data[output_index - 1]) / 2; + } else { + int remainder = ((w + 1 - start) % col) > 0; + int col_index = w - ((w + 1 - start) / col) - remainder; + int output_index = b * num_filters * output_size + p * output_size + + h * output_width + col_index; + full_output_data[full_output_index] = output_data[output_index]; + } } - return tensorIrregularFilterSamplingConvolutionCPU(input_ptr, filter_ptr, - vertical_pad, horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, - compute_precision, skip_every, start); + } } - printf("---REGULAR CONV\n"); - return tensorRegularConvolutionCPU(input_ptr, filter_ptr, vertical_pad, - horizontal_pad, vertical_stride, - horizontal_stride, conv_mode, compute_precision); + } + free(output_data); + free(host_data); + + return full_output; } -void* tensorConvCutlassCPU(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - +void *tensorConvApproxCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int compute_precision, int row, int col, + int skip_every, int start) { + if (row > 1) { + printf("ROW PERFORATION\n"); + return tensorRowPerfConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision, row, start); + } + if (col > 1) { + printf("COL PERFORATION\n"); + return tensorColPerfConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision, col, start); + } + if (skip_every > 1) { + printf("INPUT FILTERING\n"); Tensor *input = (Tensor *)input_ptr; Tensor *filter = (Tensor *)filter_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_filter = (float *)filter->host_data; - - const int batch_size = input->dims.dim_sizes[0]; - const int channels = input->dims.dim_sizes[1]; - const int image_height = input->dims.dim_sizes[2]; - const int image_width = input->dims.dim_sizes[3]; - const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; const int kernel_width = filter->dims.dim_sizes[3]; - const int output_height = - 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); - const int output_width = - 1 + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); - const int filter_dim = kernel_height * kernel_width; - const int num_filter_elem = filter_dim * channels; - const int output_size = output_width * output_height; - - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, num_filters, channels, - output_height * output_width); - float * __restrict__ output_data = (float *)output->host_data; - - const long int conv_data_size = - sizeof(float) * num_filter_elem * output_height * output_width * batch_size; - float *host_data = (float *) malloc(conv_data_size); - - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - for(int h = 0; h < output_height; h++) { - for(int w = 0; w < output_width; w++) { - const int inH = h * vertical_stride - vertical_pad; - const int inW = w * horizontal_stride - horizontal_pad; - for(int i = 0; i < kernel_height; i++) { - for(int j = 0; j < kernel_width; j++) { - const int filter_elem_num = (ch * kernel_height + i) * kernel_width + j; - const int output_index = h * output_width + w; - const int out_index = b * num_filter_elem * output_size - + output_index * num_filter_elem + filter_elem_num; - if(inH + i >= 0 && inH + i < image_height - && inW + j >= 0 && inW + j < image_width) { - host_data[out_index] = - host_image[((b * channels + ch) * image_height - + (inH + i)) * image_width + (inW + j)]; - } else { - host_data[out_index] = 0; - } - } - } - } + + if (!(kernel_height * kernel_width % skip_every)) { + return tensorRegularFilterSamplingConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision, skip_every, start); + } + return tensorIrregularFilterSamplingConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision, skip_every, start); + } + printf("---REGULAR CONV\n"); + return tensorRegularConvolutionCPU( + input_ptr, filter_ptr, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride, conv_mode, compute_precision); +} + +void *tensorConvCutlassCPU(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_filter = (float *)filter->host_data; + + const int batch_size = input->dims.dim_sizes[0]; + const int channels = input->dims.dim_sizes[1]; + const int image_height = input->dims.dim_sizes[2]; + const int image_width = input->dims.dim_sizes[3]; + const int num_filters = filter->dims.dim_sizes[0]; + const int kernel_height = filter->dims.dim_sizes[2]; + const int kernel_width = filter->dims.dim_sizes[3]; + const int output_height = + 1 + ((image_height - kernel_height + 2 * vertical_pad) / vertical_stride); + const int output_width = + 1 + + ((image_width - kernel_width + 2 * horizontal_pad) / horizontal_stride); + const int filter_dim = kernel_height * kernel_width; + const int num_filter_elem = filter_dim * channels; + const int output_size = output_width * output_height; + + Tensor *output = (Tensor *)create4DTensorCPU( + 0, 0, batch_size, num_filters, channels, output_height * output_width); + float *__restrict__ output_data = (float *)output->host_data; + + const long int conv_data_size = sizeof(float) * num_filter_elem * + output_height * output_width * batch_size; + float *host_data = (float *)malloc(conv_data_size); + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + for (int h = 0; h < output_height; h++) { + for (int w = 0; w < output_width; w++) { + const int inH = h * vertical_stride - vertical_pad; + const int inW = w * horizontal_stride - horizontal_pad; + for (int i = 0; i < kernel_height; i++) { + for (int j = 0; j < kernel_width; j++) { + const int filter_elem_num = + (ch * kernel_height + i) * kernel_width + j; + const int output_index = h * output_width + w; + const int out_index = b * num_filter_elem * output_size + + output_index * num_filter_elem + + filter_elem_num; + if (inH + i >= 0 && inH + i < image_height && inW + j >= 0 && + inW + j < image_width) { + host_data[out_index] = + host_image[((b * channels + ch) * image_height + + (inH + i)) * + image_width + + (inW + j)]; + } else { + host_data[out_index] = 0; + } } + } } - for (int p = 0; p < num_filters; ++p) { - for (int m = 0; m < output_size; ++m) { - for (int ch = 0; ch < channels; ch++) { - float sum = 0; - #pragma omp simd reduction(+:sum) - for (int k = 0; k < filter_dim; ++k) { - int input_index = k + ch * filter_dim + num_filter_elem * m + b * num_filter_elem * output_size; - sum += host_data[input_index] * host_filter[p * num_filter_elem + ch * filter_dim + k]; - } - output_data[b * (output_size * num_filters * channels) + p * output_size * channels + ch * output_size + m] = sum; - } - } + } + } + for (int p = 0; p < num_filters; ++p) { + for (int m = 0; m < output_size; ++m) { + for (int ch = 0; ch < channels; ch++) { + float sum = 0; +#pragma omp simd reduction(+ : sum) + for (int k = 0; k < filter_dim; ++k) { + int input_index = k + ch * filter_dim + num_filter_elem * m + + b * num_filter_elem * output_size; + sum += host_data[input_index] * + host_filter[p * num_filter_elem + ch * filter_dim + k]; + } + output_data[b * (output_size * num_filters * channels) + + p * output_size * channels + ch * output_size + m] = sum; } + } } + } - free(host_data); - return output; + free(host_data); + return output; } -void* tensorAddCPU(void *x_ptr, void *bias_ptr) { - Tensor *x = (Tensor *)x_ptr; - Tensor *bias = (Tensor *)bias_ptr; - - float * __restrict__ x_data = (float *)x->host_data; - float * __restrict__ bias_data = (float *)bias->host_data; - int n = x->dims.dim_sizes[0]; - int c = x->dims.dim_sizes[1]; - int h = x->dims.dim_sizes[2]; - int w = x->dims.dim_sizes[3]; - - if(x->num_elems == bias->num_elems) { - int const1 = c * h * w; - int const2 = h * w; - omp_set_num_threads(4); - #pragma omp parallel for - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - #pragma omp simd collapse(2) - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - x_data[i * const1 + j * const2 + (k * w) + l] += - bias_data[i * const1 + j * const2 + (k*w) + l]; - } - } - } +void *tensorAddCPU(void *x_ptr, void *bias_ptr) { + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; + + float *__restrict__ x_data = (float *)x->host_data; + float *__restrict__ bias_data = (float *)bias->host_data; + int n = x->dims.dim_sizes[0]; + int c = x->dims.dim_sizes[1]; + int h = x->dims.dim_sizes[2]; + int w = x->dims.dim_sizes[3]; + + if (x->num_elems == bias->num_elems) { + int const1 = c * h * w; + int const2 = h * w; + omp_set_num_threads(4); +#pragma omp parallel for + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { +#pragma omp simd collapse(2) + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * const1 + j * const2 + (k * w) + l] += + bias_data[i * const1 + j * const2 + (k * w) + l]; + } } - } else { - omp_set_num_threads(4); - #pragma omp parallel for - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - #pragma omp simd collapse(2) - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; - } - } - } - } + } + } + } else { + omp_set_num_threads(4); +#pragma omp parallel for + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { +#pragma omp simd collapse(2) + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + x_data[i * (c * h * w) + j * (h * w) + k * w + l] += bias_data[j]; + } + } + } } - - return x; + } + + return x; } float max(float v1, float v2) __attribute__((always_inline)); -inline float maximum(float v1, float v2){ - return (v1 < v2) ? v2 : v1; -} +inline float maximum(float v1, float v2) { return (v1 < v2) ? v2 : v1; } void *tensorPoolingCPU(void *input_ptr, int poolFunction, int window_height, - int window_width, int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride) { - - Tensor *input = (Tensor *)input_ptr; - float * __restrict__ input_data = (float *)input->host_data; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - - int output_height = - 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); - int output_width = - 1 + ((image_width - window_width + 2 * horizontal_pad) / horizontal_stride); - - int center_x = (window_width - 1) / 2 - horizontal_pad; - int center_y = (window_height - 1) / 2 - vertical_pad; - int x_radius = (window_width - 1) / 2; - int y_radius = (window_height - 1) / 2; - - Tensor *output = (Tensor *) create4DTensorCPU(0, 0, batch_size, channels, - output_height, output_width); - float * __restrict__ output_data = (float *)output->host_data; - - omp_set_num_threads(4); - #pragma omp parallel for - for (int b = 0; b < batch_size; b++) { - for (int ch = 0; ch < channels; ch++) { - int ii = 0, jj = 0; - for (int r = center_y; r < image_height + vertical_pad - y_radius; - r += vertical_stride) { - for (int c = center_x; c < image_width + horizontal_pad - x_radius; - c += horizontal_stride) { - float val = (poolFunction == 0) ? -3.40282e+38 : 0; - int y_radius_var = y_radius - r; - int y_radius_var_max = y_radius_var + image_height; - int x_radius_var = x_radius - c; - int x_radius_var_max = x_radius_var + image_width; - int ki_min = (y_radius_var > 0) ? - ((y_radius_var < window_height) ? y_radius_var : -1) : 0; - int ki_max = (y_radius_var_max < window_height) ? - ((y_radius_var_max >= 0) ? y_radius_var_max : -1) : window_height; - int kj_min = (x_radius_var > 0) ? - ((x_radius_var < window_width) ? x_radius_var : -1) : 0; - int kj_max = (x_radius_var_max < window_width) ? - ((x_radius_var_max >= 0) ? x_radius_var_max : -1) : window_width; - - if(ki_min != ki_max && kj_min != kj_max && ki_min != -1 - && ki_max != -1 && kj_min != -1 && kj_max != -1) { - if(!poolFunction) { - for (int ki = 0; ki < window_height; ki++) { - for (int kj = 0; kj < window_width; kj++) { - val = maximum( - val, - input_data[b * (channels * image_height * image_width) + - ch * (image_height * image_width) + - (r - y_radius + ki) * image_width + (c - x_radius + kj)]); - } - } - } else { - for (int ki = 0; ki < window_height; ki++) { - for (int kj = 0; kj < window_width; kj++) { - val += input_data[b * (channels * image_height * image_width) - + ch * (image_height * image_width) + - (r - y_radius + ki) * image_width + (c - x_radius + kj)]; - } - } - } - } - if (poolFunction == 1) { - val /= window_height * window_width; - } - output_data[b * (channels * output_height * output_width) + - ch * (output_height * output_width) + ii * output_width + jj] = val; - jj++; - if (jj == output_width) { - jj = 0; - ii++; - } + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { + + Tensor *input = (Tensor *)input_ptr; + float *__restrict__ input_data = (float *)input->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + + int output_height = + 1 + ((image_height - window_height + 2 * vertical_pad) / vertical_stride); + int output_width = 1 + ((image_width - window_width + 2 * horizontal_pad) / + horizontal_stride); + + int center_x = (window_width - 1) / 2 - horizontal_pad; + int center_y = (window_height - 1) / 2 - vertical_pad; + int x_radius = (window_width - 1) / 2; + int y_radius = (window_height - 1) / 2; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, batch_size, channels, + output_height, output_width); + float *__restrict__ output_data = (float *)output->host_data; + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + int ii = 0, jj = 0; + for (int r = center_y; r < image_height + vertical_pad - y_radius; + r += vertical_stride) { + for (int c = center_x; c < image_width + horizontal_pad - x_radius; + c += horizontal_stride) { + float val = (poolFunction == 0) ? -3.40282e+38 : 0; + int y_radius_var = y_radius - r; + int y_radius_var_max = y_radius_var + image_height; + int x_radius_var = x_radius - c; + int x_radius_var_max = x_radius_var + image_width; + int ki_min = + (y_radius_var > 0) + ? ((y_radius_var < window_height) ? y_radius_var : -1) + : 0; + int ki_max = (y_radius_var_max < window_height) + ? ((y_radius_var_max >= 0) ? y_radius_var_max : -1) + : window_height; + int kj_min = (x_radius_var > 0) + ? ((x_radius_var < window_width) ? x_radius_var : -1) + : 0; + int kj_max = (x_radius_var_max < window_width) + ? ((x_radius_var_max >= 0) ? x_radius_var_max : -1) + : window_width; + + if (ki_min != ki_max && kj_min != kj_max && ki_min != -1 && + ki_max != -1 && kj_min != -1 && kj_max != -1) { + if (!poolFunction) { + for (int ki = 0; ki < window_height; ki++) { + for (int kj = 0; kj < window_width; kj++) { + val = maximum( + val, + input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + (r - y_radius + ki) * image_width + + (c - x_radius + kj)]); + } + } + } else { + for (int ki = 0; ki < window_height; ki++) { + for (int kj = 0; kj < window_width; kj++) { + val += + input_data[b * (channels * image_height * image_width) + + ch * (image_height * image_width) + + (r - y_radius + ki) * image_width + + (c - x_radius + kj)]; } + } } + } + if (poolFunction == 1) { + val /= window_height * window_width; + } + output_data[b * (channels * output_height * output_width) + + ch * (output_height * output_width) + ii * output_width + + jj] = val; + jj++; + if (jj == output_width) { + jj = 0; + ii++; + } } + } } - - return output; + } + + return output; } void *tensorTanhCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - - omp_set_num_threads(4); - #pragma omp parallel for - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = tanhf(input_data[i]); - } - - return input; + Tensor *input = (Tensor *)input_ptr; + + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + + omp_set_num_threads(4); +#pragma omp parallel for + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = tanhf(input_data[i]); + } + + return input; } void *tensorGemmCPU(void *lhs_ptr, void *rhs_ptr) { - Tensor *lhs = (Tensor *)lhs_ptr; - Tensor *rhs = (Tensor *)rhs_ptr; - - int m = lhs->dims.dim_sizes[0]; - int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; - - Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); - - float * __restrict__ lhs_arr = (float *)lhs->host_data; - float * __restrict__ rhs_arr = (float *)rhs->host_data; - float * __restrict__ output_arr = (float *)output->host_data; - - int k = 1; - #pragma unroll 4 // Can we unroll more??? - for (int j = 1; j < lhs->dims.num_dims; j++) { - k = k * lhs->dims.dim_sizes[j]; // input neurons - } - float *tran_rhs = (float *) malloc(sizeof(float) * k * n); - omp_set_num_threads(4); - #pragma omp parallel for simd - for (int l = 0; l < k; l++) { - for (int j = 0; j < n; j++) { - tran_rhs[j * k + l] = rhs_arr[l * n + j]; - } + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; + + int m = lhs->dims.dim_sizes[0]; + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; + + Tensor *output = (Tensor *)create4DTensorCPU(0, 0, m, n, 1, 1); + + float *__restrict__ lhs_arr = (float *)lhs->host_data; + float *__restrict__ rhs_arr = (float *)rhs->host_data; + float *__restrict__ output_arr = (float *)output->host_data; + + int k = 1; +#pragma unroll 4 // Can we unroll more??? + for (int j = 1; j < lhs->dims.num_dims; j++) { + k = k * lhs->dims.dim_sizes[j]; // input neurons + } + float *tran_rhs = (float *)malloc(sizeof(float) * k * n); + omp_set_num_threads(4); +#pragma omp parallel for simd + for (int l = 0; l < k; l++) { + for (int j = 0; j < n; j++) { + tran_rhs[j * k + l] = rhs_arr[l * n + j]; } - - #pragma omp parallel for - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - float sum = 0.0; - #pragma omp simd reduction(+:sum) - for (int l = 0; l < k; l++) { - sum += lhs_arr[i * k + l] * tran_rhs[j * k + l]; - } - output_arr[i * n + j] = sum; - } + } + +#pragma omp parallel for + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + float sum = 0.0; +#pragma omp simd reduction(+ : sum) + for (int l = 0; l < k; l++) { + sum += lhs_arr[i * k + l] * tran_rhs[j * k + l]; + } + output_arr[i * n + j] = sum; } - free(tran_rhs); - return output; + } + free(tran_rhs); + return output; } void *tensorSoftmaxCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - - float *logits = (float *)input->host_data; - int n = input->dims.dim_sizes[0]; - int c = input->dims.dim_sizes[1]; - - omp_set_num_threads(4); - #pragma omp parallel for - for (int i = 0; i < n; i++) { - float x = 0; - for(int j = i*c; j < c + i*c; j++) { - logits[j] = expf(logits[j]); - } - - #pragma omp simd reduction(+:x) - for(int j = i*c; j < i*c+c; j++) { - x += logits[j]; - } - - #pragma omp simd - for(int j = i*c; j < i*c + c; j++) { - logits[j] /= x; - } + Tensor *input = (Tensor *)input_ptr; + + float *logits = (float *)input->host_data; + int n = input->dims.dim_sizes[0]; + int c = input->dims.dim_sizes[1]; + + omp_set_num_threads(4); +#pragma omp parallel for + for (int i = 0; i < n; i++) { + float x = 0; + for (int j = i * c; j < c + i * c; j++) { + logits[j] = expf(logits[j]); } - return input; -} - -void *tensorBatchNormCPU(void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon) { - - Tensor* input = (Tensor*) input_ptr; - Tensor* gamma = (Tensor*) gamma_ptr; - Tensor* beta = (Tensor*) beta_ptr; - Tensor* mean = (Tensor*) mean_ptr; - Tensor* variance = (Tensor*) variance_ptr; - - float * __restrict__ host_image = (float *)input->host_data; - float * __restrict__ host_beta = (float *)beta->host_data; - float * __restrict__ host_gamma = (float *)gamma->host_data; - float * __restrict__ host_mean = (float *)mean->host_data; - float * __restrict__ host_variance = (float *)variance->host_data; - - float alpha_val = 1.0f, beta_val = 0.0f; - size_t num_elems = input->num_elems; - - int batch_size = input->dims.dim_sizes[0]; - int channels = input->dims.dim_sizes[1]; - int image_height = input->dims.dim_sizes[2]; - int image_width = input->dims.dim_sizes[3]; - int image_dim = image_height * image_width; +#pragma omp simd reduction(+ : x) + for (int j = i * c; j < i * c + c; j++) { + x += logits[j]; + } - omp_set_num_threads(4); - #pragma omp parallel for - for(int b = 0; b < batch_size; b++) { - for(int ch = 0; ch < channels; ch++) { - float mean = 0; - #pragma omp simd reduction(+:mean) - for(int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - mean += host_image[index]; - } - mean = mean / channels; - - float variance = 0; - #pragma omp simd reduction(+:variance) - for(int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - float tmp = host_image[index] - mean; - variance += (tmp * tmp); - } - variance = variance / channels; - - #pragma omp simd - for(int i = 0; i < image_dim; i++) { - int index = b * channels * image_dim + ch * image_dim + i; - host_image[index] = host_beta[ch] - + (host_gamma[ch] * ((host_image[index] - mean) / sqrt(epsilon + variance))); - } - } +#pragma omp simd + for (int j = i * c; j < i * c + c; j++) { + logits[j] /= x; } - return input; + } + + return input; } - void *tensorReluCPU(void *input_ptr) { - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - - #pragma omp simd - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = (input_data[i] < 0) ? 0 : input_data[i]; +void *tensorBatchNormCPU(void *input_ptr, void *gamma_ptr, void *beta_ptr, + void *mean_ptr, void *variance_ptr, double epsilon) { + + Tensor *input = (Tensor *)input_ptr; + Tensor *gamma = (Tensor *)gamma_ptr; + Tensor *beta = (Tensor *)beta_ptr; + Tensor *mean = (Tensor *)mean_ptr; + Tensor *variance = (Tensor *)variance_ptr; + + float *__restrict__ host_image = (float *)input->host_data; + float *__restrict__ host_beta = (float *)beta->host_data; + float *__restrict__ host_gamma = (float *)gamma->host_data; + float *__restrict__ host_mean = (float *)mean->host_data; + float *__restrict__ host_variance = (float *)variance->host_data; + + float alpha_val = 1.0f, beta_val = 0.0f; + size_t num_elems = input->num_elems; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + int image_height = input->dims.dim_sizes[2]; + int image_width = input->dims.dim_sizes[3]; + int image_dim = image_height * image_width; + + omp_set_num_threads(4); +#pragma omp parallel for + for (int b = 0; b < batch_size; b++) { + for (int ch = 0; ch < channels; ch++) { + float mean = 0; +#pragma omp simd reduction(+ : mean) + for (int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + mean += host_image[index]; + } + mean = mean / channels; + + float variance = 0; +#pragma omp simd reduction(+ : variance) + for (int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + float tmp = host_image[index] - mean; + variance += (tmp * tmp); + } + variance = variance / channels; + +#pragma omp simd + for (int i = 0; i < image_dim; i++) { + int index = b * channels * image_dim + ch * image_dim + i; + host_image[index] = + host_beta[ch] + (host_gamma[ch] * ((host_image[index] - mean) / + sqrt(epsilon + variance))); + } } + } + return input; +} - return input; +void *tensorReluCPU(void *input_ptr) { + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + +#pragma omp simd + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = (input_data[i] < 0) ? 0 : input_data[i]; + } + + return input; } void *tensorRelu2CPU(void *input_ptr, float min, float max) { - Tensor *input = (Tensor *)input_ptr; - float *input_data = (float *)input->host_data; - size_t num_elems = input->num_elems; - - #pragma omp simd - for (size_t i = 0; i < num_elems; i++) { - input_data[i] = (input_data[i] < min) ? min : ((input_data[i] > max) ? - max : input_data[i]); - } - - return input; -} + Tensor *input = (Tensor *)input_ptr; + float *input_data = (float *)input->host_data; + size_t num_elems = input->num_elems; + +#pragma omp simd + for (size_t i = 0; i < num_elems; i++) { + input_data[i] = (input_data[i] < min) + ? min + : ((input_data[i] > max) ? max : input_data[i]); + } + + return input; +} diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index 1b28ccaa191ba3e52255b934c894e543dd052773..253f7614337908e72c82ba986f860dd58c7c9b3f 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -1,8 +1,9 @@ -/* This file includes the API implementation of the HPVM tensor runtime built on cublas, cudnn -** -** Author: Hashim Sharif -** Email: hsharif3@illinois.edu -*/ +/* This file includes the API implementation of the HPVM tensor runtime built on + *cublas, cudnn + ** + ** Author: Hashim Sharif + ** Email: hsharif3@illinois.edu + */ #include <stdio.h> #include <stdarg.h> @@ -31,7 +32,6 @@ #include <cuda_fp16.h> #include <driver_types.h> - // Tensor runtime header files #include "tensor_runtime.h" #include "tensor_utils.h" @@ -46,202 +46,177 @@ #include "half_precision_api.h" #include "approx_simulation.h" +// FIXIT: tensorAdd currently only works for 4D tensors +void *tensorAdd(void *x_ptr, void *bias_ptr) { + Tensor *x = (Tensor *)x_ptr; + Tensor *bias = (Tensor *)bias_ptr; - - -// FIXIT: tensorAdd currently only works for 4D tensors -void* tensorAdd(void* x_ptr, void* bias_ptr){ - - Tensor* x = (Tensor*) x_ptr; - Tensor* bias = (Tensor*) bias_ptr; - - INFO("*** TensorAdd \n"); + INFO("*** TensorAdd \n"); profileEvent("Add"); - + float alpha = 1.0f; - //float beta = 0.0f; + // float beta = 0.0f; hostToDeviceCopy(x); hostToDeviceCopy(bias); convertToFP32(x); convertToFP32(bias); - DEBUG("x->num_elems = %d \n", x->num_elems); DEBUG("bias->num_elems = %d \n", bias->num_elems); - if(cudnnHandle == NULL){ - ERROR("cudnnHandle NOT initialized!! \n"); + if (cudnnHandle == NULL) { + ERROR("cudnnHandle NOT initialized!! \n"); } - + // FIXIT: routine fails for 3D tensors checkCUDNN(cudnnAddTensor(cudnnHandle, &alpha, bias->tensor_desc, - bias->gpu_data, &alpha, x->tensor_desc, x->gpu_data)); + bias->gpu_data, &alpha, x->tensor_desc, + x->gpu_data)); profileEvent("Add_end", true); return x; } - // FIXIT: Generalize all of the routines for types {half, float, double} -void* tensorConvolution(void* input_ptr, void* filter_ptr, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - +void *tensorConvolution(void *input_ptr, void *filter_ptr, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups) { + INFO("*** TensorConvolution \n"); profileEvent("Conv"); - Tensor* input = (Tensor*) input_ptr; - Tensor* filter = (Tensor*) filter_ptr; - + Tensor *input = (Tensor *)input_ptr; + Tensor *filter = (Tensor *)filter_ptr; + cudnnConvolutionDescriptor_t convDesc; cudnnConvolutionFwdAlgo_t convAlgo; cudnnConvolutionMode_t mode; - if(conv_mode == 0) + if (conv_mode == 0) mode = CUDNN_CONVOLUTION; - else if(conv_mode == 1) + else if (conv_mode == 1) mode = CUDNN_CROSS_CORRELATION; mode = CUDNN_CROSS_CORRELATION; // FIXIT: Need to be more aware of the implications of alpha and beta float alpha = 1.0f, beta = 0.0f; - - // TODO: Support other cases; + + // TODO: Support other cases; hostToDeviceCopy(input); hostToDeviceCopy(filter); convertToFP32(input); convertToFP32(filter); - - DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride); + DEBUG("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, + horizontal_stride); checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); - //FIXME: Current hack to preserve backward compatibilty - if(conv_groups == 0){ + // FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { conv_groups = 1; - } - - + } + cudnnDataType_t computeType = CUDNN_DATA_FLOAT; - - checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc, - vertical_pad, horizontal_pad, // conv padding - vertical_stride, horizontal_stride, // conv strides - 1, 1, // upscaling values - mode , // mode is configurable - computeType)); // defines compute precision + + checkCUDNN(cudnnSetConvolution2dDescriptor( + convDesc, vertical_pad, horizontal_pad, // conv padding + vertical_stride, horizontal_stride, // conv strides + 1, 1, // upscaling values + mode, // mode is configurable + computeType)); // defines compute precision // NOTE: Set conv groups for grouped convolution e.g., depthwise convolution checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups)); - int n, c, h, w; // output dimensions + int n, c, h, w; // output dimensions // Find dimension of convolution output - if(input->tensor_desc == NULL || filter->filter_desc == NULL) + if (input->tensor_desc == NULL || filter->filter_desc == NULL) ERROR("Input or Filter descriptor is NULL"); - - checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc, - input->tensor_desc, - filter->filter_desc, - &n, &c, &h, &w)); - + checkCUDNN(cudnnGetConvolution2dForwardOutputDim( + convDesc, input->tensor_desc, filter->filter_desc, &n, &c, &h, &w)); + DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w); - Tensor* output; - if(input->data_format == CUDNN_TENSOR_NCHW) - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, n, c, h, w); - else if(input->data_format == CUDNN_TENSOR_NHWC){ + Tensor *output; + if (input->data_format == CUDNN_TENSOR_NCHW) + output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + else if (input->data_format == CUDNN_TENSOR_NHWC) { DEBUG("* NHWC Format \n"); - output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NHWC, n, h, w, c); - } - else + output = (Tensor *)create4DTensor((cudnnDataType_t)float_type, + CUDNN_TENSOR_NHWC, n, h, w, c); + } else ERROR("Unsupported Tensor Type"); // NOTE: Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); + changeTensorPlacement(output, DEVICE); // NOTE: Necessary to insert the above call for every output tensor - - DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n", - output->data_type, output->data_format, output->dims.dim_sizes[0], - output->dims.dim_sizes[1], - output->dims.dim_sizes[2], output->dims.dim_sizes[3]); - - if(convDesc == NULL || input->tensor_desc == NULL || - filter->filter_desc == NULL || output->tensor_desc == NULL) - ERROR("NULL descriptor! \n"); + DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = " + "%d, W = %d \n", + output->data_type, output->data_format, output->dims.dim_sizes[0], + output->dims.dim_sizes[1], output->dims.dim_sizes[2], + output->dims.dim_sizes[3]); + + if (convDesc == NULL || input->tensor_desc == NULL || + filter->filter_desc == NULL || output->tensor_desc == NULL) + ERROR("NULL descriptor! \n"); // Debugging info prints printTensorDescInfo(input); printTensorDescInfo(filter); printTensorDescInfo(output); - // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking - checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, - //CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, - 0, - &convAlgo)); - - + // NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support + // is lacking + checkCUDNN(cudnnGetConvolutionForwardAlgorithm( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, + // CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, + 0, &convAlgo)); + DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); - + CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD); // NOTE: Currently using GEMM based convolution - other algorithms available - // TODO: Benchmark other convolution algorithms e.g., winograd + // TODO: Benchmark other convolution algorithms e.g., winograd convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; size_t workspace_size; - checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, - input->tensor_desc, - filter->filter_desc, - convDesc, - output->tensor_desc, - convAlgo, - &workspace_size)); + checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle, input->tensor_desc, filter->filter_desc, convDesc, + output->tensor_desc, convAlgo, &workspace_size)); // Allocating memory for the convolution workspace - void* workspace; - checkCudaErrors(cudaMalloc(&workspace, workspace_size)); + void *workspace; + checkCudaErrors(cudaMalloc(&workspace, workspace_size)); DEBUG("workspace size = %d \n", workspace_size); + checkCUDNN(cudnnConvolutionForward( + cudnnHandle, &alpha, input->tensor_desc, input->gpu_data, + filter->filter_desc, filter->gpu_data, convDesc, convAlgo, workspace, + workspace_size, &beta, output->tensor_desc, output->gpu_data)); - checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc, - input->gpu_data, filter->filter_desc, filter->gpu_data, - convDesc, convAlgo, workspace, workspace_size, - &beta, output->tensor_desc, output->gpu_data)); - profileEvent("Conv_end", true); return output; } - - // NOTE: Supports Max and Avg Pooling -void* tensorPooling(void* input_ptr, - int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride){ +void *tensorPooling(void *input_ptr, int poolFunction, int window_height, + int window_width, int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride) { INFO("*** TensorPooling \n"); profileEvent("Pool"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; cudnnPoolingDescriptor_t poolDesc; // FIXIT: Need to be more aware of the implications of alpha and beta @@ -251,65 +226,57 @@ void* tensorPooling(void* input_ptr, convertToFP32(input); - - checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); + checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc)); int n = input->dims.dim_sizes[0]; int c = input->dims.dim_sizes[1]; - int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / vertical_stride; + int h = (input->dims.dim_sizes[2] + (2 * vertical_pad) - window_height) / + vertical_stride; h = h + 1; - int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / horizontal_stride; + int w = (input->dims.dim_sizes[3] + (2 * horizontal_pad) - window_width) / + horizontal_stride; w = w + 1; - DEBUG("n = %d, c = %d, h = %d, w = %d , dim1 = %d , dim2 = %d \n", - n, c, h, w, input->dims.dim_sizes[2], input->dims.dim_sizes[3]); - + DEBUG("n = %d, c = %d, h = %d, w = %d , dim1 = %d , dim2 = %d \n", n, c, h, w, + input->dims.dim_sizes[2], input->dims.dim_sizes[3]); - Tensor* output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, n, c, h, w); + Tensor *output = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, n, c, h, w); // Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); + changeTensorPlacement(output, DEVICE); // FIXIT: The output tensor is hardcoded to NCHW - checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_desc, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - n, c, - h, w)); + checkCUDNN(cudnnSetTensor4dDescriptor(output->tensor_desc, CUDNN_TENSOR_NCHW, + CUDNN_DATA_FLOAT, n, c, h, w)); // Select between Max-Pooling and Avg-Pooling cudnnPoolingMode_t pool_mode; - if(poolFunction == 0) + if (poolFunction == 0) pool_mode = CUDNN_POOLING_MAX; - else if(poolFunction == 1) + else if (poolFunction == 1) pool_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - - checkCUDNN(cudnnSetPooling2dDescriptor(poolDesc, - pool_mode, - CUDNN_PROPAGATE_NAN, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride)); - - checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, input->tensor_desc, - input->gpu_data, &beta, output->tensor_desc, output->gpu_data)); + + checkCUDNN(cudnnSetPooling2dDescriptor( + poolDesc, pool_mode, CUDNN_PROPAGATE_NAN, window_height, window_width, + vertical_pad, horizontal_pad, vertical_stride, horizontal_stride)); + + checkCUDNN(cudnnPoolingForward(cudnnHandle, poolDesc, &alpha, + input->tensor_desc, input->gpu_data, &beta, + output->tensor_desc, output->gpu_data)); profileEvent("Pool_end", true); return output; } - - - - -/* Reference Implementation based on: https://gist.github.com/peterwittek/6303527 */ -void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ +/* Reference Implementation based on: + * https://gist.github.com/peterwittek/6303527 */ +void *tensorGemmGPU(void *lhs_ptr, void *rhs_ptr) { INFO("*** TensorGemmGPU \n"); profileEvent("Mul"); - Tensor* lhs = (Tensor*) lhs_ptr; - Tensor* rhs = (Tensor*) rhs_ptr; - + Tensor *lhs = (Tensor *)lhs_ptr; + Tensor *rhs = (Tensor *)rhs_ptr; DEBUG("rhs->dims.num_dims = %d \n", rhs->dims.num_dims); DEBUG("lhs->dims.num_dims = %d \n", lhs->dims.num_dims); @@ -319,30 +286,30 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ // 'm' holds the batch dimension - assuming NCHW format Tensors int m = lhs->dims.dim_sizes[0]; // The rhs last dimension must contain the neurons - int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons + int n = rhs->dims.dim_sizes[rhs->dims.num_dims - 1]; // output neurons int k = 1; - + // Flattening the dimensions after the batch dimension // NOTE: Allowing any number of dimensions > 2 for lhs - for (int j = 1 ; j < lhs->dims.num_dims; j++){ + for (int j = 1; j < lhs->dims.num_dims; j++) { k = k * lhs->dims.dim_sizes[j]; // input neurons } - int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims - 2]; // Dimension-note: Check if k is same across the two tensors DEBUG("m = %d, n = %d, k = %d \n", m, n, k); - if(rhs_k != k){ + if (rhs_k != k) { ERROR("rhs=%d and lhs=%d columns/rows don't match", rhs_k, k); } - Tensor* output = NULL; + Tensor *output = NULL; DEBUG("Creating new TENSOR * \n"); - output = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1); + output = + (Tensor *)create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, m, n, 1, 1); - DEBUG("Changing placement *\n"); // Changing output tensor placement from host to device - changeTensorPlacement(output, DEVICE); + changeTensorPlacement(output, DEVICE); DEBUG("Changed Placement * \n\n"); @@ -352,122 +319,105 @@ void* tensorGemmGPU(void* lhs_ptr, void* rhs_ptr ){ convertToFP32(lhs); convertToFP32(rhs); - DEBUG("CuBlasSgemm *\n"); - + // INFO: cuBlas uses column-major format // INFO: The leading dimension is just the FIRST Dimension - // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN expects - checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, - n, m, k, - &alpha, - (float*) rhs->gpu_data, n, - (float*) lhs->gpu_data, k, - &beta, - (float*) output->gpu_data, n)); - - + // IMP: output is N * M in column-major format, M*N in row-major - what cuDNN + // expects + checkCudaErrors(cublasSgemm(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, + &alpha, (float *)rhs->gpu_data, n, + (float *)lhs->gpu_data, k, &beta, + (float *)output->gpu_data, n)); + profileEvent("Mul_end", true); return output; } - - - - - -void* tensorRelu(void* input_ptr){ +void *tensorRelu(void *input_ptr) { DEBUG("*** TensorRelu \n"); profileEvent("Relu"); - Tensor* input = (Tensor*) input_ptr; - + Tensor *input = (Tensor *)input_ptr; + cudnnActivationDescriptor_t reluDesc; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); convertToFP32(input); - - + checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc)); checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_RELU, - CUDNN_PROPAGATE_NAN, 0.0)); + CUDNN_PROPAGATE_NAN, 0.0)); checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha, - input->tensor_desc, input->gpu_data, &beta, - input->tensor_desc, input->gpu_data)); + input->tensor_desc, input->gpu_data, &beta, + input->tensor_desc, input->gpu_data)); profileEvent("Relu_end", true); return input; } - // Think: Should Softmax be broken into multiple IR operations? -void* tensorSoftmax(void* input_ptr){ +void *tensorSoftmax(void *input_ptr) { INFO("*** TensorSoftmax \n"); profileEvent("Softmax"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; float alpha = 1.0f, beta = 0.0f; hostToDeviceCopy(input); - convertToFP32(input); - - checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, input->tensor_desc, input->gpu_data, &beta, - input->tensor_desc, input->gpu_data)); + convertToFP32(input); + + checkCUDNN(cudnnSoftmaxForward(cudnnHandle, CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, &alpha, + input->tensor_desc, input->gpu_data, &beta, + input->tensor_desc, input->gpu_data)); - deviceToHostCopy(input); + deviceToHostCopy(input); profileEvent("Softmax_end", true); - + return input; } - - - -void* tensorRelu2(void* input_ptr, float min, float max){ +void *tensorRelu2(void *input_ptr, float min, float max) { INFO("*** TensorClippedRelu *** \n"); profileEvent("Relu"); cudnnActivationDescriptor_t reluDesc; float alpha = 1.0f, beta = 0.0f; - - Tensor* input = (Tensor*) input_ptr; + + Tensor *input = (Tensor *)input_ptr; hostToDeviceCopy(input); convertToFP32(input); - checkCUDNN(cudnnCreateActivationDescriptor(&reluDesc)); - checkCUDNN(cudnnSetActivationDescriptor(reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, - CUDNN_PROPAGATE_NAN, max)); + checkCUDNN(cudnnSetActivationDescriptor( + reluDesc, CUDNN_ACTIVATION_CLIPPED_RELU, CUDNN_PROPAGATE_NAN, max)); checkCUDNN(cudnnActivationForward(cudnnHandle, reluDesc, &alpha, - input->tensor_desc, input->gpu_data, &beta, - input->tensor_desc, input->gpu_data)); + input->tensor_desc, input->gpu_data, &beta, + input->tensor_desc, input->gpu_data)); - - profileEvent("Relu_end", true); return input; } - -void* tensorTanh(void* input_ptr){ +void *tensorTanh(void *input_ptr) { INFO("*** TensorTanh \n"); profileEvent("Tanh"); - Tensor* input = (Tensor*) input_ptr; - + Tensor *input = (Tensor *)input_ptr; + cudnnActivationDescriptor_t tanhDesc; float alpha = 1.0f, beta = 0.0f; @@ -475,39 +425,36 @@ void* tensorTanh(void* input_ptr){ convertToFP32(input); - checkCUDNN(cudnnCreateActivationDescriptor(&tanhDesc)); checkCUDNN(cudnnSetActivationDescriptor(tanhDesc, CUDNN_ACTIVATION_TANH, - CUDNN_PROPAGATE_NAN, 0.0)); + CUDNN_PROPAGATE_NAN, 0.0)); checkCUDNN(cudnnActivationForward(cudnnHandle, tanhDesc, &alpha, - input->tensor_desc, input->gpu_data, &beta, - input->tensor_desc, input->gpu_data)); + input->tensor_desc, input->gpu_data, &beta, + input->tensor_desc, input->gpu_data)); profileEvent("Tanh_end", true); return input; } - - - -void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon){ +void *tensorBatchNorm(void *input_ptr, void *gamma_ptr, void *beta_ptr, + void *mean_ptr, void *variance_ptr, double epsilon) { INFO("*** TensorBatchNorm \n"); profileEvent("BatchNorm"); - Tensor* input = (Tensor*) input_ptr; - Tensor* gamma = (Tensor*) gamma_ptr; - Tensor* beta = (Tensor*) beta_ptr; - Tensor* mean = (Tensor*) mean_ptr; - Tensor* variance = (Tensor*) variance_ptr; + Tensor *input = (Tensor *)input_ptr; + Tensor *gamma = (Tensor *)gamma_ptr; + Tensor *beta = (Tensor *)beta_ptr; + Tensor *mean = (Tensor *)mean_ptr; + Tensor *variance = (Tensor *)variance_ptr; - if (input == NULL || gamma == NULL || beta == NULL || mean == NULL || variance == NULL){ + if (input == NULL || gamma == NULL || beta == NULL || mean == NULL || + variance == NULL) { ERROR("NULL Input Tensor"); } - + float alpha_val = 1.0f, beta_val = 0.0f; hostToDeviceCopy(input); hostToDeviceCopy(gamma); @@ -517,133 +464,127 @@ void* tensorBatchNorm(void* input_ptr, void* gamma_ptr, void* beta_ptr, convertToFP32(input); - - - checkCUDNN(cudnnBatchNormalizationForwardInference(cudnnHandle, CUDNN_BATCHNORM_SPATIAL, - &alpha_val, &beta_val, - input->tensor_desc, input->gpu_data, - input->tensor_desc, input->gpu_data, - gamma->tensor_desc, gamma->gpu_data, - beta->gpu_data, mean->gpu_data, - variance->gpu_data, - epsilon)); + checkCUDNN(cudnnBatchNormalizationForwardInference( + cudnnHandle, CUDNN_BATCHNORM_SPATIAL, &alpha_val, &beta_val, + input->tensor_desc, input->gpu_data, input->tensor_desc, input->gpu_data, + gamma->tensor_desc, gamma->gpu_data, beta->gpu_data, mean->gpu_data, + variance->gpu_data, epsilon)); profileEvent("BatchNorm_end", true); return input; } - - - // TODO: benchmark performance of tensorSplit -void** tensorSplit(void* tensor_ptr, int num_splits, int split_dim){ +void **tensorSplit(void *tensor_ptr, int num_splits, int split_dim) { - INFO("*** TensorSplit \n"); + INFO("*** TensorSplit \n"); profileEvent("tensorSplit"); - Tensor* tensor = (Tensor*) tensor_ptr; - + Tensor *tensor = (Tensor *)tensor_ptr; + deviceToHostCopy(tensor); // Splitting done on the host - Tensor** splits = (Tensor**) malloc(sizeof(Tensor*) * num_splits); - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * tensor->dims.num_dims); - for(unsigned int i = 0; i < tensor->dims.num_dims; i++){ + Tensor **splits = (Tensor **)malloc(sizeof(Tensor *) * num_splits); + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * tensor->dims.num_dims); + for (unsigned int i = 0; i < tensor->dims.num_dims; i++) { dim_sizes[i] = tensor->dims.dim_sizes[i]; } - dim_sizes[split_dim] = tensor->dims.dim_sizes[split_dim] / num_splits; - if(dim_sizes[split_dim] < 1) + if (dim_sizes[split_dim] < 1) ERROR("Split Dimension < 1 after splitting"); size_t copy_size = getTypeSize(tensor->data_type); - for(unsigned int i = split_dim; i < tensor->dims.num_dims; i++){ + for (unsigned int i = split_dim; i < tensor->dims.num_dims; i++) { copy_size = copy_size * dim_sizes[i]; } - - for(unsigned int i = 0; i < num_splits; i++){ - DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] = %d \n", - dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); + for (unsigned int i = 0; i < num_splits; i++) { + + DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, " + "dim_sizes[3] = %d \n", + dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); + + Tensor *split = (Tensor *)create4DTensor( + tensor->data_type, tensor->data_format, dim_sizes[0], dim_sizes[1], + dim_sizes[2], dim_sizes[3]); - Tensor* split = (Tensor*) create4DTensor(tensor->data_type, tensor->data_format, - dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); - size_t copy_start = i * copy_size; size_t copy_stride = num_splits * copy_size; - DEBUG("copy_size = %d, copy_start = %d, copy_stride = %d, tensor->size_in_bytes = %d \n", - copy_size, copy_start, copy_stride, tensor->size_in_bytes); + DEBUG("copy_size = %d, copy_start = %d, copy_stride = %d, " + "tensor->size_in_bytes = %d \n", + copy_size, copy_start, copy_stride, tensor->size_in_bytes); int index = 0; - while(copy_start + copy_size <= tensor->size_in_bytes){ - memcpy(((char*) split->host_data + (index * copy_size)), - ((char*)tensor->host_data + copy_start), - copy_size); + while (copy_start + copy_size <= tensor->size_in_bytes) { + memcpy(((char *)split->host_data + (index * copy_size)), + ((char *)tensor->host_data + copy_start), copy_size); copy_start += copy_stride; index++; } - - splits[i] = split; + + splits[i] = split; } profileEvent("tensorSplit_end", true); - return (void**) splits; + return (void **)splits; } +void *tensorConcat(void **tensors_ptr, int num_splits, int split_dim) { -void* tensorConcat(void** tensors_ptr, int num_splits, int split_dim){ - - INFO("*** TensorConcat \n"); + INFO("*** TensorConcat \n"); profileEvent("tensorConcat"); - Tensor** tensors = (Tensor**) tensors_ptr; + Tensor **tensors = (Tensor **)tensors_ptr; - for(int i = 0; i < num_splits; i++){ + for (int i = 0; i < num_splits; i++) { deviceToHostCopy(tensors[i]); // Concatenation done on the host } - + // The no of dimensions of concatenated tensor are the same - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * tensors[0]->dims.num_dims); - for(unsigned int i = 0; i < tensors[0]->dims.num_dims; i++){ + size_t *dim_sizes = + (size_t *)malloc(sizeof(size_t) * tensors[0]->dims.num_dims); + for (unsigned int i = 0; i < tensors[0]->dims.num_dims; i++) { dim_sizes[i] = tensors[0]->dims.dim_sizes[i]; } - + size_t copy_size = getTypeSize(tensors[0]->data_type); - for(unsigned int i = split_dim; i < tensors[0]->dims.num_dims; i++){ + for (unsigned int i = split_dim; i < tensors[0]->dims.num_dims; i++) { copy_size = copy_size * dim_sizes[i]; } dim_sizes[split_dim] = dim_sizes[split_dim] * num_splits; - if(dim_sizes[split_dim] < 1) + if (dim_sizes[split_dim] < 1) ERROR("Split Dimension < 1 after concat"); - Tensor* output = (Tensor*) create4DTensor(tensors[0]->data_type, tensors[0]->data_format, - dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); - - DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] = %d \n", - dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); + Tensor *output = (Tensor *)create4DTensor( + tensors[0]->data_type, tensors[0]->data_format, dim_sizes[0], + dim_sizes[1], dim_sizes[2], dim_sizes[3]); + DEBUG("dim_sizes[0] = %d, dim_sizes[1] = %d, dim_sizes[2] = %d, dim_sizes[3] " + "= %d \n", + dim_sizes[0], dim_sizes[1], dim_sizes[2], dim_sizes[3]); int num_copies = 1; - for(unsigned int i = 0; i < split_dim; i++){ + for (unsigned int i = 0; i < split_dim; i++) { num_copies = num_copies * dim_sizes[i]; } - + size_t copy_stride = num_splits * copy_size; - DEBUG("copy_size = %d, num_copies = %d, copy_stride = %d, output->size_in_bytes = %d \n", - copy_size, num_copies, copy_stride, output->size_in_bytes); + DEBUG("copy_size = %d, num_copies = %d, copy_stride = %d, " + "output->size_in_bytes = %d \n", + copy_size, num_copies, copy_stride, output->size_in_bytes); - for(unsigned int i = 0; i < num_copies; i++){ + for (unsigned int i = 0; i < num_copies; i++) { // FIXIT: Don't be specific to 4D tensors size_t copy_start = i * copy_stride; - - for(int j = 0; j < num_splits; j++){ - struct Tensor* split = tensors[j]; - memcpy(((char*) output->host_data + copy_start + (j * copy_size)), - ((char*) split->host_data + (i * copy_size)), - copy_size); - } + + for (int j = 0; j < num_splits; j++) { + struct Tensor *split = tensors[j]; + memcpy(((char *)output->host_data + copy_start + (j * copy_size)), + ((char *)split->host_data + (i * copy_size)), copy_size); + } } profileEvent("tensorConcat_end", true); @@ -651,15 +592,13 @@ void* tensorConcat(void** tensors_ptr, int num_splits, int split_dim){ return output; } +void *tensorLRN(void *input_ptr, unsigned int LRN_window, double LRN_alpha, + double LRN_beta, double LRN_k) { - -void* tensorLRN(void* input_ptr, unsigned int LRN_window, - double LRN_alpha, double LRN_beta, double LRN_k){ - - INFO("*** TensorLRN \n"); + INFO("*** TensorLRN \n"); profileEvent("tensorLRN"); - Tensor* input = (Tensor*) input_ptr; + Tensor *input = (Tensor *)input_ptr; hostToDeviceCopy(input); @@ -667,29 +606,28 @@ void* tensorLRN(void* input_ptr, unsigned int LRN_window, cudnnLRNDescriptor_t LRNDesc; checkCUDNN(cudnnCreateLRNDescriptor(&LRNDesc)); - DEBUG("window = %d, LRN_alpha = %f, LRN_beta = %f, LRN_k = %f \n", - LRN_window, LRN_alpha, LRN_beta, LRN_k); - - - checkCUDNN(cudnnSetLRNDescriptor(LRNDesc, LRN_window, LRN_alpha, LRN_beta, LRN_k)); + DEBUG("window = %d, LRN_alpha = %f, LRN_beta = %f, LRN_k = %f \n", LRN_window, + LRN_alpha, LRN_beta, LRN_k); + + checkCUDNN( + cudnnSetLRNDescriptor(LRNDesc, LRN_window, LRN_alpha, LRN_beta, LRN_k)); - size_t* dim_sizes = input->dims.dim_sizes; - Tensor* output = (Tensor*) create4DTensor((cudnnDataType_t) float_type, - CUDNN_TENSOR_NCHW, dim_sizes[0], dim_sizes[1], - dim_sizes[2], dim_sizes[3]); + size_t *dim_sizes = input->dims.dim_sizes; + Tensor *output = (Tensor *)create4DTensor( + (cudnnDataType_t)float_type, CUDNN_TENSOR_NCHW, dim_sizes[0], + dim_sizes[1], dim_sizes[2], dim_sizes[3]); - changeTensorPlacement(output, DEVICE); + changeTensorPlacement(output, DEVICE); printTensorDescInfo(input); printTensorDescInfo(output); - - checkCUDNN(cudnnLRNCrossChannelForward(cudnnHandle, LRNDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, - &alpha, input->tensor_desc, input->gpu_data, - &beta, output->tensor_desc, output->gpu_data)); + + checkCUDNN(cudnnLRNCrossChannelForward( + cudnnHandle, LRNDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, + input->tensor_desc, input->gpu_data, &beta, output->tensor_desc, + output->gpu_data)); profileEvent("tensorLRN_end", true); - + return output; } - - diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu index 079a9898294b01ba8dfcb575f11998790f24abfa..f6bfe700b44c88fea06c6a76267b49af4a523716 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_utils.cu @@ -1,13 +1,12 @@ //===--------------------------- tensor_utils.cu --------------------------===// // //===----------------------------------------------------------------------===// -// +// // This file consists of the custom implementation of utility functions // useful for approximated and non-approximated versions of tensor operations. // //===----------------------------------------------------------------------===// - #include <stdio.h> #include <stdlib.h> #include <stdarg.h> @@ -42,18 +41,15 @@ #include "global_data.h" #include "fp16_gemm.h" +extern "C" { - -extern "C"{ - - -void freeTensor(void* tensor_ptr){ - Tensor* tensor = (Tensor*) tensor_ptr; +void freeTensor(void *tensor_ptr) { + Tensor *tensor = (Tensor *)tensor_ptr; tensors_ptr.erase(tensor->gpu_data); tensors_ptr.erase(tensor->gpu_half_data); host_ptr.erase(tensor->host_data); - + cudaFree(tensor->gpu_data); tensor->gpu_data = nullptr; cudaFree(tensor->gpu_half_data); @@ -62,43 +58,42 @@ void freeTensor(void* tensor_ptr){ tensor->host_data = nullptr; } - // Returns the size of the target datatype -int getTypeSize(int data_type){ +int getTypeSize(int data_type) { // TODO: Add support for more data types switch (data_type) { - case float_type: - return 4; - case double_type: - return 8; - case half_type: - return 2; - case int_type: - return 1; - case float2_type: - return 8; - case half2_type: - return 4; - default: - ERROR("Unknown type %d\n", data_type); + case float_type: + return 4; + case double_type: + return 8; + case half_type: + return 2; + case int_type: + return 1; + case float2_type: + return 8; + case half2_type: + return 4; + default: + ERROR("Unknown type %d\n", data_type); } return 0; } -static int getFullPrecTypeSize(int data_type){ +static int getFullPrecTypeSize(int data_type) { switch (data_type) { - case float_type: - case half_type: - return 4; - case double_type: - return 8; - case int_type: - return 1; - case float2_type: - case half2_type: - return 8; - default: - ERROR("Unknown type %d\n", data_type); + case float_type: + case half_type: + return 4; + case double_type: + return 8; + case int_type: + return 1; + case float2_type: + case half2_type: + return 8; + default: + ERROR("Unknown type %d\n", data_type); } return 0; } @@ -107,7 +102,7 @@ static bool isFP16Compound(int data_type) { return data_type == half_type || data_type == half2_type; } -void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){ +void setSizeInBytes(struct Tensor *tensor, int data_type, size_t num_elems) { int type_size = getTypeSize(data_type); size_t size_in_bytes = type_size * num_elems; tensor->size_in_bytes = size_in_bytes; @@ -115,18 +110,20 @@ void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){ DEBUG("***--- size_in_bytes = %d \n", size_in_bytes); } - // NOTE: Always allocates FP32 on Host, FP32/FP16 for Device (GPU) -void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){ +void allocateMem(struct Tensor *tensor, int data_type, size_t num_elems) { setSizeInBytes(tensor, data_type, num_elems); tensor->data_type = data_type; - tensor->cur_type = data_type; // type maintained for hanlding FP32 <-> FP16 conversions + tensor->cur_type = + data_type; // type maintained for hanlding FP32 <-> FP16 conversions tensor->num_elems = num_elems; - - size_t size_on_host = num_elems * getFullPrecTypeSize(data_type); // NOTE: On host, always FP32 - tensor->host_data = (void*) malloc(size_on_host); // Allocate memory on the host - tensor->data_placement = HOST; // By defaut data is on the host - + + size_t size_on_host = + num_elems * getFullPrecTypeSize(data_type); // NOTE: On host, always FP32 + tensor->host_data = + (void *)malloc(size_on_host); // Allocate memory on the host + tensor->data_placement = HOST; // By defaut data is on the host + DEBUG("Attempting to Allocate = %lu \n\n\n", tensor->size_in_bytes); if (isFP16Compound(data_type)) { @@ -142,23 +139,25 @@ void allocateMem(struct Tensor* tensor, int data_type, size_t num_elems){ } tracked_tensors[tensor] = 1; // For FP16-FP32 data handling - + host_ptr.insert(tensor->host_data); obj_ptr.insert(tensor); - //host_ptr.push_back(tensor->host_data); + // host_ptr.push_back(tensor->host_data); } /// Two tensor formats are supported: NCHW and NHWC. /// TODO: Make this more general in the future. /// -void setCudnnDataFormat(struct Tensor* tensor, int data_format){ +void setCudnnDataFormat(struct Tensor *tensor, int data_format) { - switch(data_format){ + switch (data_format) { case 0: - data_format = CUDNN_TENSOR_NCHW; break; + data_format = CUDNN_TENSOR_NCHW; + break; case 1: - data_format = CUDNN_TENSOR_NHWC; break; - + data_format = CUDNN_TENSOR_NHWC; + break; + default: break; } @@ -167,39 +166,31 @@ void setCudnnDataFormat(struct Tensor* tensor, int data_format){ DEBUG("tensor->data_format = %d \n", tensor->data_format); } - -void set4DFilterDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size){ +void set4DFilterDescriptor(struct Tensor *tensor, int data_format, + size_t dim1_size, size_t dim2_size, size_t dim3_size, + size_t dim4_size) { setCudnnDataFormat(tensor, data_format); - + checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_desc)); checkCUDNN(cudnnCreateFilterDescriptor(&tensor->filter_half_desc)); - - checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_desc, - (cudnnDataType_t) CUDNN_DATA_FLOAT, //tensor->data_type, - (cudnnTensorFormat_t) tensor->data_format, - dim1_size, - dim2_size, - dim3_size, - dim4_size)); - - checkCUDNN(cudnnSetFilter4dDescriptor(tensor->filter_half_desc, - (cudnnDataType_t) CUDNN_DATA_HALF, - (cudnnTensorFormat_t) tensor->data_format, - dim1_size, - dim2_size, - dim3_size, - dim4_size)); + checkCUDNN(cudnnSetFilter4dDescriptor( + tensor->filter_desc, + (cudnnDataType_t)CUDNN_DATA_FLOAT, // tensor->data_type, + (cudnnTensorFormat_t)tensor->data_format, dim1_size, dim2_size, dim3_size, + dim4_size)); + checkCUDNN(cudnnSetFilter4dDescriptor( + tensor->filter_half_desc, (cudnnDataType_t)CUDNN_DATA_HALF, + (cudnnTensorFormat_t)tensor->data_format, dim1_size, dim2_size, dim3_size, + dim4_size)); } - - -void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_size, - size_t dim2_size, size_t dim3_size, size_t dim4_size){ +void set4DTensorDescriptor(struct Tensor *tensor, int data_format, + size_t dim1_size, size_t dim2_size, size_t dim3_size, + size_t dim4_size) { setCudnnDataFormat(tensor, data_format); @@ -207,292 +198,270 @@ void set4DTensorDescriptor(struct Tensor* tensor, int data_format, size_t dim1_s checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_half_desc)); - // For certain operations, the strides may need to change - in which case the descriptor - // needs to be reinitialized - cudnnSetTensor4dDescriptor(tensor->tensor_desc, - (cudnnTensorFormat_t) tensor->data_format, // Data format - (cudnnDataType_t) CUDNN_DATA_FLOAT, //tensor->data_type, // Data type - dim1_size, dim2_size, - dim3_size, dim4_size); - + // For certain operations, the strides may need to change - in which case the + // descriptor needs to be reinitialized + cudnnSetTensor4dDescriptor( + tensor->tensor_desc, + (cudnnTensorFormat_t)tensor->data_format, // Data format + (cudnnDataType_t)CUDNN_DATA_FLOAT, // tensor->data_type, // Data type + dim1_size, dim2_size, dim3_size, dim4_size); - cudnnSetTensor4dDescriptor(tensor->tensor_half_desc, - (cudnnTensorFormat_t) tensor->data_format, // Data format - (cudnnDataType_t) CUDNN_DATA_HALF, // Data type - dim1_size, dim2_size, - dim3_size, dim4_size); + cudnnSetTensor4dDescriptor( + tensor->tensor_half_desc, + (cudnnTensorFormat_t)tensor->data_format, // Data format + (cudnnDataType_t)CUDNN_DATA_HALF, // Data type + dim1_size, dim2_size, dim3_size, dim4_size); - cudnnDataType_t dType; int nStride, cStride, hStride, wStride; int size1, size2, size3, size4; - cudnnGetTensor4dDescriptor(tensor->tensor_desc, - &dType, - &size1, &size2, &size3, &size4, - &nStride, &cStride, &hStride, &wStride); - - DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", - nStride, cStride, hStride, wStride); -} + cudnnGetTensor4dDescriptor(tensor->tensor_desc, &dType, &size1, &size2, + &size3, &size4, &nStride, &cStride, &hStride, + &wStride); + DEBUG("nStride = %d, cStride = %d, hStride = %d, wStride = %d \n", nStride, + cStride, hStride, wStride); +} // FIXIT: Striding still not working - hence 2D and 3D tensor support is missing -void setTensorDescriptor(struct Tensor* tensor, int num_dims, - size_t* dim_sizes){ +void setTensorDescriptor(struct Tensor *tensor, int num_dims, + size_t *dim_sizes) { checkCUDNN(cudnnCreateTensorDescriptor(&tensor->tensor_desc)); - int* strides = (int*) malloc(sizeof(int) * num_dims); + int *strides = (int *)malloc(sizeof(int) * num_dims); strides[num_dims - 1] = 1; - for(int i = num_dims - 2; i >= 0; i--){ - strides[i] = strides[i+1] * dim_sizes[i+1]; + for (int i = num_dims - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * dim_sizes[i + 1]; } - for(int i = 0; i < num_dims; i++){ + for (int i = 0; i < num_dims; i++) { DEBUG("strides[%d] = %d \n", i, strides[i]); } - int* const_dims = (int*) malloc(sizeof(int) * num_dims); - for(int j = 0 ; j < num_dims; j++){ - const_dims[j] = (int) dim_sizes[j]; + int *const_dims = (int *)malloc(sizeof(int) * num_dims); + for (int j = 0; j < num_dims; j++) { + const_dims[j] = (int)dim_sizes[j]; DEBUG("const_dim = %d \n", const_dims[j]); } - - DEBUG("data_type = %d, cuDNN_value = %d \n", tensor->data_type, CUDNN_DATA_FLOAT); - // For certain operations, the strides may need to change - in which case the descriptor - // needs to be reinitialized - checkCUDNN(cudnnSetTensorNdDescriptor(tensor->tensor_desc, - (cudnnDataType_t) tensor->data_type, // Data type - num_dims, - (const int*) const_dims, - (const int*) strides)); + + DEBUG("data_type = %d, cuDNN_value = %d \n", tensor->data_type, + CUDNN_DATA_FLOAT); + // For certain operations, the strides may need to change - in which case the + // descriptor needs to be reinitialized + checkCUDNN(cudnnSetTensorNdDescriptor( + tensor->tensor_desc, + (cudnnDataType_t)tensor->data_type, // Data type + num_dims, (const int *)const_dims, (const int *)strides)); } +/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors. -/// HPVM tensor runtime allows creation of 2D, 3D and 4D tensors. +void *create2DTensor(int data_type, size_t dim1_size, size_t dim2_size) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size; + allocateMem(tensor, data_type, num_elems); + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 2); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 2; + return tensor; +} - void* create2DTensor(int data_type, size_t dim1_size, size_t dim2_size){ - struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size; - allocateMem(tensor, data_type, num_elems); - // Setting the tensor dimensions - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 2); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 2; - - return tensor; - } +void *create3DTensor(int data_type, size_t dim1_size, size_t dim2_size, + size_t dim3_size) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size; + allocateMem(tensor, data_type, num_elems); + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 3); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 3; + + return tensor; +} +void *create4DTensor(int data_type, int data_format, size_t dim1_size, + size_t dim2_size, size_t dim3_size, size_t dim4_size) { + struct Tensor *tensor = (struct Tensor *)malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + allocateMem(tensor, data_type, num_elems); + // Setting the tensor dimensions + size_t *dim_sizes = (size_t *)malloc(sizeof(size_t) * 4); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + dim_sizes[3] = dim4_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 4; + // Done setting tensor dimensions + // setTensorDescriptor(tensor, 4, dim_sizes); + set4DTensorDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, + dim4_size); + // FIXIT: filter descriptor should be invoked only for filters + set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, + dim4_size); + + return tensor; +} - void* create3DTensor(int data_type, size_t dim1_size, size_t dim2_size, - size_t dim3_size){ - struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size; - allocateMem(tensor, data_type, num_elems); - // Setting the tensor dimensions - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 3); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 3; - - return tensor; - } +void initTensorData(void *tensor_ptr, void *data_ptr, size_t size_in_bytes) { + Tensor *tensor = (Tensor *)tensor_ptr; - void* create4DTensor(int data_type, int data_format, size_t dim1_size, size_t dim2_size, - size_t dim3_size, size_t dim4_size){ - struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor)); - size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; - allocateMem(tensor, data_type, num_elems); - // Setting the tensor dimensions - size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 4); - dim_sizes[0] = dim1_size; - dim_sizes[1] = dim2_size; - dim_sizes[2] = dim3_size; - dim_sizes[3] = dim4_size; - tensor->dims.dim_sizes = dim_sizes; - tensor->dims.num_dims = 4; - // Done setting tensor dimensions - //setTensorDescriptor(tensor, 4, dim_sizes); - set4DTensorDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size); - // FIXIT: filter descriptor should be invoked only for filters - set4DFilterDescriptor(tensor, data_format, dim1_size, dim2_size, dim3_size, dim4_size); - - return tensor; + size_t host_size_in_bytes = tensor->num_elems * 4; + // if(tensor->size_in_bytes != size_in_bytes){ + if (host_size_in_bytes != size_in_bytes) { + ERROR("The destination and source sizes don't match"); } + std::memcpy(tensor->host_data, data_ptr, size_in_bytes); - void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){ + changeTensorPlacement(tensor, HOST); + + tensor->cur_type = float_type; +} - Tensor* tensor = (Tensor*) tensor_ptr; +void hostToDeviceCopy(struct Tensor *tensor) { - size_t host_size_in_bytes = tensor->num_elems * 4; - //if(tensor->size_in_bytes != size_in_bytes){ - if(host_size_in_bytes != size_in_bytes){ - ERROR("The destination and source sizes don't match"); - } - - std::memcpy(tensor->host_data, data_ptr, size_in_bytes); + if (tensor->data_placement != DEVICE) { + cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, + cudaMemcpyHostToDevice); + DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); + tensor->data_placement = DEVICE; + } else { + DEBUG("No data movement required - Data on Device \n"); + } +} - changeTensorPlacement(tensor, HOST); +void deviceToHostCopy(struct Tensor *tensor) { - tensor->cur_type = float_type; + if (tensor->data_placement != HOST) { + cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes, + cudaMemcpyDeviceToHost); + DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); + tensor->data_placement = HOST; + } else { + DEBUG("No data movement required - Data on Host \n"); } +} - +// void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){ - void hostToDeviceCopy(struct Tensor* tensor){ +void tensorCopy(void *srcTensor_ptr, void *dstTensor_ptr) { - if(tensor->data_placement != DEVICE){ - cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, - cudaMemcpyHostToDevice); - DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); - tensor->data_placement = DEVICE; - } - else{ - DEBUG("No data movement required - Data on Device \n"); - } - - } + struct Tensor *srcTensor = (struct Tensor *)srcTensor_ptr; + struct Tensor *dstTensor = (struct Tensor *)dstTensor_ptr; + if (srcTensor->data_placement == HOST) { + memcpy(dstTensor->host_data, srcTensor->host_data, + srcTensor->size_in_bytes); + DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes); + dstTensor->data_placement = HOST; + } else if (srcTensor->data_placement == DEVICE) { + cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data, + srcTensor->size_in_bytes, cudaMemcpyDeviceToDevice); + DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes); + dstTensor->data_placement = DEVICE; + } +} - void deviceToHostCopy(struct Tensor* tensor){ +void hpvm_request_tensor(void *tensor_ptr, int destination) { - if(tensor->data_placement != HOST){ + Tensor *tensor = (Tensor *)tensor_ptr; + // If destination is the host + if (destination == 0) { + if (tensor->data_placement != HOST) { cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes, - cudaMemcpyDeviceToHost); + cudaMemcpyDeviceToHost); DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); tensor->data_placement = HOST; + } else { + DEBUG("No data movement required - Data on Host \n"); } - else{ - DEBUG("No data movement required - Data on Host \n"); - } - - } - - - //void tensorCopy(struct Tensor* srcTensor, struct Tensor* dstTensor){ - - void tensorCopy(void* srcTensor_ptr, void* dstTensor_ptr){ - - struct Tensor* srcTensor = (struct Tensor*) srcTensor_ptr; - struct Tensor* dstTensor = (struct Tensor*) dstTensor_ptr; - - - if(srcTensor->data_placement == HOST){ - memcpy(dstTensor->host_data, srcTensor->host_data, srcTensor->size_in_bytes); - DEBUG("Moving %d bytes from host to host \n", srcTensor->size_in_bytes); - dstTensor->data_placement = HOST; - } - else if (srcTensor->data_placement == DEVICE){ - cudaMemcpy(dstTensor->gpu_data, srcTensor->gpu_data, srcTensor->size_in_bytes, - cudaMemcpyDeviceToDevice); - DEBUG("Moving %d bytes from GPU to GPU \n", srcTensor->size_in_bytes); - dstTensor->data_placement = DEVICE; - } - } + // If destination is the GPU + else if (destination == 1) { - - void hpvm_request_tensor(void* tensor_ptr, int destination){ - - Tensor* tensor = (Tensor*) tensor_ptr; - // If destination is the host - if(destination == 0){ - if(tensor->data_placement != HOST){ - cudaMemcpy(tensor->host_data, tensor->gpu_data, tensor->size_in_bytes, - cudaMemcpyDeviceToHost); - DEBUG("Moving %d bytes from GPU to host \n", tensor->size_in_bytes); - tensor->data_placement = HOST; - } - else{ - DEBUG("No data movement required - Data on Host \n"); - } - } - // If destination is the GPU - else if(destination == 1){ - - if(tensor->data_placement != DEVICE){ - cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, - cudaMemcpyHostToDevice); - DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); - tensor->data_placement = DEVICE; - } - else{ - DEBUG("No data movement required - Data on Device \n"); - } + if (tensor->data_placement != DEVICE) { + cudaMemcpy(tensor->gpu_data, tensor->host_data, tensor->size_in_bytes, + cudaMemcpyHostToDevice); + DEBUG("Moving %d bytes from host to GPU \n", tensor->size_in_bytes); + tensor->data_placement = DEVICE; + } else { + DEBUG("No data movement required - Data on Device \n"); } - } +} +void convertToFP16(struct Tensor *tensor) { - - void convertToFP16(struct Tensor* tensor){ - - if(tensor == NULL) + if (tensor == NULL) return; - + if (tensor->cur_type == half_type) return; - + DEBUG("ConvertoFP16 \n"); setSizeInBytes(tensor, half_type, tensor->num_elems); size_t size_in_bytes = tensor->size_in_bytes; DEBUG("size_in_bytes = %d \n", size_in_bytes); - - if(tensor->gpu_half_data == NULL) - checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, size_in_bytes)); // Allocate memory on GPU - // If Tensor is one of Tracked (has to free per batch) then track all data types - if(tracked_tensors.find(tensor) != tracked_tensors.end()) + + if (tensor->gpu_half_data == NULL) + checkCudaErrors(cudaMalloc(&tensor->gpu_half_data, + size_in_bytes)); // Allocate memory on GPU + // If Tensor is one of Tracked (has to free per batch) then track all data + // types + if (tracked_tensors.find(tensor) != tracked_tensors.end()) tensors_ptr.insert(tensor->gpu_half_data); - f2h((float*) tensor->gpu_data, tensor->num_elems, (half*) tensor->gpu_half_data); + f2h((float *)tensor->gpu_data, tensor->num_elems, + (half *)tensor->gpu_half_data); - tensor->cur_type = half_type; + tensor->cur_type = half_type; } +void convertToFP32(struct Tensor *tensor) { - -void convertToFP32(struct Tensor* tensor){ - - if(tensor == NULL) + if (tensor == NULL) return; - + // Need this check for both offline and online profiling path if (tensor->cur_type == float_type) return; - + DEBUG("ConvertoFP32 \n"); - + setSizeInBytes(tensor, float_type, tensor->num_elems); size_t size_in_bytes = tensor->size_in_bytes; - + // If FP32 data array doesn't exist, allocate - if(tensor->gpu_data == NULL){ - checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU + if (tensor->gpu_data == NULL) { + checkCudaErrors( + cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes); } - // If Tensor is one of Tracked (has to free per batch) then track all data types - if(tracked_tensors.find(tensor) != tracked_tensors.end()) + // If Tensor is one of Tracked (has to free per batch) then track all data + // types + if (tracked_tensors.find(tensor) != tracked_tensors.end()) tensors_ptr.insert(tensor->gpu_data); - h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data); + h2f((half *)tensor->gpu_half_data, tensor->num_elems, + (float *)tensor->gpu_data); tensor->cur_type = float_type; - } +void convertToFP32_offline(struct Tensor *tensor) { - -void convertToFP32_offline(struct Tensor* tensor){ - - if(tensor == NULL) + if (tensor == NULL) return; if (tensor->cur_type == half_type) @@ -504,36 +473,36 @@ void convertToFP32_offline(struct Tensor* tensor){ size_t size_in_bytes = tensor->size_in_bytes; // If FP32 data array doesn't exist, allocate - if(tensor->gpu_data == NULL){ - checkCudaErrors(cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU + if (tensor->gpu_data == NULL) { + checkCudaErrors( + cudaMalloc(&tensor->gpu_data, size_in_bytes)); // Allocate memory on GPU DEBUG("NOTE: Allocating new FP32 Array with size = %lu \n", size_in_bytes); } - // If Tensor is one of Tracked (has to free per batch) then track all data types - if(tracked_tensors.find(tensor) != tracked_tensors.end()) + // If Tensor is one of Tracked (has to free per batch) then track all data + // types + if (tracked_tensors.find(tensor) != tracked_tensors.end()) tensors_ptr.insert(tensor->gpu_data); - h2f((half*) tensor->gpu_half_data, tensor->num_elems, (float*) tensor->gpu_data); + h2f((half *)tensor->gpu_half_data, tensor->num_elems, + (float *)tensor->gpu_data); tensor->cur_type = float_type; - + cudaFree(tensor->gpu_half_data); tensors_ptr.erase(tensor->gpu_half_data); tensor->gpu_half_data = NULL; } - - - - // Called from within the runtime to change the data placement -// This routine is required to change the output data placements from host to device -void changeTensorPlacement(struct Tensor* tensor, data_location_t data_placement){ +// This routine is required to change the output data placements from host to +// device +void changeTensorPlacement(struct Tensor *tensor, + data_location_t data_placement) { - if(tensor == NULL) + if (tensor == NULL) ERROR("Tensor == NULL"); tensor->data_placement = data_placement; } - } // end of Extern"C" diff --git a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu index 5cdfdf5a55109fac66a89f544306fbe7b4b9562a..8c77234e2432bd5fe1cde144b031d42273140d42 100644 --- a/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu +++ b/hpvm/projects/hpvm-tensor-rt/tensor_runtime/src/wrapper_runtime.cu @@ -1,13 +1,13 @@ //===--------------------------- wrapper_runtime.cu -----------------------===// // //===----------------------------------------------------------------------===// -// -// This file contains the implementation of some of the core API to tensor runtime -// so that runtime tuning of approximations can be done on different targets. +// +// This file contains the implementation of some of the core API to tensor +// runtime so that runtime tuning of approximations can be done on different +// targets. // //===----------------------------------------------------------------------===// - #include <stdio.h> #include <cstdio> #include <cstdlib> @@ -24,7 +24,6 @@ #include <cuda_fp16.h> #include <driver_types.h> - // Tensor runtime header files #include "tensor_utils.h" #include "debug.h" @@ -37,641 +36,580 @@ #include "half_precision_api.h" #include "hpvm-rt-controller.h" -#include "approxhpvm_runtime_utils.h" +#include "approxhpvm_runtime_utils.h" #include "approx_api.h" - -extern "C"{ - - /**** Wrapper Runtime API ***/ - - - void* wrapper_ConvLayer(const char* hpvm_node_id, - void* input, - void* filter, - void* bias, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, int pool_size, - int activation_id, - // NOTE: out_min, out_max are only relevant for ClippedRelu - float out_min, float out_max){ - - NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); - - if (NodeConf->isGPUNodeConfiguration()) { - DEBUG("GPU Configuration for ConvLayer\n"); - // Mapped to GPU - get a GPU node configuration - GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; - - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Check for convolution as first operation - CUSTOM_ASSERT((ApproxChoices.size() >= 1) && - (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) && - "Incorrect number/type of operations in provided Conv layer configuration"); - - void* conv_out = handleTensorConvApproximationTuples(ApproxChoices[0].second, - input, filter, conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w); - void* add_out; - if (bias != NULL) { - // Check for add as second operation - CUSTOM_ASSERT((ApproxChoices.size() >= 2) && - (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) && - "Incorrect number/type of operations in provided Conv layer configuration"); - add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, - conv_out, bias); - } else { - add_out = conv_out; - } - - void* activation_out; - switch (activation_id) { - case -1: - { // No activation - //INFO("No activation Function\n"); - activation_out = add_out; - } - break; - case 0: - { // TanH activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = handleTensorTanhApproximationTuples(ApproxChoices[2].second, - add_out); - } - break; - case 1: - { // ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = handleTensorReluApproximationTuples(ApproxChoices[2].second, - add_out); - } - break; - case 2: - { // Clipped ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = - handleTensorClippedReluApproximationTuples(ApproxChoices[2].second, - add_out, out_min, out_max); - } - break; - default: - { - ERROR("Activation id %d NOT supported \n", activation_id); - } - break; - } - - void* pool_out; - - if (pool_size > 0) { - switch (pool_id) { - case 0: - { - // If we remove the asserts, we can have all cases handled by a single call - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MAX) && - "Expected POOL_MAX in provided Conv layer configuration"); - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size, pool_size, 0, 0, - pool_size, pool_size); - } - break; - case 1: - { - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) && - "Expected POOL_MEAN in provided Conv layer configuration"); - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size, pool_size, 0, 0, - pool_size, pool_size); - } - break; - case 2: - { - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && - "Expected POOL_MIN in provided Conv layer configuration"); - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size, pool_size, 0, 0, - pool_size, pool_size); - } - break; - default: - { - ERROR("Pool id %d NOT supported \n", pool_id); - } - break; - } - } else { - pool_out = activation_out; - } - return pool_out; +extern "C" { + +/**** Wrapper Runtime API ***/ + +void * +wrapper_ConvLayer(const char *hpvm_node_id, void *input, void *filter, + void *bias, int conv_pad_h, int conv_pad_w, int conv_stride_h, + int conv_stride_w, int pool_id, int pool_size, + int activation_id, + // NOTE: out_min, out_max are only relevant for ClippedRelu + float out_min, float out_max) { + + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); + + if (NodeConf->isGPUNodeConfiguration()) { + DEBUG("GPU Configuration for ConvLayer\n"); + // Mapped to GPU - get a GPU node configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; + + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + + // Check for convolution as first operation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 1) && + (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + + void *conv_out = handleTensorConvApproximationTuples( + ApproxChoices[0].second, input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w); + void *add_out; + if (bias != NULL) { + // Check for add as second operation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 2) && + (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, + conv_out, bias); + } else { + add_out = conv_out; + } + + void *activation_out; + switch (activation_id) { + case -1: { // No activation + // INFO("No activation Function\n"); + activation_out = add_out; + } break; + case 0: { // TanH activation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + activation_out = + handleTensorTanhApproximationTuples(ApproxChoices[2].second, add_out); + } break; + case 1: { // ReLU activation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + activation_out = + handleTensorReluApproximationTuples(ApproxChoices[2].second, add_out); + } break; + case 2: { // Clipped ReLU activation + CUSTOM_ASSERT((ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && + "Incorrect number/type of operations in provided Conv " + "layer configuration"); + activation_out = handleTensorClippedReluApproximationTuples( + ApproxChoices[2].second, add_out, out_min, out_max); + } break; + default: { + ERROR("Activation id %d NOT supported \n", activation_id); + } break; + } + + void *pool_out; + + if (pool_size > 0) { + switch (pool_id) { + case 0: { + // If we remove the asserts, we can have all cases handled by a single + // call + CUSTOM_ASSERT((ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MAX) && + "Expected POOL_MAX in provided Conv layer configuration"); + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size, + pool_size, 0, 0, pool_size, pool_size); + } break; + case 1: { + CUSTOM_ASSERT( + (ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) && + "Expected POOL_MEAN in provided Conv layer configuration"); + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size, + pool_size, 0, 0, pool_size, pool_size); + } break; + case 2: { + CUSTOM_ASSERT((ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && + "Expected POOL_MIN in provided Conv layer configuration"); + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size, + pool_size, 0, 0, pool_size, pool_size); + } break; + default: { + ERROR("Pool id %d NOT supported \n", pool_id); + } break; } - else { - ERROR("Unsupported Configuration"); - abort(); - } - - return NULL; + } else { + pool_out = activation_out; + } + return pool_out; + } else { + ERROR("Unsupported Configuration"); + abort(); } + return NULL; +} - - - - void* wrapper_ConvLayer2(const char* hpvm_node_id, - void* input, - void* filter, - void* bias, - int conv_pad_h, int conv_pad_w, - int conv_stride_h, int conv_stride_w, - int pool_id, - int pool_size_v, int pool_size_h, - int pool_pad_v, int pool_pad_h, - int pool_stride_v, int pool_stride_h, - int activation_id, - // NOTE: out_min, out_max are only relevant for ClippedRelu - float out_min, float out_max){ - - INFO ("*** Conv Layer \n"); - - NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); - if (NodeConf->isGPUNodeConfiguration()) { - DEBUG("GPU Configuration for ConvLayer\n"); - // Mapped to GPU - get a GPU node configuration - GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; - - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - - //printf("*** Convolution \n ApproxChoice = %d \n BatchNorm = %d \n CONV = %d \n", ApproxChoices[0].first, - // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, - // GPUNodeConfiguration::TENSOR_OP::CONV); - - // Check for convolution as first operation - CUSTOM_ASSERT((ApproxChoices.size() >= 1) && - (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) && - "Incorrect number/type of operations in provided Conv layer configuration"); - - - - void* conv_out = handleTensorConvApproximationTuples(ApproxChoices[0].second, - input, filter, conv_pad_h, conv_pad_w, - conv_stride_h, conv_stride_w); - void* add_out; - if (bias != NULL) { - // Check for add as second operation - CUSTOM_ASSERT((ApproxChoices.size() >= 2) && - (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) && - "Incorrect number/type of operations in provided Conv layer configuration"); - add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, - conv_out, bias); - } else { - add_out = conv_out; - } - - void* activation_out; - switch (activation_id) { - case -1: - { // No activation - //INFO("No activation Function\n"); - activation_out = add_out; - } - break; - case 0: - { // TanH activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = handleTensorTanhApproximationTuples(ApproxChoices[2].second, - add_out); - } - break; - case 1: - { // ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = handleTensorReluApproximationTuples(ApproxChoices[2].second, - add_out); - } - break; - case 2: - { // Clipped ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() >= 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && - "Incorrect number/type of operations in provided Conv layer configuration"); - activation_out = - handleTensorClippedReluApproximationTuples(ApproxChoices[2].second, - add_out, out_min, out_max); - } - break; - default: - { - ERROR("Activation id %d NOT supported \n", activation_id); - } - break; - } - - void* pool_out; - - if (pool_size_v > 0) { - switch (pool_id) { - case 0: - { - // If we remove the asserts, we can have all cases handled by a single call - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MAX) && - "Expected POOL_MAX in provided Conv layer configuration"); - - pool_out = handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size_v, pool_size_h, - pool_pad_v, pool_pad_h, - pool_stride_v, pool_stride_h); - - - } - break; - case 1: - { - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) && - "Expected POOL_MEAN in provided Conv layer configuration"); - - // FIXIT: POOL_MEAN still needs fixing - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size_v, pool_size_h, - 0, 0, - pool_size_v, pool_size_h); - - } - break; - case 2: - { - CUSTOM_ASSERT((ApproxChoices.back().first == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && - "Expected POOL_MIN in provided Conv layer configuration"); - - // FIXIT: Pool_MEAN needs fixing - pool_out = - handleTensorPoolingApproximationTuples(ApproxChoices.back().second, - activation_out, pool_id, - pool_size_v, pool_size_h, 0, 0, - pool_size_v, pool_size_h); - } - break; - default: - { - ERROR("Pool id %d NOT supported \n", pool_id); - } - break; - } - } else { - pool_out = activation_out; - } - return pool_out; - } - else { - ERROR("Unsupported Configuration"); - abort(); +void *wrapper_ConvLayer2( + const char *hpvm_node_id, void *input, void *filter, void *bias, + int conv_pad_h, int conv_pad_w, int conv_stride_h, int conv_stride_w, + int pool_id, int pool_size_v, int pool_size_h, int pool_pad_v, + int pool_pad_h, int pool_stride_v, int pool_stride_h, int activation_id, + // NOTE: out_min, out_max are only relevant for ClippedRelu + float out_min, float out_max) { + + INFO("*** Conv Layer \n"); + + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); + if (NodeConf->isGPUNodeConfiguration()) { + DEBUG("GPU Configuration for ConvLayer\n"); + // Mapped to GPU - get a GPU node configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; + + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + + // printf("*** Convolution \n ApproxChoice = %d \n BatchNorm = %d \n CONV = + // %d \n", ApproxChoices[0].first, + // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, + // GPUNodeConfiguration::TENSOR_OP::CONV); + + // Check for convolution as first operation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 1) && + (ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CONV) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + + void *conv_out = handleTensorConvApproximationTuples( + ApproxChoices[0].second, input, filter, conv_pad_h, conv_pad_w, + conv_stride_h, conv_stride_w); + void *add_out; + if (bias != NULL) { + // Check for add as second operation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 2) && + (ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, + conv_out, bias); + } else { + add_out = conv_out; + } + + void *activation_out; + switch (activation_id) { + case -1: { // No activation + // INFO("No activation Function\n"); + activation_out = add_out; + } break; + case 0: { // TanH activation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + activation_out = + handleTensorTanhApproximationTuples(ApproxChoices[2].second, add_out); + } break; + case 1: { // ReLU activation + CUSTOM_ASSERT( + (ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && + "Incorrect number/type of operations in provided Conv layer " + "configuration"); + activation_out = + handleTensorReluApproximationTuples(ApproxChoices[2].second, add_out); + } break; + case 2: { // Clipped ReLU activation + CUSTOM_ASSERT((ApproxChoices.size() >= 3) && + (ApproxChoices[2].first == + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && + "Incorrect number/type of operations in provided Conv " + "layer configuration"); + activation_out = handleTensorClippedReluApproximationTuples( + ApproxChoices[2].second, add_out, out_min, out_max); + } break; + default: { + ERROR("Activation id %d NOT supported \n", activation_id); + } break; + } + + void *pool_out; + + if (pool_size_v > 0) { + switch (pool_id) { + case 0: { + // If we remove the asserts, we can have all cases handled by a single + // call + CUSTOM_ASSERT((ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MAX) && + "Expected POOL_MAX in provided Conv layer configuration"); + + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size_v, + pool_size_h, pool_pad_v, pool_pad_h, pool_stride_v, pool_stride_h); + + } break; + case 1: { + CUSTOM_ASSERT( + (ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MEAN) && + "Expected POOL_MEAN in provided Conv layer configuration"); + + // FIXIT: POOL_MEAN still needs fixing + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size_v, + pool_size_h, 0, 0, pool_size_v, pool_size_h); + + } break; + case 2: { + CUSTOM_ASSERT((ApproxChoices.back().first == + GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && + "Expected POOL_MIN in provided Conv layer configuration"); + + // FIXIT: Pool_MEAN needs fixing + pool_out = handleTensorPoolingApproximationTuples( + ApproxChoices.back().second, activation_out, pool_id, pool_size_v, + pool_size_h, 0, 0, pool_size_v, pool_size_h); + } break; + default: { + ERROR("Pool id %d NOT supported \n", pool_id); + } break; } - - return NULL; + } else { + pool_out = activation_out; + } + return pool_out; + } else { + ERROR("Unsupported Configuration"); + abort(); } + return NULL; +} - - - - - void* wrapper_FCLayer(const char* hpvm_node_id, - void* input, - void* weights, - void* bias, - int activation_id, - // NOTE: out_min and out_max are only relevant for ClippedRelu - float out_min, float out_max){ - - INFO ("*** Dense Layer \n"); - - NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); - if (NodeConf->isGPUNodeConfiguration()) { - DEBUG("GPU Configuration for FCLayer\n"); - // Mapped to GPU - get a GPU node configuration - GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; - - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Approximation choices must be for a FC wrapper operation - CUSTOM_ASSERT((ApproxChoices.size() == 2 || ApproxChoices.size() == 3) && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MUL && - ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD && - "Invalid configuration generated for FC layer wrapper operation"); - - void* gemm_out = handleTensorMulApproximationTuples(ApproxChoices[0].second, - input, weights); - void* add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, - gemm_out, bias); - - void* activation_out; - switch (activation_id) { - case -1: - { // No activation - CUSTOM_ASSERT((ApproxChoices.size() == 2) && - "Incorrect number of operations in provided FC layer configuration"); - //INFO("No activation Function\n"); - activation_out = add_out; - } - break; - case 0: - { // TanH activation - CUSTOM_ASSERT((ApproxChoices.size() == 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && - "Incorrect number/type of operations in provided FC layer configuration"); - activation_out = handleTensorTanhApproximationTuples(ApproxChoices[1].second, - add_out); - } - break; - case 1: - { // ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() == 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && - "Incorrect number/type of operations in provided FC layer configuration"); - activation_out = handleTensorReluApproximationTuples(ApproxChoices[1].second, - add_out); - } - break; - case 2: - { // Clipped ReLU activation - CUSTOM_ASSERT((ApproxChoices.size() == 3) && - (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && - "Incorrect number/type of operations in provided FC layer configuration"); - activation_out = - handleTensorClippedReluApproximationTuples(ApproxChoices[1].second, - add_out, out_min, out_max); - } - break; - default: - { - ERROR("Activation id %d NOT supported \n", activation_id); - } - break; - } - return activation_out; - } - else { - ERROR("Unsupported Configuration"); - abort(); - } - - return NULL; +void * +wrapper_FCLayer(const char *hpvm_node_id, void *input, void *weights, + void *bias, int activation_id, + // NOTE: out_min and out_max are only relevant for ClippedRelu + float out_min, float out_max) { + + INFO("*** Dense Layer \n"); + + NodeConfiguration *NodeConf = RC->getNodeConfiguration(hpvm_node_id); + if (NodeConf->isGPUNodeConfiguration()) { + DEBUG("GPU Configuration for FCLayer\n"); + // Mapped to GPU - get a GPU node configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)NodeConf; + + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + + // Approximation choices must be for a FC wrapper operation + CUSTOM_ASSERT( + (ApproxChoices.size() == 2 || ApproxChoices.size() == 3) && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MUL && + ApproxChoices[1].first == GPUNodeConfiguration::TENSOR_OP::ADD && + "Invalid configuration generated for FC layer wrapper operation"); + + void *gemm_out = handleTensorMulApproximationTuples(ApproxChoices[0].second, + input, weights); + void *add_out = handleTensorAddApproximationTuples(ApproxChoices[1].second, + gemm_out, bias); + + void *activation_out; + switch (activation_id) { + case -1: { // No activation + CUSTOM_ASSERT( + (ApproxChoices.size() == 2) && + "Incorrect number of operations in provided FC layer configuration"); + // INFO("No activation Function\n"); + activation_out = add_out; + } break; + case 0: { // TanH activation + CUSTOM_ASSERT( + (ApproxChoices.size() == 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::TANH) && + "Incorrect number/type of operations in provided FC layer " + "configuration"); + activation_out = + handleTensorTanhApproximationTuples(ApproxChoices[1].second, add_out); + } break; + case 1: { // ReLU activation + CUSTOM_ASSERT( + (ApproxChoices.size() == 3) && + (ApproxChoices[2].first == GPUNodeConfiguration::TENSOR_OP::RELU) && + "Incorrect number/type of operations in provided FC layer " + "configuration"); + activation_out = + handleTensorReluApproximationTuples(ApproxChoices[1].second, add_out); + } break; + case 2: { // Clipped ReLU activation + CUSTOM_ASSERT((ApproxChoices.size() == 3) && + (ApproxChoices[2].first == + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU) && + "Incorrect number/type of operations in provided FC layer " + "configuration"); + activation_out = handleTensorClippedReluApproximationTuples( + ApproxChoices[1].second, add_out, out_min, out_max); + } break; + default: { + ERROR("Activation id %d NOT supported \n", activation_id); + } break; + } + return activation_out; + } else { + ERROR("Unsupported Configuration"); + abort(); } + return NULL; +} +void *wrapper_tensorRelu(const char *hpvm_node_id, void *input_ptr) { + INFO("*** Relu Operation \n"); - void* wrapper_tensorRelu(const char* hpvm_node_id, void* input_ptr){ - - INFO("*** Relu Operation \n"); - - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a relu operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::RELU && - "Invalid configuration generated for tensor relu wrapper operation"); + // Approximation choices must be for a relu operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::RELU && + "Invalid configuration generated for tensor relu wrapper operation"); - return handleTensorReluApproximationTuples(ApproxChoices[0].second, - input_ptr); - - } + return handleTensorReluApproximationTuples(ApproxChoices[0].second, + input_ptr); +} - void* wrapper_tensorClippedRelu(const char* hpvm_node_id, - void* input_ptr, - float out_min, float out_max){ - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = +void *wrapper_tensorClippedRelu(const char *hpvm_node_id, void *input_ptr, + float out_min, float out_max) { + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Approximation choices must be for a relu operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU && - "Invalid configuration generated for tensor clipped relu wrapper operation"); + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); - return handleTensorClippedReluApproximationTuples(ApproxChoices[0].second, - input_ptr, out_min, out_max); + // Approximation choices must be for a relu operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == + GPUNodeConfiguration::TENSOR_OP::CLIPPED_RELU && + "Invalid configuration generated for tensor clipped relu " + "wrapper operation"); - } + return handleTensorClippedReluApproximationTuples( + ApproxChoices[0].second, input_ptr, out_min, out_max); +} - void* wrapper_tensorTanh(const char* hpvm_node_id, void* input_ptr){ - // return tensorTanh(input_ptr); +void *wrapper_tensorTanh(const char *hpvm_node_id, void *input_ptr) { + // return tensorTanh(input_ptr); - GPUNodeConfiguration *GPUConf = + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); - // Approximation choices must be for a tanh operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::TANH && - "Invalid configuration generated for tensor tanh wrapper operation"); - - return handleTensorTanhApproximationTuples(ApproxChoices[0].second, - input_ptr); - - } + // Approximation choices must be for a tanh operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::TANH && + "Invalid configuration generated for tensor tanh wrapper operation"); + return handleTensorTanhApproximationTuples(ApproxChoices[0].second, + input_ptr); +} - void* wrapper_tensorBatchNorm(const char* hpvm_node_id, - void* input_ptr, void* gamma_ptr, void* beta_ptr, - void* mean_ptr, void* variance_ptr, double epsilon){ +void *wrapper_tensorBatchNorm(const char *hpvm_node_id, void *input_ptr, + void *gamma_ptr, void *beta_ptr, void *mean_ptr, + void *variance_ptr, double epsilon) { - INFO("*** BatchNorm Operation \n"); + INFO("*** BatchNorm Operation \n"); - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = - GPUConf->getApproxChoices(); + GPUConf->getApproxChoices(); - // printf("*** BatchNorm \n ApproxChoice = %d \n BatchNorm = %d \n CONV = %d \n", ApproxChoices[0].first, - // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, - // GPUNodeConfiguration::TENSOR_OP::CONV); + // printf("*** BatchNorm \n ApproxChoice = %d \n BatchNorm = %d \n CONV = %d + // \n", ApproxChoices[0].first, + // GPUNodeConfiguration::TENSOR_OP::BATCHNORM, + // GPUNodeConfiguration::TENSOR_OP::CONV); - // Approximation choices must be for a batchnorm operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::BATCHNORM && - "Invalid configuration generated for tensor batchnorm wrapper operation"); - - return handleTensorBatchNormApproximationTuples(ApproxChoices[0].second, - input_ptr, gamma_ptr, beta_ptr, - mean_ptr, variance_ptr, epsilon); - - } + // Approximation choices must be for a batchnorm operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::BATCHNORM && + "Invalid configuration generated for tensor batchnorm wrapper operation"); + return handleTensorBatchNormApproximationTuples( + ApproxChoices[0].second, input_ptr, gamma_ptr, beta_ptr, mean_ptr, + variance_ptr, epsilon); +} - void* wrapper_tensorAdd(const char* hpvm_node_id, void* input_ptr, void* bias_ptr){ +void *wrapper_tensorAdd(const char *hpvm_node_id, void *input_ptr, + void *bias_ptr) { - - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = - GPUConf->getApproxChoices(); + GPUConf->getApproxChoices(); - // Approximation choices must be for an add operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::ADD && - "Invalid configuration generated for tensor add wrapper operation"); + // Approximation choices must be for an add operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::ADD && + "Invalid configuration generated for tensor add wrapper operation"); - return handleTensorAddApproximationTuples(ApproxChoices[0].second, - input_ptr, bias_ptr); - - } + return handleTensorAddApproximationTuples(ApproxChoices[0].second, input_ptr, + bias_ptr); +} +void *wrapper_tensorPooling(const char *hpvm_node_id, void *input_ptr, + int poolFunction, int window_height, + int window_width, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride) { - void* wrapper_tensorPooling(const char* hpvm_node_id, - void* input_ptr, - int poolFunction, - int window_height, int window_width, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride){ + INFO("*** TensorPooling Operation \n"); - INFO("*** TensorPooling Operation \n"); - - // return tensorPooling(input_ptr, poolFunction, window_height, window_width, - // vertical_pad, horizontal_pad, vertical_stride, horizontal_stride); + // return tensorPooling(input_ptr, poolFunction, window_height, window_width, + // vertical_pad, horizontal_pad, vertical_stride, + // horizontal_stride); - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - - GPUConf->getApproxChoices(); - - // Approximation choices must be for a single operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - "Invalid configuration generated for tensor pool wrapper operation"); - enum GPUNodeConfiguration::TENSOR_OP top = ApproxChoices[0].first; - // Approximation choices must be for a pool operation - CUSTOM_ASSERT((top == GPUNodeConfiguration::TENSOR_OP::POOL_MAX || - top == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN || - top == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && - "Invalid configuration generated for tensor pool wrapper operation"); - - return handleTensorPoolingApproximationTuples(ApproxChoices[0].second, - input_ptr, poolFunction, - window_height, window_width, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride); - - } - + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = + + GPUConf->getApproxChoices(); + + // Approximation choices must be for a single operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + "Invalid configuration generated for tensor pool wrapper operation"); + enum GPUNodeConfiguration::TENSOR_OP top = ApproxChoices[0].first; + // Approximation choices must be for a pool operation + CUSTOM_ASSERT( + (top == GPUNodeConfiguration::TENSOR_OP::POOL_MAX || + top == GPUNodeConfiguration::TENSOR_OP::POOL_MEAN || + top == GPUNodeConfiguration::TENSOR_OP::POOL_MIN) && + "Invalid configuration generated for tensor pool wrapper operation"); + + return handleTensorPoolingApproximationTuples( + ApproxChoices[0].second, input_ptr, poolFunction, window_height, + window_width, vertical_pad, horizontal_pad, vertical_stride, + horizontal_stride); +} - void* wrapper_tensorGroupConvolution(const char* hpvm_node_id, - void* input, void* filter, - int vertical_pad, int horizontal_pad, - int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups){ - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = +void *wrapper_tensorGroupConvolution(const char *hpvm_node_id, void *input, + void *filter, int vertical_pad, + int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, + int conv_groups) { + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Approximation choices must be for a group_conv operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::GROUP_CONV && - "Invalid configuration generated for tensor group_conv wrapper operation"); - - return handleTensorGroupConvApproximationTuples(ApproxChoices[0].second, - input, filter, - vertical_pad, horizontal_pad, - vertical_stride, horizontal_stride, - conv_mode, conv_groups); - - } - - + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + + // Approximation choices must be for a group_conv operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == + GPUNodeConfiguration::TENSOR_OP::GROUP_CONV && + "Invalid configuration generated for tensor group_conv wrapper " + "operation"); + + return handleTensorGroupConvApproximationTuples( + ApproxChoices[0].second, input, filter, vertical_pad, horizontal_pad, + vertical_stride, horizontal_stride, conv_mode, conv_groups); +} - void* wrapper_tensorSoftmax(const char* hpvm_node_id, void* input_ptr){ - // return tensorSoftmax(input_ptr); +void *wrapper_tensorSoftmax(const char *hpvm_node_id, void *input_ptr) { + // return tensorSoftmax(input_ptr); - // Only mapped to GPU - get a GPU configuration - GPUNodeConfiguration *GPUConf = + // Only mapped to GPU - get a GPU configuration + GPUNodeConfiguration *GPUConf = (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); - std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, - std::vector< std::pair<GPUNodeConfiguration::APPROX, - int> > > > &ApproxChoices = - GPUConf->getApproxChoices(); - - // Approximation choices must be for a softmax operation - CUSTOM_ASSERT(ApproxChoices.size() == 1 && - ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::SOFTMAX && - "Invalid configuration generated for tensor softmax wrapper operation"); - - return handleTensorSoftmaxApproximationTuples(ApproxChoices[0].second, input_ptr); + std::vector< + std::pair<GPUNodeConfiguration::TENSOR_OP, + std::vector<std::pair<GPUNodeConfiguration::APPROX, int>>>> + &ApproxChoices = GPUConf->getApproxChoices(); + // Approximation choices must be for a softmax operation + CUSTOM_ASSERT( + ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::SOFTMAX && + "Invalid configuration generated for tensor softmax wrapper operation"); - } - - - - void* tensor_set_node_id(unsigned int node_id){ + return handleTensorSoftmaxApproximationTuples(ApproxChoices[0].second, + input_ptr); +} - currentTensorID = node_id; +void *tensor_set_node_id(unsigned int node_id) { - return NULL; - } + currentTensorID = node_id; + return NULL; +} }