diff --git a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt index cda77b905298bd1e518423bc6a667053c5843777..e7b6c1e9214236c5735df61d30933dec863efd02 100644 --- a/llvm/projects/hpvm-tensor-rt/CMakeLists.txt +++ b/llvm/projects/hpvm-tensor-rt/CMakeLists.txt @@ -36,6 +36,9 @@ link_directories($ENV{CUDNN_PATH} $ENV{CUDNN_PATH}/lib $ENV{CUDNN_PATH}/lib64) cuda_add_library(tensor_runtime tensor_runtime/src/tensor_runtime.cu) cuda_add_cublas_to_target(tensor_runtime) +# Adding new rule for building a cuDNN runtime library +cuda_add_library(tensor_cpu_runtime tensor_runtime/src/tensor_cpu_runtime.cc) + if(USE_GFLAGS) target_link_libraries(tensor_runtime gflags cudnn -lcurand) @@ -43,12 +46,17 @@ else() target_link_libraries(tensor_runtime cudnn -lcurand) endif() - +target_link_libraries(tensor_cpu_runtime) # Adding rule for the debugging source add_executable(test_ops dnn_sources/src/test_ops.cc) target_link_libraries(test_ops tensor_runtime) +#**** CPU sources +add_executable(fc2_cpu dnn_sources/src/fc2_cpu.cc) +target_link_libraries(fc2_cpu tensor_cpu_runtime) + + # Full-Precision versions add_executable(fc2_clipped dnn_sources/src/fc2_clipped.cc) @@ -70,6 +78,9 @@ target_link_libraries(lenet_keras tensor_runtime) add_executable(alexnet_cifar10 dnn_sources/src/alexnet_cifar10_front.cc) target_link_libraries(alexnet_cifar10 tensor_runtime) +add_executable(alexnet_cifar10_test dnn_sources/src/alexnet_cifar10_test.cc) +target_link_libraries(alexnet_cifar10_test tensor_runtime) + add_executable(alexnet_cifar10_tuner dnn_sources/src/alexnet_cifar10_tuner.cc) target_link_libraries(alexnet_cifar10_tuner tensor_runtime) @@ -100,11 +111,28 @@ target_link_libraries(vgg16_cifar100_tuner tensor_runtime) add_executable(vgg16_cifar100_top5 dnn_sources/src/vgg16_cifar100_5.cc) target_link_libraries(vgg16_cifar100_top5 tensor_runtime) -#add_executable(cifar_keras dnn_sources/src/cifar_keras.cc) -#target_link_libraries(cifar_keras tensor_runtime) +#### Image Pipeline Tuning sources + +add_executable(pipeline_GEMO dnn_sources/src/pipeline_GEMO.cc) +target_link_libraries(pipeline_GEMO tensor_runtime) + +add_executable(pipeline_GEO dnn_sources/src/pipeline_GEO.cc) +target_link_libraries(pipeline_GEO tensor_runtime) + +add_executable(pipeline_GEOM dnn_sources/src/pipeline_GEOM.cc) +target_link_libraries(pipeline_GEOM tensor_runtime) +add_executable(pipeline_GSM dnn_sources/src/pipeline_GSM.cc) +target_link_libraries(pipeline_GSM tensor_runtime) + +add_executable(pipeline_GSME dnn_sources/src/pipeline_GSME.cc) +target_link_libraries(pipeline_GSME tensor_runtime) + + + + +#*** Half precision networks -# Half precision networks add_executable(fc2_half dnn_sources/src/half/fc2_half.cc) target_link_libraries(fc2_half tensor_runtime) @@ -125,6 +153,9 @@ target_link_libraries(lenet_keras_half tensor_runtime) add_executable(lenet_keras_promise dnn_sources/src/promise/lenet_keras_promise.cc) target_link_libraries(lenet_keras_promise tensor_runtime) +add_executable(lenet_promise_relu dnn_sources/src/promise/lenet_promise_relu.cc) +target_link_libraries(lenet_promise_relu tensor_runtime) + add_executable(fc4_clipped_promise dnn_sources/src/promise/fc4_clipped_promise.cc) target_link_libraries(fc4_clipped_promise tensor_runtime) @@ -132,31 +163,16 @@ add_executable(alexnet_promise dnn_sources/src/promise/alexnet_promise.cc) target_link_libraries(alexnet_promise tensor_runtime) -#add_executable(resnet18_promise_relu dnn_sources/src/promise/resnet18_promise_relu.cc) -#target_link_libraries(resnet18_promise_relu tensor_runtime) - -#add_executable(resnet18_promise_quant dnn_sources/src/promise/resnet18_promise_quant.cc) -#target_link_libraries(resnet18_promise_quant tensor_runtime) - -#add_executable(resnet18_promise_quant2 dnn_sources/src/promise/resnet18_promise_quant2.cc) -#target_link_libraries(resnet18_promise_quant2 tensor_runtime) - # Quantized PROMISE sources add_executable(alexnet_promise_quant dnn_sources/src/promise/alexnet_promise_quant.cc) target_link_libraries(alexnet_promise_quant tensor_runtime) -#add_executable(alexnet2_promise dnn_sources/src/promise/alexnet2_promise.cc) -#target_link_libraries(alexnet2_promise tensor_runtime) - add_executable(alexnet2_promise_quant dnn_sources/src/promise/alexnet2_promise_quant.cc) target_link_libraries(alexnet2_promise_quant tensor_runtime) add_executable(resnet18_promise_quant dnn_sources/src/promise/resnet18_promise_quant2.cc) target_link_libraries(resnet18_promise_quant tensor_runtime) -#add_executable(vgg16_cifar100_promise dnn_sources/src/promise/vgg16_cifar100_promise.cc) -#target_link_libraries(vgg16_cifar100_promise tensor_runtime) - add_executable(vgg16_cifar100_promise_quant dnn_sources/src/promise/vgg16_cifar100_promise_quant.cc) target_link_libraries(vgg16_cifar100_promise_quant tensor_runtime) @@ -165,6 +181,24 @@ target_link_libraries(vgg16_cifar10_promise_quant tensor_runtime) +#### Image Pipeline PROMISE sources +add_executable(pipeline_GEMO_promise dnn_sources/src/promise/pipeline_GEMO_promise.cc) +target_link_libraries(pipeline_GEMO_promise tensor_runtime) + +add_executable(pipeline_GEO_promise dnn_sources/src/promise/pipeline_GEO_promise.cc) +target_link_libraries(pipeline_GEO_promise tensor_runtime) + +add_executable(pipeline_GEOM_promise dnn_sources/src/promise/pipeline_GEOM_promise.cc) +target_link_libraries(pipeline_GEOM_promise tensor_runtime) + +add_executable(pipeline_GSM_promise dnn_sources/src/promise/pipeline_GSM_promise.cc) +target_link_libraries(pipeline_GSM_promise tensor_runtime) + +add_executable(pipeline_GSME_promise dnn_sources/src/promise/pipeline_GSME_promise.cc) +target_link_libraries(pipeline_GSME_promise tensor_runtime) + + + ############# Promise Validation Sources ############# add_executable(alexnet_valid dnn_sources/src/promise/alexnet_valid.cc) @@ -181,3 +215,20 @@ target_link_libraries(vgg16_cifar100_valid tensor_runtime) add_executable(vgg16_cifar10_valid dnn_sources/src/promise/vgg16_cifar10_valid.cc) target_link_libraries(vgg16_cifar10_valid tensor_runtime) + +##### Image pipeline validation sources +add_executable(pipeline_GEMO_valid dnn_sources/src/promise/pipeline_GEMO_valid.cc) +target_link_libraries(pipeline_GEMO_valid tensor_runtime) + +add_executable(pipeline_GEO_valid dnn_sources/src/promise/pipeline_GEO_valid.cc) +target_link_libraries(pipeline_GEO_valid tensor_runtime) + +add_executable(pipeline_GEOM_valid dnn_sources/src/promise/pipeline_GEOM_valid.cc) +target_link_libraries(pipeline_GEOM_valid tensor_runtime) + +add_executable(pipeline_GSM_valid dnn_sources/src/promise/pipeline_GSM_valid.cc) +target_link_libraries(pipeline_GSM_valid tensor_runtime) + +add_executable(pipeline_GSME_valid dnn_sources/src/promise/pipeline_GSME_valid.cc) +target_link_libraries(pipeline_GSME_valid tensor_runtime) + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..f0358556022d81e914af79f4080f6594aa61c924 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/include/utils_cpu.h @@ -0,0 +1,482 @@ + +// Header guards +#ifndef UTILS_HEADER +#define UTILS_HEADER + + +#include <sstream> +#include <vector> +#include <bits/stdc++.h> +#include "../../tensor_runtime/include/tensor.h" +#include "types.h" +#include <cmath> + + +std::vector<float> run_accuracies; + + +void printTensorInfo(void* tensor_ptr){ + + struct Tensor* tensor = (struct Tensor*) tensor_ptr; + + if(tensor->gpu_data != NULL){ + printf("Successful cudaMalloc \n"); + } + + printf("tensor dims = %d \n", tensor->dims.num_dims); + printf("dim1_size = %lu \n", tensor->dims.dim_sizes[0]); + printf("dim2_size = %lu \n", tensor->dims.dim_sizes[1]); + printf("num_elems = %lu \n", tensor->num_elems); +} + + + +void printTensorDims(void* tensor_ptr){ + + struct Tensor* tensor = (struct Tensor*) tensor_ptr; + + printf("Num_elems = %lu \n", tensor->num_elems); + for (int i = 0; i < tensor->dims.num_dims; i++){ + printf("dim[%d] = %lu \n", i, tensor->dims.dim_sizes[i]); + } +} + + + +void compareTensors(void* tensor1_ptr, void* tensor2_ptr){ + + struct Tensor* tensor1 = (struct Tensor*) tensor1_ptr; + struct Tensor* tensor2 = (struct Tensor*) tensor2_ptr; + + hpvm_request_tensor(tensor1, 0); + hpvm_request_tensor(tensor2, 0); + + float* tensor_data1 = (float*) tensor1->host_data; + float* tensor_data2 = (float*) tensor2->host_data; + + for(unsigned int i = 0; i < tensor1->num_elems; i++){ + if(tensor_data1[i] != tensor_data2[i]){ + printf("Tensor data mismatch at index %d \n", i); + abort(); + } + } +} + + + +//*** FIXIT: Move this to CPU-only +struct Tensor* readTrainedWeightsCPU(const char* file_name, int data_type, + int dim1_size, int dim2_size, + int dim3_size, int dim4_size){ + + // FIXIT: Don't assume floating point types + int type_size = 4; // NOTE: Assuming floating point tensors + long int num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + long int size_in_bytes = type_size * dim1_size * dim2_size * dim3_size * dim4_size; + float* tensor_data = (float*) malloc(sizeof(float) * num_elems); + int file_header_size = 0; + + FILE* file = fopen(file_name, "rb"); + if(file == NULL){ + printf("Data file %s is not found. Aborting... \n", file_name); + abort(); + } + + fseek(file, file_header_size, SEEK_CUR); // Skipping the file header + size_t bytes_read = fread(tensor_data, 1, size_in_bytes, file); + + printf("size in bytes = %lu, bytes read = %lu \n", size_in_bytes, bytes_read); + + fclose(file); + + + struct Tensor* weights = (struct Tensor*) create4DTensorCPU(data_type, nchw, dim1_size, dim2_size, + dim3_size, dim4_size); + + initTensorData(weights, tensor_data, size_in_bytes); + //compareValues(weights, tensor_data, num_elems); + free(tensor_data); + + return weights; +} + + + +uint8_t* readLabels(const char* labels_file, int num_labels){ + + uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); + FILE* file = fopen(labels_file, "rb"); + if(file == NULL){ + printf("Data file %s is not found. Aborting...\n", labels_file); + abort(); + } + + size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); + + fclose(file); + + return labels; +} + + +uint8_t* readLabelsBatch(const char* labels_file, int start, int end){ + + int num_labels = end - start; + int file_header_size = sizeof(uint8_t) * start; + + uint8_t* labels = (uint8_t*) malloc(sizeof(uint8_t) * num_labels); + FILE* file = fopen(labels_file, "rb"); + if(file == NULL){ + printf("Data file %s is not found. Aborting...\n", labels_file); + abort(); + } + + fseek(file, file_header_size, SEEK_SET); // Skipping the file header + + size_t bytes_read = fread(labels, 1, sizeof(uint8_t) * num_labels, file); + + + fclose(file); + + // printf("--labels bytes_read = %lu \n", bytes_read); + return labels; +} + + + +void computeAccuracy(const char* labels_file, int num_labels, void* result_ptr){ + + struct Tensor* result = (struct Tensor*) result_ptr; + + uint8_t* labels = readLabels(labels_file, num_labels); + size_t batch_dim = result->dims.dim_sizes[0]; + size_t channels = result->dims.dim_sizes[1]; + float* data = (float*) result->host_data; + int num_errors = 0; + + for(int i = 0; i < batch_dim; i++){ + int chosen = 0; + for (int id = 1; id < 10; ++id){ + if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; + } + + //printf("chosen = %d, label = %d \n", chosen, labels[i]); + if(chosen != labels[i]) + num_errors++; + } + + float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; + printf("****** Accuracy = %f \n\n", accuracy); + + + FILE* fp = fopen("final_accuracy", "w+"); + if(fp != NULL){ + + std::ostringstream ss; + ss << std::fixed << accuracy; + std::string print_str = ss.str(); + + fwrite(print_str.c_str(), 1, print_str.length(), fp); + fclose(fp); + } + +} + + + +float computeAccuracy2(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){ + + unsigned num_zeros = 0; + + struct Tensor* result = (struct Tensor*) result_ptr; + + size_t batch_dim = result->dims.dim_sizes[0]; + size_t channels = result->dims.dim_sizes[1]; + float* data = (float*) result->host_data; + int num_errors = 0; + + printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); + + for(int i = 0; i < num_labels; i++){ + + int chosen = 0; + for (int id = 1; id < num_classes; ++id){ + if (data[i * channels + chosen] < data[i * channels + id]) chosen = id; + } + + if(labels[i] == 0) + num_zeros++; + + if(chosen != labels[i]) + num_errors++; + + //printf("chosen = %d, label = %d \n", chosen, labels[i]); + } + + float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; + printf("****** Accuracy = %f \n\n", accuracy); + //printf("****** Zero class labels %d \n", num_zeros); + + FILE* fp = fopen("final_accuracy", "w+"); + if(fp != NULL){ + + std::ostringstream ss; + ss << std::fixed << accuracy; + std::string print_str = ss.str(); + + fwrite(print_str.c_str(), 1, print_str.length(), fp); + } + + fclose(fp); + + return accuracy; +} + + +struct ClassProb{ + float prob; + int index; +}; + + +bool descendFloatComp(ClassProb obj1, ClassProb obj2){ + return obj1.prob > obj2.prob; +} + + +float computeTop5Accuracy(uint8_t* labels, int num_labels, void* result_ptr, unsigned num_classes = 10){ + + struct Tensor* result = (struct Tensor*) result_ptr; + + size_t batch_dim = result->dims.dim_sizes[0]; + size_t channels = result->dims.dim_sizes[1]; + float* data = (float*) result->host_data; + int num_errors = 0; + + printf("batch_dim = %lu, channels = %lu \n", batch_dim, channels); + + for(int i = 0; i < num_labels; i++){ + + std::vector<ClassProb> elem_probs; + for (int id = 0; id < num_classes; ++id){ + ClassProb cProb; + cProb.prob = data[i * channels + id]; + cProb.index = id; + elem_probs.push_back(cProb); + } + + std:sort(elem_probs.begin(), elem_probs.end(), descendFloatComp); + // Check if any of top-5 predictions matches + bool matched = false; + for(int j = 0; j < 5; j++){ + ClassProb cProb = elem_probs[j]; + if(cProb.index == labels[i]) + matched = true; + } + + if(!matched) + num_errors +=1; + } + + float accuracy = ((batch_dim - num_errors) * 1.0 / batch_dim * 1.0) * 100.0; + printf("****** Accuracy = %f \n\n", accuracy); + + FILE* fp = fopen("final_accuracy", "w+"); + if(fp != NULL){ + + std::ostringstream ss; + ss << std::fixed << accuracy; + std::string print_str = ss.str(); + + fwrite(print_str.c_str(), 1, print_str.length(), fp); + } + + fclose(fp); + + return accuracy; +} + + + + +void dumpFinalAccuracy(float accuracy){ + + printf("\n\n **** Final Accuracy = %f \n", accuracy); + + FILE* fp = fopen("final_accuracy", "w+"); + if(fp != NULL){ + std::ostringstream ss; + ss << std::fixed << accuracy; + std::string print_str = ss.str(); + + fwrite(print_str.c_str(), 1, print_str.length(), fp); + } + + fclose(fp); + + run_accuracies.push_back(accuracy); +} + + + +void dumpAvgPSNR(float avg_psnr){ + + FILE* fp = fopen("avg_psnr", "w+"); + if(fp != NULL){ + std::ostringstream ss; + ss << std::fixed << avg_psnr; + std::string print_str = ss.str(); + fwrite(print_str.c_str(), 1, print_str.length(), fp); + } + + fclose(fp); +} + + +void dumpPSNRStd(float psnr_std){ + + FILE* fp = fopen("psnr_std.txt", "w+"); + if(fp != NULL){ + std::ostringstream ss; + ss << std::fixed << psnr_std; + std::string print_str = ss.str(); + fwrite(print_str.c_str(), 1, print_str.length(), fp); + } + + fclose(fp); +} + + + + + +void dumpExecutionAccuracies(){ + + FILE* fp = fopen("run_accuracies.txt", "w+"); + if(fp != NULL){ + for (int i = 0; i < run_accuracies.size(); i++){ + float accuracy = run_accuracies[i]; + std::ostringstream ss; + ss << std::fixed << accuracy; + std::string print_str = ss.str(); + fwrite(print_str.c_str(), 1, print_str.length(), fp); + fwrite("\n", 1, 1, fp); + } + + } + + fclose(fp); +} + + +float readPSNRFromFile(const char* file_name){ + + float psnr; + FILE* pFile = fopen(file_name, "r"); + if(pFile == NULL){ + printf("ERROR: psnr.txt not found! \n"); + abort(); + } + + fscanf(pFile, "%f", &psnr); + printf("**** PSNR read = %f \n\n", psnr); + return psnr; +} + + +float computePSNRViolation(void* gold_ptr, void* approx_ptr, float PSNR_threshold){ + + + PSNR_threshold = readPSNRFromFile("psnr.txt"); + std::vector<float> psnr_list; + + struct Tensor* gold_tensor = (struct Tensor*) gold_ptr; + struct Tensor* approx_tensor = (struct Tensor*) approx_ptr; + + size_t* dim_sizes = gold_tensor->dims.dim_sizes; + size_t batch_dim = dim_sizes[0]; + size_t image_size = dim_sizes[1] * dim_sizes[2] * dim_sizes[3]; + + printf("batch_dim = %lu, image_size = %lu \n", batch_dim, image_size); + + float* gold_data = (float*) gold_tensor->host_data; + float* approx_data = (float*) approx_tensor->host_data; + + FILE* fp = fopen("img_psnr.txt", "w+"); + + float sum_psnr = 0.0; + int num_errors = 0; + for(size_t i = 0; i < batch_dim; i++){ + float mse_sum = 0.0; + float max_val = -999999; + size_t offset = i * image_size; + + for(size_t j = 0; j < image_size; j++){ + float diff = gold_data[offset + j] - approx_data[offset + j]; + float diff_square = diff * diff; + mse_sum += diff_square; + + if(max_val < gold_data[offset + j]){ + max_val = gold_data[offset + j]; + } + } + + mse_sum = mse_sum / image_size; + float psnr = 20 * log10(255 / sqrt(mse_sum)); + + sum_psnr += psnr; + if (psnr < PSNR_threshold) + num_errors += 1; + + printf("PSNR value = %f \n", psnr); + psnr_list.push_back(psnr); + + std::ostringstream ss; + ss << std::fixed << psnr; + std::string print_str = ss.str(); + fwrite(print_str.c_str(), 1, print_str.length(), fp); + fwrite("\n", 1, 1, fp); + } + + float violation_rate = (num_errors * 1.0) / batch_dim * 100.0; + printf("*** violation_rate= %f \n\n", violation_rate); + + float avg_psnr = sum_psnr / batch_dim; + printf("*** avg_psnr = %f \n\n", avg_psnr); + dumpAvgPSNR(avg_psnr); + + float success_rate = 100.0 - violation_rate; + dumpFinalAccuracy(success_rate); + + fclose(fp); + + + float var = 0.0; + for(size_t i = 0; i < batch_dim; i++){ + var = var + (psnr_list[i] - avg_psnr) * (psnr_list[i] - avg_psnr); + } + + var /= batch_dim; + float std = sqrt(var); + + dumpPSNRStd(std); + + return violation_rate; +} + + +void dumpOutput(void* output_ptr, const char* file_name){ + + struct Tensor* out_tensor = (struct Tensor*) output_ptr; + size_t size_in_bytes = out_tensor->size_in_bytes; + printf ("** Output size = %lu \n", size_in_bytes); + + float* host_data = (float*) out_tensor->host_data; + FILE* fd = fopen(file_name, "w+"); + fwrite(host_data, 1, size_in_bytes, fd); + fclose(fd); +} + + + +#endif diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/fc2_cpu.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/fc2_cpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..476160ac90420a6957b0e8918a3bdde48f42db01 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/fc2_cpu.cc @@ -0,0 +1,66 @@ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> + +#include "../../tensor_runtime/include/tensor_cpu_runtime.h" +#include "../include/utils_cpu.h" +#include "../include/types.h" + + +void FC2(){ + + printf("********* 2-Layer FC with clipped activations and weights ********* \n"); + + int test_batch_size = 100; + + uint8_t* labels = readLabels("../model_params/lenet_params/datasets/t10k-labels-idx1-ubyte", test_batch_size); + + void* input = readTrainedWeightsCPU("../model_params/FC_network2/mnist_float_input.bin", + float_type, test_batch_size, 1, 28, 28); + + void* fc1_weights = readTrainedWeightsCPU("../model_params/fc2_clipped/fc1.bin", + float_type, 1, 1, 784, 128); + void* fc1_bias = readTrainedWeightsCPU("../model_params/fc2_clipped/fc1_bias.bin", + float_type, 1, 128, 1, 1); + void* fc2_weights = readTrainedWeightsCPU("../model_params/fc2_clipped/fc2.bin", + float_type, 1, 1, 128, 10); + void* fc2_bias = readTrainedWeightsCPU("../model_params/fc2_clipped/fc2_bias.bin", + float_type, 1, 10, 1, 1); + + + // Layer-1 + void* fc1out = tensorGemmCPU(input, fc1_weights); + + void* fc1_bias_out = tensorAddCPU(fc1out, fc1_bias); + + //-- void* fc1_relu = tensorRelu2(fc1_bias_out, 0, 2); + + // Layer-2 + //-- void* fc2out = tensorGemmCPU(fc1_relu, fc2_weights); + + //-- void* fc2_bias_out = tensorAddCPU(fc2out, fc2_bias); + + //--- void* fc2_relu = tensorRelu2(fc2_bias_out, 0, 2); + + //--- void* result = tensorSoftmax(fc2_relu); + + //-- computeAccuracy2(labels, test_batch_size, result); + +} + + +// If an argument is passed - the run goes into OpenTuner mode - waiting on a pipe +int main(int argc, char* argv[]){ + + llvm_hpvm_initTensorRt(0); + + FC2(); + + return 0; +} + diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..dfa9511055f04d703696d65f5fd24094bc202f3e --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_cpu_runtime.h @@ -0,0 +1,94 @@ + +#include <stdio.h> +#include <cstdlib> +#include <cmath> +#include <memory> +#include <string> +//#include "runtime_types.h" + + +#ifndef CUDNN_HEADER +#define CUDNN_HEADER + + +extern "C"{ + /**** Initialization Routine - Must be inserted at program start (in the backend) ****/ + void llvm_hpvm_initTensorRt(int gpuid = 0); + void llvm_hpvm_cleanupTensorRt(); + + // Routine to moving tensor data (from and to GPU,CPU) + void hpvm_request_tensor(void* tensor, int destination); + + + // NOTE: Currently only using 4-D tensors - 2D and 3D tensors not supported for cuDNN operations + // NOTE: The only data format supported as of now is: NCHW (batch_dimension, channels, Height, Width) + void* create4DTensorCPU(int data_type, int data_format, size_t dim1_size, size_t dim2_size, + size_t dim3_size, size_t dim4_size); + void initTensorData(void* tensor, void* data_ptr, size_t size_in_bytes); + + /********** Tensor Operation API ******/ + + // NOTE: For conv_mode, only value '1' is supported + void* tensorConvolutionCPU(void* input, void* filter, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int compute_precision); + + void* tensorPoolingCPU(void* input, + int poolFunction, + int window_height, int window_width, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride); + + void* tensorGemmCPU(void* lhs, void* rhs); + // NOTE: In place operation + void* tensorAddCPU(void* x, void* bias); + // NOTE: In-place operation + void* tensorReluCPU(void* input); + + void* tensorTanhCPU(void* input); + + void* tensorSoftmaxCPU(void* input); + +} + + +/* +void dummyFunction(){ + + void* initRT = (void*) &llvm_hpvm_initTensorRt; + void* cleanRT = (void*) &llvm_hpvm_cleanupTensorRt; + void* request_tensorPtr = (void*) &hpvm_request_tensor; + void* startProf = (void*) &startProfiling; + void* stopProf = (void*) &stopProfiling; + void* create2Dptr = (void*) &create2DTensor; + void* create3Dptr = (void*) &create3DTensor; + void* create4Dptr = (void*) &create4DTensor; + void* initTensorPtr = (void*) &initTensorData; + void* tensorSplitPtr = (void*) &tensorSplit; + void* tensorConcatPtr = (void*) &tensorConcat; + void* tensorConvPtr = (void*) &tensorConvolution; + void* tensorHConvPtr = (void*) &tensorHalfConvolution; + void* tensorPoolPtr = (void*) &tensorPooling; + void* tensorHalfPoolPtr = (void*) &tensorHalfPooling; + void* tensorLRNPtr = (void*) &tensorLRN; + void* tensorGemmPr = (void*) &tensorGemm; + void* tensorGemmCPUPtr = (void*) &tensorGemmCPU; + void* tensorGemmGPUPtr = (void*) &tensorGemmGPU; + void* tensorHgemmPtr = (void*) &tensorHalfGemm; + void* tensorGemmBiasPtr = (void*) &tensorGemmBias; + void* tensorAddPtr = (void*) &tensorAdd; + void* tensorHalfAddPtr = (void*) &tensorHalfAdd; + void* tensorReluPtr = (void*) &tensorRelu; + //FIXME: --void* tensorHalfReluPtr = (void*) &tensorHalfRelu; + void* tensorRelu2Ptr = (void*) &tensorRelu2; + void* tensorHalfRelu2Ptr = (void*) &tensorHalfRelu2; + void* tensorTanhPtr = (void*) &tensorTanh; + void* tensorHalfTanhPtr = (void*) &tensorHalfTanh; + void* tensorSoftmaxPtr = (void*) &tensorSoftmax; + void* tensorAddErrorPtr = (void*) &tensorAddError; +} +*/ + + +#endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc new file mode 100644 index 0000000000000000000000000000000000000000..336c98b767d416e98de6e848d395e80f148d118a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_cpu_runtime.cc @@ -0,0 +1,158 @@ +/* This file includes the API implementation of the HPVM tensor runtime built on cublas, cudnn +** +** Author: Hashim Sharif +** Email: hsharif3@illinois.edu +*/ + +#include <stdio.h> +#include <stdarg.h> +#include <cstdio> +#include <cstdlib> +#include <cmath> +#include <ctime> +#include <cfloat> +#include <algorithm> +#include <iostream> +#include <map> +#include <memory> +#include <sstream> +#include <string> +#include <vector> +#include <stdlib.h> +#include <cstring> + +// Tensor runtime header files +#include "../include/tensor_cpu_runtime.h" +#include "../include/tensor.h" + + +void llvm_hpvm_initTensorRt(int gpuid){ + // NOTE: Do Nothing +} + +void llvm_hpvm_cleanupTensorRt(){ + // NOTE: Do Nothing +} + +void hpvm_request_tensor(void* tensor, int destination){ + // NOTE: Do Nothing +} + + +// Returns the size of the target cudnn datatype +int getTypeSize(int data_type){ + // Float/Int data type - Full Precision + if(data_type == 0) + return 4; + // Half data type + if(data_type == 1) + return 2; + + return 1; +} + + +void setSizeInBytes(struct Tensor* tensor, int data_type, size_t num_elems){ + int type_size = getTypeSize(data_type); + size_t size_in_bytes = type_size * num_elems; + tensor->size_in_bytes = size_in_bytes; +} + + +void allocateMemCPU(struct Tensor* tensor, int data_type, size_t num_elems){ + setSizeInBytes(tensor, data_type, num_elems); + tensor->data_type = data_type; + tensor->num_elems = num_elems; + tensor->host_data = (void*) malloc(tensor->size_in_bytes); // Allocate memory on the host +} + + +void initTensorData(void* tensor_ptr, void* data_ptr, size_t size_in_bytes){ + + Tensor* tensor = (Tensor*) tensor_ptr; + if(tensor->size_in_bytes != size_in_bytes){ + printf("The destination and source sizes don't match"); + } + memcpy(tensor->host_data, data_ptr, size_in_bytes); +} + + + +void* create4DTensorCPU(int data_type, int data_format, size_t dim1_size, size_t dim2_size, + size_t dim3_size, size_t dim4_size){ + + struct Tensor* tensor = (struct Tensor*) malloc(sizeof(Tensor)); + size_t num_elems = dim1_size * dim2_size * dim3_size * dim4_size; + + allocateMemCPU(tensor, data_type, num_elems); + // Setting the tensor dimensions + size_t* dim_sizes = (size_t*) malloc(sizeof(size_t) * 4); + dim_sizes[0] = dim1_size; + dim_sizes[1] = dim2_size; + dim_sizes[2] = dim3_size; + dim_sizes[3] = dim4_size; + tensor->dims.dim_sizes = dim_sizes; + tensor->dims.num_dims = 4; + + return tensor; +} + + + +void* tensorAddCPU(void* x_ptr, void* bias_ptr){ + + Tensor* x = (Tensor*) x_ptr; + Tensor* bias = (Tensor*) bias_ptr; + + float* x_data = (float*) x->host_data; + float* bias_data = (float*) bias->host_data; + + size_t num_elems = x->num_elems; + for(size_t i = 0; i < num_elems; i++){ + x_data[i] += bias_data[i]; + } + + return x; +} + + +void* tensorGemmCPU(void* lhs_ptr, void* rhs_ptr){ + + Tensor* lhs = (Tensor*) lhs_ptr; + Tensor* rhs = (Tensor*) rhs_ptr; + + // 'm' holds the batch dimension - assuming NCHW format Tensors + int m = lhs->dims.dim_sizes[0]; + // The rhs must be a 2D tensor + int n = rhs->dims.dim_sizes[rhs->dims.num_dims-1]; // output neurons + int k = 1; + // Flattening the dimensions after the batch dimension + // NOTE: Allowing any number of dimensions > 2 for lhs + for (int j = 1 ; j < lhs->dims.num_dims; j++){ + k = k * lhs->dims.dim_sizes[j]; // input neurons + } + + int rhs_k = rhs->dims.dim_sizes[rhs->dims.num_dims-2]; + + // NOTE: Creating a 4D tensor to be compatible with later called cuDNN routines + Tensor* output = (Tensor*) create4DTensorCPU(0, 0, m, n, 1, 1); + + float* lhs_arr = (float*) lhs->host_data; + float* rhs_arr = (float*) rhs->host_data; + float* output_arr = (float*) output->host_data; + + for(int i = 0; i < m; i++){ + for(int j = 0; j < n; j++){ + float sum = 0.0; + for(int l = 0; l < k; l++){ + float mul = lhs_arr[i*k+l] * rhs_arr[l*n+j]; + sum = sum + mul; + } + output_arr[i*n+j] = sum; + } + } + + return output; +} + +