diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc new file mode 100644 index 0000000000000000000000000000000000000000..e9db1ed227291f8aa60c7c919162db26dbd0ea98 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/profiling/alexnet2_profiling_tensors.cc @@ -0,0 +1,276 @@ +// Per tensor operation + +#include "/home/nvidia/Gitlab/hpvm/llvm/projects/gpu_profiler/include/profiler.h" + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <string.h> + +#include "../../../tensor_runtime/include/tensor_runtime.h" +#include "../../include/utils.h" + +/* +void add_to_map(std::unordered_map<std::string, std::pair<double, double> >& total_time_energies, double data, const std::string& key){ + auto itr = total_time_energies.find(key); + if (itr == total_time_energies.end()){ + total_time_energies.insert(std::make_pair(key, data)); + } else { + itr->second += data; + } +} +*/ + +void add_data(std::unordered_map<std::string, std::pair<double, double> >& total_time_energies, Profiler& profiler, const std::string& op_name){ + profiler.pause_profiler(); + auto time_energy = profiler.get_time_energy_realzzzz(); + + auto itr = total_time_energies.find(op_name); + if (itr == total_time_energies.end()){ + total_time_energies.insert(std::make_pair(op_name, time_energy)); + } else { + itr->second.first += time_energy.first; + itr->second.second += time_energy.second; + } + profiler.reset(); +} + +/* NOTE: Reference Architecture to use for profiling */ +void testCifarNet(){ + + printf("********* Alexnet2 CIFAR-10 DNN ********** \n"); + + std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("norm_cifar_input.bin"); + std::string labels_path = dir_prefix + std::string("test_labels.bin"); + + void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin", + float_type, 32, 3, 3, 3); + void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin", + float_type, 32, 32, 3, 3); + void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin", + float_type, 1, 32, 1, 1); + void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin", + float_type, 64, 32, 3, 3); + void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin", + float_type, 1, 64, 1, 1); + void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin", + float_type, 64, 64, 3, 3); + void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin", + float_type, 1, 64, 1, 1); + void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin", + float_type, 128, 64, 3, 3); + void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin", + float_type, 1, 128, 1, 1); + void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin", + float_type, 128, 128, 3, 3); + void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin", + float_type, 1, 128, 1, 1); + + void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin", + float_type, 1, 1, 2048, 10); + void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin", + float_type, 1, 10, 1, 1); + + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + + std::ofstream online_profiler_output; + online_profiler_output.open("online_output.txt"); + + startMemTracking(); + + // NOTE: CHANGED INPUT TO STANDARDIZE + int total_runs = 50; // FOR NOW 100; + + int test_input_size = 2500; //10000; + int batch_size = 2500; + int batch_count = test_input_size / batch_size; + float final_accuracy = 0.0; + + // NOTE: Starting time profiling + startProfiling(); + + Profiler profiler; + profiler.start_profiler(); + + // Get the total time and energy per tensor per run + std::unordered_map<std::string, std::pair<double, double> > total_time_energies; + + for(int i = 0; i < total_runs; i++){ + for(int i = 0; i < batch_count; i++){ + int start = i * batch_size; + int end = (i + 1) * batch_size; + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + // FIRST Tensor Runtime CALL + profiler.resume_profiler(); + void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv1"); + + profiler.resume_profiler(); + tensorAdd(conv1out, conv1_bias); + add_data(total_time_energies, profiler, "Add1"); + + profiler.resume_profiler(); + void* conv1_tanh = tensorTanh(conv1out); + add_data(total_time_energies, profiler, "Tanh1"); + + // 2nd Layer + profiler.resume_profiler(); + void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv2"); + + profiler.resume_profiler(); + tensorAdd(conv2out, conv2_bias); + add_data(total_time_energies, profiler, "Add2"); + + profiler.resume_profiler(); + void* conv2_tanh = tensorTanh(conv2out); + add_data(total_time_energies, profiler, "Tanh2"); + + profiler.resume_profiler(); + void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + add_data(total_time_energies, profiler, "Pool1"); + + // 3rd Layer + profiler.resume_profiler(); + void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv3"); + + profiler.resume_profiler(); + tensorAdd(conv3out, conv3_bias); + add_data(total_time_energies, profiler, "Add3"); + + profiler.resume_profiler(); + void* conv3_tanh = tensorTanh(conv3out); + add_data(total_time_energies, profiler, "Tanh3"); + + // 4th Layer + profiler.resume_profiler(); + void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv4"); + + profiler.resume_profiler(); + tensorAdd(conv4out, conv4_bias); + add_data(total_time_energies, profiler, "Add4"); + + profiler.resume_profiler(); + void* conv4_tanh = tensorTanh(conv4out); + add_data(total_time_energies, profiler, "Tanh4"); + + profiler.resume_profiler(); + void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + add_data(total_time_energies, profiler, "Pool2"); + + // 5th Layer + profiler.resume_profiler(); + void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv5"); + + profiler.resume_profiler(); + tensorAdd(conv5out, conv5_bias); + add_data(total_time_energies, profiler, "Add5"); + + profiler.resume_profiler(); + void* conv5_tanh = tensorTanh(conv5out); + add_data(total_time_energies, profiler, "Tanh5"); + + // 6th Layer + profiler.resume_profiler(); + void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + add_data(total_time_energies, profiler, "Conv6"); + + profiler.resume_profiler(); + tensorAdd(conv6out, conv6_bias); + add_data(total_time_energies, profiler, "Add6"); + + profiler.resume_profiler(); + void* conv6_tanh = tensorTanh(conv6out); + add_data(total_time_energies, profiler, "Tanh6"); + + profiler.resume_profiler(); + void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + add_data(total_time_energies, profiler, "Pool3"); + + // final FC Layer + profiler.resume_profiler(); + void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); + add_data(total_time_energies, profiler, "Mul1"); // ASSUMING that this is mul1 + + std::cout<<"-----------------------------------ADD 7--------------------------------\n"; + profiler.resume_profiler(); + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + add_data(total_time_energies, profiler, "Add7"); + std::cout<<"-----------------------------------ADD 7 ENDS --------------------------------\n"; + + profiler.resume_profiler(); + void* result = tensorSoftmax(gemm1biasout); + add_data(total_time_energies, profiler, "Softmax1"); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); + + float accuracy = computeAccuracy2(labels, batch_size, result); + final_accuracy += accuracy; + + freeBatchMemory(); + } + } + auto total = profiler.get_time_energy_realzzzz(); + std::cout << "total = " << total.first << ", " << total.second << std::endl; + profiler.get(); + profiler.stop_profiler(); + + stopProfiling(); + //online_profiler_output << "Total time: " << total_time << ", " << total_energy << "\n"; + // Now compute the averages across batches + std::ofstream ofs; + std::string arr[] = {"Add1", "Add2", "Add3", "Add4", "Add5", "Add6", "Add7", + "Conv1", "Conv2", "Conv3", "Conv4", "Conv5", "Conv6", + "Mul1", + "Pool1", "Pool2", "Pool3", + "Softmax1", + "Tanh1", "Tanh2", "Tanh3", "Tanh4", "Tanh5", "Tanh6"}; + ofs.open("online_profiler_tensor_data.txt"); + std::vector<std::string> ordered_keys(std::begin(arr), std::end(arr)); + for (const std::string& key : ordered_keys){ + const auto& data_pair = total_time_energies[key]; + ofs << key << ": " << data_pair.first / total_runs << "\t" << data_pair.second / total_runs << '\n'; + std::cout<< key << ": " << data_pair.first / total_runs << "\t" << data_pair.second / total_runs << '\n'; + } + + /* + ofs.open("online_profiler_tensor_data.txt"); + for (const auto& tensor_data : total_time_energies){ + ofs << tensor_data.first << ": " << tensor_data.second.first / total_runs << "\t" << tensor_data.second.second / total_runs << '\n'; + }*/ + ofs.close(); + final_accuracy = (final_accuracy / batch_count) / total_runs; + dumpFinalAccuracy(final_accuracy); + online_profiler_output.close(); +} + + +int main(int argc, char* argv[]){ + + llvm_hpvm_initTensorRt(0); + + testCifarNet(); + + llvm_hpvm_cleanupTensorRt(); + + return 0; +} +