diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py index 7e969271c20031dab9f302b333a4f7feb0338871..41bd4c73483e19472053406104f8590e54bd5d36 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py +++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/benchmarks.py @@ -12,10 +12,18 @@ # Batch 18: Batch13 (Basline) + ParetoCurve (1500 Runs) - BUGGY IGNORE!!! # Batch 19: (Basline) + ParetoCurve + 2 runs in Tuning Phase (1500 Runs) + # Batch 20: 3-Skip levels + + 2 runs + 1500 Runs + EnergyBandSize now % of Max (Compare against Batch19 -batch_id = "batch201" +# Batch 200: AlgoTuner - 1000 images - 1500 runs (IGNORE) +# Batch 201: AlgoTuner - 2000 images - 1500 runs +# Batch 202: AlgoTuner - 2000 images - 500 runs + + + +batch_id = "batch202" + class Benchmark: def __init__(self): @@ -65,7 +73,7 @@ Alexnet1.layer_knobs = "../opentuner/data/alexnet/knobs.txt" Alexnet1.loss1_result_file = "tuner_results/alexnet_cifar10/loss_1/promise_tuned_confs/promise_confs.txt" Alexnet1.loss2_result_file = "tuner_results/alexnet_cifar10/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet1.autotuner_runs = 1500 +Alexnet1.autotuner_runs = 500 Alexnet1.tuner_accuracy = 79.9 #Alexnet1.promise_accuracy = 79.9 Alexnet1.promise_accuracy = 79.5 @@ -96,7 +104,7 @@ Alexnet2.cost_file = "../build_tuner/tuner_results/alexnet2_cifar10/op_cost.txt" Alexnet2.layer_knobs = "../opentuner/data/alexnet2/knobs.txt" #Alexnet2.loss1_result_file = "tuner_results/alexnet2_cifar10/loss_1/promise_tuned_confs/promise_confs.txt" #Alexnet2.loss2_result_file = "tuner_results/alexnet2_cifar10/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet2.autotuner_runs = 1500 +Alexnet2.autotuner_runs = 500 Alexnet2.tuner_accuracy = 84.19 #Alexnet2.promise_accuracy = 84.19 Alexnet2.promise_accuracy = 84.8 @@ -131,7 +139,7 @@ Alexnet3.layer_knobs = "../opentuner/data/vgg16_cifar10/knobs.txt" Alexnet3.loss1_result_file = "tuner_results/vgg16_cifar10/loss_1/promise_tuned_confs/promise_confs.txt" Alexnet3.loss2_result_file = "tuner_results/vgg16_cifar10/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet3.autotuner_runs = 1500 +Alexnet3.autotuner_runs = 500 Alexnet3.tuner_accuracy = 90.19 #Alexnet3.promise_accuracy = 90.19 Alexnet3.promise_accuracy = 89.55 @@ -164,7 +172,7 @@ Alexnet4.layer_knobs = "../opentuner/data/resnet/knobs.txt" Alexnet4.loss1_result_file = "tuner_results/resnet18_cifar10/loss_1/promise_tuned_confs/promise_confs.txt" Alexnet4.loss2_result_file = "tuner_results/resnet18_cifar10/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet4.autotuner_runs = 1500 +Alexnet4.autotuner_runs = 500 Alexnet4.tuner_accuracy = 89.6 #Alexnet4.promise_accuracy = 89.59 - 1000 images Alexnet4.promise_accuracy = 89.94 @@ -199,7 +207,7 @@ Alexnet5.layer_knobs = "../opentuner/data/vgg16_cifar100/knobs.txt" Alexnet5.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_confs/promise_confs.txt" Alexnet5.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet5.autotuner_runs = 1500 +Alexnet5.autotuner_runs = 500 Alexnet5.tuner_accuracy = 67.95 #Alexnet5.promise_accuracy = 66.8 Alexnet5.promise_accuracy = 70.1 @@ -233,7 +241,7 @@ Alexnet6.layer_knobs = "../opentuner/data/lenet/knobs.txt" #Alexnet6.loss1_result_file = "tuner_results/vgg_cifar100/loss_1/promise_tuned_confs/promise_confs.txt" #Alexnet6.loss2_result_file = "tuner_results/vgg_cifar100/loss_2/promise_tuned_confs/promise_confs.txt" -Alexnet6.autotuner_runs = 900 +Alexnet6.autotuner_runs = 500 Alexnet6.tuner_accuracy = 98.9 Alexnet6.promise_accuracy = 98.9 Alexnet6.validation_accuracy = 99 @@ -268,7 +276,7 @@ Alexnet7.layer_knobs = "../opentuner/data/mobilenet/knobs.txt" #--- Files below needed for VALIDATION experiment Alexnet7.loss1_result_file = "tuner_results/mobilenet/loss_1/batch1/promise_tuner/high_confidence/promise_confs.txt" Alexnet7.loss2_result_file = "tuner_results/mobilenet/loss_2/batch1/promise_tuner/high_confidence/promise_confs.txt" -Alexnet7.autotuner_runs = 1500 +Alexnet7.autotuner_runs = 500 Alexnet7.tuner_accuracy = 84.8 #Alexnet7.promise_accuracy = 84.8 Alexnet7.promise_accuracy = 83.65 @@ -302,7 +310,7 @@ Alexnet8.layer_knobs = "../opentuner/data/mobilenet_shallow/knobs.txt" Alexnet8.loss1_result_file = "../build_tuner/tuner_results/mobilenet_shallow/loss_1/batch2/promise_tuner/high_confidence/promise_selected_confs.txt" Alexnet8.loss2_result_file = "../build_tuner/tuner_results/mobilenet_shallow/loss_2/batch2/promise_tuner/high_confidence/promise_selected_confs.txt" -Alexnet8.autotuner_runs = 1500 +Alexnet8.autotuner_runs = 500 Alexnet8.tuner_accuracy = 87.6 #Alexnet8.promise_accuracy = 87.59 Alexnet8.promise_accuracy = 89.25 diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py index 6a07ef86e53d2b4b6372e1e253611ba6f018aaad..f82b1a24f3982802c249f502161d944110b389e0 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py +++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/buildRtConfig.py @@ -89,6 +89,9 @@ def skipFile(fname): skip_files["confidence_summary.txt"] = 1 skip_files["promise_confs.txt"] = 1 + if "accuracy" in fname: # *_accuracy files should be skipped + return True + if fname in skip_files: return True else: @@ -117,7 +120,8 @@ def loadConfigData(result_dir, baseline_accuracy): config_arr = [] - result_dir += "/promise_tuner/high_confidence/" + #result_dir += "/promise_tuner/high_confidence/" + result_dir += "/algo_tuner/high_confidence/" file_names = os.listdir(result_dir) diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py index 73d460be0c4091067c9d52e07ea7f4d421765ff3..1e533b8702139966166f860b72a3df1ccae03ee6 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py +++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/run_autotuner.py @@ -12,7 +12,7 @@ from run_ha_tuner import runTunerBench from run_hs_tuner import runPromiseBench from run_algo_tuner import runAlgoBench from compute_confs import computePSNRBenchSwings, computeBenchSwings -from validation import runPromiseBenchValidation, runPromiseBenchValidation2, runBenchValidation +from validation import runPromiseBenchValidation2, runBenchValidation, runAlgoBenchValidate from profiling import startProfile, stopProfile, dumpProfiles from utils import createResultDirs from benchmarks import batch_id @@ -276,8 +276,8 @@ def runAlgoTuner(): Bench = bench_tuner_data["vgg16_cifar10"] runAlgoBench(Bench) - #Bench = bench_tuner_data["lenet_keras"] - #runAlgoBench(Bench) + Bench = bench_tuner_data["lenet_keras"] + runAlgoBench(Bench) Bench = bench_tuner_data["alexnet2_cifar10"] runAlgoBench(Bench) @@ -292,6 +292,36 @@ def runAlgoTuner(): +def runAlgoTunerValidation(): + + + Bench = bench_tuner_data["alexnet_cifar10"] + runAlgoBenchValidate(Bench) + + Bench = bench_tuner_data["mobilenet_shallow"] + runAlgoBenchValidate(Bench) + + Bench = bench_tuner_data["mobilenet_cifar10"] + runAlgoBenchValidate(Bench) + + Bench = bench_tuner_data["vgg16_cifar10"] + runAlgoBenchValidate(Bench) + + Bench = bench_tuner_data["lenet_keras"] + runAlgoBenchValidate(Bench) + + Bench = bench_tuner_data["alexnet2_cifar10"] + runAlgoBenchValidate(Bench) + + Bench = bench_tuner_data["vgg16_cifar100"] + runAlgoBenchValidate(Bench) + + Bench = bench_tuner_data["resnet18_cifar10"] + runAlgoBenchValidate(Bench) + + + + if __name__ == "__main__": @@ -305,7 +335,9 @@ if __name__ == "__main__": #runPromiseTuner() - runAlgoTuner() + #runAlgoTuner() + + runAlgoTunerValidation() #runPromiseValidation() diff --git a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/validation.py b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/validation.py index 586d23d70a661558bad0cb8ac75c2367e3e73e2f..06a076103b8052d7321951f39bab738ec6d58358 100644 --- a/llvm/projects/hpvm-tensor-rt/bin/tuner_src/validation.py +++ b/llvm/projects/hpvm-tensor-rt/bin/tuner_src/validation.py @@ -488,7 +488,7 @@ def runPromiseBenchValidation(Bench): def copyValidatedConf(result_dir, validated_confs): src_dir = result_dir + "/promise_tuner/high_confidence/" - dest_dir = result_dir + "/promise_tuner/validated/" + dest_dir = result_dir + "/promise_tuner/validated_test/" if not os.path.isdir(dest_dir): os.mkdir(dest_dir) @@ -501,7 +501,7 @@ def copyValidatedConf(result_dir, validated_confs): def copyFailedConf(result_dir, failed_confs): src_dir = result_dir + "/promise_tuner/high_confidence/" - dest_dir = result_dir + "/promise_tuner/failed/" + dest_dir = result_dir + "/promise_tuner/failed_test/" if not os.path.isdir(dest_dir): os.mkdir(dest_dir) @@ -529,6 +529,9 @@ def validateConfigs(Bench, result_dir, configs_arr, acc_thresh): copyFailedConf(result_dir, failed_confs) + + + def runPromiseBenchValidation2(Bench): @@ -543,3 +546,94 @@ def runPromiseBenchValidation2(Bench): validateConfigs(Bench, Bench.result_dir_2, config_arr2, 2.0) validateConfigs(Bench, Bench.result_dir_3, config_arr3, 3.0) + + + +### NOTE: Algo Tuner Validation routines + + +def readConfidence(target_acc): + + f = open("run_accuracies.txt") + index = 0.0 + unsuccessful = 0.0 + sum_acc = 0.0 + for x in f: + x = x.strip() + acc = float(x) + if acc < target_acc: + unsuccessful += 1 + index += 1 + sum_acc += acc + + f.close() + + confidence = ( (index - unsuccessful) / index) * 100.0 + print ("run_confidence = ", confidence) + avg_acc = sum_acc / index + + return confidence + + + +def invokeBinary(Bench, layer_swings, target_acc, runs): # threshold): + + validation_binary = Bench.promise_binary + + # Write to promise_flags + fout = open("promise_flags", "w+") + for swing in layer_swings: + int_swing = int(swing) + if int_swing > 0: + fout.write(str(swing) + "\n") + fout.close() + + # Execute Validation Run + p = subprocess.Popen("./" + validation_binary, shell=True) + p.wait() + + + + + +def validateAlgoConfigs(Bench, result_dir, configs_arr, acc_thresh): + + # NOTE: Use confidence target as 95% + confidence_target = 95 + # NOTE: 1 run sufficient for software approximations + runs = 1 + + validated_confs = [] + failed_confs = [] + + validation_acc = Bench.validation_accuracy + target_acc = validation_acc - acc_thresh + + for conf in configs_arr: + layer_swings = conf.flags + invokeBinary(Bench, layer_swings, target_acc, runs) # acc_thresh) + confidence = readConfidence(target_acc) + + if confidence >= confidence_target: + validated_confs.append(conf.fname) + else: + failed_confs.append(conf.fname) + + + copyValidatedConf(result_dir, validated_confs) + copyFailedConf(result_dir, failed_confs) + + + + +def runAlgoBenchValidate(Bench): + + config_arr1 = loadConfigData(Bench.result_dir_1, 100) + config_arr2 = loadConfigData(Bench.result_dir_2, 100) + config_arr3 = loadConfigData(Bench.result_dir_3, 100) + + + validateAlgoConfigs(Bench, Bench.result_dir_1, config_arr1, 1.0) + validateAlgoConfigs(Bench, Bench.result_dir_2, config_arr2, 2.0) + validateAlgoConfigs(Bench, Bench.result_dir_3, config_arr3, 3.0) + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc index 66e824f6d098434e140d764edda7cdacd11e110f..241eb4cea8795af05983eb7e7ea7e645b42b9edb 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet2_promise.cc @@ -16,6 +16,9 @@ int to_skip = 5; int main(int argc, char* argv[]){ + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; if (argc > 1){ total_runs = atoi(argv[1]); @@ -29,6 +32,14 @@ int main(int argc, char* argv[]){ to_skip = atoi(argv[3]); } + if(argc > 4){ + test_input_size = atoi(argv[4]); + } + + if(argc > 5){ + offset = atoi(argv[5]); + } + llvm_hpvm_initTensorRt(0); @@ -41,9 +52,6 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 2000; - int batch_size = 1000; - int offset = 5000; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc index 6b951cffcaf142bd917abc7f7c04a2c691c472d7..3777d11718adf573f17348ceb0262641293fd2a5 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/alexnet_promise.cc @@ -17,6 +17,9 @@ int to_skip = 5; int main(int argc, char* argv[]){ + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; if (argc > 1){ total_runs = atoi(argv[1]); @@ -29,23 +32,29 @@ int main(int argc, char* argv[]){ if(argc > 3){ to_skip = atoi(argv[3]); } + + if(argc > 4){ + test_input_size = atoi(argv[4]); + } + + if(argc > 5){ + offset = atoi(argv[5]); + } llvm_hpvm_initTensorRt(0); + int missed = 0; for (int i = 0 ; i < total_runs; i++){ - + if (missed >= to_skip){ break; } startMemTracking(); - int test_input_size = 2000; - int batch_size = 1000; - int offset = 5000; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/lenet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/lenet_promise.cc index 37a30c55002bd2fea2ac4054649869e5cf4b5c6f..2247ccba9fcf08d2f24368907c328f4b77b173b0 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/lenet_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/lenet_promise.cc @@ -11,24 +11,24 @@ #include "../../include/utils.h" -// Piped OpenTuner usage is now deprecated -bool Opentuner_run = false; int total_runs = 1; float bench_acc = 0; int to_skip = 5; +int test_input_size = 2000; +int batch_size = 2000; +int offset = 5000; + /* NOTE: Reference Architecture to use for profiling */ void testLenetTanh(){ - - printf("********* Lenet-2 Architecture ********** \n"); - // FIXIT: Extend this to batch of images - currently 5 images - - int test_batch_size = 1000; - - uint8_t* labels = readLabels("../model_params/lenet_params/datasets/t10k-labels-idx1-ubyte", test_batch_size); - + printf("********* Lenet-5 Architecture ********** \n"); + + std::string dir_prefix = std::string("../model_params/lenet_params/"); + std::string input_path = dir_prefix + std::string("input.bin"); + std::string labels_path = dir_prefix + std::string("labels.bin"); + clearTensorMap(); int missed = 0; @@ -38,10 +38,13 @@ void testLenetTanh(){ break; } - void* input = readInputTensor("../model_params/lenet_params/datasets/t10k-images-idx3-ubyte", - CUDNN_DATA_FLOAT, - test_batch_size, 1, 28, 28); + int start = offset; + int end = batch_size + offset; + + // Loading Input Batch + void* input = readInputBatch(input_path.c_str(),0,start,end,1,28,28); + // Loading Weights void* conv1_filter = readTrainedWeights("../model_params/lenet_keras/conv1.bin", float_type, 32, 1, 5, 5); void* conv1_bias = readTrainedWeights("../model_params/lenet_keras/conv1_bias.bin", @@ -60,27 +63,25 @@ void testLenetTanh(){ float_type, 1, 10, 1, 1); - - - readOpenTunerFlags("promise_flags"); // Resets the OpenTuner counters - - + // DNN Operations void* conv1_out = ConvLayer_PROMISE(input, 0,1, conv1_filter, -1,1, conv1_bias, -1,1, 2, 2, 1, 1, 0, 2, 0, -1,1, 9); - void* conv2_out = ConvLayer_PROMISE(conv1_out, -1,1, conv2_filter, -1,1, conv2_bias, -1,1, 2, 2, 1, 1, 0, 2, 0, -1,1, 9); void* fc1_out = FCLayer_PROMISE(conv2_out, -1,1, fc1_weights, -1,1, fc1_bias, -1,1, - 0, -1,1, 9); - + 0, -1,1, 9); void* fc2_out = FCLayer_PROMISE(fc1_out, -1,1, fc2_weights, -1,1, fc2_bias, -1,1, 0, -1,1, 9); void* result = tensorSoftmax(fc2_out); - float accuracy = computeAccuracy2(labels, test_batch_size, result); + + uint8_t* labels = readLabelsBatch(labels_path.c_str(),start,end); + + float accuracy = computeAccuracy2(labels, batch_size, result); + freeOutputTensors(); @@ -110,8 +111,18 @@ int main(int argc, char* argv[]){ if(argc > 3){ to_skip = atoi(argv[3]); } + + if(argc > 4){ + test_input_size = atoi(argv[4]); + batch_size = atoi(argv[4]); + } + + if(argc > 5){ + offset = atoi(argv[5]); + } + llvm_hpvm_initTensorRt(0); testLenetTanh(); @@ -124,43 +135,3 @@ int main(int argc, char* argv[]){ - - - -/* if(Opentuner_run){ - - char* myfifo = "/tmp/myfifo"; - int fd = open(myfifo, O_RDONLY); - - int ret_val = fcntl(fd, F_GETFD); - if(ret_val == -1){ - printf("Invalid descriptor \n"); - abort(); - } - - char str[100]; - read(fd, str, 80); - if(strcmp(str, "stop_run") == 0){ - abort(); - } - - close(fd); - } -*/ - - -/* if(Opentuner_run){ - - char* myfifo = "/tmp/myfifo"; - int fd_out = open(myfifo, O_WRONLY); - int ret_val = fcntl(fd_out, F_GETFD); - if(ret_val == -1){ - printf("Invalid descriptor \n"); - abort(); - } - - const char* str = "completed***!\n\0"; - write(fd_out, str, 80); - close(fd_out); - } -*/ diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc index 052809f29b9d89534005e56125e66c5e4a0bd1cf..45abde0c285c858904dafc54104aec797ca0abf7 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_promise.cc @@ -17,6 +17,9 @@ int to_skip = 5; int main(int argc, char* argv[]){ + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; if (argc > 1){ total_runs = atoi(argv[1]); @@ -30,6 +33,14 @@ int main(int argc, char* argv[]){ to_skip = atoi(argv[3]); } + if(argc > 4){ + test_input_size = atoi(argv[4]); + } + + if(argc > 5){ + offset = atoi(argv[5]); + } + llvm_hpvm_initTensorRt(0); @@ -43,9 +54,6 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 2000; - int batch_size = 1000; - int offset = 5000; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc index 42d26d34e65939b410143485a61f23e705906bfc..2585d96530a1c089beb3db8c15ade0a99be25718 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/mobilenet_shallow_promise.cc @@ -17,6 +17,9 @@ int to_skip = 5; int main(int argc, char* argv[]){ + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; if (argc > 1){ total_runs = atoi(argv[1]); @@ -30,6 +33,14 @@ int main(int argc, char* argv[]){ to_skip = atoi(argv[3]); } + if(argc > 4){ + test_input_size = atoi(argv[4]); + } + + if(argc > 5){ + offset = atoi(argv[5]); + } + llvm_hpvm_initTensorRt(0); @@ -41,10 +52,6 @@ int main(int argc, char* argv[]){ } startMemTracking(); - - int test_input_size = 2000; - int batch_size = 1000; - int offset = 5000; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc index 0e5cdd1d284e6c7621cd3331b924c06969be79db..d2e852664e931957b518902881d813acf6692408 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/resnet18_promise.cc @@ -17,6 +17,10 @@ int to_skip = 5; int main(int argc, char* argv[]){ + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; + if (argc > 1){ total_runs = atoi(argv[1]); } @@ -29,6 +33,15 @@ int main(int argc, char* argv[]){ to_skip = atoi(argv[3]); } + if(argc > 4){ + test_input_size = atoi(argv[4]); + } + + if(argc > 5){ + offset = atoi(argv[5]); + } + + llvm_hpvm_initTensorRt(0); @@ -40,10 +53,6 @@ int main(int argc, char* argv[]){ } startMemTracking(); - - int test_input_size = 2000; - int batch_size = 1000; - int offset = 5000; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc index 33c68eae84a075f50b2bc8e7484036c54ade5620..0f4c9cd62adee6df3c93de9d99812fad96f4d650 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar100_promise.cc @@ -18,6 +18,11 @@ int to_skip = 5; int main(int argc, char* argv[]){ + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; + + if (argc > 1){ total_runs = atoi(argv[1]); } @@ -30,6 +35,14 @@ int main(int argc, char* argv[]){ to_skip = atoi(argv[3]); } + if(argc > 4){ + test_input_size = atoi(argv[4]); + } + + if(argc > 5){ + offset = atoi(argv[5]); + } + llvm_hpvm_initTensorRt(0); @@ -43,9 +56,6 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 2000; - int batch_size = 1000; - int offset = 5000; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc index ff767235e9d44139f97ad885aa89eef1c385ad33..a9363acd7614c83f8def2e4df55e23c6f767733e 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/promise/vgg16_cifar10_promise.cc @@ -16,7 +16,12 @@ int to_skip = 5; int main(int argc, char* argv[]){ - + + int test_input_size = 2000; + int batch_size = 1000; + int offset = 5000; + + if (argc > 1){ total_runs = atoi(argv[1]); } @@ -29,6 +34,15 @@ int main(int argc, char* argv[]){ to_skip = atoi(argv[3]); } + if(argc > 4){ + test_input_size = atoi(argv[4]); + } + + if(argc > 5){ + offset = atoi(argv[5]); + } + + llvm_hpvm_initTensorRt(0); int missed = 0; @@ -40,9 +54,6 @@ int main(int argc, char* argv[]){ startMemTracking(); - int test_input_size = 2000; - int batch_size = 1000; - int offset = 5000; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; diff --git a/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/input.bin b/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/input.bin new file mode 100644 index 0000000000000000000000000000000000000000..4d2423f74188cfe0364185ccb66837785ccf4c4e Binary files /dev/null and b/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/input.bin differ diff --git a/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/labels.bin b/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/labels.bin new file mode 100644 index 0000000000000000000000000000000000000000..5e1f3881897f4729d6d90ff208a08ccdabb8fe7c Binary files /dev/null and b/llvm/projects/hpvm-tensor-rt/model_params/lenet_params/labels.bin differ diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h index 2dc985a0c14ebc18a68d5e54f78bd416f9d3b523..ff0b2a8c68d44b26982423d02ea891136ae87d8e 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_api.h @@ -31,8 +31,14 @@ extern "C"{ void* tensorConvPerfCuda(void* input, void* filter, int vertical_pad, int horizontal_pad, int vertical_stride, int horizontal_stride, - int conv_mode, int conv_groups, int row, int col, int start); - + int conv_mode, int conv_groups, + int row, int col, int start); + + void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, + int vertical_stride, int horizontal_stride, + int conv_mode, int conv_groups, + int row, int col, int start); void sampleFilter(Tensor* filter, int skip_rate, int skip_offset); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h index a81ffe296233178126555bbb53babdcd4192a7bf..0a741316682324ca6270aab2066ebc9f0b48bcdf 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approx_techniques2.h @@ -2,6 +2,36 @@ #include "tensor_utils.cu" +//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols +__global__ void convToGemmApproxHalf(__half * const __restrict__ output, + const __half * const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_out * W_out); //output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number + const int h = tx % (H_out * W_out) / W_out; //output height index (row number) + const int w = tx % W_out; //output width index (col number) + const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + if(filter_elem_num % skip_every != skip_every-1) { //are we including this filter element? + const int output_col = filter_elem_num - (filter_elem_num/skip_every); //calculate output column, taking skipping into account + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((n * reduced_filter_elem + output_col) * H_out + h) * W_out + w] = 0; + } + } + } + } +} + //This skips every xth row //H_eff is the number of rows calculated exactly @@ -350,3 +380,477 @@ void* tensorConvPerfCuda(void* input_ptr, void* filter_ptr, return new_output; } + +__global__ +void convToGemmPerfRowHalf(__half * const __restrict__ output, + const __half * const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, const int W_out, const int V_stride, + const int H_stride, const int x, const int start, const int H_eff){ + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_eff * W_out); //output image number + const int c = tx % (C * H_eff * W_out) / (H_eff * W_out); //output chan number + const int h = tx % (H_eff * W_out) / W_out; //output height index (row number) + const int w = tx % W_out; //output width index (col number) + int past_start = (h % (x - 1) >= (x - 1 - start)); + const int inH = (h / (x - 1) * x + h % (x-1) + + past_start) * V_stride - V_pad; //input height index (row number) + const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((filter_elem_num * N + n) * H_eff + h) * W_out + w] = 0; + + } + } + } + +} + + +//For use in tensorConvPerfCuda +//Interpolates every xth row starting from x - 1 - start +//N is total number of elements in final output array +__global__ +void approxInterpolateRowHalf(int N, int old_h, int b, int c, int h, int w, + __half *old_data, __half *new_data, int x, int start){ + + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for(int i = index; i < N; i += stride){ + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n = i / (c * h * w); + int past_start = ((row % x) >= (x - 1 - start)); + + if(row == h-1) + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + (old_h - 1) * (w) + col]; + else if (row == 0) + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + 0 * (w) + col]; + else if(row % x == x - 1 - start){ + int past_startO = ((row - 1) % x) > (x - 1 - start); + int oldIdx1 = ch * (b * old_h * w) + n * (old_h * w) + + ((x-1) * ((row - 1) / x) + (row-1) % x - past_startO) * (w) + col; + + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1 * w]), 2); + } + else + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * old_h * w) + n * (old_h * w) + + ((x-1) * (row / x) + row % x - past_start ) * (w) + col]; + + + } + +} + + +//This skips every xth row +//W_eff is the number of cols calculated exactly +__global__ +void convToGemmPerfColHalf(__half * const __restrict__ output, + const __half * const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, const int W_out, const int V_stride, + const int H_stride, const int x, const int start, const int W_eff){ + + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_out * W_eff); //output image number + const int c = tx % (C * H_out * W_eff) / (H_out * W_eff); //output chan number + const int h = tx % (H_out * W_eff) / W_eff; //output height index (row number) + const int w = tx % W_eff; //output width index (col number) + int past_start = (w % (x - 1)) >= (x - 1 - start); + const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = (w / (x - 1) * x + w % (x-1) + + past_start) * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((filter_elem_num * N + n) * H_out + h) * W_eff + w] = 0; + + } + } + } + +} + + +//For use in tensorConvPerfCuda +//Interpolates every xth col starting from x - 1 - start +//N is total number of elements in final output array +__global__ +void approxInterpolateColHalf(int N, int old_w, int b, int c, int h, int w, + __half *old_data, __half *new_data, int x, int start){ + + + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for(int i = index; i < N; i += stride){ + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n = i / (c * h * w); + int past_start = ((col % x) >= (x - 1 - start)); + + if(col == w-1) + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w) + old_w - 1]; + else if (col == 0) + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * (old_w)]; + else if(col % x == x - 1 - start){ + int past_startO = ((col - 1) % x) > (x - 1 - start); + int oldIdx1 = ch * (b * h * old_w) + n * (h * old_w) + row * old_w + + ((x-1) * ((col - 1) / x) + (col-1) % x - past_startO); + + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + __hdiv(__hadd(old_data[oldIdx1], old_data[oldIdx1 + 1]), 2); + } + else + new_data[n * (c * h * w) + ch * (h * w) + row * (w) + col] = + old_data[ch * (b * h * old_w) + n * (h * old_w) + row * old_w + + ((x-1) * (col / x) + col % x - past_start)]; + + } +} + +__global__ +void switchMatrix(int N, int n, int c, int h, int w, __half *old_data, __half *new_data){ + + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < N){ + int col = ((i % (c * h * w)) % (h * w)) % w; + int row = ((i % (c * h * w)) % (h * w)) / w; + int ch = (i % (c * h * w)) / (h * w); + int n_new = i / (c * h * w); + + new_data[((n_new * c + ch) * h + row ) * w + col] = + old_data[((ch * n + n_new) * h + row ) * w + col]; + } + +} + + +__global__ +void createNewFilter(__half *new_filter, __half *old_filter, + int newFilterSize, int oldFilterSize){ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for(int i = index; i < newFilterSize; i += stride){ + new_filter[i] = old_filter[i % oldFilterSize]; + } +} + +__global__ +void createBatches(int n, const __half * matA[], const __half * matB[], __half * matC[], + __half * convData, __half * newFilter, __half * output, + int aStride, int bStride, int cStride){ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for(int i = index; i < n; i += stride){ + matA[i] = &convData[i * aStride]; + matB[i] = &newFilter[i * bStride]; + matC[i] = &output[i * cStride]; + } +} + +//produces N COL MAJOR matrixes with H_out*W_out rows and reduced_filter_elem cols +__global__ void convToGemmApproxHalfN(__half * const __restrict__ output, + const __half * const __restrict input, const int N, const int C, + const int H, const int W, const int KH, const int KW, const int V_pad, + const int H_pad, const int H_out, const int W_out, const int V_stride, + const int H_stride, const int reduced_filter_elem, + const int skip_every) { + const int tx = blockDim.x * blockIdx.x + threadIdx.x; //thread id + const int n = tx / (C * H_out * W_out); //output image number + const int c = tx % (C * H_out * W_out) / (H_out * W_out); //output chan number + const int h = tx % (H_out * W_out) / W_out; //output height index (row number) + const int w = tx % W_out; //output width index (col number) + const int inH = h * V_stride - V_pad; //input height index (row number) + const int inW = w * H_stride - H_pad; //input width index (col number) + if(n < N) { //is thread id within bounds? + for(int i = 0; i < KH; i++) { + for(int j = 0; j < KW; j++) { + const int filter_elem_num = (c * KH + i) * KW + j; //index of this filter element + const int output_col = filter_elem_num; //calculate output column, taking skipping into account + if(inH + i >= 0 && inH + i < H && inW + j >= 0 && inW + j < W) + output[((output_col * N + n) * H_out + h) * W_out + w] = + input[((n * C + c) * H + (inH + i)) * W + (inW + j)]; + else + output[((output_col * N + n) * H_out + h) * W_out + w] = 0; + + } + } + } +} + +//start has to be less than row or less than col +//row and col have to be >= 0 +//row = col = 1 means no perforation +void* tensorConvPerfCudaHalf(void* input_ptr, void* filter_ptr, + int vertical_pad, int horizontal_pad, int vertical_stride, + int horizontal_stride, int conv_mode, int conv_groups, + int row, int col, int start){ + + INFO("*** TensorConvolution half perforation \n"); + + Tensor* input = (Tensor*)input_ptr; + Tensor* filter = (Tensor*)filter_ptr; + //FIXME: Current hack to preserve backward compatibilty + if (conv_groups == 0) { + conv_groups = 1; + } + + profileEvent("F2H_start"); + + hostToDeviceCopy(input); + hostToDeviceCopy(filter); + + convertToFP16(input); + convertToFP16(filter); + + /******* END OF INPUT DATA CONVERSIONS*/ + profileEvent("F2H_end"); + + profileEvent("Conv"); + + Tensor* output_half; + int n, c, h, w; // output dimensions + n = input->dims.dim_sizes[0]; + c = filter->dims.dim_sizes[0]; //number of filters + const int KH = filter->dims.dim_sizes[2]; + const int KW = filter->dims.dim_sizes[3]; + + h = (2 * vertical_pad + input->dims.dim_sizes[2] - KH) / vertical_stride + 1; + int h_eff = h - h / row; + if(h % row > row - 1 - start) + h_eff = h_eff - 1; + + w = (2 * horizontal_pad + input->dims.dim_sizes[3] - KW) / horizontal_stride + 1; + int w_eff = w - w / col; + if(w % col > col - 1 - start) + w_eff = w_eff - 1; + + + Tensor *new_output; + if(row > 1){ + output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, CUDNN_TENSOR_NCHW, + n, c, h_eff, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output_half, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + + __half * convData; + int convDataSize = sizeof(__half) * n * num_filter_elem * h_eff * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = (n * input->dims.dim_sizes[1] * h_eff * w + blockSize - 1) / blockSize; + + convToGemmPerfRowHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, + horizontal_pad, h, w, + vertical_stride, horizontal_stride, row, start, h_eff); + + + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + + checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + n * h_eff * w, c, num_filter_elem, + alpha_half, + convData, CUDA_R_16F, n * h_eff * w, + (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, + (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h_eff * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + + + new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(new_output, DEVICE); + + //interpolate + int numBlocks = (n * c * h * w + 255) / 256; + approxInterpolateRowHalf<<<numBlocks,256>>>(n * c * h * w, h_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, + row, start); + cudaDeviceSynchronize(); + + cudaFree(output_half); + cudaFree(convData); + } + else if(col > 1){ + output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, n, c, h, w_eff); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output_half, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + + __half * convData; + int convDataSize = sizeof(__half) * n * num_filter_elem * h * w_eff; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = (n * input->dims.dim_sizes[1] * h * w_eff + blockSize - 1) / blockSize; + + convToGemmPerfColHalf<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, + horizontal_pad, h, w, + vertical_stride, horizontal_stride, col, start, w_eff); + + + checkCudaErrors(cudaDeviceSynchronize()); + + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + + + checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + n * h * w_eff, c, num_filter_elem, + alpha_half, + convData, CUDA_R_16F, n * h * w_eff, + (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, + (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w_eff, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + + + new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(new_output, DEVICE); + + //interpolate + int numBlocks = (n * c * h * w + 255) / 256; + approxInterpolateColHalf<<<numBlocks,256>>>(n * c * h * w, w_eff, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data, + col, start); + + cudaDeviceSynchronize(); + + cudaFree(output_half); + cudaFree(convData); + + } + else{ + output_half = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, c, n, h, w); + + // NOTE: Changing output tensor placement from host to device + changeTensorPlacement(output_half, DEVICE); + // NOTE: Necessary to insert the above call for every output tensor + //total number of filter elem + const int num_filter_elem = KH * KW * input->dims.dim_sizes[1]; + + __half * convData; + int convDataSize = sizeof(__half) * n * num_filter_elem * h * w; + checkCudaErrors(cudaMalloc(&convData, convDataSize)); + + const int blockSize = 256; + const int gridSize = (n * input->dims.dim_sizes[1] * h * w + blockSize - 1) / blockSize; + convToGemmApproxHalfN<<<gridSize, blockSize>>>(convData, (__half *)input->gpu_half_data, n, + input->dims.dim_sizes[1], input->dims.dim_sizes[2], + input->dims.dim_sizes[3], KH, KW, vertical_pad, horizontal_pad, h, w, + vertical_stride, horizontal_stride, num_filter_elem, c * h * w); + checkCudaErrors(cudaDeviceSynchronize()); + //Do the matrix multiplication. Want to multiply convData by filter->gpu_data[f * chan * KH * KW] + const __half alf = approx_float_to_half(1.0); + const __half bet = approx_float_to_half(0.0); + const __half *alpha_half = &alf; + const __half *beta_half = &bet; + + checkCudaErrors(cublasGemmEx(cublasHandle, CUBLAS_OP_N, CUBLAS_OP_N, + n * h * w, c, num_filter_elem, + alpha_half, + convData, CUDA_R_16F, n * h * w, + (__half*) filter->gpu_half_data, CUDA_R_16F, num_filter_elem, + beta_half, + (__half*) output_half->gpu_half_data, CUDA_R_16F, n * h * w, + CUDA_R_16F, CUBLAS_GEMM_DEFAULT_TENSOR_OP) ); + + + + // profileEvent("gemm_end", true); + new_output = (Tensor*)create4DTensor((cudnnDataType_t) half_type, + CUDNN_TENSOR_NCHW, n, c, h, w); + changeTensorPlacement(new_output, DEVICE); + + + int numBlocks = (n * c * h * w + 255) / 256; + switchMatrix<<<numBlocks,256>>>(n * c * h * w, n, c, h, w, + (__half *)output_half->gpu_half_data, + (__half *)new_output->gpu_half_data); + + checkCudaErrors(cudaDeviceSynchronize()); + + cudaFree(convData); + cudaFree(output_half); + } + + profileEvent("Conv_end", true); + + profileEvent("H2F_start"); + + convertToFP32(new_output); + + profileEvent("H2F_end"); + + + #ifdef ERROR_INJECTION_ENABLED + if (op_counter >= total_ops) { + ERROR("No accuracy flag found \n"); + } + int op_acc = op_accuracies[op_counter]; + // Skip errorInjection if explicitly requested + if (skip_tensors.find(op_counter) != skip_tensors.end()) { + op_acc = 0; + } + void* error_norms = tensorAddError(output, op_acc); + add_norms(error_norms, "tensorConv", op_acc); + add_conv_overheads(input, filter, vertical_stride, horizontal_stride, op_acc); + op_counter++; + #endif + return new_output; +} + diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..dc5cddf8a2121a937dbe8cd4582fe1022fd99f48 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/approxhpvm_img_runtime_utils.h @@ -0,0 +1,238 @@ +#ifndef APPROXHPVM_IMG_RUNTIME_UTILS +#define APPROXHPVM_IMG_RUNTIME_UTILS + +#include "configuration.h" +#include "hpvm-rt-controller.h" + +#include "img_tensor_runtime.h" + + +// Utilities header for ApproxHPVM image runtime API (wrapper runtime API) + +void* handleTensorFftApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { + + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorFft(input); //TODO: correct name here + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorFft", pinfo.first); // and here + RC->addToCurrentIterationComputeEnergy("tensorFft", pinfo.second); // and here + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + +void* handleTensorReduceApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorReduce(input); //TODO: correct name here + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorReduce", pinfo.first); // and here + RC->addToCurrentIterationComputeEnergy("tensorReduce", pinfo.second); // and here + return t_out; + } + case GPUNodeConfiguration::APPROX::REDUCTION_SAMPLING : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorReductionSamplingReduce(input); //TODO: correct name here + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorReductionSamplingReduce", + pinfo.first); // and here + RC->addToCurrentIterationComputeEnergy("tensorReductionSamplingReduce", + pinfo.second); // and here + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + +void* handleTensorProjectiveTApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorProjectiveT(input); //TODO: correct name here + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorProjectiveT", pinfo.first); // and here + RC->addToCurrentIterationComputeEnergy("tensorProjectiveT", pinfo.second); // and here + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + +void* handleTensorMap1ApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorMap1(input); //TODO: correct name here + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorMap1", pinfo.first); // and here + RC->addToCurrentIterationComputeEnergy("tensorMap1", pinfo.second); // and here + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + +void* handleTensorMap2ApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorMap2(input); //TODO: correct name here + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorMap2", pinfo.first); // and here + RC->addToCurrentIterationComputeEnergy("tensorMap2", pinfo.second); // and here + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + +void* handleTensorMap3ApproximationTuples( + std::vector< std::pair<GPUNodeConfiguration::APPROX, int> > &approxTuples, + void* input) { + if (approxTuples.size() == 1) { + enum GPUNodeConfiguration::APPROX approx = approxTuples[0].first; + int param = approxTuples[0].second; + switch (approx) { + case GPUNodeConfiguration::APPROX::FP32 : + { + void* t_out; + RC->resume_profiler(); + t_out = tensorMap3(input); //TODO: correct name here + RC->pause_profiler(); + std::pair<double, double> pinfo = RC->get_time_energy(); + RC->reset_profiler(); + RC->addToCurrentIterationComputeTime("tensorMap3", pinfo.first); // and here + RC->addToCurrentIterationComputeEnergy("tensorMap3", pinfo.second); // and here + return t_out; + } + default : + CUSTOM_ASSERT(false && "Unknown approximation type"); + ERROR("Unknown approximation type"); + abort(); + // TODO additional approx methods implemented here + } + } else if (approxTuples.size() == 2) { + ERROR("Currently unsupported case"); + abort(); + } else { + ERROR("Unsupported case"); + abort(); + } + return NULL; +} + + +#endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..9c098719e52e31fcd06b6425964c8e1d48a15210 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h @@ -0,0 +1,26 @@ +#ifndef IMG_TENSOR_RUNTIME_H +#define IMG_TENSOR_RUNTIME_H + +// *** Runtime declaration *** // +void* tensorFft(void* input); +void* tensorReduce(void* input); +void* tensorReductionSamplingReduce(void* input); +void* tensorProjectiveT(void* input); +void* tensorMap1(void* input); +void* tensorMap2(void* input); +void* tensorMap3(void* input); + +// *** Wrapper API declaration *** // +void* wrapper_tensorFft(const char* hpvm_node_id, void* input); +void* wrapper_tensorReduce(const char* hpvm_node_id, void* input); +void* wrapper_tensorProjectiveT(const char* hpvm_node_id, void* input); +void* wrapper_tensorMap1(const char* hpvm_node_id, void* input); +void* wrapper_tensorMap2(const char* hpvm_node_id, void* input); +void* wrapper_tensorMap3(const char* hpvm_node_id, void* input); + +// Tentative +void* wrapper_tensorStencil(const char* hpvm_node_id, void* input); +void* wrapper_tensorCosineT(const char* hpvm_node_id, void* input); + + +#endif diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h index 6041bb4de989be20acef973ac7f632b838a097a4..06c492c9e8fb45e0a51de153e8cf434a79a50e23 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_runtime.h @@ -10,6 +10,7 @@ #include "tensor.h" #include "rt-controller-api.h" +#include "img_tensor_runtime.h" #ifndef CUDNN_HEADER #define CUDNN_HEADER diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu new file mode 100644 index 0000000000000000000000000000000000000000..0460e490fd2b188b85f53cf9b109f09ac3d6b83a --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu @@ -0,0 +1,137 @@ +#include "../include/debug.h" +#include "../include/img_tensor_runtime.h" +#include "../include/approxhpvm_img_runtime_utils.h" + +// *** Runtime implementation *** // +void* tensorFft(void* input) { + +} + +void* tensorReduce(void* input) { + +} + +void* tensorReductionSamplingReduce(void* input) { + +} + +void* tensorProjectiveT(void* input) { + +} + +void* tensorMap1(void* input) { + +} + +void* tensorMap2(void* input) { + +} + +void* tensorMap3(void* input) { + +} + + +// *** Wrapper API implementation *** // + +void* wrapper_tensorFft(const char* hpvm_node_id, void* input) { + GPUNodeConfiguration *GPUConf = + (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); + std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, + std::vector< std::pair<GPUNodeConfiguration::APPROX, + int> > > > &ApproxChoices = + GPUConf->getApproxChoices(); + // Approximation choices must be for a fft operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::FFT && + "Invalid configuration generated for tensor fft wrapper operation"); + return handleTensorFftApproximationTuples(ApproxChoices[0].second, + input); +} + +void* wrapper_tensorReduce(const char* hpvm_node_id, void* input) { + GPUNodeConfiguration *GPUConf = + (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); + std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, + std::vector< std::pair<GPUNodeConfiguration::APPROX, + int> > > > &ApproxChoices = + GPUConf->getApproxChoices(); + // Approximation choices must be for a reduce operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::REDUCE && + "Invalid configuration generated for tensor reduce wrapper operation"); + return handleTensorReduceApproximationTuples(ApproxChoices[0].second, + input); +} + +void* wrapper_tensorProjectiveT(const char* hpvm_node_id, void* input) { + GPUNodeConfiguration *GPUConf = + (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); + std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, + std::vector< std::pair<GPUNodeConfiguration::APPROX, + int> > > > &ApproxChoices = + GPUConf->getApproxChoices(); + // Approximation choices must be for a projectiveT operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::PROJECTIVE_T && + "Invalid configuration generated for tensor projectiveT wrapper operation"); + return handleTensorProjectiveTApproximationTuples(ApproxChoices[0].second, + input); +} + +void* wrapper_tensorMap1(const char* hpvm_node_id, void* input) { + GPUNodeConfiguration *GPUConf = + (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); + std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, + std::vector< std::pair<GPUNodeConfiguration::APPROX, + int> > > > &ApproxChoices = + GPUConf->getApproxChoices(); + // Approximation choices must be for a map1 operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP1 && + "Invalid configuration generated for tensor map1 wrapper operation"); + return handleTensorMap1ApproximationTuples(ApproxChoices[0].second, + input); +} + +void* wrapper_tensorMap2(const char* hpvm_node_id, void* input) { + GPUNodeConfiguration *GPUConf = + (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); + std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, + std::vector< std::pair<GPUNodeConfiguration::APPROX, + int> > > > &ApproxChoices = + GPUConf->getApproxChoices(); + // Approximation choices must be for a map2 operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP2 && + "Invalid configuration generated for tensor map2 wrapper operation"); + return handleTensorMap2ApproximationTuples(ApproxChoices[0].second, + input); +} + +void* wrapper_tensorMap3(const char* hpvm_node_id, void* input) { + GPUNodeConfiguration *GPUConf = + (GPUNodeConfiguration *)RC->getNodeConfiguration(hpvm_node_id); + std::vector< std::pair< GPUNodeConfiguration::TENSOR_OP, + std::vector< std::pair<GPUNodeConfiguration::APPROX, + int> > > > &ApproxChoices = + GPUConf->getApproxChoices(); + // Approximation choices must be for a map3 operation + CUSTOM_ASSERT(ApproxChoices.size() == 1 && + ApproxChoices[0].first == GPUNodeConfiguration::TENSOR_OP::MAP3 && + "Invalid configuration generated for tensor map3 wrapper operation"); + return handleTensorMap3ApproximationTuples(ApproxChoices[0].second, + input); +} + +// Tentative +void* wrapper_tensorStencil(const char* hpvm_node_id, void* input) { + ERROR("Stencil operation currently unsupported.\n"); + abort(); +} + +void* wrapper_tensorCosineT(const char* hpvm_node_id, void* input) { + ERROR("CosineT operation currently unsupported.\n"); + abort(); +} + diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu index 9e58f36a402844c33a1cb665ae4113e6e6a8534f..cc2a5d6ba91ff3c7bae29980340142656e2ef62b 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/tensor_runtime.cu @@ -49,6 +49,8 @@ #include "../include/approx_simulation.h" +// Image tensor runtime implementation +#include "img_tensor_runtime.cu" //** Potential Improvements: // 1) Add support for dataypes beyond floats and half diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp index d92bc0c45d1115620d529aea4636ece8d3d62127..50732550db8c8f02c940e485702c3253a7bb9760 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/src/alexnet_loop.cpp @@ -440,7 +440,7 @@ int main(){ int test_input_size = 10000; int batch_count = test_input_size / batch_size; - void* input = create4DTensor(0,nchw,batch_size,3,32,32); + // void* input = create4DTensor(0,nchw,batch_size,3,32,32); startMemTracking(); startProfiling(); @@ -450,8 +450,13 @@ int main(){ int start = i * batch_size; int end = (i + 1) * batch_size; - copyInputBatch(input_path.c_str(),start,end,3,32,32, input); - + // copyInputBatch(input_path.c_str(),start,end,3,32,32, input); + + // Replaced create4DTensor and copyInputBatch with readInputBatch + void* input = readInputBatch(input_path.c_str(), 0, + start, end, + 3, 32, 32); + args->input = input; args->input_bytes = 0; diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile index 30c80f2a1a65ad122681b76e531ed5d99ec8a12b..914c9817d64cd0e55baababe78fd7ae6e33c34ab 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile @@ -60,18 +60,17 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc - $(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc + #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc - $(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc + #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) - #$(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_AUTOTUNER_DIR) -o $(BUILD_DIR)/lenet_tune $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/src/alexnet2_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/src/alexnet2_loop.cpp index 03ffd82ee7df6240e565397d099e37aecd9dad9e..91bf3b0c4523e7239d7f11f6ad350f9dbb454a91 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/src/alexnet2_loop.cpp +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/src/alexnet2_loop.cpp @@ -491,16 +491,21 @@ int main(){ int batch_count = test_input_size / batch_size; std::string input_path = dir_prefix + std::string("input.bin"); - void* input = create4DTensor(0,nchw,batch_size,3,32,32); + // void* input = create4DTensor(0,nchw,batch_size,3,32,32); - startMemTracking(); + startProfiling(); + for (int i = 0; i < batch_count; i++){ int start = i * batch_size; int end = (i + 1) * batch_size; - copyInputBatch(input_path.c_str(),start,end,3,32,32, input); + // copyInputBatch(input_path.c_str(),start,end,3,32,32, input); + + void* input = readInputBatch(input_path.c_str(), 0, + start, end, + 3, 32, 32); args->input = input; args->input_bytes = 0; @@ -512,19 +517,12 @@ int main(){ void *result = static_cast<RootIn*>(args)->input; hpvm_request_tensor(result, 0); - -// uint32_t* labels = readLabelsBatch3(labels_path.c_str(),start,end); -// -// computeAccuracy3(labels, result); - llvm_hpvm_invokeRtControl(result, labels_path.c_str(), start, end); freeBatchMemory(); } - - + stopProfiling(); __visc__cleanup(); - return 0; }